QUDA  v0.7.0
A library for QCD on GPUs
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
reduce_quda.cu
Go to the documentation of this file.
1 #include <blas_quda.h>
2 #include <tune_quda.h>
3 #include <float_vector.h>
4 
5 #if (__COMPUTE_CAPABILITY__ >= 130)
6 #define QudaSumFloat double
7 #define QudaSumFloat2 double2
8 #define QudaSumFloat3 double3
9 #else
10 #define QudaSumFloat doublesingle
11 #define QudaSumFloat2 doublesingle2
12 #define QudaSumFloat3 doublesingle3
13 #include <double_single.h>
14 #endif
15 
16 #define REDUCE_MAX_BLOCKS 65536
17 
18 #define checkSpinor(a, b) \
19  { \
20  if (a.Precision() != b.Precision()) \
21  errorQuda("precisions do not match: %d %d", a.Precision(), b.Precision()); \
22  if (a.Length() != b.Length()) \
23  errorQuda("lengths do not match: %d %d", a.Length(), b.Length()); \
24  if (a.Stride() != b.Stride()) \
25  errorQuda("strides do not match: %d %d", a.Stride(), b.Stride()); \
26  }
27 
28 #define checkLength(a, b) \
29  { \
30  if (a.Length() != b.Length()) \
31  errorQuda("lengths do not match: %d %d", a.Length(), b.Length()); \
32  if (a.Stride() != b.Stride()) \
33  errorQuda("strides do not match: %d %d", a.Stride(), b.Stride()); \
34  }
35 
36 static struct {
37  const char *vol_str;
38  const char *aux_str;
40 } blasStrings;
41 
42 // These are used for reduction kernels
43 static QudaSumFloat *d_reduce=0;
44 static QudaSumFloat *h_reduce=0;
45 static QudaSumFloat *hd_reduce=0;
46 static cudaEvent_t reduceEnd;
47 
48 namespace quda {
49 
50  cudaStream_t* getBlasStream();
51 
52  void initReduce()
53  {
54 
55  const int MaxReduce = 12;
56  // reduction buffer size
57  size_t bytes = MaxReduce*3*REDUCE_MAX_BLOCKS*sizeof(QudaSumFloat); // Factor of N for composite reductions
58 
59 
60  if (!d_reduce) d_reduce = (QudaSumFloat *) device_malloc(bytes);
61 
62  // these arrays are actually oversized currently (only needs to be QudaSumFloat3)
63 
64  // if the device supports host-mapped memory then use a host-mapped array for the reduction
65  if (!h_reduce) {
66  // only use zero copy reductions when using 64-bit
67 #if (defined(_MSC_VER) && defined(_WIN64)) || defined(__LP64__)
68  if(deviceProp.canMapHostMemory) {
69  h_reduce = (QudaSumFloat *) mapped_malloc(bytes);
70  cudaHostGetDevicePointer(&hd_reduce, h_reduce, 0); // set the matching device pointer
71  } else
72 #endif
73  {
74  h_reduce = (QudaSumFloat *) pinned_malloc(bytes);
75  hd_reduce = d_reduce;
76  }
77  memset(h_reduce, 0, bytes); // added to ensure that valgrind doesn't report h_reduce is unitialised
78  }
79 
80  cudaEventCreateWithFlags(&reduceEnd, cudaEventDisableTiming);
81 
83  }
84 
85  void endReduce(void)
86  {
87  if (d_reduce) {
88  device_free(d_reduce);
89  d_reduce = 0;
90  }
91  if (h_reduce) {
92  host_free(h_reduce);
93  h_reduce = 0;
94  }
95  hd_reduce = 0;
96 
97  cudaEventDestroy(reduceEnd);
98  }
99 
100  namespace reduce {
101 
102 #include <texture.h>
103 #include <reduce_core.h>
104 #include <reduce_mixed_core.h>
105 
106  } // namespace reduce
107 
111  template <typename ReduceType, typename Float2, typename FloatN>
112  struct ReduceFunctor {
113 
115  virtual __device__ void pre() { ; }
116 
118  virtual __device__ void operator()(ReduceType &sum, FloatN &x, FloatN &y,
119  FloatN &z, FloatN &w, FloatN &v) = 0;
120 
122  virtual __device__ void post(ReduceType &sum) { ; }
123 
124  };
125 
129  __device__ double norm2_(const double2 &a) { return a.x*a.x + a.y*a.y; }
130  __device__ float norm2_(const float2 &a) { return a.x*a.x + a.y*a.y; }
131  __device__ float norm2_(const float4 &a) { return a.x*a.x + a.y*a.y + a.z*a.z + a.w*a.w; }
132 
133  template <typename ReduceType, typename Float2, typename FloatN>
134 #if (__COMPUTE_CAPABILITY__ >= 200)
135  struct Norm2 : public ReduceFunctor<ReduceType, Float2, FloatN> {
136 #else
137  struct Norm2 {
138 #endif
139  Norm2(const Float2 &a, const Float2 &b) { ; }
140  __device__ void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z,FloatN &w, FloatN &v) { sum += norm2_(x); }
141  static int streams() { return 1; }
142  static int flops() { return 2; }
143  };
144 
145  double normCuda(const cudaColorSpinorField &x) {
147  return reduce::reduceCuda<double,QudaSumFloat,QudaSumFloat,Norm2,0,0,0,0,0,false>
148  (make_double2(0.0, 0.0), make_double2(0.0, 0.0), y, y, y, y, y);
149  }
150 
154  __device__ double dot_(const double2 &a, const double2 &b) { return a.x*b.x + a.y*b.y; }
155  __device__ float dot_(const float2 &a, const float2 &b) { return a.x*b.x + a.y*b.y; }
156  __device__ float dot_(const float4 &a, const float4 &b) { return a.x*b.x + a.y*b.y + a.z*b.z + a.w*b.w; }
157 
158  template <typename ReduceType, typename Float2, typename FloatN>
159 #if (__COMPUTE_CAPABILITY__ >= 200)
160  struct Dot : public ReduceFunctor<ReduceType, Float2, FloatN> {
161 #else
162  struct Dot {
163 #endif
164  Dot(const Float2 &a, const Float2 &b) { ; }
165  __device__ void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v) { sum += dot_(x,y); }
166  static int streams() { return 2; }
167  static int flops() { return 2; }
168  };
169 
171  return reduce::reduceCuda<double,QudaSumFloat,QudaSumFloat,Dot,0,0,0,0,0,false>
172  (make_double2(0.0, 0.0), make_double2(0.0, 0.0), x, y, x, x, x);
173  }
174 
175 
176  void reDotProductCuda(double* result, std::vector<cudaColorSpinorField*>& x, std::vector<cudaColorSpinorField*>& y){
177 #ifndef SSTEP
178  errorQuda("S-step code not built\n");
179 #else
180  switch(x.size()){
181  case 1:
182  reduce::multiReduceCuda<1,double,QudaSumFloat,QudaSumFloat,Dot,0,0,0,0,0,false>
183  (result, make_double2(0.0, 0.0), make_double2(0.0, 0.0), x, y, x, x, x);
184  break;
185  case 2:
186  reduce::multiReduceCuda<2,double,QudaSumFloat,QudaSumFloat,Dot,0,0,0,0,0,false>
187  (result, make_double2(0.0, 0.0), make_double2(0.0, 0.0), x, y, x, x, x);
188  break;
189  case 3:
190  reduce::multiReduceCuda<3,double,QudaSumFloat,QudaSumFloat,Dot,0,0,0,0,0,false>
191  (result, make_double2(0.0, 0.0), make_double2(0.0, 0.0), x, y, x, x, x);
192  break;
193  case 4:
194  reduce::multiReduceCuda<4,double,QudaSumFloat,QudaSumFloat,Dot,0,0,0,0,0,false>
195  (result, make_double2(0.0, 0.0), make_double2(0.0, 0.0), x, y, x, x, x);
196  break;
197  case 5:
198  reduce::multiReduceCuda<5,double,QudaSumFloat,QudaSumFloat,Dot,0,0,0,0,0,false>
199  (result, make_double2(0.0, 0.0), make_double2(0.0, 0.0), x, y, x, x, x);
200  break;
201  case 6:
202  reduce::multiReduceCuda<6,double,QudaSumFloat,QudaSumFloat,Dot,0,0,0,0,0,false>
203  (result, make_double2(0.0, 0.0), make_double2(0.0, 0.0), x, y, x, x, x);
204  break;
205  case 7:
206  reduce::multiReduceCuda<7,double,QudaSumFloat,QudaSumFloat,Dot,0,0,0,0,0,false>
207  (result, make_double2(0.0, 0.0), make_double2(0.0, 0.0), x, y, x, x, x);
208  break;
209  case 8:
210  reduce::multiReduceCuda<8,double,QudaSumFloat,QudaSumFloat,Dot,0,0,0,0,0,false>
211  (result, make_double2(0.0, 0.0), make_double2(0.0, 0.0), x, y, x, x, x);
212  break;
213  case 9:
214  reduce::multiReduceCuda<9,double,QudaSumFloat,QudaSumFloat,Dot,0,0,0,0,0,false>
215  (result, make_double2(0.0, 0.0), make_double2(0.0, 0.0), x, y, x, x, x);
216  break;
217  case 10:
218  reduce::multiReduceCuda<10,double,QudaSumFloat,QudaSumFloat,Dot,0,0,0,0,0,false>
219  (result, make_double2(0.0, 0.0), make_double2(0.0, 0.0), x, y, x, x, x);
220  break;
221  case 11:
222  reduce::multiReduceCuda<11,double,QudaSumFloat,QudaSumFloat,Dot,0,0,0,0,0,false>
223  (result, make_double2(0.0, 0.0), make_double2(0.0, 0.0), x, y, x, x, x);
224  break;
225  case 12:
226  reduce::multiReduceCuda<12,double,QudaSumFloat,QudaSumFloat,Dot,0,0,0,0,0,false>
227  (result, make_double2(0.0, 0.0), make_double2(0.0, 0.0), x, y, x, x, x);
228  break;
229  case 13:
230  reduce::multiReduceCuda<13,double,QudaSumFloat,QudaSumFloat,Dot,0,0,0,0,0,false>
231  (result, make_double2(0.0, 0.0), make_double2(0.0, 0.0), x, y, x, x, x);
232  break;
233  case 14:
234  reduce::multiReduceCuda<14,double,QudaSumFloat,QudaSumFloat,Dot,0,0,0,0,0,false>
235  (result, make_double2(0.0, 0.0), make_double2(0.0, 0.0), x, y, x, x, x);
236  break;
237  case 15:
238  reduce::multiReduceCuda<15,double,QudaSumFloat,QudaSumFloat,Dot,0,0,0,0,0,false>
239  (result, make_double2(0.0, 0.0), make_double2(0.0, 0.0), x, y, x, x, x);
240  break;
241  case 16:
242  reduce::multiReduceCuda<16,double,QudaSumFloat,QudaSumFloat,Dot,0,0,0,0,0,false>
243  (result, make_double2(0.0, 0.0), make_double2(0.0, 0.0), x, y, x, x, x);
244  break;
245  case 17:
246  reduce::multiReduceCuda<17,double,QudaSumFloat,QudaSumFloat,Dot,0,0,0,0,0,false>
247  (result, make_double2(0.0, 0.0), make_double2(0.0, 0.0), x, y, x, x, x);
248  break;
249  case 18:
250  reduce::multiReduceCuda<18,double,QudaSumFloat,QudaSumFloat,Dot,0,0,0,0,0,false>
251  (result, make_double2(0.0, 0.0), make_double2(0.0, 0.0), x, y, x, x, x);
252  break;
253  case 19:
254  reduce::multiReduceCuda<19,double,QudaSumFloat,QudaSumFloat,Dot,0,0,0,0,0,false>
255  (result, make_double2(0.0, 0.0), make_double2(0.0, 0.0), x, y, x, x, x);
256  break;
257  case 20:
258  reduce::multiReduceCuda<20,double,QudaSumFloat,QudaSumFloat,Dot,0,0,0,0,0,false>
259  (result, make_double2(0.0, 0.0), make_double2(0.0, 0.0), x, y, x, x, x);
260  break;
261  default:
262  errorQuda("Unsupported vector size");
263  break;
264  }
265 #endif // SSTEP
266  }
267 
268 
269  /*
270  returns the real component of the dot product of a and b
271  and the norm of a
272  */
273  __device__ double2 dotNormA_(const double2 &a, const double2 &b)
274  { return make_double2(a.x*b.x + a.y*b.y, a.x*a.x + a.y*a.y); }
275 
276  __device__ double2 dotNormA_(const float2 &a, const float2 &b)
277  { return make_double2(a.x*b.x + a.y*b.y, a.x*a.x + a.y*a.y); }
278 
279 
280  __device__ double2 dotNormA_(const float4 &a, const float4 & b)
281  { return make_double2(a.x*b.x + a.y*b.y + a.z*b.z + a.w*b.w, a.x*a.x + a.y*a.y + a.z*a.z + a.w*a.w); }
282 
283 
284 
285  template <typename ReduceType, typename Float2, typename FloatN>
286 #if (__COMPUTE_CAPABILITY__ >= 200)
287  struct DotNormA : public ReduceFunctor<ReduceType, Float2, FloatN> {
288 #else
289  struct DotNormA {
290 #endif
291  DotNormA(const Float2 &a, const Float2 &b){}
292  __device__ void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v){sum += dotNormA_(x,y);}
293  static int streams() { return 2; }
294  static int flops() { return 4; }
295  };
296 
298  return reduce::reduceCuda<double2,QudaSumFloat2,QudaSumFloat,DotNormA,0,0,0,0,0,false>
299  (make_double2(0.0, 0.0), make_double2(0.0, 0.0), x, y, x, x, x);
300  }
301 
302 
307  template <typename ReduceType, typename Float2, typename FloatN>
308 #if (__COMPUTE_CAPABILITY__ >= 200)
309  struct axpyNorm2 : public ReduceFunctor<ReduceType, Float2, FloatN> {
310 #else
311  struct axpyNorm2 {
312 #endif
313  Float2 a;
314  axpyNorm2(const Float2 &a, const Float2 &b) : a(a) { ; }
315  __device__ void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v) {
316  y += a.x*x; sum += norm2_(y); }
317  static int streams() { return 3; }
318  static int flops() { return 4; }
319  };
320 
322  return reduce::reduceCuda<double,QudaSumFloat,QudaSumFloat,axpyNorm2,0,1,0,0,0,false>
323  (make_double2(a, 0.0), make_double2(0.0, 0.0), x, y, x, x, x);
324  }
325 
330  template <typename ReduceType, typename Float2, typename FloatN>
331 #if (__COMPUTE_CAPABILITY__ >= 200)
332  struct xmyNorm2 : public ReduceFunctor<ReduceType, Float2, FloatN> {
333 #else
334  struct xmyNorm2 {
335 #endif
336  xmyNorm2(const Float2 &a, const Float2 &b) { ; }
337  __device__ void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v) {
338  y = x - y; sum += norm2_(y); }
339  static int streams() { return 3; }
340  static int flops() { return 3; }
341  };
342 
344  return reduce::reduceCuda<double,QudaSumFloat,QudaSumFloat,xmyNorm2,0,1,0,0,0,false>
345  (make_double2(0.0, 0.0), make_double2(0.0, 0.0), x, y, x, x, x);
346  }
347 
348 
353  __device__ void Caxpy_(const float2 &a, const float4 &x, float4 &y) {
354  y.x += a.x*x.x; y.x -= a.y*x.y;
355  y.y += a.y*x.x; y.y += a.x*x.y;
356  y.z += a.x*x.z; y.z -= a.y*x.w;
357  y.w += a.y*x.z; y.w += a.x*x.w;
358  }
359 
360  __device__ void Caxpy_(const float2 &a, const float2 &x, float2 &y) {
361  y.x += a.x*x.x; y.x -= a.y*x.y;
362  y.y += a.y*x.x; y.y += a.x*x.y;
363  }
364 
365  __device__ void Caxpy_(const double2 &a, const double2 &x, double2 &y) {
366  y.x += a.x*x.x; y.x -= a.y*x.y;
367  y.y += a.y*x.x; y.y += a.x*x.y;
368  }
369 
374  template <typename ReduceType, typename Float2, typename FloatN>
375 #if (__COMPUTE_CAPABILITY__ >= 200)
376  struct caxpyNorm2 : public ReduceFunctor<ReduceType, Float2, FloatN> {
377 #else
378  struct caxpyNorm2 {
379 #endif
380  Float2 a;
381  caxpyNorm2(const Float2 &a, const Float2 &b) : a(a) { ; }
382  __device__ void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v) {
383  Caxpy_(a, x, y); sum += norm2_(y); }
384  static int streams() { return 3; }
385  static int flops() { return 6; }
386  };
387 
389  return reduce::reduceCuda<double,QudaSumFloat,QudaSumFloat,caxpyNorm2,0,1,0,0,0,false>
390  (make_double2(REAL(a), IMAG(a)), make_double2(0.0, 0.0), x, y, x, x, x);
391  }
392 
400  template <typename ReduceType, typename Float2, typename FloatN>
401 #if (__COMPUTE_CAPABILITY__ >= 200)
402  struct caxpyxmaznormx : public ReduceFunctor<ReduceType, Float2, FloatN> {
403 #else
404  struct caxpyxmaznormx {
405 #endif
406  Float2 a;
407  caxpyxmaznormx(const Float2 &a, const Float2 &b) : a(a) { ; }
408  __device__ void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v) { Caxpy_(a, x, y); x-= a.x*z; sum += norm2_(x); }
409  static int streams() { return 5; }
410  static int flops() { return 10; }
411  };
412 
415  return reduce::reduceCuda<double,QudaSumFloat,QudaSumFloat,caxpyxmaznormx,1,1,0,0,0,false>
416  (make_double2(REAL(a), IMAG(a)), make_double2(0.0, 0.0), x, y, z, x, x);
417  }
418 
426  template <typename ReduceType, typename Float2, typename FloatN>
427 #if (__COMPUTE_CAPABILITY__ >= 200)
428  struct cabxpyaxnorm : public ReduceFunctor<ReduceType, Float2, FloatN> {
429 #else
430  struct cabxpyaxnorm {
431 #endif
432  Float2 a;
433  Float2 b;
434  cabxpyaxnorm(const Float2 &a, const Float2 &b) : a(a), b(b) { ; }
435  __device__ void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v) { x *= a.x; Caxpy_(b, x, y); sum += norm2_(y); }
436  static int streams() { return 4; }
437  static int flops() { return 10; }
438  };
439 
440  double cabxpyAxNormCuda(const double &a, const Complex &b,
442  return reduce::reduceCuda<double,QudaSumFloat,QudaSumFloat,cabxpyaxnorm,1,1,0,0,0,false>
443  (make_double2(a, 0.0), make_double2(REAL(b), IMAG(b)), x, y, x, x, x);
444  }
445 
449  __device__ double2 cdot_(const double2 &a, const double2 &b)
450  { return make_double2(a.x*b.x + a.y*b.y, a.x*b.y - a.y*b.x); }
451  __device__ double2 cdot_(const float2 &a, const float2 &b)
452  { return make_double2(a.x*b.x + a.y*b.y, a.x*b.y - a.y*b.x); }
453  __device__ double2 cdot_(const float4 &a, const float4 &b)
454  { return make_double2(a.x*b.x + a.y*b.y + a.z*b.z + a.w*b.w, a.x*b.y - a.y*b.x + a.z*b.w - a.w*b.z); }
455 
456  template <typename ReduceType, typename Float2, typename FloatN>
457 #if (__COMPUTE_CAPABILITY__ >= 200)
458  struct Cdot : public ReduceFunctor<ReduceType, Float2, FloatN> {
459 #else
460  struct Cdot {
461 #endif
462  Cdot(const Float2 &a, const Float2 &b) { ; }
463  __device__ void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v) { sum += cdot_(x,y); }
464  static int streams() { return 2; }
465  static int flops() { return 4; }
466  };
467 
469  double2 cdot = reduce::reduceCuda<double2,QudaSumFloat2,QudaSumFloat,Cdot,0,0,0,0,0,false>
470  (make_double2(0.0, 0.0), make_double2(0.0, 0.0), x, y, x, x, x);
471  return Complex(cdot.x, cdot.y);
472  }
473 
474  void cDotProductCuda(Complex* result, std::vector<cudaColorSpinorField*>& x, std::vector<cudaColorSpinorField*>& y){
475 #ifndef SSTEP
476  errorQuda("S-step code not built\n");
477 #else
478  double2* cdot = new double2[x.size()];
479 
480  switch(x.size()){
481  case 1:
482  reduce::multiReduceCuda<1,double2,QudaSumFloat2,QudaSumFloat,Cdot,0,0,0,0,0,false>
483  (cdot, make_double2(0.0, 0.0), make_double2(0.0, 0.0), x, y, x, x, x);
484  break;
485  case 6:
486  reduce::multiReduceCuda<6,double2,QudaSumFloat2,QudaSumFloat,Cdot,0,0,0,0,0,false>
487  (cdot, make_double2(0.0, 0.0), make_double2(0.0, 0.0), x, y, x, x, x);
488  break;
489  case 10:
490  reduce::multiReduceCuda<10,double2,QudaSumFloat2,QudaSumFloat,Cdot,0,0,0,0,0,false>
491  (cdot, make_double2(0.0, 0.0), make_double2(0.0, 0.0), x, y, x, x, x);
492  break;
493  case 14:
494  reduce::multiReduceCuda<14,double2,QudaSumFloat2,QudaSumFloat,Cdot,0,0,0,0,0,false>
495  (cdot, make_double2(0.0, 0.0), make_double2(0.0, 0.0), x, y, x, x, x);
496  break;
497  case 18:
498  reduce::multiReduceCuda<18,double2,QudaSumFloat2,QudaSumFloat,Cdot,0,0,0,0,0,false>
499  (cdot, make_double2(0.0, 0.0), make_double2(0.0, 0.0), x, y, x, x, x);
500  break;
501  case 22:
502  reduce::multiReduceCuda<22,double2,QudaSumFloat2,QudaSumFloat,Cdot,0,0,0,0,0,false>
503  (cdot, make_double2(0.0, 0.0), make_double2(0.0, 0.0), x, y, x, x, x);
504  break;
505  default:
506  errorQuda("Unsupported vector size\n");
507  break;
508  }
509 
510  for(int i=0; i<x.size(); ++i) result[i] = Complex(cdot[i].x,cdot[i].y);
511  delete[] cdot;
512 #endif
513  }
514 
521  template <typename ReduceType, typename Float2, typename FloatN>
522 #if (__COMPUTE_CAPABILITY__ >= 200)
523  struct xpaycdotzy : public ReduceFunctor<ReduceType, Float2, FloatN> {
524 #else
525  struct xpaycdotzy {
526 #endif
527  Float2 a;
528  xpaycdotzy(const Float2 &a, const Float2 &b) : a(a) { ; }
529  __device__ void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v) { y = x + a.x*y; sum += cdot_(z,y); }
530  static int streams() { return 4; }
531  static int flops() { return 6; }
532  };
533 
535  double2 cdot = reduce::reduceCuda<double2,QudaSumFloat2,QudaSumFloat,xpaycdotzy,0,1,0,0,0,false>
536  (make_double2(a, 0.0), make_double2(0.0, 0.0), x, y, z, x, x);
537  return Complex(cdot.x, cdot.y);
538  }
539 
546  template <typename ReduceType, typename Float2, typename FloatN>
547 #if (__COMPUTE_CAPABILITY__ >= 200)
548  struct caxpydotzy : public ReduceFunctor<ReduceType, Float2, FloatN> {
549 #else
550  struct caxpydotzy {
551 #endif
552  Float2 a;
553  caxpydotzy(const Float2 &a, const Float2 &b) : a(a) { ; }
554  __device__ void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v) { Caxpy_(a, x, y); sum += cdot_(z,y); }
555  static int streams() { return 4; }
556  static int flops() { return 8; }
557  };
558 
561  double2 cdot = reduce::reduceCuda<double2,QudaSumFloat2,QudaSumFloat,caxpydotzy,0,1,0,0,0,false>
562  (make_double2(REAL(a), IMAG(a)), make_double2(0.0, 0.0), x, y, z, x, x);
563  return Complex(cdot.x, cdot.y);
564  }
565 
570  __device__ double3 cdotNormA_(const double2 &a, const double2 &b)
571  { return make_double3(a.x*b.x + a.y*b.y, a.x*b.y - a.y*b.x, a.x*a.x + a.y*a.y); }
572  __device__ double3 cdotNormA_(const float2 &a, const float2 &b)
573  { return make_double3(a.x*b.x + a.y*b.y, a.x*b.y - a.y*b.x, a.x*a.x + a.y*a.y); }
574  __device__ double3 cdotNormA_(const float4 &a, const float4 &b)
575  { return make_double3(a.x*b.x + a.y*b.y + a.z*b.z + a.w*b.w,
576  a.x*b.y - a.y*b.x + a.z*b.w - a.w*b.z,
577  a.x*a.x + a.y*a.y + a.z*a.z + a.w*a.w); }
578 
579  template <typename ReduceType, typename Float2, typename FloatN>
580 #if (__COMPUTE_CAPABILITY__ >= 200)
581  struct CdotNormA : public ReduceFunctor<ReduceType, Float2, FloatN> {
582 #else
583  struct CdotNormA {
584 #endif
585  CdotNormA(const Float2 &a, const Float2 &b) { ; }
586  __device__ void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v) { sum += cdotNormA_(x,y); }
587  static int streams() { return 2; }
588  static int flops() { return 6; }
589  };
590 
592  return reduce::reduceCuda<double3,QudaSumFloat3,QudaSumFloat,CdotNormA,0,0,0,0,0,false>
593  (make_double2(0.0, 0.0), make_double2(0.0, 0.0), x, y, x, x, x);
594  }
595 
600  __device__ double3 cdotNormB_(const double2 &a, const double2 &b)
601  { return make_double3(a.x*b.x + a.y*b.y, a.x*b.y - a.y*b.x, b.x*b.x + b.y*b.y); }
602  __device__ double3 cdotNormB_(const float2 &a, const float2 &b)
603  { return make_double3(a.x*b.x + a.y*b.y, a.x*b.y - a.y*b.x, b.x*b.x + b.y*b.y); }
604  __device__ double3 cdotNormB_(const float4 &a, const float4 &b)
605  { return make_double3(a.x*b.x + a.y*b.y + a.z*b.z + a.w*b.w, a.x*b.y - a.y*b.x + a.z*b.w - a.w*b.z,
606  b.x*b.x + b.y*b.y + b.z*b.z + b.w*b.w); }
607 
608  template <typename ReduceType, typename Float2, typename FloatN>
609 #if (__COMPUTE_CAPABILITY__ >= 200)
610  struct CdotNormB : public ReduceFunctor<ReduceType, Float2, FloatN> {
611 #else
612  struct CdotNormB {
613 #endif
614  CdotNormB(const Float2 &a, const Float2 &b) { ; }
615  __device__ void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v) { sum += cdotNormB_(x,y); }
616  static int streams() { return 2; }
617  static int flops() { return 6; }
618  };
619 
621  return reduce::reduceCuda<double3,QudaSumFloat3,QudaSumFloat,CdotNormB,0,0,0,0,0,false>
622  (make_double2(0.0, 0.0), make_double2(0.0, 0.0), x, y, x, x, x);
623  }
624 
629  template <typename ReduceType, typename Float2, typename FloatN>
630 #if (__COMPUTE_CAPABILITY__ >= 200)
631  struct caxpbypzYmbwcDotProductUYNormY : public ReduceFunctor<ReduceType, Float2, FloatN> {
632 #else
634 #endif
635  Float2 a;
636  Float2 b;
637  caxpbypzYmbwcDotProductUYNormY(const Float2 &a, const Float2 &b) : a(a), b(b) { ; }
638  __device__ void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v) { Caxpy_(a, x, z); Caxpy_(b, y, z); Caxpy_(-b, w, y); sum += cdotNormB_(v,y); }
639  static int streams() { return 7; }
640  static int flops() { return 18; }
641  };
642 
644  const Complex &b, cudaColorSpinorField &y,
647  if (x.Precision() != z.Precision()) {
648  return reduce::mixed::reduceCuda<double3,QudaSumFloat3,QudaSumFloat,caxpbypzYmbwcDotProductUYNormY,0,1,1,0,0,false>
649  (make_double2(REAL(a), IMAG(a)), make_double2(REAL(b), IMAG(b)), x, y, z, w, u);
650 
651  } else {
652  return reduce::reduceCuda<double3,QudaSumFloat3,QudaSumFloat,caxpbypzYmbwcDotProductUYNormY,0,1,1,0,0,false>
653  (make_double2(REAL(a), IMAG(a)), make_double2(REAL(b), IMAG(b)), x, y, z, w, u);
654  }
655  }
656 
657 
664  template <typename ReduceType, typename Float2, typename FloatN>
665 #if (__COMPUTE_CAPABILITY__ >= 200)
666  struct axpyCGNorm2 : public ReduceFunctor<ReduceType, Float2, FloatN> {
667 #else
668  struct axpyCGNorm2 {
669 #endif
670  Float2 a;
671  axpyCGNorm2(const Float2 &a, const Float2 &b) : a(a) { ; }
672  __device__ void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v) {
673  FloatN y_new = y + a.x*x;
674  sum.x += norm2_(y_new);
675  sum.y += dot_(y_new, y_new-y);
676  y = y_new;
677  }
678  static int streams() { return 3; }
679  static int flops() { return 6; }
680  };
681 
683  double2 cg_norm = reduce::reduceCuda<double2,QudaSumFloat2,QudaSumFloat,axpyCGNorm2,0,1,0,0,0,false>
684  (make_double2(a, 0.0), make_double2(0.0, 0.0), x, y, x, x, x);
685  return Complex(cg_norm.x, cg_norm.y);
686  }
687 
688 #if (__COMPUTE_CAPABILITY__ >= 200)
689 
702  template <typename ReduceType, typename Float2, typename FloatN>
703  struct HeavyQuarkResidualNorm : public ReduceFunctor<ReduceType, Float2, FloatN> {
704  Float2 a;
705  Float2 b;
706  ReduceType aux;
707  HeavyQuarkResidualNorm(const Float2 &a, const Float2 &b) : a(a), b(b) { ; }
708 
709  __device__ void pre() { aux.x = 0; aux.y = 0; }
710 
711  __device__ void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v) { aux.x += norm2_(x); aux.y += norm2_(y); }
712 
714  __device__ void post(ReduceType &sum)
715  {
716  sum.x += aux.x; sum.y += aux.y; sum.z += (aux.x > 0.0) ? (aux.y / aux.x) : 1.0;
717  }
718 
719  static int streams() { return 2; }
720  static int flops() { return 4; }
721  };
722 
723  double3 HeavyQuarkResidualNormCuda(cudaColorSpinorField &x, cudaColorSpinorField &r) {
724  double3 rtn = reduce::reduceCuda<double3,QudaSumFloat3,QudaSumFloat,HeavyQuarkResidualNorm,0,0,0,0,0,true>
725  (make_double2(0.0, 0.0), make_double2(0.0, 0.0), x, r, r, r, r);
726 #ifdef MULTI_GPU
727  rtn.z /= (x.Volume()*comm_size());
728 #else
729  rtn.z /= x.Volume();
730 #endif
731  return rtn;
732  }
733 
741  template <typename ReduceType, typename Float2, typename FloatN>
742  struct xpyHeavyQuarkResidualNorm : public ReduceFunctor<ReduceType, Float2, FloatN> {
743  Float2 a;
744  Float2 b;
745  ReduceType aux;
746  xpyHeavyQuarkResidualNorm(const Float2 &a, const Float2 &b) : a(a), b(b) { ; }
747 
748  __device__ void pre() { aux.x = 0; aux.y = 0; }
749 
750  __device__ void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v)
751  { aux.x += norm2_(x + y); aux.y += norm2_(z); }
752 
754  __device__ void post(ReduceType &sum)
755  {
756  sum.x += aux.x; sum.y += aux.y; sum.z += (aux.x > 0.0) ? (aux.y / aux.x) : 1.0;
757  }
758 
759  static int streams() { return 3; }
760  static int flops() { return 5; }
761  };
762 
763  double3 xpyHeavyQuarkResidualNormCuda(cudaColorSpinorField &x, cudaColorSpinorField &y,
764  cudaColorSpinorField &r) {
765  double3 rtn = reduce::reduceCuda<double3,QudaSumFloat3,QudaSumFloat,xpyHeavyQuarkResidualNorm,0,0,0,0,0,true>
766  (make_double2(0.0, 0.0), make_double2(0.0, 0.0), x, y, r, r, r);
767 #ifdef MULTI_GPU
768  rtn.z /= (x.Volume()*comm_size());
769 #else
770  rtn.z /= x.Volume();
771 #endif
772  return rtn;
773  }
774 
775 #else
776 
778  errorQuda("Not supported on pre-Fermi architectures");
779  return make_double3(0.0,0.0,0.0);
780  }
781 
784  errorQuda("Not supported on pre-Fermi architectures");
785  return make_double3(0.0,0.0,0.0);
786  }
787 
788 #endif
789 
798  template <typename ReduceType, typename Float2, typename FloatN>
799 #if (__COMPUTE_CAPABILITY__ >= 200)
800  struct tripleCGReduction : public ReduceFunctor<ReduceType, Float2, FloatN> {
801 #else
803 #endif
804  tripleCGReduction(const Float2 &a, const Float2 &b) { ; }
805  __device__ void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v)
806  { sum.x += norm2_(x); sum.y += norm2_(y); sum.z += dot_(y,z); }
807  static int streams() { return 3; }
808  static int flops() { return 6; }
809  };
810 
812  return reduce::reduceCuda<double3,QudaSumFloat3,QudaSumFloat,tripleCGReduction,0,0,0,0,0,false>
813  (make_double2(0.0, 0.0), make_double2(0.0, 0.0), x, y, z, x, x);
814  }
815 
816 } // namespace quda
caxpydotzy(const Float2 &a, const Float2 &b)
Definition: reduce_quda.cu:553
static int flops()
total number of input and output streams
Definition: reduce_quda.cu:617
__device__ void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v)
Definition: reduce_quda.cu:463
double3 tripleCGReductionCuda(cudaColorSpinorField &x, cudaColorSpinorField &y, cudaColorSpinorField &z)
Definition: reduce_quda.cu:811
#define pinned_malloc(size)
Definition: malloc_quda.h:26
int y[4]
cudaDeviceProp deviceProp
__device__ void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v)
Definition: reduce_quda.cu:615
__device__ void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v)
Definition: reduce_quda.cu:292
static int streams()
Definition: reduce_quda.cu:293
static int flops()
total number of input and output streams
Definition: reduce_quda.cu:142
static int flops()
total number of input and output streams
Definition: reduce_quda.cu:167
char aux_tmp[quda::TuneKey::aux_n]
Definition: reduce_quda.cu:39
#define errorQuda(...)
Definition: util_quda.h:73
void initReduce()
Definition: reduce_quda.cu:52
void endReduce()
Definition: reduce_quda.cu:85
__device__ void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v)
Definition: reduce_quda.cu:408
#define host_free(ptr)
Definition: malloc_quda.h:29
__device__ void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v)
Definition: reduce_quda.cu:140
virtual __device__ void post(ReduceType &sum)
post-computation routine called after the "M-loop"
Definition: reduce_quda.cu:122
double axpyNormCuda(const double &a, cudaColorSpinorField &x, cudaColorSpinorField &y)
Definition: reduce_quda.cu:321
__device__ void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v)
Definition: reduce_quda.cu:586
std::complex< double > Complex
Definition: eig_variables.h:13
static int flops()
total number of input and output streams
Definition: reduce_quda.cu:465
virtual __device__ void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v)=0
where the reduction is usually computed and any auxiliary operations
const char * aux_str
Definition: reduce_quda.cu:38
static int flops()
total number of input and output streams
Definition: reduce_quda.cu:556
__device__ double3 cdotNormA_(const double2 &a, const double2 &b)
Definition: reduce_quda.cu:570
static int flops()
Definition: reduce_quda.cu:294
static int streams()
Definition: reduce_quda.cu:339
double cabxpyAxNormCuda(const double &a, const Complex &b, cudaColorSpinorField &x, cudaColorSpinorField &y)
Definition: reduce_quda.cu:440
static int streams()
Definition: reduce_quda.cu:555
static int streams()
Definition: reduce_quda.cu:409
__device__ void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v)
Definition: reduce_quda.cu:337
double3 cDotProductNormBCuda(cudaColorSpinorField &a, cudaColorSpinorField &b)
Definition: reduce_quda.cu:620
static int flops()
total number of input and output streams
Definition: reduce_quda.cu:410
tripleCGReduction(const Float2 &a, const Float2 &b)
Definition: reduce_quda.cu:804
Dot(const Float2 &a, const Float2 &b)
Definition: reduce_quda.cu:164
#define QudaSumFloat
Definition: reduce_quda.cu:10
double2 reDotProductNormACuda(cudaColorSpinorField &a, cudaColorSpinorField &b)
Definition: reduce_quda.cu:297
caxpyNorm2(const Float2 &a, const Float2 &b)
Definition: reduce_quda.cu:381
Complex axpyCGNormCuda(const double &a, cudaColorSpinorField &x, cudaColorSpinorField &y)
Definition: reduce_quda.cu:682
static int flops()
total number of input and output streams
Definition: reduce_quda.cu:318
__device__ void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v)
Definition: reduce_quda.cu:435
static int flops()
total number of input and output streams
Definition: reduce_quda.cu:808
__device__ void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v)
Definition: reduce_quda.cu:165
CdotNormA(const Float2 &a, const Float2 &b)
Definition: reduce_quda.cu:585
static int streams()
Definition: reduce_quda.cu:317
xpaycdotzy(const Float2 &a, const Float2 &b)
Definition: reduce_quda.cu:528
static int flops()
total number of input and output streams
Definition: reduce_quda.cu:385
static int flops()
total number of input and output streams
Definition: reduce_quda.cu:531
int comm_size(void)
Definition: comm_mpi.cpp:86
caxpbypzYmbwcDotProductUYNormY(const Float2 &a, const Float2 &b)
Definition: reduce_quda.cu:637
static int streams()
Definition: reduce_quda.cu:166
static int streams()
Definition: reduce_quda.cu:530
double3 caxpbypzYmbwcDotProductUYNormYCuda(const Complex &a, cudaColorSpinorField &x, const Complex &b, cudaColorSpinorField &y, cudaColorSpinorField &z, cudaColorSpinorField &w, cudaColorSpinorField &u)
Definition: reduce_quda.cu:643
__device__ double3 cdotNormB_(const double2 &a, const double2 &b)
Definition: reduce_quda.cu:600
#define REAL(a)
Definition: quda_internal.h:86
static int streams()
Definition: reduce_quda.cu:616
Complex cDotProductCuda(cudaColorSpinorField &, cudaColorSpinorField &)
Definition: reduce_quda.cu:468
xmyNorm2(const Float2 &a, const Float2 &b)
Definition: reduce_quda.cu:336
Cdot(const Float2 &a, const Float2 &b)
Definition: reduce_quda.cu:462
static int flops()
total number of input and output streams
Definition: reduce_quda.cu:679
axpyCGNorm2(const Float2 &a, const Float2 &b)
Definition: reduce_quda.cu:671
double caxpyXmazNormXCuda(const Complex &a, cudaColorSpinorField &x, cudaColorSpinorField &y, cudaColorSpinorField &z)
Definition: reduce_quda.cu:413
__device__ void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v)
Definition: reduce_quda.cu:554
static int streams()
Definition: reduce_quda.cu:384
#define IMAG(a)
Definition: quda_internal.h:87
static int streams()
Definition: reduce_quda.cu:141
static int flops()
total number of input and output streams
Definition: reduce_quda.cu:340
__device__ void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v)
Definition: reduce_quda.cu:382
cabxpyaxnorm(const Float2 &a, const Float2 &b)
Definition: reduce_quda.cu:434
__device__ double norm2_(const double2 &a)
Definition: reduce_quda.cu:129
double normCuda(const cudaColorSpinorField &b)
Definition: reduce_quda.cu:145
__device__ void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v)
Definition: reduce_quda.cu:529
static int flops()
total number of input and output streams
Definition: reduce_quda.cu:640
Complex caxpyDotzyCuda(const Complex &a, cudaColorSpinorField &x, cudaColorSpinorField &y, cudaColorSpinorField &z)
Definition: reduce_quda.cu:559
Complex xpaycDotzyCuda(cudaColorSpinorField &x, const double &a, cudaColorSpinorField &y, cudaColorSpinorField &z)
Definition: reduce_quda.cu:534
int x[4]
axpyNorm2(const Float2 &a, const Float2 &b)
Definition: reduce_quda.cu:314
cudaStream_t * getBlasStream()
Definition: blas_quda.cu:64
static int streams()
Definition: reduce_quda.cu:436
caxpyxmaznormx(const Float2 &a, const Float2 &b)
Definition: reduce_quda.cu:407
double3 xpyHeavyQuarkResidualNormCuda(cudaColorSpinorField &x, cudaColorSpinorField &y, cudaColorSpinorField &r)
Definition: reduce_quda.cu:782
static int streams()
Definition: reduce_quda.cu:678
__device__ void Caxpy_(const float2 &a, const float4 &x, float4 &y)
Definition: reduce_quda.cu:353
DotNormA(const Float2 &a, const Float2 &b)
Definition: reduce_quda.cu:291
double reDotProductCuda(cudaColorSpinorField &a, cudaColorSpinorField &b)
Definition: reduce_quda.cu:170
const char * vol_str
Definition: reduce_quda.cu:37
void * memset(void *s, int c, size_t n)
__device__ void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v)
Definition: reduce_quda.cu:315
__device__ void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v)
Definition: reduce_quda.cu:638
QudaPrecision Precision() const
static const int aux_n
Definition: tune_key.h:12
__device__ void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v)
Definition: reduce_quda.cu:672
double3 HeavyQuarkResidualNorm(const Float *x, const Float *r, const int volume, const int Nint)
Definition: blas_cpu.cpp:310
double3 cDotProductNormACuda(cudaColorSpinorField &a, cudaColorSpinorField &b)
Definition: reduce_quda.cu:591
#define device_malloc(size)
Definition: malloc_quda.h:24
__device__ void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v)
Definition: reduce_quda.cu:805
Norm2(const Float2 &a, const Float2 &b)
Definition: reduce_quda.cu:139
#define REDUCE_MAX_BLOCKS
Definition: reduce_quda.cu:16
#define checkCudaError()
Definition: util_quda.h:110
#define mapped_malloc(size)
Definition: malloc_quda.h:27
__device__ double dot_(const double2 &a, const double2 &b)
Definition: reduce_quda.cu:154
static int streams()
Definition: reduce_quda.cu:464
double caxpyNormCuda(const Complex &a, cudaColorSpinorField &x, cudaColorSpinorField &y)
Definition: reduce_quda.cu:388
CdotNormB(const Float2 &a, const Float2 &b)
Definition: reduce_quda.cu:614
double3 HeavyQuarkResidualNormCuda(cudaColorSpinorField &x, cudaColorSpinorField &r)
Definition: reduce_quda.cu:777
__device__ double2 dotNormA_(const double2 &a, const double2 &b)
Definition: reduce_quda.cu:273
__device__ double2 cdot_(const double2 &a, const double2 &b)
Definition: reduce_quda.cu:449
static int flops()
total number of input and output streams
Definition: reduce_quda.cu:437
double xmyNormCuda(cudaColorSpinorField &a, cudaColorSpinorField &b)
Definition: reduce_quda.cu:343
virtual __device__ void pre()
pre-computation routine called before the "M-loop"
Definition: reduce_quda.cu:115
static int streams()
Definition: reduce_quda.cu:587
static int flops()
total number of input and output streams
Definition: reduce_quda.cu:588
#define device_free(ptr)
Definition: malloc_quda.h:28