QUDA  v0.5.0
A library for QCD on GPUs
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
reduce_quda.cu
Go to the documentation of this file.
1 #include <blas_quda.h>
2 #include <tune_quda.h>
3 #include <float_vector.h>
4 
5 #if (__COMPUTE_CAPABILITY__ >= 130)
6 #define QudaSumFloat double
7 #define QudaSumFloat2 double2
8 #define QudaSumFloat3 double3
9 #else
10 #define QudaSumFloat doublesingle
11 #define QudaSumFloat2 doublesingle2
12 #define QudaSumFloat3 doublesingle3
13 #include <double_single.h>
14 #endif
15 
16 #define REDUCE_MAX_BLOCKS 65536
17 
18 #define checkSpinor(a, b) \
19  { \
20  if (a.Precision() != b.Precision()) \
21  errorQuda("precisions do not match: %d %d", a.Precision(), b.Precision()); \
22  if (a.Length() != b.Length()) \
23  errorQuda("lengths do not match: %d %d", a.Length(), b.Length()); \
24  if (a.Stride() != b.Stride()) \
25  errorQuda("strides do not match: %d %d", a.Stride(), b.Stride()); \
26  }
27 
28 static struct {
30  int stride;
31 } blasConstants;
32 
33 // These are used for reduction kernels
34 static QudaSumFloat *d_reduce=0;
35 static QudaSumFloat *h_reduce=0;
36 static QudaSumFloat *hd_reduce=0;
37 static cudaEvent_t reduceEnd;
38 
39 namespace quda {
40 
43  cudaStream_t* getBlasStream();
44 
45  void initReduce()
46  {
47  // reduction buffer size
48  size_t bytes = 3*REDUCE_MAX_BLOCKS*sizeof(QudaSumFloat);
49 
50  if (!d_reduce) d_reduce = (QudaSumFloat *) device_malloc(bytes);
51 
52  // these arrays are actually oversized currently (only needs to be QudaSumFloat3)
53 
54  // if the device supports host-mapped memory then use a host-mapped array for the reduction
55  if (!h_reduce) {
56  // only use zero copy reductions when using 64-bit
57 #if (defined(_MSC_VER) && defined(_WIN64)) || defined(__LP64__)
58  if(deviceProp.canMapHostMemory) {
59  h_reduce = (QudaSumFloat *) mapped_malloc(bytes);
60  cudaHostGetDevicePointer(&hd_reduce, h_reduce, 0); // set the matching device pointer
61  } else
62 #endif
63  {
64  h_reduce = (QudaSumFloat *) pinned_malloc(bytes);
65  hd_reduce = d_reduce;
66  }
67  memset(h_reduce, 0, bytes); // added to ensure that valgrind doesn't report h_reduce is unitialised
68  }
69 
70  cudaEventCreateWithFlags(&reduceEnd, cudaEventDisableTiming);
71 
73  }
74 
75  void endReduce(void)
76  {
77  if (d_reduce) {
78  device_free(d_reduce);
79  d_reduce = 0;
80  }
81  if (h_reduce) {
82  host_free(h_reduce);
83  h_reduce = 0;
84  }
85  hd_reduce = 0;
86 
87  cudaEventDestroy(reduceEnd);
88  }
89 
90  namespace reduce {
91 
92 #include <texture.h>
93 #include <reduce_core.h>
94 
95  } // namespace reduce
96 
100  template <typename ReduceType, typename Float2, typename FloatN>
101  struct ReduceFunctor {
102 
104  virtual __device__ void pre() { ; }
105 
107  virtual __device__ void operator()(ReduceType &sum, FloatN &x, FloatN &y,
108  FloatN &z, FloatN &w, FloatN &v) = 0;
109 
111  virtual __device__ void post(ReduceType &sum) { ; }
112 
113  };
114 
118  __device__ double norm2_(const double2 &a) { return a.x*a.x + a.y*a.y; }
119  __device__ float norm2_(const float2 &a) { return a.x*a.x + a.y*a.y; }
120  __device__ float norm2_(const float4 &a) { return a.x*a.x + a.y*a.y + a.z*a.z + a.w*a.w; }
121 
122  template <typename ReduceType, typename Float2, typename FloatN>
123 #if (__COMPUTE_CAPABILITY__ >= 200)
124  struct Norm2 : public ReduceFunctor<ReduceType, Float2, FloatN> {
125 #else
126  struct Norm2 {
127 #endif
128  Norm2(const Float2 &a, const Float2 &b) { ; }
129  __device__ void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z,FloatN &w, FloatN &v) { sum += norm2_(x); }
130  static int streams() { return 1; }
131  static int flops() { return 2; }
132  };
133 
134  double normCuda(const cudaColorSpinorField &x) {
135  cudaColorSpinorField &y = (cudaColorSpinorField&)x; // FIXME
136  return reduce::reduceCuda<double,QudaSumFloat,QudaSumFloat,Norm2,0,0,0,0,0,false>
137  (make_double2(0.0, 0.0), make_double2(0.0, 0.0), y, y, y, y, y);
138  }
139 
143  __device__ double dot_(const double2 &a, const double2 &b) { return a.x*b.x + a.y*b.y; }
144  __device__ float dot_(const float2 &a, const float2 &b) { return a.x*b.x + a.y*b.y; }
145  __device__ float dot_(const float4 &a, const float4 &b) { return a.x*b.x + a.y*b.y + a.z*b.z + a.w*b.w; }
146 
147  template <typename ReduceType, typename Float2, typename FloatN>
148 #if (__COMPUTE_CAPABILITY__ >= 200)
149  struct Dot : public ReduceFunctor<ReduceType, Float2, FloatN> {
150 #else
151  struct Dot {
152 #endif
153  Dot(const Float2 &a, const Float2 &b) { ; }
154  __device__ void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v) { sum += dot_(x,y); }
155  static int streams() { return 2; }
156  static int flops() { return 2; }
157  };
158 
160  return reduce::reduceCuda<double,QudaSumFloat,QudaSumFloat,Dot,0,0,0,0,0,false>
161  (make_double2(0.0, 0.0), make_double2(0.0, 0.0), x, y, x, x, x);
162  }
163 
168  template <typename ReduceType, typename Float2, typename FloatN>
169 #if (__COMPUTE_CAPABILITY__ >= 200)
170  struct axpyNorm2 : public ReduceFunctor<ReduceType, Float2, FloatN> {
171 #else
172  struct axpyNorm2 {
173 #endif
174  Float2 a;
175  axpyNorm2(const Float2 &a, const Float2 &b) : a(a) { ; }
176  __device__ void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v) {
177  y += a.x*x; sum += norm2_(y); }
178  static int streams() { return 3; }
179  static int flops() { return 4; }
180  };
181 
182  double axpyNormCuda(const double &a, cudaColorSpinorField &x, cudaColorSpinorField &y) {
183  return reduce::reduceCuda<double,QudaSumFloat,QudaSumFloat,axpyNorm2,0,1,0,0,0,false>
184  (make_double2(a, 0.0), make_double2(0.0, 0.0), x, y, x, x, x);
185  }
186 
191  template <typename ReduceType, typename Float2, typename FloatN>
192 #if (__COMPUTE_CAPABILITY__ >= 200)
193  struct xmyNorm2 : public ReduceFunctor<ReduceType, Float2, FloatN> {
194 #else
195  struct xmyNorm2 {
196 #endif
197  xmyNorm2(const Float2 &a, const Float2 &b) { ; }
198  __device__ void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v) {
199  y = x - y; sum += norm2_(y); }
200  static int streams() { return 3; }
201  static int flops() { return 3; }
202  };
203 
205  return reduce::reduceCuda<double,QudaSumFloat,QudaSumFloat,xmyNorm2,0,1,0,0,0,false>
206  (make_double2(0.0, 0.0), make_double2(0.0, 0.0), x, y, x, x, x);
207  }
208 
209 
214  __device__ void Caxpy_(const float2 &a, const float4 &x, float4 &y) {
215  y.x += a.x*x.x; y.x -= a.y*x.y;
216  y.y += a.y*x.x; y.y += a.x*x.y;
217  y.z += a.x*x.z; y.z -= a.y*x.w;
218  y.w += a.y*x.z; y.w += a.x*x.w;
219  }
220 
221  __device__ void Caxpy_(const float2 &a, const float2 &x, float2 &y) {
222  y.x += a.x*x.x; y.x -= a.y*x.y;
223  y.y += a.y*x.x; y.y += a.x*x.y;
224  }
225 
226  __device__ void Caxpy_(const double2 &a, const double2 &x, double2 &y) {
227  y.x += a.x*x.x; y.x -= a.y*x.y;
228  y.y += a.y*x.x; y.y += a.x*x.y;
229  }
230 
235  template <typename ReduceType, typename Float2, typename FloatN>
236 #if (__COMPUTE_CAPABILITY__ >= 200)
237  struct caxpyNorm2 : public ReduceFunctor<ReduceType, Float2, FloatN> {
238 #else
239  struct caxpyNorm2 {
240 #endif
241  Float2 a;
242  caxpyNorm2(const Float2 &a, const Float2 &b) : a(a) { ; }
243  __device__ void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v) {
244  Caxpy_(a, x, y); sum += norm2_(y); }
245  static int streams() { return 3; }
246  static int flops() { return 6; }
247  };
248 
250  return reduce::reduceCuda<double,QudaSumFloat,QudaSumFloat,caxpyNorm2,0,1,0,0,0,false>
251  (make_double2(a.real(), a.imag()), make_double2(0.0, 0.0), x, y, x, x, x);
252  }
253 
261  template <typename ReduceType, typename Float2, typename FloatN>
262 #if (__COMPUTE_CAPABILITY__ >= 200)
263  struct caxpyxmaznormx : public ReduceFunctor<ReduceType, Float2, FloatN> {
264 #else
265  struct caxpyxmaznormx {
266 #endif
267  Float2 a;
268  caxpyxmaznormx(const Float2 &a, const Float2 &b) : a(a) { ; }
269  __device__ void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v) { Caxpy_(a, x, y); x-= a.x*z; sum += norm2_(x); }
270  static int streams() { return 5; }
271  static int flops() { return 10; }
272  };
273 
276  return reduce::reduceCuda<double,QudaSumFloat,QudaSumFloat,caxpyxmaznormx,1,1,0,0,0,false>
277  (make_double2(a.real(), a.imag()), make_double2(0.0, 0.0), x, y, z, x, x);
278  }
279 
287  template <typename ReduceType, typename Float2, typename FloatN>
288 #if (__COMPUTE_CAPABILITY__ >= 200)
289  struct cabxpyaxnorm : public ReduceFunctor<ReduceType, Float2, FloatN> {
290 #else
291  struct cabxpyaxnorm {
292 #endif
293  Float2 a;
294  Float2 b;
295  cabxpyaxnorm(const Float2 &a, const Float2 &b) : a(a), b(b) { ; }
296  __device__ void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v) { x *= a.x; Caxpy_(b, x, y); sum += norm2_(y); }
297  static int streams() { return 4; }
298  static int flops() { return 10; }
299  };
300 
301  double cabxpyAxNormCuda(const double &a, const Complex &b,
303  return reduce::reduceCuda<double,QudaSumFloat,QudaSumFloat,cabxpyaxnorm,1,1,0,0,0,false>
304  (make_double2(a, 0.0), make_double2(b.real(), b.imag()), x, y, x, x, x);
305  }
306 
310  __device__ double2 cdot_(const double2 &a, const double2 &b)
311  { return make_double2(a.x*b.x + a.y*b.y, a.x*b.y - a.y*b.x); }
312  __device__ double2 cdot_(const float2 &a, const float2 &b)
313  { return make_double2(a.x*b.x + a.y*b.y, a.x*b.y - a.y*b.x); }
314  __device__ double2 cdot_(const float4 &a, const float4 &b)
315  { return make_double2(a.x*b.x + a.y*b.y + a.z*b.z + a.w*b.w, a.x*b.y - a.y*b.x + a.z*b.w - a.w*b.z); }
316 
317  template <typename ReduceType, typename Float2, typename FloatN>
318 #if (__COMPUTE_CAPABILITY__ >= 200)
319  struct Cdot : public ReduceFunctor<ReduceType, Float2, FloatN> {
320 #else
321  struct Cdot {
322 #endif
323  Cdot(const Float2 &a, const Float2 &b) { ; }
324  __device__ void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v) { sum += cdot_(x,y); }
325  static int streams() { return 2; }
326  static int flops() { return 4; }
327  };
328 
330  double2 cdot = reduce::reduceCuda<double2,QudaSumFloat2,QudaSumFloat,Cdot,0,0,0,0,0,false>
331  (make_double2(0.0, 0.0), make_double2(0.0, 0.0), x, y, x, x, x);
332  return Complex(cdot.x, cdot.y);
333  }
334 
341  template <typename ReduceType, typename Float2, typename FloatN>
342 #if (__COMPUTE_CAPABILITY__ >= 200)
343  struct xpaycdotzy : public ReduceFunctor<ReduceType, Float2, FloatN> {
344 #else
345  struct xpaycdotzy {
346 #endif
347  Float2 a;
348  xpaycdotzy(const Float2 &a, const Float2 &b) : a(a) { ; }
349  __device__ void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v) { y = x + a.x*y; sum += cdot_(z,y); }
350  static int streams() { return 4; }
351  static int flops() { return 6; }
352  };
353 
355  double2 cdot = reduce::reduceCuda<double2,QudaSumFloat2,QudaSumFloat,xpaycdotzy,0,1,0,0,0,false>
356  (make_double2(a, 0.0), make_double2(0.0, 0.0), x, y, z, x, x);
357  return Complex(cdot.x, cdot.y);
358  }
359 
366  template <typename ReduceType, typename Float2, typename FloatN>
367 #if (__COMPUTE_CAPABILITY__ >= 200)
368  struct caxpydotzy : public ReduceFunctor<ReduceType, Float2, FloatN> {
369 #else
370  struct caxpydotzy {
371 #endif
372  Float2 a;
373  caxpydotzy(const Float2 &a, const Float2 &b) : a(a) { ; }
374  __device__ void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v) { Caxpy_(a, x, y); sum += cdot_(z,y); }
375  static int streams() { return 4; }
376  static int flops() { return 8; }
377  };
378 
381  double2 cdot = reduce::reduceCuda<double2,QudaSumFloat2,QudaSumFloat,caxpydotzy,0,1,0,0,0,false>
382  (make_double2(a.real(), a.imag()), make_double2(0.0, 0.0), x, y, z, x, x);
383  return Complex(cdot.x, cdot.y);
384  }
385 
390  __device__ double3 cdotNormA_(const double2 &a, const double2 &b)
391  { return make_double3(a.x*b.x + a.y*b.y, a.x*b.y - a.y*b.x, a.x*a.x + a.y*a.y); }
392  __device__ double3 cdotNormA_(const float2 &a, const float2 &b)
393  { return make_double3(a.x*b.x + a.y*b.y, a.x*b.y - a.y*b.x, a.x*a.x + a.y*a.y); }
394  __device__ double3 cdotNormA_(const float4 &a, const float4 &b)
395  { return make_double3(a.x*b.x + a.y*b.y + a.z*b.z + a.w*b.w,
396  a.x*b.y - a.y*b.x + a.z*b.w - a.w*b.z,
397  a.x*a.x + a.y*a.y + a.z*a.z + a.w*a.w); }
398 
399  template <typename ReduceType, typename Float2, typename FloatN>
400 #if (__COMPUTE_CAPABILITY__ >= 200)
401  struct CdotNormA : public ReduceFunctor<ReduceType, Float2, FloatN> {
402 #else
403  struct CdotNormA {
404 #endif
405  CdotNormA(const Float2 &a, const Float2 &b) { ; }
406  __device__ void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v) { sum += cdotNormA_(x,y); }
407  static int streams() { return 2; }
408  static int flops() { return 6; }
409  };
410 
412  return reduce::reduceCuda<double3,QudaSumFloat3,QudaSumFloat,CdotNormA,0,0,0,0,0,false>
413  (make_double2(0.0, 0.0), make_double2(0.0, 0.0), x, y, x, x, x);
414  }
415 
420  __device__ double3 cdotNormB_(const double2 &a, const double2 &b)
421  { return make_double3(a.x*b.x + a.y*b.y, a.x*b.y - a.y*b.x, b.x*b.x + b.y*b.y); }
422  __device__ double3 cdotNormB_(const float2 &a, const float2 &b)
423  { return make_double3(a.x*b.x + a.y*b.y, a.x*b.y - a.y*b.x, b.x*b.x + b.y*b.y); }
424  __device__ double3 cdotNormB_(const float4 &a, const float4 &b)
425  { return make_double3(a.x*b.x + a.y*b.y + a.z*b.z + a.w*b.w, a.x*b.y - a.y*b.x + a.z*b.w - a.w*b.z,
426  b.x*b.x + b.y*b.y + b.z*b.z + b.w*b.w); }
427 
428  template <typename ReduceType, typename Float2, typename FloatN>
429 #if (__COMPUTE_CAPABILITY__ >= 200)
430  struct CdotNormB : public ReduceFunctor<ReduceType, Float2, FloatN> {
431 #else
432  struct CdotNormB {
433 #endif
434  CdotNormB(const Float2 &a, const Float2 &b) { ; }
435  __device__ void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v) { sum += cdotNormB_(x,y); }
436  static int streams() { return 2; }
437  static int flops() { return 6; }
438  };
439 
441  return reduce::reduceCuda<double3,QudaSumFloat3,QudaSumFloat,CdotNormB,0,0,0,0,0,false>
442  (make_double2(0.0, 0.0), make_double2(0.0, 0.0), x, y, x, x, x);
443  }
444 
449  template <typename ReduceType, typename Float2, typename FloatN>
450 #if (__COMPUTE_CAPABILITY__ >= 200)
451  struct caxpbypzYmbwcDotProductUYNormY : public ReduceFunctor<ReduceType, Float2, FloatN> {
452 #else
454 #endif
455  Float2 a;
456  Float2 b;
457  caxpbypzYmbwcDotProductUYNormY(const Float2 &a, const Float2 &b) : a(a), b(b) { ; }
458  __device__ void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v) { Caxpy_(a, x, z); Caxpy_(b, y, z); Caxpy_(-b, w, y); sum += cdotNormB_(v,y); }
459  static int streams() { return 7; }
460  static int flops() { return 18; }
461  };
462 
464  const Complex &b, cudaColorSpinorField &y,
467  return reduce::reduceCuda<double3,QudaSumFloat3,QudaSumFloat,caxpbypzYmbwcDotProductUYNormY,0,1,1,0,0,false>
468  (make_double2(a.real(), a.imag()), make_double2(b.real(), b.imag()), x, y, z, w, u);
469  }
470 
471 
478  template <typename ReduceType, typename Float2, typename FloatN>
479 #if (__COMPUTE_CAPABILITY__ >= 200)
480  struct axpyCGNorm2 : public ReduceFunctor<ReduceType, Float2, FloatN> {
481 #else
482  struct axpyCGNorm2 {
483 #endif
484  Float2 a;
485  axpyCGNorm2(const Float2 &a, const Float2 &b) : a(a) { ; }
486  __device__ void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v) {
487  FloatN y_new = y + a.x*x;
488  sum.x += norm2_(y_new);
489  sum.y += dot_(y_new, y_new-y);
490  y = y_new;
491  }
492  static int streams() { return 3; }
493  static int flops() { return 6; }
494  };
495 
497  double2 cg_norm = reduce::reduceCuda<double2,QudaSumFloat2,QudaSumFloat,axpyCGNorm2,0,1,0,0,0,false>
498  (make_double2(a, 0.0), make_double2(0.0, 0.0), x, y, x, x, x);
499  return Complex(cg_norm.x, cg_norm.y);
500  }
501 
502 #if (__COMPUTE_CAPABILITY__ >= 200)
503 
516  template <typename ReduceType, typename Float2, typename FloatN>
517  struct HeavyQuarkResidualNorm : public ReduceFunctor<ReduceType, Float2, FloatN> {
518  Float2 a;
519  Float2 b;
520  ReduceType aux;
521  HeavyQuarkResidualNorm(const Float2 &a, const Float2 &b) : a(a), b(b) { ; }
522 
523  __device__ void pre() { aux.x = 0; aux.y = 0; }
524 
525  __device__ void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v) { aux.x += norm2_(x); aux.y += norm2_(y); }
526 
528  __device__ void post(ReduceType &sum)
529  {
530  sum.x += aux.x; sum.y += aux.y; sum.z += (aux.x > 0.0) ? (aux.y / aux.x) : 1.0;
531  }
532 
533  static int streams() { return 2; }
534  static int flops() { return 4; }
535  };
536 
537  double3 HeavyQuarkResidualNormCuda(cudaColorSpinorField &x, cudaColorSpinorField &r) {
538  double3 rtn = reduce::reduceCuda<double3,QudaSumFloat3,QudaSumFloat,HeavyQuarkResidualNorm,0,0,0,0,0,true>
539  (make_double2(0.0, 0.0), make_double2(0.0, 0.0), x, r, r, r, r);
540 #ifdef MULTI_GPU
541  rtn.z /= (x.Volume()*comm_size());
542 #else
543  rtn.z /= x.Volume();
544 #endif
545  return rtn;
546  }
547 
555  template <typename ReduceType, typename Float2, typename FloatN>
556  struct xpyHeavyQuarkResidualNorm : public ReduceFunctor<ReduceType, Float2, FloatN> {
557  Float2 a;
558  Float2 b;
559  ReduceType aux;
560  xpyHeavyQuarkResidualNorm(const Float2 &a, const Float2 &b) : a(a), b(b) { ; }
561 
562  __device__ void pre() { aux.x = 0; aux.y = 0; }
563 
564  __device__ void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v)
565  { aux.x += norm2_(x + y); aux.y += norm2_(z); }
566 
568  __device__ void post(ReduceType &sum)
569  {
570  sum.x += aux.x; sum.y += aux.y; sum.z += (aux.x > 0.0) ? (aux.y / aux.x) : 1.0;
571  }
572 
573  static int streams() { return 3; }
574  static int flops() { return 5; }
575  };
576 
577  double3 xpyHeavyQuarkResidualNormCuda(cudaColorSpinorField &x, cudaColorSpinorField &y,
578  cudaColorSpinorField &r) {
579  double3 rtn = reduce::reduceCuda<double3,QudaSumFloat3,QudaSumFloat,xpyHeavyQuarkResidualNorm,0,0,0,0,0,true>
580  (make_double2(0.0, 0.0), make_double2(0.0, 0.0), x, y, r, r, r);
581 #ifdef MULTI_GPU
582  rtn.z /= (x.Volume()*comm_size());
583 #else
584  rtn.z /= x.Volume();
585 #endif
586  return rtn;
587  }
588 
589 #else
590 
592  errorQuda("Not supported on pre-Fermi architectures");
593  return make_double3(0.0,0.0,0.0);
594  }
595 
598  errorQuda("Not supported on pre-Fermi architectures");
599  return make_double3(0.0,0.0,0.0);
600  }
601 
602 #endif
603 
612  template <typename ReduceType, typename Float2, typename FloatN>
613 #if (__COMPUTE_CAPABILITY__ >= 200)
614  struct tripleCGReduction : public ReduceFunctor<ReduceType, Float2, FloatN> {
615 #else
617 #endif
618  tripleCGReduction(const Float2 &a, const Float2 &b) { ; }
619  __device__ void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v)
620  { sum.x += norm2_(x); sum.y += norm2_(y); sum.z += dot_(y,z); }
621  static int streams() { return 3; }
622  static int flops() { return 6; }
623  };
624 
626  return reduce::reduceCuda<double3,QudaSumFloat3,QudaSumFloat,tripleCGReduction,0,0,0,0,0,false>
627  (make_double2(0.0, 0.0), make_double2(0.0, 0.0), x, y, z, x, x);
628  }
629 
630 } // namespace quda