QUDA  1.0.0
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros
reduce_core.cuh
Go to the documentation of this file.
1 #pragma once
2 
4 
5 #include <blas_helper.cuh>
6 #include <cub_helper.cuh>
7 
8 namespace quda
9 {
10 
11  namespace blas
12  {
13 
14 #define BLAS_SPINOR // do not include ghost functions in Spinor class to reduce parameter space overhead
15 #include <texture.h>
16 
17  template <typename ReduceType, typename SpinorX, typename SpinorY, typename SpinorZ, typename SpinorW,
18  typename SpinorV, typename Reducer>
19  struct ReductionArg : public ReduceArg<ReduceType> {
20  SpinorX X;
21  SpinorY Y;
22  SpinorZ Z;
23  SpinorW W;
24  SpinorV V;
25  Reducer r;
26  const int length;
27  ReductionArg(SpinorX X, SpinorY Y, SpinorZ Z, SpinorW W, SpinorV V, Reducer r, int length) :
28  X(X),
29  Y(Y),
30  Z(Z),
31  W(W),
32  V(V),
33  r(r),
34  length(length)
35  {
36  ;
37  }
38  };
39 
43  template <int block_size, typename ReduceType, typename FloatN, int M, typename Arg>
44  __global__ void reduceKernel(Arg arg)
45  {
46  unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
47  unsigned int parity = blockIdx.y;
48  unsigned int gridSize = gridDim.x * blockDim.x;
49 
50  ReduceType sum;
51  ::quda::zero(sum);
52 
53  while (i < arg.length) {
54  FloatN x[M], y[M], z[M], w[M], v[M];
55  arg.X.load(x, i, parity);
56  arg.Y.load(y, i, parity);
57  arg.Z.load(z, i, parity);
58  arg.W.load(w, i, parity);
59  arg.V.load(v, i, parity);
60 
61  arg.r.pre();
62 
63 #pragma unroll
64  for (int j = 0; j < M; j++) arg.r(sum, x[j], y[j], z[j], w[j], v[j]);
65 
66  arg.r.post(sum);
67 
68  arg.X.save(x, i, parity);
69  arg.Y.save(y, i, parity);
70  arg.Z.save(z, i, parity);
71  arg.W.save(w, i, parity);
72  arg.V.save(v, i, parity);
73 
74  i += gridSize;
75  }
76 
77  ::quda::reduce<block_size, ReduceType>(arg, sum, parity);
78  }
79 
83  template <typename ReduceType, typename Float2, typename FloatN> struct ReduceFunctor {
84 
86  virtual __device__ __host__ void pre() { ; }
87 
89  virtual __device__ __host__ __host__ void operator()(
90  ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v)
91  = 0;
92 
94  virtual __device__ __host__ void post(ReduceType &sum) { ; }
95  };
96 
100  template <typename ReduceType> __device__ __host__ ReduceType norm1_(const double2 &a)
101  {
102  return (ReduceType)sqrt(a.x * a.x + a.y * a.y);
103  }
104 
105  template <typename ReduceType> __device__ __host__ ReduceType norm1_(const float2 &a)
106  {
107  return (ReduceType)sqrt(a.x * a.x + a.y * a.y);
108  }
109 
110  template <typename ReduceType> __device__ __host__ ReduceType norm1_(const float4 &a)
111  {
112  return (ReduceType)sqrt(a.x * a.x + a.y * a.y) + (ReduceType)sqrt(a.z * a.z + a.w * a.w);
113  }
114 
115  template <typename ReduceType, typename Float2, typename FloatN>
116  struct Norm1 : public ReduceFunctor<ReduceType, Float2, FloatN> {
117  Norm1(const Float2 &a, const Float2 &b) { ; }
118  __device__ __host__ void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v)
119  {
120  sum += norm1_<ReduceType>(x);
121  }
122  static int streams() { return 1; }
123  static int flops() { return 2; }
124  };
125 
129  template <typename ReduceType> __device__ __host__ void norm2_(ReduceType &sum, const double2 &a)
130  {
131  sum += (ReduceType)a.x * (ReduceType)a.x;
132  sum += (ReduceType)a.y * (ReduceType)a.y;
133  }
134 
135  template <typename ReduceType> __device__ __host__ void norm2_(ReduceType &sum, const float2 &a)
136  {
137  sum += (ReduceType)a.x * (ReduceType)a.x;
138  sum += (ReduceType)a.y * (ReduceType)a.y;
139  }
140 
141  template <typename ReduceType> __device__ __host__ void norm2_(ReduceType &sum, const float4 &a)
142  {
143  sum += (ReduceType)a.x * (ReduceType)a.x;
144  sum += (ReduceType)a.y * (ReduceType)a.y;
145  sum += (ReduceType)a.z * (ReduceType)a.z;
146  sum += (ReduceType)a.w * (ReduceType)a.w;
147  }
148 
149  template <typename ReduceType, typename Float2, typename FloatN>
150  struct Norm2 : public ReduceFunctor<ReduceType, Float2, FloatN> {
151  Norm2(const Float2 &a, const Float2 &b) { ; }
152  __device__ __host__ void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v)
153  {
154  norm2_<ReduceType>(sum, x);
155  }
156  static int streams() { return 1; }
157  static int flops() { return 2; }
158  };
159 
163  template <typename ReduceType> __device__ __host__ void dot_(ReduceType &sum, const double2 &a, const double2 &b)
164  {
165  sum += (ReduceType)a.x * (ReduceType)b.x;
166  sum += (ReduceType)a.y * (ReduceType)b.y;
167  }
168 
169  template <typename ReduceType> __device__ __host__ void dot_(ReduceType &sum, const float2 &a, const float2 &b)
170  {
171  sum += (ReduceType)a.x * (ReduceType)b.x;
172  sum += (ReduceType)a.y * (ReduceType)b.y;
173  }
174 
175  template <typename ReduceType> __device__ __host__ void dot_(ReduceType &sum, const float4 &a, const float4 &b)
176  {
177  sum += (ReduceType)a.x * (ReduceType)b.x;
178  sum += (ReduceType)a.y * (ReduceType)b.y;
179  sum += (ReduceType)a.z * (ReduceType)b.z;
180  sum += (ReduceType)a.w * (ReduceType)b.w;
181  }
182 
183  template <typename ReduceType, typename Float2, typename FloatN>
184  struct Dot : public ReduceFunctor<ReduceType, Float2, FloatN> {
185  Dot(const Float2 &a, const Float2 &b) { ; }
186  __device__ __host__ void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v)
187  {
188  dot_<ReduceType>(sum, x, y);
189  }
190  static int streams() { return 2; }
191  static int flops() { return 2; }
192  };
193 
198  template <typename ReduceType, typename Float2, typename FloatN>
199  struct axpbyzNorm2 : public ReduceFunctor<ReduceType, Float2, FloatN> {
200  Float2 a;
201  Float2 b;
202  axpbyzNorm2(const Float2 &a, const Float2 &b) : a(a), b(b) { ; }
203  __device__ __host__ void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v)
204  {
205  z = a.x * x + b.x * y;
206  norm2_<ReduceType>(sum, z);
207  }
208  static int streams() { return 3; }
209  static int flops() { return 4; }
210  };
211 
216  template <typename ReduceType, typename Float2, typename FloatN>
217  struct AxpyReDot : public ReduceFunctor<ReduceType, Float2, FloatN> {
218  Float2 a;
219  AxpyReDot(const Float2 &a, const Float2 &b) : a(a) { ; }
220  __device__ __host__ void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v)
221  {
222  y += a.x * x;
223  dot_<ReduceType>(sum, x, y);
224  }
225  static int streams() { return 3; }
226  static int flops() { return 4; }
227  };
228 
232  __device__ __host__ void Caxpy_(const double2 &a, const double2 &x, double2 &y)
233  {
234  y.x += a.x * x.x;
235  y.x -= a.y * x.y;
236  y.y += a.y * x.x;
237  y.y += a.x * x.y;
238  }
239  __device__ __host__ void Caxpy_(const float2 &a, const float2 &x, float2 &y)
240  {
241  y.x += a.x * x.x;
242  y.x -= a.y * x.y;
243  y.y += a.y * x.x;
244  y.y += a.x * x.y;
245  }
246  __device__ __host__ void Caxpy_(const float2 &a, const float4 &x, float4 &y)
247  {
248  y.x += a.x * x.x;
249  y.x -= a.y * x.y;
250  y.y += a.y * x.x;
251  y.y += a.x * x.y;
252  y.z += a.x * x.z;
253  y.z -= a.y * x.w;
254  y.w += a.y * x.z;
255  y.w += a.x * x.w;
256  }
257 
262  template <typename ReduceType, typename Float2, typename FloatN>
263  struct caxpyNorm2 : public ReduceFunctor<ReduceType, Float2, FloatN> {
264  Float2 a;
265  caxpyNorm2(const Float2 &a, const Float2 &b) : a(a) { ; }
266  __device__ __host__ void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v)
267  {
268  Caxpy_(a, x, y);
269  norm2_<ReduceType>(sum, y);
270  }
271  static int streams() { return 3; }
272  static int flops() { return 6; }
273  };
274 
281  template <typename ReduceType, typename Float2, typename FloatN>
282  struct caxpyxmaznormx : public ReduceFunctor<ReduceType, Float2, FloatN> {
283  Float2 a;
284  caxpyxmaznormx(const Float2 &a, const Float2 &b) : a(a) { ; }
285  __device__ __host__ void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v)
286  {
287  Caxpy_(a, x, y);
288  Caxpy_(-a, z, x);
289  norm2_<ReduceType>(sum, x);
290  }
291  static int streams() { return 5; }
292  static int flops() { return 10; }
293  };
294 
301  template <typename ReduceType, typename Float2, typename FloatN>
302  struct cabxpyzaxnorm : public ReduceFunctor<ReduceType, Float2, FloatN> {
303  Float2 a;
304  Float2 b;
305  cabxpyzaxnorm(const Float2 &a, const Float2 &b) : a(a), b(b) { ; }
306  __device__ __host__ void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v)
307  {
308  x *= a.x;
309  Caxpy_(b, x, y);
310  z = y;
311  norm2_<ReduceType>(sum, z);
312  }
313  static int streams() { return 4; }
314  static int flops() { return 10; }
315  };
316 
320  template <typename ReduceType> __device__ __host__ void cdot_(ReduceType &sum, const double2 &a, const double2 &b)
321  {
322  typedef typename scalar<ReduceType>::type scalar;
323  sum.x += (scalar)a.x * (scalar)b.x;
324  sum.x += (scalar)a.y * (scalar)b.y;
325  sum.y += (scalar)a.x * (scalar)b.y;
326  sum.y -= (scalar)a.y * (scalar)b.x;
327  }
328 
329  template <typename ReduceType> __device__ __host__ void cdot_(ReduceType &sum, const float2 &a, const float2 &b)
330  {
331  typedef typename scalar<ReduceType>::type scalar;
332  sum.x += (scalar)a.x * (scalar)b.x;
333  sum.x += (scalar)a.y * (scalar)b.y;
334  sum.y += (scalar)a.x * (scalar)b.y;
335  sum.y -= (scalar)a.y * (scalar)b.x;
336  }
337 
338  template <typename ReduceType> __device__ __host__ void cdot_(ReduceType &sum, const float4 &a, const float4 &b)
339  {
340  typedef typename scalar<ReduceType>::type scalar;
341  sum.x += (scalar)a.x * (scalar)b.x;
342  sum.x += (scalar)a.y * (scalar)b.y;
343  sum.x += (scalar)a.z * (scalar)b.z;
344  sum.x += (scalar)a.w * (scalar)b.w;
345  sum.y += (scalar)a.x * (scalar)b.y;
346  sum.y -= (scalar)a.y * (scalar)b.x;
347  sum.y += (scalar)a.z * (scalar)b.w;
348  sum.y -= (scalar)a.w * (scalar)b.z;
349  }
350 
351  template <typename ReduceType, typename Float2, typename FloatN>
352  struct Cdot : public ReduceFunctor<ReduceType, Float2, FloatN> {
353  Cdot(const Float2 &a, const Float2 &b) { ; }
354  __device__ __host__ void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v)
355  {
356  cdot_<ReduceType>(sum, x, y);
357  }
358  static int streams() { return 2; }
359  static int flops() { return 4; }
360  };
361 
367  template <typename ReduceType, typename Float2, typename FloatN>
368  struct caxpydotzy : public ReduceFunctor<ReduceType, Float2, FloatN> {
369  Float2 a;
370  caxpydotzy(const Float2 &a, const Float2 &b) : a(a) { ; }
371  __device__ __host__ void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v)
372  {
373  Caxpy_(a, x, y);
374  cdot_<ReduceType>(sum, z, y);
375  }
376  static int streams() { return 4; }
377  static int flops() { return 8; }
378  };
379 
384  template <typename ReduceType, typename InputType>
385  __device__ __host__ void cdotNormA_(ReduceType &sum, const InputType &a, const InputType &b)
386  {
387  typedef typename scalar<ReduceType>::type scalar;
388  typedef typename vector<scalar, 2>::type vec2;
389  cdot_<ReduceType>(sum, a, b);
390  norm2_<scalar>(sum.z, a);
391  }
392 
397  template <typename ReduceType, typename InputType>
398  __device__ __host__ void cdotNormB_(ReduceType &sum, const InputType &a, const InputType &b)
399  {
400  typedef typename scalar<ReduceType>::type scalar;
401  typedef typename vector<scalar, 2>::type vec2;
402  cdot_<ReduceType>(sum, a, b);
403  norm2_<scalar>(sum.z, b);
404  }
405 
406  template <typename ReduceType, typename Float2, typename FloatN>
407  struct CdotNormA : public ReduceFunctor<ReduceType, Float2, FloatN> {
408  CdotNormA(const Float2 &a, const Float2 &b) { ; }
409  __device__ __host__ void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v)
410  {
411  cdotNormA_<ReduceType>(sum, x, y);
412  }
413  static int streams() { return 2; }
414  static int flops() { return 6; }
415  };
416 
421  template <typename ReduceType, typename Float2, typename FloatN>
422  struct caxpbypzYmbwcDotProductUYNormY_ : public ReduceFunctor<ReduceType, Float2, FloatN> {
423  Float2 a;
424  Float2 b;
425  caxpbypzYmbwcDotProductUYNormY_(const Float2 &a, const Float2 &b) : a(a), b(b) { ; }
426  __device__ __host__ void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v)
427  {
428  Caxpy_(a, x, z);
429  Caxpy_(b, y, z);
430  Caxpy_(-b, w, y);
431  cdotNormB_<ReduceType>(sum, v, y);
432  }
433  static int streams() { return 7; }
434  static int flops() { return 18; }
435  };
436 
443  template <typename ReduceType, typename Float2, typename FloatN>
444  struct axpyCGNorm2 : public ReduceFunctor<ReduceType, Float2, FloatN> {
445  Float2 a;
446  axpyCGNorm2(const Float2 &a, const Float2 &b) : a(a) { ; }
447  __device__ __host__ void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v)
448  {
449  typedef typename scalar<ReduceType>::type scalar;
450  FloatN z_new = z + a.x * x;
451  norm2_<scalar>(sum.x, z_new);
452  dot_<scalar>(sum.y, z_new, z_new - z);
453  z = z_new;
454  }
455  static int streams() { return 3; }
456  static int flops() { return 6; }
457  };
458 
470  template <typename ReduceType, typename Float2, typename FloatN>
471  struct HeavyQuarkResidualNorm_ : public ReduceFunctor<ReduceType, Float2, FloatN> {
472  typedef typename scalar<ReduceType>::type real;
473  Float2 a;
474  Float2 b;
475  ReduceType aux;
476  HeavyQuarkResidualNorm_(const Float2 &a, const Float2 &b) : a(a), b(b), aux {} { ; }
477 
478  __device__ __host__ void pre()
479  {
480  aux.x = 0;
481  aux.y = 0;
482  }
483 
484  __device__ __host__ void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v)
485  {
486  norm2_<real>(aux.x, x);
487  norm2_<real>(aux.y, y);
488  }
489 
491  __device__ __host__ void post(ReduceType &sum)
492  {
493  sum.x += aux.x;
494  sum.y += aux.y;
495  sum.z += (aux.x > 0.0) ? (aux.y / aux.x) : static_cast<real>(1.0);
496  }
497 
498  static int streams() { return 2; }
499  static int flops() { return 4; }
500  };
501 
509  template <typename ReduceType, typename Float2, typename FloatN>
510  struct xpyHeavyQuarkResidualNorm_ : public ReduceFunctor<ReduceType, Float2, FloatN> {
511  typedef typename scalar<ReduceType>::type real;
512  Float2 a;
513  Float2 b;
514  ReduceType aux;
515  xpyHeavyQuarkResidualNorm_(const Float2 &a, const Float2 &b) : a(a), b(b), aux {} { ; }
516 
517  __device__ __host__ void pre()
518  {
519  aux.x = 0;
520  aux.y = 0;
521  }
522 
523  __device__ __host__ void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v)
524  {
525  norm2_<real>(aux.x, x + y);
526  norm2_<real>(aux.y, z);
527  }
528 
530  __device__ __host__ void post(ReduceType &sum)
531  {
532  sum.x += aux.x;
533  sum.y += aux.y;
534  sum.z += (aux.x > 0.0) ? (aux.y / aux.x) : static_cast<real>(1.0);
535  }
536 
537  static int streams() { return 3; }
538  static int flops() { return 5; }
539  };
540 
547  template <typename ReduceType, typename Float2, typename FloatN>
548  struct tripleCGReduction_ : public ReduceFunctor<ReduceType, Float2, FloatN> {
549  tripleCGReduction_(const Float2 &a, const Float2 &b) { ; }
550  __device__ __host__ void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v)
551  {
552  typedef typename scalar<ReduceType>::type scalar;
553  norm2_<scalar>(sum.x, x);
554  norm2_<scalar>(sum.y, y);
555  dot_<scalar>(sum.z, y, z);
556  }
557  static int streams() { return 3; }
558  static int flops() { return 6; }
559  };
560 
568  template <typename ReduceType, typename Float2, typename FloatN>
569  struct quadrupleCGReduction_ : public ReduceFunctor<ReduceType, Float2, FloatN> {
570  quadrupleCGReduction_(const Float2 &a, const Float2 &b) { ; }
571  __device__ __host__ void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v)
572  {
573  typedef typename scalar<ReduceType>::type scalar;
574  norm2_<scalar>(sum.x, x);
575  norm2_<scalar>(sum.y, y);
576  dot_<scalar>(sum.z, y, z);
577  norm2_<scalar>(sum.w, w);
578  }
579  static int streams() { return 3; }
580  static int flops() { return 8; }
581  };
582 
591  template <typename ReduceType, typename Float2, typename FloatN>
592  struct quadrupleCG3InitNorm_ : public ReduceFunctor<ReduceType, Float2, FloatN> {
593  Float2 a;
594  quadrupleCG3InitNorm_(const Float2 &a, const Float2 &b) : a(a) { ; }
595  __device__ __host__ void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v)
596  {
597  z = x;
598  w = y;
599  x += a.x * y;
600  y -= a.x * v;
601  norm2_<ReduceType>(sum, y);
602  }
603  static int streams() { return 6; }
604  static int flops() { return 6; }
605  };
606 
617  template <typename ReduceType, typename Float2, typename FloatN>
618  struct quadrupleCG3UpdateNorm_ : public ReduceFunctor<ReduceType, Float2, FloatN> {
619  Float2 a, b;
620  quadrupleCG3UpdateNorm_(const Float2 &a, const Float2 &b) : a(a), b(b) { ; }
621  FloatN tmpx {}, tmpy {};
622  __device__ __host__ void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v)
623  {
624  tmpx = x;
625  tmpy = y;
626  x = b.x * (x + a.x * y) + b.y * z;
627  y = b.x * (y - a.x * v) + b.y * w;
628  z = tmpx;
629  w = tmpy;
630  norm2_<ReduceType>(sum, y);
631  }
632  static int streams() { return 7; }
633  static int flops() { return 16; }
634  };
635 
642  template <typename ReduceType, typename Float2, typename FloatN>
643  struct doubleCG3InitNorm_ : public ReduceFunctor<ReduceType, Float2, FloatN> {
644  Float2 a;
645  doubleCG3InitNorm_(const Float2 &a, const Float2 &b) : a(a) { ; }
646  __device__ __host__ void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v)
647  {
648  y = x;
649  x -= a.x * z;
650  norm2_<ReduceType>(sum, x);
651  }
652  static int streams() { return 3; }
653  static int flops() { return 5; }
654  };
655 
663  template <typename ReduceType, typename Float2, typename FloatN>
664  struct doubleCG3UpdateNorm_ : public ReduceFunctor<ReduceType, Float2, FloatN> {
665  Float2 a, b;
666  doubleCG3UpdateNorm_(const Float2 &a, const Float2 &b) : a(a), b(b) { ; }
667  FloatN tmp {};
668  __device__ __host__ void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v)
669  {
670  tmp = x;
671  x = b.x * (x - a.x * z) + b.y * y;
672  y = tmp;
673  norm2_<ReduceType>(sum, x);
674  }
675  static int streams() { return 4; }
676  static int flops() { return 9; }
677  };
678 
679  } // namespace blas
680 
681 } // namespace quda
AxpyReDot(const Float2 &a, const Float2 &b)
scalar< ReduceType >::type real
static int flops()
total number of input and output streams
__device__ __host__ void cdotNormA_(ReduceType &sum, const InputType &a, const InputType &b)
__device__ __host__ void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v)
where the reduction is usually computed and any auxiliary operations
caxpyNorm2(const Float2 &a, const Float2 &b)
static int flops()
total number of input and output streams
__device__ __host__ void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v)
where the reduction is usually computed and any auxiliary operations
__device__ __host__ void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v)
where the reduction is usually computed and any auxiliary operations
virtual __device__ __host__ void pre()
pre-computation routine called before the "M-loop"
Definition: reduce_core.cuh:86
__device__ __host__ void cdot_(ReduceType &sum, const double2 &a, const double2 &b)
__device__ __host__ void cdotNormB_(ReduceType &sum, const InputType &a, const InputType &b)
__device__ __host__ void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v)
where the reduction is usually computed and any auxiliary operations
static int flops()
total number of input and output streams
static int flops()
total number of input and output streams
caxpydotzy(const Float2 &a, const Float2 &b)
static int streams()
__host__ __device__ ValueType sqrt(ValueType x)
Definition: complex_quda.h:120
__device__ __host__ void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v)
where the reduction is usually computed and any auxiliary operations
static int flops()
total number of input and output streams
__device__ __host__ ReduceType norm1_(const double2 &a)
Norm2(const Float2 &a, const Float2 &b)
cudaColorSpinorField * tmp
Definition: covdev_test.cpp:44
doubleCG3InitNorm_(const Float2 &a, const Float2 &b)
Cdot(const Float2 &a, const Float2 &b)
__device__ __host__ void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v)
where the reduction is usually computed and any auxiliary operations
static int flops()
total number of input and output streams
static int streams()
static int flops()
total number of input and output streams
tripleCGReduction_(const Float2 &a, const Float2 &b)
CdotNormA(const Float2 &a, const Float2 &b)
__host__ __device__ void sum(double &a, double &b)
Definition: blas_helper.cuh:62
scalar< ReduceType >::type real
__device__ __host__ void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v)
where the reduction is usually computed and any auxiliary operations
static int flops()
total number of input and output streams
__device__ __host__ void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v)
where the reduction is usually computed and any auxiliary operations
static int flops()
total number of input and output streams
Norm1(const Float2 &a, const Float2 &b)
cabxpyzaxnorm(const Float2 &a, const Float2 &b)
static int flops()
total number of input and output streams
__device__ __host__ void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v)
where the reduction is usually computed and any auxiliary operations
static int flops()
total number of input and output streams
static int flops()
total number of input and output streams
xpyHeavyQuarkResidualNorm_(const Float2 &a, const Float2 &b)
__device__ __host__ void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v)
where the reduction is usually computed and any auxiliary operations
static int flops()
total number of input and output streams
__global__ void reduceKernel(Arg arg)
Definition: reduce_core.cuh:44
ReductionArg(SpinorX X, SpinorY Y, SpinorZ Z, SpinorW W, SpinorV V, Reducer r, int length)
Definition: reduce_core.cuh:27
static int flops()
total number of input and output streams
Dot(const Float2 &a, const Float2 &b)
virtual __device__ __host__ void post(ReduceType &sum)
post-computation routine called after the "M-loop"
Definition: reduce_core.cuh:94
static int streams()
doubleCG3UpdateNorm_(const Float2 &a, const Float2 &b)
static int flops()
total number of input and output streams
void zero(ColorSpinorField &a)
Definition: blas_quda.cu:472
static int flops()
total number of input and output streams
__device__ __host__ void post(ReduceType &sum)
sum the solution and residual norms, and compute the heavy-quark norm
HeavyQuarkResidualNorm_(const Float2 &a, const Float2 &b)
axpbyzNorm2(const Float2 &a, const Float2 &b)
__device__ __host__ void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v)
where the reduction is usually computed and any auxiliary operations
__device__ __host__ void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v)
where the reduction is usually computed and any auxiliary operations
__device__ __host__ void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v)
where the reduction is usually computed and any auxiliary operations
static int flops()
total number of input and output streams
__device__ __host__ void Caxpy_(const double2 &a, const double2 &x, double2 &y)
__device__ __host__ void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v)
where the reduction is usually computed and any auxiliary operations
caxpbypzYmbwcDotProductUYNormY_(const Float2 &a, const Float2 &b)
quadrupleCG3InitNorm_(const Float2 &a, const Float2 &b)
__device__ __host__ void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v)
where the reduction is usually computed and any auxiliary operations
__device__ __host__ void norm2_(ReduceType &sum, const double2 &a)
__device__ __host__ void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v)
where the reduction is usually computed and any auxiliary operations
static int flops()
total number of input and output streams
static int streams()
__device__ __host__ void pre()
pre-computation routine called before the "M-loop"
__device__ __host__ void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v)
where the reduction is usually computed and any auxiliary operations
static int flops()
total number of input and output streams
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
colorspinor::FieldOrderCB< real, Ns, Nc, 1, order > V
Definition: spinor_noise.cu:23
quadrupleCGReduction_(const Float2 &a, const Float2 &b)
caxpyxmaznormx(const Float2 &a, const Float2 &b)
axpyCGNorm2(const Float2 &a, const Float2 &b)
quadrupleCG3UpdateNorm_(const Float2 &a, const Float2 &b)
__device__ __host__ void pre()
pre-computation routine called before the "M-loop"
__device__ __host__ void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v)
where the reduction is usually computed and any auxiliary operations
static int flops()
total number of input and output streams
QudaParity parity
Definition: covdev_test.cpp:54
__device__ __host__ void dot_(ReduceType &sum, const double2 &a, const double2 &b)
__device__ __host__ void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v)
where the reduction is usually computed and any auxiliary operations
__device__ __host__ void post(ReduceType &sum)
sum the solution and residual norms, and compute the heavy-quark norm
__device__ __host__ void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v)
where the reduction is usually computed and any auxiliary operations
__device__ __host__ void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v)
where the reduction is usually computed and any auxiliary operations
static int flops()
total number of input and output streams