5 #if (__COMPUTE_CAPABILITY__ >= 130)
6 #define QudaSumFloat double
7 #define QudaSumFloat2 double2
8 #define QudaSumFloat3 double3
10 #define QudaSumFloat doublesingle
11 #define QudaSumFloat2 doublesingle2
12 #define QudaSumFloat3 doublesingle3
16 #define REDUCE_MAX_BLOCKS 65536
18 #define checkSpinor(a, b) \
20 if (a.Precision() != b.Precision()) \
21 errorQuda("precisions do not match: %d %d", a.Precision(), b.Precision()); \
22 if (a.Length() != b.Length()) \
23 errorQuda("lengths do not match: %d %d", a.Length(), b.Length()); \
24 if (a.Stride() != b.Stride()) \
25 errorQuda("strides do not match: %d %d", a.Stride(), b.Stride()); \
28 #define checkLength(a, b) \
30 if (a.Length() != b.Length()) \
31 errorQuda("lengths do not match: %d %d", a.Length(), b.Length()); \
32 if (a.Stride() != b.Stride()) \
33 errorQuda("strides do not match: %d %d", a.Stride(), b.Stride()); \
46 static cudaEvent_t reduceEnd;
55 const int MaxReduce = 12;
67 #if (defined(_MSC_VER) && defined(_WIN64)) || defined(__LP64__)
70 cudaHostGetDevicePointer(&hd_reduce, h_reduce, 0);
77 memset(h_reduce, 0, bytes);
80 cudaEventCreateWithFlags(&reduceEnd, cudaEventDisableTiming);
97 cudaEventDestroy(reduceEnd);
111 template <
typename ReduceType,
typename Float2,
typename FloatN>
115 virtual __device__
void pre() { ; }
118 virtual __device__
void operator()(ReduceType &sum, FloatN &
x, FloatN &
y,
119 FloatN &z, FloatN &w, FloatN &v) = 0;
122 virtual __device__
void post(ReduceType &sum) { ; }
129 __device__
double norm2_(
const double2 &a) {
return a.x*a.x + a.y*a.y; }
130 __device__
float norm2_(
const float2 &a) {
return a.x*a.x + a.y*a.y; }
131 __device__
float norm2_(
const float4 &a) {
return a.x*a.x + a.y*a.y + a.z*a.z + a.w*a.w; }
133 template <
typename ReduceType,
typename Float2,
typename FloatN>
134 #if (__COMPUTE_CAPABILITY__ >= 200)
135 struct Norm2 :
public ReduceFunctor<ReduceType, Float2, FloatN> {
139 Norm2(
const Float2 &a,
const Float2 &b) { ; }
140 __device__
void operator()(ReduceType &sum, FloatN &
x, FloatN &
y, FloatN &z,FloatN &w, FloatN &v) { sum +=
norm2_(x); }
147 return reduce::reduceCuda<double,QudaSumFloat,QudaSumFloat,Norm2,0,0,0,0,0,false>
148 (make_double2(0.0, 0.0), make_double2(0.0, 0.0),
y,
y,
y,
y,
y);
154 __device__
double dot_(
const double2 &a,
const double2 &b) {
return a.x*b.x + a.y*b.y; }
155 __device__
float dot_(
const float2 &a,
const float2 &b) {
return a.x*b.x + a.y*b.y; }
156 __device__
float dot_(
const float4 &a,
const float4 &b) {
return a.x*b.x + a.y*b.y + a.z*b.z + a.w*b.w; }
158 template <
typename ReduceType,
typename Float2,
typename FloatN>
159 #if (__COMPUTE_CAPABILITY__ >= 200)
160 struct Dot :
public ReduceFunctor<ReduceType, Float2, FloatN> {
164 Dot(
const Float2 &a,
const Float2 &b) { ; }
165 __device__
void operator()(ReduceType &sum, FloatN &
x, FloatN &
y, FloatN &z, FloatN &w, FloatN &v) { sum +=
dot_(x,y); }
171 return reduce::reduceCuda<double,QudaSumFloat,QudaSumFloat,Dot,0,0,0,0,0,false>
172 (make_double2(0.0, 0.0), make_double2(0.0, 0.0),
x,
y,
x,
x,
x);
176 void reDotProductCuda(
double* result, std::vector<cudaColorSpinorField*>&
x, std::vector<cudaColorSpinorField*>&
y){
182 reduce::multiReduceCuda<1,double,QudaSumFloat,QudaSumFloat,Dot,0,0,0,0,0,false>
183 (result, make_double2(0.0, 0.0), make_double2(0.0, 0.0),
x,
y,
x,
x,
x);
186 reduce::multiReduceCuda<2,double,QudaSumFloat,QudaSumFloat,Dot,0,0,0,0,0,false>
187 (result, make_double2(0.0, 0.0), make_double2(0.0, 0.0),
x,
y,
x,
x,
x);
190 reduce::multiReduceCuda<3,double,QudaSumFloat,QudaSumFloat,Dot,0,0,0,0,0,false>
191 (result, make_double2(0.0, 0.0), make_double2(0.0, 0.0),
x,
y,
x,
x,
x);
194 reduce::multiReduceCuda<4,double,QudaSumFloat,QudaSumFloat,Dot,0,0,0,0,0,false>
195 (result, make_double2(0.0, 0.0), make_double2(0.0, 0.0),
x,
y,
x,
x,
x);
198 reduce::multiReduceCuda<5,double,QudaSumFloat,QudaSumFloat,Dot,0,0,0,0,0,false>
199 (result, make_double2(0.0, 0.0), make_double2(0.0, 0.0),
x,
y,
x,
x,
x);
202 reduce::multiReduceCuda<6,double,QudaSumFloat,QudaSumFloat,Dot,0,0,0,0,0,false>
203 (result, make_double2(0.0, 0.0), make_double2(0.0, 0.0),
x,
y,
x,
x,
x);
206 reduce::multiReduceCuda<7,double,QudaSumFloat,QudaSumFloat,Dot,0,0,0,0,0,false>
207 (result, make_double2(0.0, 0.0), make_double2(0.0, 0.0),
x,
y,
x,
x,
x);
210 reduce::multiReduceCuda<8,double,QudaSumFloat,QudaSumFloat,Dot,0,0,0,0,0,false>
211 (result, make_double2(0.0, 0.0), make_double2(0.0, 0.0),
x,
y,
x,
x,
x);
214 reduce::multiReduceCuda<9,double,QudaSumFloat,QudaSumFloat,Dot,0,0,0,0,0,false>
215 (result, make_double2(0.0, 0.0), make_double2(0.0, 0.0),
x,
y,
x,
x,
x);
218 reduce::multiReduceCuda<10,double,QudaSumFloat,QudaSumFloat,Dot,0,0,0,0,0,false>
219 (result, make_double2(0.0, 0.0), make_double2(0.0, 0.0),
x,
y,
x,
x,
x);
222 reduce::multiReduceCuda<11,double,QudaSumFloat,QudaSumFloat,Dot,0,0,0,0,0,false>
223 (result, make_double2(0.0, 0.0), make_double2(0.0, 0.0),
x,
y,
x,
x,
x);
226 reduce::multiReduceCuda<12,double,QudaSumFloat,QudaSumFloat,Dot,0,0,0,0,0,false>
227 (result, make_double2(0.0, 0.0), make_double2(0.0, 0.0),
x,
y,
x,
x,
x);
230 reduce::multiReduceCuda<13,double,QudaSumFloat,QudaSumFloat,Dot,0,0,0,0,0,false>
231 (result, make_double2(0.0, 0.0), make_double2(0.0, 0.0),
x,
y,
x,
x,
x);
234 reduce::multiReduceCuda<14,double,QudaSumFloat,QudaSumFloat,Dot,0,0,0,0,0,false>
235 (result, make_double2(0.0, 0.0), make_double2(0.0, 0.0),
x,
y,
x,
x,
x);
238 reduce::multiReduceCuda<15,double,QudaSumFloat,QudaSumFloat,Dot,0,0,0,0,0,false>
239 (result, make_double2(0.0, 0.0), make_double2(0.0, 0.0),
x,
y,
x,
x,
x);
242 reduce::multiReduceCuda<16,double,QudaSumFloat,QudaSumFloat,Dot,0,0,0,0,0,false>
243 (result, make_double2(0.0, 0.0), make_double2(0.0, 0.0),
x,
y,
x,
x,
x);
246 reduce::multiReduceCuda<17,double,QudaSumFloat,QudaSumFloat,Dot,0,0,0,0,0,false>
247 (result, make_double2(0.0, 0.0), make_double2(0.0, 0.0),
x,
y,
x,
x,
x);
250 reduce::multiReduceCuda<18,double,QudaSumFloat,QudaSumFloat,Dot,0,0,0,0,0,false>
251 (result, make_double2(0.0, 0.0), make_double2(0.0, 0.0),
x,
y,
x,
x,
x);
254 reduce::multiReduceCuda<19,double,QudaSumFloat,QudaSumFloat,Dot,0,0,0,0,0,false>
255 (result, make_double2(0.0, 0.0), make_double2(0.0, 0.0),
x,
y,
x,
x,
x);
258 reduce::multiReduceCuda<20,double,QudaSumFloat,QudaSumFloat,Dot,0,0,0,0,0,false>
259 (result, make_double2(0.0, 0.0), make_double2(0.0, 0.0),
x,
y,
x,
x,
x);
273 __device__ double2
dotNormA_(
const double2 &a,
const double2 &b)
274 {
return make_double2(a.x*b.x + a.y*b.y, a.x*a.x + a.y*a.y); }
276 __device__ double2
dotNormA_(
const float2 &a,
const float2 &b)
277 {
return make_double2(a.x*b.x + a.y*b.y, a.x*a.x + a.y*a.y); }
280 __device__ double2
dotNormA_(
const float4 &a,
const float4 & b)
281 {
return make_double2(a.x*b.x + a.y*b.y + a.z*b.z + a.w*b.w, a.x*a.x + a.y*a.y + a.z*a.z + a.w*a.w); }
285 template <
typename ReduceType,
typename Float2,
typename FloatN>
286 #if (__COMPUTE_CAPABILITY__ >= 200)
287 struct DotNormA :
public ReduceFunctor<ReduceType, Float2, FloatN> {
292 __device__
void operator()(ReduceType &sum, FloatN &
x, FloatN &
y, FloatN &z, FloatN &w, FloatN &v){sum +=
dotNormA_(x,y);}
298 return reduce::reduceCuda<double2,QudaSumFloat2,QudaSumFloat,DotNormA,0,0,0,0,0,false>
299 (make_double2(0.0, 0.0), make_double2(0.0, 0.0),
x,
y,
x,
x,
x);
307 template <
typename ReduceType,
typename Float2,
typename FloatN>
308 #if (__COMPUTE_CAPABILITY__ >= 200)
309 struct axpyNorm2 :
public ReduceFunctor<ReduceType, Float2, FloatN> {
314 axpyNorm2(
const Float2 &a,
const Float2 &b) : a(a) { ; }
315 __device__
void operator()(ReduceType &sum, FloatN &
x, FloatN &
y, FloatN &z, FloatN &w, FloatN &v) {
316 y += a.x*
x; sum +=
norm2_(y); }
322 return reduce::reduceCuda<double,QudaSumFloat,QudaSumFloat,axpyNorm2,0,1,0,0,0,false>
323 (make_double2(a, 0.0), make_double2(0.0, 0.0),
x,
y,
x,
x,
x);
330 template <
typename ReduceType,
typename Float2,
typename FloatN>
331 #if (__COMPUTE_CAPABILITY__ >= 200)
332 struct xmyNorm2 :
public ReduceFunctor<ReduceType, Float2, FloatN> {
337 __device__
void operator()(ReduceType &sum, FloatN &
x, FloatN &
y, FloatN &z, FloatN &w, FloatN &v) {
344 return reduce::reduceCuda<double,QudaSumFloat,QudaSumFloat,xmyNorm2,0,1,0,0,0,false>
345 (make_double2(0.0, 0.0), make_double2(0.0, 0.0),
x,
y,
x,
x,
x);
353 __device__
void Caxpy_(
const float2 &a,
const float4 &
x, float4 &
y) {
354 y.x += a.x*x.x; y.x -= a.y*x.y;
355 y.y += a.y*x.x; y.y += a.x*x.y;
356 y.z += a.x*x.z; y.z -= a.y*x.w;
357 y.w += a.y*x.z; y.w += a.x*x.w;
360 __device__
void Caxpy_(
const float2 &a,
const float2 &
x, float2 &
y) {
361 y.x += a.x*x.x; y.x -= a.y*x.y;
362 y.y += a.y*x.x; y.y += a.x*x.y;
365 __device__
void Caxpy_(
const double2 &a,
const double2 &
x, double2 &
y) {
366 y.x += a.x*x.x; y.x -= a.y*x.y;
367 y.y += a.y*x.x; y.y += a.x*x.y;
374 template <
typename ReduceType,
typename Float2,
typename FloatN>
375 #if (__COMPUTE_CAPABILITY__ >= 200)
376 struct caxpyNorm2 :
public ReduceFunctor<ReduceType, Float2, FloatN> {
382 __device__
void operator()(ReduceType &sum, FloatN &
x, FloatN &
y, FloatN &z, FloatN &w, FloatN &v) {
389 return reduce::reduceCuda<double,QudaSumFloat,QudaSumFloat,caxpyNorm2,0,1,0,0,0,false>
390 (make_double2(
REAL(a),
IMAG(a)), make_double2(0.0, 0.0),
x,
y,
x,
x,
x);
400 template <
typename ReduceType,
typename Float2,
typename FloatN>
401 #if (__COMPUTE_CAPABILITY__ >= 200)
402 struct caxpyxmaznormx :
public ReduceFunctor<ReduceType, Float2, FloatN> {
408 __device__
void operator()(ReduceType &sum, FloatN &
x, FloatN &
y, FloatN &z, FloatN &w, FloatN &v) {
Caxpy_(a, x, y); x-= a.x*z; sum +=
norm2_(x); }
415 return reduce::reduceCuda<double,QudaSumFloat,QudaSumFloat,caxpyxmaznormx,1,1,0,0,0,false>
416 (make_double2(
REAL(a),
IMAG(a)), make_double2(0.0, 0.0),
x,
y, z,
x,
x);
426 template <
typename ReduceType,
typename Float2,
typename FloatN>
427 #if (__COMPUTE_CAPABILITY__ >= 200)
428 struct cabxpyaxnorm :
public ReduceFunctor<ReduceType, Float2, FloatN> {
435 __device__
void operator()(ReduceType &sum, FloatN &
x, FloatN &
y, FloatN &z, FloatN &w, FloatN &v) { x *= a.x;
Caxpy_(b, x, y); sum +=
norm2_(y); }
442 return reduce::reduceCuda<double,QudaSumFloat,QudaSumFloat,cabxpyaxnorm,1,1,0,0,0,false>
443 (make_double2(a, 0.0), make_double2(
REAL(b),
IMAG(b)),
x,
y,
x,
x,
x);
449 __device__ double2
cdot_(
const double2 &a,
const double2 &b)
450 {
return make_double2(a.x*b.x + a.y*b.y, a.x*b.y - a.y*b.x); }
451 __device__ double2
cdot_(
const float2 &a,
const float2 &b)
452 {
return make_double2(a.x*b.x + a.y*b.y, a.x*b.y - a.y*b.x); }
453 __device__ double2
cdot_(
const float4 &a,
const float4 &b)
454 {
return make_double2(a.x*b.x + a.y*b.y + a.z*b.z + a.w*b.w, a.x*b.y - a.y*b.x + a.z*b.w - a.w*b.z); }
456 template <
typename ReduceType,
typename Float2,
typename FloatN>
457 #if (__COMPUTE_CAPABILITY__ >= 200)
458 struct Cdot :
public ReduceFunctor<ReduceType, Float2, FloatN> {
462 Cdot(
const Float2 &a,
const Float2 &b) { ; }
463 __device__
void operator()(ReduceType &sum, FloatN &
x, FloatN &
y, FloatN &z, FloatN &w, FloatN &v) { sum +=
cdot_(x,y); }
469 double2 cdot = reduce::reduceCuda<double2,QudaSumFloat2,QudaSumFloat,Cdot,0,0,0,0,0,false>
470 (make_double2(0.0, 0.0), make_double2(0.0, 0.0),
x,
y,
x,
x,
x);
471 return Complex(cdot.x, cdot.y);
478 double2* cdot =
new double2[x.size()];
482 reduce::multiReduceCuda<1,double2,QudaSumFloat2,QudaSumFloat,Cdot,0,0,0,0,0,false>
483 (cdot, make_double2(0.0, 0.0), make_double2(0.0, 0.0),
x,
y,
x,
x,
x);
486 reduce::multiReduceCuda<6,double2,QudaSumFloat2,QudaSumFloat,Cdot,0,0,0,0,0,false>
487 (cdot, make_double2(0.0, 0.0), make_double2(0.0, 0.0),
x,
y,
x,
x,
x);
490 reduce::multiReduceCuda<10,double2,QudaSumFloat2,QudaSumFloat,Cdot,0,0,0,0,0,false>
491 (cdot, make_double2(0.0, 0.0), make_double2(0.0, 0.0),
x,
y,
x,
x,
x);
494 reduce::multiReduceCuda<14,double2,QudaSumFloat2,QudaSumFloat,Cdot,0,0,0,0,0,false>
495 (cdot, make_double2(0.0, 0.0), make_double2(0.0, 0.0),
x,
y,
x,
x,
x);
498 reduce::multiReduceCuda<18,double2,QudaSumFloat2,QudaSumFloat,Cdot,0,0,0,0,0,false>
499 (cdot, make_double2(0.0, 0.0), make_double2(0.0, 0.0),
x,
y,
x,
x,
x);
502 reduce::multiReduceCuda<22,double2,QudaSumFloat2,QudaSumFloat,Cdot,0,0,0,0,0,false>
503 (cdot, make_double2(0.0, 0.0), make_double2(0.0, 0.0),
x,
y,
x,
x,
x);
510 for(
int i=0; i<x.size(); ++i) result[i] =
Complex(cdot[i].x,cdot[i].y);
521 template <
typename ReduceType,
typename Float2,
typename FloatN>
522 #if (__COMPUTE_CAPABILITY__ >= 200)
523 struct xpaycdotzy :
public ReduceFunctor<ReduceType, Float2, FloatN> {
529 __device__
void operator()(ReduceType &sum, FloatN &
x, FloatN &
y, FloatN &z, FloatN &w, FloatN &v) { y = x + a.x*
y; sum +=
cdot_(z,y); }
535 double2 cdot = reduce::reduceCuda<double2,QudaSumFloat2,QudaSumFloat,xpaycdotzy,0,1,0,0,0,false>
536 (make_double2(a, 0.0), make_double2(0.0, 0.0),
x,
y, z,
x,
x);
537 return Complex(cdot.x, cdot.y);
546 template <
typename ReduceType,
typename Float2,
typename FloatN>
547 #if (__COMPUTE_CAPABILITY__ >= 200)
548 struct caxpydotzy :
public ReduceFunctor<ReduceType, Float2, FloatN> {
554 __device__
void operator()(ReduceType &sum, FloatN &
x, FloatN &
y, FloatN &z, FloatN &w, FloatN &v) {
Caxpy_(a, x, y); sum +=
cdot_(z,y); }
561 double2 cdot = reduce::reduceCuda<double2,QudaSumFloat2,QudaSumFloat,caxpydotzy,0,1,0,0,0,false>
562 (make_double2(
REAL(a),
IMAG(a)), make_double2(0.0, 0.0),
x,
y, z,
x,
x);
563 return Complex(cdot.x, cdot.y);
570 __device__ double3
cdotNormA_(
const double2 &a,
const double2 &b)
571 {
return make_double3(a.x*b.x + a.y*b.y, a.x*b.y - a.y*b.x, a.x*a.x + a.y*a.y); }
572 __device__ double3
cdotNormA_(
const float2 &a,
const float2 &b)
573 {
return make_double3(a.x*b.x + a.y*b.y, a.x*b.y - a.y*b.x, a.x*a.x + a.y*a.y); }
574 __device__ double3
cdotNormA_(
const float4 &a,
const float4 &b)
575 {
return make_double3(a.x*b.x + a.y*b.y + a.z*b.z + a.w*b.w,
576 a.x*b.y - a.y*b.x + a.z*b.w - a.w*b.z,
577 a.x*a.x + a.y*a.y + a.z*a.z + a.w*a.w); }
579 template <
typename ReduceType,
typename Float2,
typename FloatN>
580 #if (__COMPUTE_CAPABILITY__ >= 200)
581 struct CdotNormA :
public ReduceFunctor<ReduceType, Float2, FloatN> {
586 __device__
void operator()(ReduceType &sum, FloatN &
x, FloatN &
y, FloatN &z, FloatN &w, FloatN &v) { sum +=
cdotNormA_(x,y); }
592 return reduce::reduceCuda<double3,QudaSumFloat3,QudaSumFloat,CdotNormA,0,0,0,0,0,false>
593 (make_double2(0.0, 0.0), make_double2(0.0, 0.0),
x,
y,
x,
x,
x);
600 __device__ double3
cdotNormB_(
const double2 &a,
const double2 &b)
601 {
return make_double3(a.x*b.x + a.y*b.y, a.x*b.y - a.y*b.x, b.x*b.x + b.y*b.y); }
602 __device__ double3
cdotNormB_(
const float2 &a,
const float2 &b)
603 {
return make_double3(a.x*b.x + a.y*b.y, a.x*b.y - a.y*b.x, b.x*b.x + b.y*b.y); }
604 __device__ double3
cdotNormB_(
const float4 &a,
const float4 &b)
605 {
return make_double3(a.x*b.x + a.y*b.y + a.z*b.z + a.w*b.w, a.x*b.y - a.y*b.x + a.z*b.w - a.w*b.z,
606 b.x*b.x + b.y*b.y + b.z*b.z + b.w*b.w); }
608 template <
typename ReduceType,
typename Float2,
typename FloatN>
609 #if (__COMPUTE_CAPABILITY__ >= 200)
610 struct CdotNormB :
public ReduceFunctor<ReduceType, Float2, FloatN> {
615 __device__
void operator()(ReduceType &sum, FloatN &
x, FloatN &
y, FloatN &z, FloatN &w, FloatN &v) { sum +=
cdotNormB_(x,y); }
621 return reduce::reduceCuda<double3,QudaSumFloat3,QudaSumFloat,CdotNormB,0,0,0,0,0,false>
622 (make_double2(0.0, 0.0), make_double2(0.0, 0.0),
x,
y,
x,
x,
x);
629 template <
typename ReduceType,
typename Float2,
typename FloatN>
630 #if (__COMPUTE_CAPABILITY__ >= 200)
631 struct caxpbypzYmbwcDotProductUYNormY :
public ReduceFunctor<ReduceType, Float2, FloatN> {
638 __device__
void operator()(ReduceType &sum, FloatN &
x, FloatN &
y, FloatN &z, FloatN &w, FloatN &v) {
Caxpy_(a, x, z);
Caxpy_(b, y, z);
Caxpy_(-b, w, y); sum +=
cdotNormB_(v,y); }
648 return reduce::mixed::reduceCuda<double3,QudaSumFloat3,QudaSumFloat,caxpbypzYmbwcDotProductUYNormY,0,1,1,0,0,false>
652 return reduce::reduceCuda<double3,QudaSumFloat3,QudaSumFloat,caxpbypzYmbwcDotProductUYNormY,0,1,1,0,0,false>
664 template <
typename ReduceType,
typename Float2,
typename FloatN>
665 #if (__COMPUTE_CAPABILITY__ >= 200)
666 struct axpyCGNorm2 :
public ReduceFunctor<ReduceType, Float2, FloatN> {
672 __device__
void operator()(ReduceType &sum, FloatN &
x, FloatN &
y, FloatN &z, FloatN &w, FloatN &v) {
673 FloatN y_new = y + a.x*
x;
675 sum.y +=
dot_(y_new, y_new-y);
683 double2 cg_norm = reduce::reduceCuda<double2,QudaSumFloat2,QudaSumFloat,axpyCGNorm2,0,1,0,0,0,false>
684 (make_double2(a, 0.0), make_double2(0.0, 0.0),
x,
y,
x,
x,
x);
685 return Complex(cg_norm.x, cg_norm.y);
688 #if (__COMPUTE_CAPABILITY__ >= 200)
702 template <
typename ReduceType,
typename Float2,
typename FloatN>
709 __device__
void pre() { aux.x = 0; aux.y = 0; }
711 __device__
void operator()(ReduceType &sum, FloatN &
x, FloatN &
y, FloatN &z, FloatN &w, FloatN &v) { aux.x +=
norm2_(x); aux.y +=
norm2_(y); }
714 __device__
void post(ReduceType &sum)
716 sum.x += aux.x; sum.y += aux.y; sum.z += (aux.x > 0.0) ? (aux.y / aux.x) : 1.0;
719 static int streams() {
return 2; }
720 static int flops() {
return 4; }
724 double3 rtn = reduce::reduceCuda<double3,QudaSumFloat3,QudaSumFloat,HeavyQuarkResidualNorm,0,0,0,0,0,true>
725 (make_double2(0.0, 0.0), make_double2(0.0, 0.0),
x, r, r, r, r);
741 template <
typename ReduceType,
typename Float2,
typename FloatN>
742 struct xpyHeavyQuarkResidualNorm :
public ReduceFunctor<ReduceType, Float2, FloatN> {
746 xpyHeavyQuarkResidualNorm(
const Float2 &a,
const Float2 &b) : a(a), b(b) { ; }
748 __device__
void pre() { aux.x = 0; aux.y = 0; }
750 __device__
void operator()(ReduceType &sum, FloatN &x, FloatN &
y, FloatN &z, FloatN &w, FloatN &v)
754 __device__
void post(ReduceType &sum)
756 sum.x += aux.x; sum.y += aux.y; sum.z += (aux.x > 0.0) ? (aux.y / aux.x) : 1.0;
759 static int streams() {
return 3; }
760 static int flops() {
return 5; }
764 cudaColorSpinorField &r) {
765 double3 rtn = reduce::reduceCuda<double3,QudaSumFloat3,QudaSumFloat,xpyHeavyQuarkResidualNorm,0,0,0,0,0,true>
766 (make_double2(0.0, 0.0), make_double2(0.0, 0.0),
x,
y, r, r, r);
778 errorQuda(
"Not supported on pre-Fermi architectures");
779 return make_double3(0.0,0.0,0.0);
784 errorQuda(
"Not supported on pre-Fermi architectures");
785 return make_double3(0.0,0.0,0.0);
798 template <
typename ReduceType,
typename Float2,
typename FloatN>
799 #if (__COMPUTE_CAPABILITY__ >= 200)
800 struct tripleCGReduction :
public ReduceFunctor<ReduceType, Float2, FloatN> {
805 __device__
void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v)
812 return reduce::reduceCuda<double3,QudaSumFloat3,QudaSumFloat,tripleCGReduction,0,0,0,0,0,false>
813 (make_double2(0.0, 0.0), make_double2(0.0, 0.0),
x,
y, z,
x,
x);
caxpydotzy(const Float2 &a, const Float2 &b)
static int flops()
total number of input and output streams
__device__ void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v)
double3 tripleCGReductionCuda(cudaColorSpinorField &x, cudaColorSpinorField &y, cudaColorSpinorField &z)
#define pinned_malloc(size)
cudaDeviceProp deviceProp
__device__ void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v)
__device__ void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v)
static int flops()
total number of input and output streams
static int flops()
total number of input and output streams
char aux_tmp[quda::TuneKey::aux_n]
__device__ void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v)
__device__ void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v)
virtual __device__ void post(ReduceType &sum)
post-computation routine called after the "M-loop"
double axpyNormCuda(const double &a, cudaColorSpinorField &x, cudaColorSpinorField &y)
__device__ void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v)
std::complex< double > Complex
static int flops()
total number of input and output streams
virtual __device__ void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v)=0
where the reduction is usually computed and any auxiliary operations
static int flops()
total number of input and output streams
__device__ double3 cdotNormA_(const double2 &a, const double2 &b)
double cabxpyAxNormCuda(const double &a, const Complex &b, cudaColorSpinorField &x, cudaColorSpinorField &y)
__device__ void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v)
double3 cDotProductNormBCuda(cudaColorSpinorField &a, cudaColorSpinorField &b)
static int flops()
total number of input and output streams
tripleCGReduction(const Float2 &a, const Float2 &b)
Dot(const Float2 &a, const Float2 &b)
double2 reDotProductNormACuda(cudaColorSpinorField &a, cudaColorSpinorField &b)
caxpyNorm2(const Float2 &a, const Float2 &b)
Complex axpyCGNormCuda(const double &a, cudaColorSpinorField &x, cudaColorSpinorField &y)
static int flops()
total number of input and output streams
__device__ void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v)
static int flops()
total number of input and output streams
__device__ void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v)
CdotNormA(const Float2 &a, const Float2 &b)
xpaycdotzy(const Float2 &a, const Float2 &b)
static int flops()
total number of input and output streams
static int flops()
total number of input and output streams
caxpbypzYmbwcDotProductUYNormY(const Float2 &a, const Float2 &b)
double3 caxpbypzYmbwcDotProductUYNormYCuda(const Complex &a, cudaColorSpinorField &x, const Complex &b, cudaColorSpinorField &y, cudaColorSpinorField &z, cudaColorSpinorField &w, cudaColorSpinorField &u)
__device__ double3 cdotNormB_(const double2 &a, const double2 &b)
Complex cDotProductCuda(cudaColorSpinorField &, cudaColorSpinorField &)
xmyNorm2(const Float2 &a, const Float2 &b)
Cdot(const Float2 &a, const Float2 &b)
static int flops()
total number of input and output streams
axpyCGNorm2(const Float2 &a, const Float2 &b)
double caxpyXmazNormXCuda(const Complex &a, cudaColorSpinorField &x, cudaColorSpinorField &y, cudaColorSpinorField &z)
__device__ void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v)
static int flops()
total number of input and output streams
__device__ void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v)
cabxpyaxnorm(const Float2 &a, const Float2 &b)
__device__ double norm2_(const double2 &a)
double normCuda(const cudaColorSpinorField &b)
__device__ void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v)
static int flops()
total number of input and output streams
Complex caxpyDotzyCuda(const Complex &a, cudaColorSpinorField &x, cudaColorSpinorField &y, cudaColorSpinorField &z)
Complex xpaycDotzyCuda(cudaColorSpinorField &x, const double &a, cudaColorSpinorField &y, cudaColorSpinorField &z)
axpyNorm2(const Float2 &a, const Float2 &b)
cudaStream_t * getBlasStream()
caxpyxmaznormx(const Float2 &a, const Float2 &b)
double3 xpyHeavyQuarkResidualNormCuda(cudaColorSpinorField &x, cudaColorSpinorField &y, cudaColorSpinorField &r)
__device__ void Caxpy_(const float2 &a, const float4 &x, float4 &y)
DotNormA(const Float2 &a, const Float2 &b)
double reDotProductCuda(cudaColorSpinorField &a, cudaColorSpinorField &b)
void * memset(void *s, int c, size_t n)
__device__ void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v)
__device__ void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v)
QudaPrecision Precision() const
__device__ void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v)
double3 HeavyQuarkResidualNorm(const Float *x, const Float *r, const int volume, const int Nint)
double3 cDotProductNormACuda(cudaColorSpinorField &a, cudaColorSpinorField &b)
#define device_malloc(size)
__device__ void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v)
Norm2(const Float2 &a, const Float2 &b)
#define REDUCE_MAX_BLOCKS
#define mapped_malloc(size)
__device__ double dot_(const double2 &a, const double2 &b)
double caxpyNormCuda(const Complex &a, cudaColorSpinorField &x, cudaColorSpinorField &y)
CdotNormB(const Float2 &a, const Float2 &b)
double3 HeavyQuarkResidualNormCuda(cudaColorSpinorField &x, cudaColorSpinorField &r)
__device__ double2 dotNormA_(const double2 &a, const double2 &b)
__device__ double2 cdot_(const double2 &a, const double2 &b)
static int flops()
total number of input and output streams
double xmyNormCuda(cudaColorSpinorField &a, cudaColorSpinorField &b)
virtual __device__ void pre()
pre-computation routine called before the "M-loop"
static int flops()
total number of input and output streams