5 #if (__COMPUTE_CAPABILITY__ >= 130)
6 #define QudaSumFloat double
7 #define QudaSumFloat2 double2
8 #define QudaSumFloat3 double3
10 #define QudaSumFloat doublesingle
11 #define QudaSumFloat2 doublesingle2
12 #define QudaSumFloat3 doublesingle3
16 #define REDUCE_MAX_BLOCKS 65536
18 #define checkSpinor(a, b) \
20 if (a.Precision() != b.Precision()) \
21 errorQuda("precisions do not match: %d %d", a.Precision(), b.Precision()); \
22 if (a.Length() != b.Length()) \
23 errorQuda("lengths do not match: %d %d", a.Length(), b.Length()); \
24 if (a.Stride() != b.Stride()) \
25 errorQuda("strides do not match: %d %d", a.Stride(), b.Stride()); \
37 static cudaEvent_t reduceEnd;
57 #if (defined(_MSC_VER) && defined(_WIN64)) || defined(__LP64__)
60 cudaHostGetDevicePointer(&hd_reduce, h_reduce, 0);
67 memset(h_reduce, 0, bytes);
70 cudaEventCreateWithFlags(&reduceEnd, cudaEventDisableTiming);
87 cudaEventDestroy(reduceEnd);
100 template <
typename ReduceType,
typename Float2,
typename FloatN>
104 virtual __device__
void pre() { ; }
111 virtual __device__
void post(ReduceType &sum) { ; }
118 __device__
double norm2_(
const double2 &a) {
return a.x*a.x + a.y*a.y; }
119 __device__
float norm2_(
const float2 &a) {
return a.x*a.x + a.y*a.y; }
120 __device__
float norm2_(
const float4 &a) {
return a.x*a.x + a.y*a.y + a.z*a.z + a.w*a.w; }
122 template <
typename ReduceType,
typename Float2,
typename FloatN>
123 #if (__COMPUTE_CAPABILITY__ >= 200)
124 struct Norm2 :
public ReduceFunctor<ReduceType, Float2, FloatN> {
128 Norm2(
const Float2 &a,
const Float2 &b) { ; }
136 return reduce::reduceCuda<double,QudaSumFloat,QudaSumFloat,Norm2,0,0,0,0,0,false>
137 (make_double2(0.0, 0.0), make_double2(0.0, 0.0), y, y, y, y, y);
143 __device__
double dot_(
const double2 &a,
const double2 &b) {
return a.x*b.x + a.y*b.y; }
144 __device__
float dot_(
const float2 &a,
const float2 &b) {
return a.x*b.x + a.y*b.y; }
145 __device__
float dot_(
const float4 &a,
const float4 &b) {
return a.x*b.x + a.y*b.y + a.z*b.z + a.w*b.w; }
147 template <
typename ReduceType,
typename Float2,
typename FloatN>
148 #if (__COMPUTE_CAPABILITY__ >= 200)
149 struct Dot :
public ReduceFunctor<ReduceType, Float2, FloatN> {
153 Dot(
const Float2 &a,
const Float2 &b) { ; }
160 return reduce::reduceCuda<double,QudaSumFloat,QudaSumFloat,Dot,0,0,0,0,0,false>
161 (make_double2(0.0, 0.0), make_double2(0.0, 0.0),
x, y,
x,
x,
x);
168 template <
typename ReduceType,
typename Float2,
typename FloatN>
169 #if (__COMPUTE_CAPABILITY__ >= 200)
170 struct axpyNorm2 :
public ReduceFunctor<ReduceType, Float2, FloatN> {
175 axpyNorm2(
const Float2 &a,
const Float2 &b) : a(a) { ; }
177 y += a.x*
x; sum +=
norm2_(y); }
183 return reduce::reduceCuda<double,QudaSumFloat,QudaSumFloat,axpyNorm2,0,1,0,0,0,false>
184 (make_double2(a, 0.0), make_double2(0.0, 0.0),
x, y,
x,
x,
x);
191 template <
typename ReduceType,
typename Float2,
typename FloatN>
192 #if (__COMPUTE_CAPABILITY__ >= 200)
193 struct xmyNorm2 :
public ReduceFunctor<ReduceType, Float2, FloatN> {
199 y = x - y; sum +=
norm2_(y); }
205 return reduce::reduceCuda<double,QudaSumFloat,QudaSumFloat,xmyNorm2,0,1,0,0,0,false>
206 (make_double2(0.0, 0.0), make_double2(0.0, 0.0),
x, y,
x,
x,
x);
214 __device__
void Caxpy_(
const float2 &a,
const float4 &
x, float4 &y) {
215 y.x += a.x*x.x; y.x -= a.y*x.y;
216 y.y += a.y*x.x; y.y += a.x*x.y;
217 y.z += a.x*x.z; y.z -= a.y*x.w;
218 y.w += a.y*x.z; y.w += a.x*x.w;
221 __device__
void Caxpy_(
const float2 &a,
const float2 &
x, float2 &y) {
222 y.x += a.x*x.x; y.x -= a.y*x.y;
223 y.y += a.y*x.x; y.y += a.x*x.y;
226 __device__
void Caxpy_(
const double2 &a,
const double2 &
x, double2 &y) {
227 y.x += a.x*x.x; y.x -= a.y*x.y;
228 y.y += a.y*x.x; y.y += a.x*x.y;
235 template <
typename ReduceType,
typename Float2,
typename FloatN>
236 #if (__COMPUTE_CAPABILITY__ >= 200)
237 struct caxpyNorm2 :
public ReduceFunctor<ReduceType, Float2, FloatN> {
250 return reduce::reduceCuda<double,QudaSumFloat,QudaSumFloat,caxpyNorm2,0,1,0,0,0,false>
251 (make_double2(a.real(), a.imag()), make_double2(0.0, 0.0),
x, y,
x,
x,
x);
261 template <
typename ReduceType,
typename Float2,
typename FloatN>
262 #if (__COMPUTE_CAPABILITY__ >= 200)
263 struct caxpyxmaznormx :
public ReduceFunctor<ReduceType, Float2, FloatN> {
276 return reduce::reduceCuda<double,QudaSumFloat,QudaSumFloat,caxpyxmaznormx,1,1,0,0,0,false>
277 (make_double2(a.real(), a.imag()), make_double2(0.0, 0.0),
x, y, z,
x,
x);
287 template <
typename ReduceType,
typename Float2,
typename FloatN>
288 #if (__COMPUTE_CAPABILITY__ >= 200)
289 struct cabxpyaxnorm :
public ReduceFunctor<ReduceType, Float2, FloatN> {
303 return reduce::reduceCuda<double,QudaSumFloat,QudaSumFloat,cabxpyaxnorm,1,1,0,0,0,false>
304 (make_double2(a, 0.0), make_double2(b.real(), b.imag()), x, y, x, x, x);
310 __device__ double2
cdot_(
const double2 &a,
const double2 &b)
311 {
return make_double2(a.x*b.x + a.y*b.y, a.x*b.y - a.y*b.x); }
312 __device__ double2
cdot_(
const float2 &a,
const float2 &b)
313 {
return make_double2(a.x*b.x + a.y*b.y, a.x*b.y - a.y*b.x); }
314 __device__ double2
cdot_(
const float4 &a,
const float4 &b)
315 {
return make_double2(a.x*b.x + a.y*b.y + a.z*b.z + a.w*b.w, a.x*b.y - a.y*b.x + a.z*b.w - a.w*b.z); }
317 template <
typename ReduceType,
typename Float2,
typename FloatN>
318 #if (__COMPUTE_CAPABILITY__ >= 200)
319 struct Cdot :
public ReduceFunctor<ReduceType, Float2, FloatN> {
323 Cdot(
const Float2 &a,
const Float2 &b) { ; }
330 double2 cdot = reduce::reduceCuda<double2,QudaSumFloat2,QudaSumFloat,Cdot,0,0,0,0,0,false>
331 (make_double2(0.0, 0.0), make_double2(0.0, 0.0),
x, y,
x,
x,
x);
332 return Complex(cdot.x, cdot.y);
341 template <
typename ReduceType,
typename Float2,
typename FloatN>
342 #if (__COMPUTE_CAPABILITY__ >= 200)
343 struct xpaycdotzy :
public ReduceFunctor<ReduceType, Float2, FloatN> {
355 double2 cdot = reduce::reduceCuda<double2,QudaSumFloat2,QudaSumFloat,xpaycdotzy,0,1,0,0,0,false>
356 (make_double2(a, 0.0), make_double2(0.0, 0.0),
x, y, z,
x,
x);
357 return Complex(cdot.x, cdot.y);
366 template <
typename ReduceType,
typename Float2,
typename FloatN>
367 #if (__COMPUTE_CAPABILITY__ >= 200)
368 struct caxpydotzy :
public ReduceFunctor<ReduceType, Float2, FloatN> {
381 double2 cdot = reduce::reduceCuda<double2,QudaSumFloat2,QudaSumFloat,caxpydotzy,0,1,0,0,0,false>
382 (make_double2(a.real(), a.imag()), make_double2(0.0, 0.0),
x, y, z,
x,
x);
383 return Complex(cdot.x, cdot.y);
390 __device__ double3
cdotNormA_(
const double2 &a,
const double2 &b)
391 {
return make_double3(a.x*b.x + a.y*b.y, a.x*b.y - a.y*b.x, a.x*a.x + a.y*a.y); }
392 __device__ double3
cdotNormA_(
const float2 &a,
const float2 &b)
393 {
return make_double3(a.x*b.x + a.y*b.y, a.x*b.y - a.y*b.x, a.x*a.x + a.y*a.y); }
394 __device__ double3
cdotNormA_(
const float4 &a,
const float4 &b)
395 {
return make_double3(a.x*b.x + a.y*b.y + a.z*b.z + a.w*b.w,
396 a.x*b.y - a.y*b.x + a.z*b.w - a.w*b.z,
397 a.x*a.x + a.y*a.y + a.z*a.z + a.w*a.w); }
399 template <
typename ReduceType,
typename Float2,
typename FloatN>
400 #if (__COMPUTE_CAPABILITY__ >= 200)
401 struct CdotNormA :
public ReduceFunctor<ReduceType, Float2, FloatN> {
412 return reduce::reduceCuda<double3,QudaSumFloat3,QudaSumFloat,CdotNormA,0,0,0,0,0,false>
413 (make_double2(0.0, 0.0), make_double2(0.0, 0.0),
x, y,
x,
x,
x);
420 __device__ double3
cdotNormB_(
const double2 &a,
const double2 &b)
421 {
return make_double3(a.x*b.x + a.y*b.y, a.x*b.y - a.y*b.x, b.x*b.x + b.y*b.y); }
422 __device__ double3
cdotNormB_(
const float2 &a,
const float2 &b)
423 {
return make_double3(a.x*b.x + a.y*b.y, a.x*b.y - a.y*b.x, b.x*b.x + b.y*b.y); }
424 __device__ double3
cdotNormB_(
const float4 &a,
const float4 &b)
425 {
return make_double3(a.x*b.x + a.y*b.y + a.z*b.z + a.w*b.w, a.x*b.y - a.y*b.x + a.z*b.w - a.w*b.z,
426 b.x*b.x + b.y*b.y + b.z*b.z + b.w*b.w); }
428 template <
typename ReduceType,
typename Float2,
typename FloatN>
429 #if (__COMPUTE_CAPABILITY__ >= 200)
430 struct CdotNormB :
public ReduceFunctor<ReduceType, Float2, FloatN> {
441 return reduce::reduceCuda<double3,QudaSumFloat3,QudaSumFloat,CdotNormB,0,0,0,0,0,false>
442 (make_double2(0.0, 0.0), make_double2(0.0, 0.0),
x, y,
x,
x,
x);
449 template <
typename ReduceType,
typename Float2,
typename FloatN>
450 #if (__COMPUTE_CAPABILITY__ >= 200)
451 struct caxpbypzYmbwcDotProductUYNormY :
public ReduceFunctor<ReduceType, Float2, FloatN> {
458 __device__
void operator()(ReduceType &sum,
FloatN &
x,
FloatN &y,
FloatN &z,
FloatN &w,
FloatN &v) {
Caxpy_(a, x, z);
Caxpy_(b, y, z);
Caxpy_(-b, w, y); sum +=
cdotNormB_(v,y); }
467 return reduce::reduceCuda<double3,QudaSumFloat3,QudaSumFloat,caxpbypzYmbwcDotProductUYNormY,0,1,1,0,0,false>
468 (make_double2(a.real(), a.imag()), make_double2(b.real(), b.imag()), x, y, z, w, u);
478 template <
typename ReduceType,
typename Float2,
typename FloatN>
479 #if (__COMPUTE_CAPABILITY__ >= 200)
480 struct axpyCGNorm2 :
public ReduceFunctor<ReduceType, Float2, FloatN> {
489 sum.y +=
dot_(y_new, y_new-y);
497 double2 cg_norm = reduce::reduceCuda<double2,QudaSumFloat2,QudaSumFloat,axpyCGNorm2,0,1,0,0,0,false>
498 (make_double2(a, 0.0), make_double2(0.0, 0.0),
x, y,
x,
x,
x);
499 return Complex(cg_norm.x, cg_norm.y);
502 #if (__COMPUTE_CAPABILITY__ >= 200)
516 template <
typename ReduceType,
typename Float2,
typename FloatN>
523 __device__
void pre() { aux.x = 0; aux.y = 0; }
528 __device__
void post(ReduceType &sum)
530 sum.x += aux.x; sum.y += aux.y; sum.z += (aux.x > 0.0) ? (aux.y / aux.x) : 1.0;
533 static int streams() {
return 2; }
534 static int flops() {
return 4; }
538 double3 rtn = reduce::reduceCuda<double3,QudaSumFloat3,QudaSumFloat,HeavyQuarkResidualNorm,0,0,0,0,0,true>
539 (make_double2(0.0, 0.0), make_double2(0.0, 0.0),
x, r, r, r, r);
555 template <
typename ReduceType,
typename Float2,
typename FloatN>
556 struct xpyHeavyQuarkResidualNorm :
public ReduceFunctor<ReduceType, Float2, FloatN> {
560 xpyHeavyQuarkResidualNorm(
const Float2 &a,
const Float2 &b) : a(a), b(b) { ; }
562 __device__
void pre() { aux.x = 0; aux.y = 0; }
568 __device__
void post(ReduceType &sum)
570 sum.x += aux.x; sum.y += aux.y; sum.z += (aux.x > 0.0) ? (aux.y / aux.x) : 1.0;
573 static int streams() {
return 3; }
574 static int flops() {
return 5; }
578 cudaColorSpinorField &r) {
579 double3 rtn = reduce::reduceCuda<double3,QudaSumFloat3,QudaSumFloat,xpyHeavyQuarkResidualNorm,0,0,0,0,0,true>
580 (make_double2(0.0, 0.0), make_double2(0.0, 0.0),
x, y, r, r, r);
592 errorQuda(
"Not supported on pre-Fermi architectures");
593 return make_double3(0.0,0.0,0.0);
598 errorQuda(
"Not supported on pre-Fermi architectures");
599 return make_double3(0.0,0.0,0.0);
612 template <
typename ReduceType,
typename Float2,
typename FloatN>
613 #if (__COMPUTE_CAPABILITY__ >= 200)
614 struct tripleCGReduction :
public ReduceFunctor<ReduceType, Float2, FloatN> {
626 return reduce::reduceCuda<double3,QudaSumFloat3,QudaSumFloat,tripleCGReduction,0,0,0,0,0,false>
627 (make_double2(0.0, 0.0), make_double2(0.0, 0.0),
x, y, z,
x,
x);