20 template<
typename>
struct Vec2Type { };
24 #define QudaSumFloat doubledouble 25 #define QudaSumFloat2 doubledouble2 26 #define QudaSumFloat3 doubledouble3 33 #define QudaSumFloat double 34 #define QudaSumFloat2 double2 35 #define QudaSumFloat3 double3 36 #define QudaSumFloat4 double4 41 if (
a.Precision() !=
b.Precision())
42 errorQuda(
"precisions do not match: %d %d",
a.Precision(),
b.Precision());
43 if (
a.Length() !=
b.Length())
44 errorQuda(
"lengths do not match: %lu %lu",
a.Length(),
b.Length());
45 if (
a.Stride() !=
b.Stride())
46 errorQuda(
"strides do not match: %d %d",
a.Stride(),
b.Stride());
50 if (
a.Length() !=
b.Length())
51 errorQuda(
"lengths do not match: %lu %lu",
a.Length(),
b.Length());
52 if (
a.Stride() !=
b.Stride())
53 errorQuda(
"strides do not match: %d %d",
a.Stride(),
b.Stride());
98 const int max_reduce_blocks = 2*
deviceProp.multiProcessorCount;
100 const int max_reduce = 2 * max_reduce_blocks * 4 *
sizeof(
QudaSumFloat);
103 const int max_generic_blocks = 65336;
107 size_t bytes = std::max(std::max(max_reduce, max_multi_reduce), max_generic_reduce);
116 #if (defined(_MSC_VER) && defined(_WIN64)) || defined(__LP64__) 129 cudaEventCreateWithFlags(&
reduceEnd, cudaEventDisableTiming);
161 template <
typename ReduceType,
typename Float2,
typename FloatN>
165 virtual __device__ __host__
void pre() { ; }
168 virtual __device__ __host__ __host__
void operator()(ReduceType &
sum, FloatN &
x, FloatN &
y,
169 FloatN &
z, FloatN &
w, FloatN &v) = 0;
172 virtual __device__ __host__
void post(ReduceType &
sum) { ; }
179 template<
typename ReduceType> __device__ __host__ ReduceType
norm1_(
const double2 &
a) {
180 return (ReduceType)
fabs(
a.x) + (ReduceType)
fabs(
a.y);
183 template<
typename ReduceType> __device__ __host__ ReduceType
norm1_(
const float2 &
a) {
184 return (ReduceType)
fabs(
a.x) + (ReduceType)
fabs(
a.y);
187 template<
typename ReduceType> __device__ __host__ ReduceType
norm1_(
const float4 &
a) {
188 return (ReduceType)
fabs(
a.x) + (ReduceType)
fabs(
a.y) + (ReduceType)
fabs(
a.z) + (ReduceType)
fabs(
a.w);
191 template <
typename ReduceType,
typename Float2,
typename FloatN>
194 __device__ __host__
void operator()(ReduceType &
sum, FloatN &
x, FloatN &
y, FloatN &
z,FloatN &
w, FloatN &v)
195 {
sum += norm1_<ReduceType>(
x); }
203 return reduce::reduceCuda<double,QudaSumFloat,Norm1,0,0,0,0,0,false>
204 (make_double2(0.0, 0.0), make_double2(0.0, 0.0),
y,
y,
y,
y,
y);
206 errorQuda(
"L1 norm kernel only built when HOST_DEBUG is enabled");
214 template<
typename ReduceType> __device__ __host__
void norm2_(ReduceType &
sum,
const double2 &
a) {
215 sum += (ReduceType)
a.x*(ReduceType)
a.x;
216 sum += (ReduceType)
a.y*(ReduceType)
a.y;
219 template<
typename ReduceType> __device__ __host__
void norm2_(ReduceType &
sum,
const float2 &
a) {
220 sum += (ReduceType)
a.x*(ReduceType)
a.x;
221 sum += (ReduceType)
a.y*(ReduceType)
a.y;
224 template<
typename ReduceType> __device__ __host__
void norm2_(ReduceType &
sum,
const float4 &
a) {
225 sum += (ReduceType)
a.x*(ReduceType)
a.x;
226 sum += (ReduceType)
a.y*(ReduceType)
a.y;
227 sum += (ReduceType)
a.z*(ReduceType)
a.z;
228 sum += (ReduceType)
a.w*(ReduceType)
a.w;
232 template <
typename ReduceType,
typename Float2,
typename FloatN>
235 __device__ __host__
void operator()(ReduceType &
sum, FloatN &
x, FloatN &
y, FloatN &
z,FloatN &
w, FloatN &v)
236 { norm2_<ReduceType>(
sum,
x); }
243 return reduce::reduceCuda<double,QudaSumFloat,Norm2,0,0,0,0,0,false>
244 (make_double2(0.0, 0.0), make_double2(0.0, 0.0),
y,
y,
y,
y,
y);
251 template<
typename ReduceType> __device__ __host__
void dot_(ReduceType &
sum,
const double2 &
a,
const double2 &
b) {
252 sum += (ReduceType)
a.x*(ReduceType)
b.x;
253 sum += (ReduceType)
a.y*(ReduceType)
b.y;
256 template<
typename ReduceType> __device__ __host__
void dot_(ReduceType &
sum,
const float2 &
a,
const float2 &
b) {
257 sum += (ReduceType)
a.x*(ReduceType)
b.x;
258 sum += (ReduceType)
a.y*(ReduceType)
b.y;
261 template<
typename ReduceType> __device__ __host__
void dot_(ReduceType &
sum,
const float4 &
a,
const float4 &
b) {
262 sum += (ReduceType)
a.x*(ReduceType)
b.x;
263 sum += (ReduceType)
a.y*(ReduceType)
b.y;
264 sum += (ReduceType)
a.z*(ReduceType)
b.z;
265 sum += (ReduceType)
a.w*(ReduceType)
b.w;
268 template <
typename ReduceType,
typename Float2,
typename FloatN>
270 Dot(
const Float2 &
a,
const Float2 &
b) { ; }
271 __device__ __host__
void operator()(ReduceType &
sum, FloatN &
x, FloatN &
y, FloatN &
z, FloatN &
w, FloatN &v)
272 { dot_<ReduceType>(
sum,
x,
y); }
278 return reduce::reduceCuda<double,QudaSumFloat,Dot,0,0,0,0,0,false>
279 (make_double2(0.0, 0.0), make_double2(0.0, 0.0),
x,
y,
x,
x,
x);
287 template<
typename ReduceType,
typename InputType>
288 __device__ __host__ ReduceType
dotNormA_(
const InputType &
a,
const InputType &
b) {
291 dot_<scalar>(
c.x,
a,
b);
292 norm2_<scalar>(
c.y,
a);
296 template <
typename ReduceType,
typename Float2,
typename FloatN>
299 __device__ __host__
void operator()(ReduceType &
sum, FloatN &
x, FloatN &
y, FloatN &
z, FloatN &
w, FloatN &v)
300 {
sum += dotNormA_<ReduceType,FloatN>(
x,
y);}
306 return reduce::reduceCuda<double2,QudaSumFloat2,DotNormA,0,0,0,0,0,false>
307 (make_double2(0.0, 0.0), make_double2(0.0, 0.0),
x,
y,
x,
x,
x);
315 template <
typename ReduceType,
typename Float2,
typename FloatN>
319 __device__ __host__
void operator()(ReduceType &
sum, FloatN &
x, FloatN &
y, FloatN &
z, FloatN &
w, FloatN &v) {
320 y +=
a.x*
x; norm2_<ReduceType>(
sum,
y); }
326 return reduce::reduceCuda<double,QudaSumFloat,axpyNorm2,0,1,0,0,0,false>
327 (make_double2(
a, 0.0), make_double2(0.0, 0.0),
x,
y,
x,
x,
x);
335 template <
typename ReduceType,
typename Float2,
typename FloatN>
339 __device__ __host__
void operator()(ReduceType &
sum, FloatN &
x, FloatN &
y, FloatN &
z, FloatN &
w, FloatN &v) {
340 y +=
a.x*
x; dot_<ReduceType>(
sum,
x,
y); }
346 return reduce::reduceCuda<double,QudaSumFloat,AxpyReDot,0,1,0,0,0,false>
347 (make_double2(
a, 0.0), make_double2(0.0, 0.0),
x,
y,
x,
x,
x);
355 template <
typename ReduceType,
typename Float2,
typename FloatN>
358 __device__ __host__
void operator()(ReduceType &
sum, FloatN &
x, FloatN &
y, FloatN &
z, FloatN &
w, FloatN &v) {
359 y =
x -
y; norm2_<ReduceType>(
sum,
y); }
365 return reduce::reduceCuda<double,QudaSumFloat,xmyNorm2,0,1,0,0,0,false>
366 (make_double2(0.0, 0.0), make_double2(0.0, 0.0),
x,
y,
x,
x,
x);
373 __device__ __host__
void Caxpy_(
const double2 &
a,
const double2 &
x, double2 &
y) {
374 y.x +=
a.x*
x.x;
y.x -=
a.y*
x.y;
375 y.y +=
a.y*
x.x;
y.y +=
a.x*
x.y;
377 __device__ __host__
void Caxpy_(
const float2 &
a,
const float2 &
x, float2 &
y) {
378 y.x +=
a.x*
x.x;
y.x -=
a.y*
x.y;
379 y.y +=
a.y*
x.x;
y.y +=
a.x*
x.y;
381 __device__ __host__
void Caxpy_(
const float2 &
a,
const float4 &
x, float4 &
y) {
382 y.x +=
a.x*
x.x;
y.x -=
a.y*
x.y;
383 y.y +=
a.y*
x.x;
y.y +=
a.x*
x.y;
384 y.z +=
a.x*
x.z;
y.z -=
a.y*
x.w;
385 y.w +=
a.y*
x.z;
y.w +=
a.x*
x.w;
392 template <
typename ReduceType,
typename Float2,
typename FloatN>
396 __device__ __host__
void operator()(ReduceType &
sum, FloatN &
x, FloatN &
y, FloatN &
z, FloatN &
w, FloatN &v) {
403 return reduce::reduceCuda<double,QudaSumFloat,caxpyNorm2,0,1,0,0,0,false>
404 (make_double2(
REAL(
a),
IMAG(
a)), make_double2(0.0, 0.0),
x,
y,
x,
x,
x);
414 template <
typename ReduceType,
typename Float2,
typename FloatN>
418 __device__ __host__
void operator()(ReduceType &
sum, FloatN &
x, FloatN &
y, FloatN &
z, FloatN &
w, FloatN &v)
426 return reduce::reduceCuda<double,QudaSumFloat,caxpyxmaznormx,1,1,0,0,0,false>
427 (make_double2(
REAL(
a),
IMAG(
a)), make_double2(0.0, 0.0),
x,
y,
z,
x,
x);
437 template <
typename ReduceType,
typename Float2,
typename FloatN>
442 __device__ __host__
void operator()(ReduceType &
sum, FloatN &
x, FloatN &
y, FloatN &
z, FloatN &
w, FloatN &v)
451 return reduce::reduceCuda<double,QudaSumFloat,cabxpyaxnorm,1,1,0,0,0,false>
459 template<
typename ReduceType>
460 __device__ __host__
void cdot_(ReduceType &
sum,
const double2 &
a,
const double2 &
b) {
468 template<
typename ReduceType>
469 __device__ __host__
void cdot_(ReduceType &
sum,
const float2 &
a,
const float2 &
b) {
477 template<
typename ReduceType>
478 __device__ __host__
void cdot_(ReduceType &
sum,
const float4 &
a,
const float4 &
b) {
490 template <
typename ReduceType,
typename Float2,
typename FloatN>
492 Cdot(
const Float2 &
a,
const Float2 &
b) { ; }
493 __device__ __host__
void operator()(ReduceType &
sum, FloatN &
x, FloatN &
y, FloatN &
z, FloatN &
w, FloatN &v)
494 { cdot_<ReduceType>(
sum,
x,
y); }
501 double2 cdot = reduce::reduceCuda<double2,QudaSumFloat2,Cdot,0,0,0,0,0,false>
502 (make_double2(0.0, 0.0), make_double2(0.0, 0.0),
x,
y,
x,
x,
x);
503 return Complex(cdot.x, cdot.y);
511 template <
typename ReduceType,
typename Float2,
typename FloatN>
515 __device__ __host__
void operator()(ReduceType &
sum, FloatN &
x, FloatN &
y, FloatN &
z, FloatN &
w, FloatN &v)
516 {
y =
x +
a.x*
y; cdot_<ReduceType>(
sum,
z,
y); }
522 double2 cdot = reduce::reduceCuda<double2,QudaSumFloat2,xpaycdotzy,0,1,0,0,0,false>
523 (make_double2(
a, 0.0), make_double2(0.0, 0.0),
x,
y,
z,
x,
x);
524 return Complex(cdot.x, cdot.y);
533 template <
typename ReduceType,
typename Float2,
typename FloatN>
537 __device__ __host__
void operator()(ReduceType &
sum, FloatN &
x, FloatN &
y, FloatN &
z, FloatN &
w, FloatN &v)
545 double2 cdot = reduce::reduceCuda<double2,QudaSumFloat2,caxpydotzy,0,1,0,0,0,false>
546 (make_double2(
REAL(
a),
IMAG(
a)), make_double2(0.0, 0.0),
x,
y,
z,
x,
x);
547 return Complex(cdot.x, cdot.y);
555 template<
typename ReduceType,
typename InputType>
556 __device__ __host__
void cdotNormA_(ReduceType &
sum,
const InputType &
a,
const InputType &
b) {
559 cdot_<ReduceType>(
sum,
a,
b);
560 norm2_<scalar>(
sum.z,
a);
563 template <
typename ReduceType,
typename Float2,
typename FloatN>
566 __device__ __host__
void operator()(ReduceType &
sum, FloatN &
x, FloatN &
y, FloatN &
z, FloatN &
w, FloatN &v)
567 { cdotNormA_<ReduceType>(
sum,
x,
y); }
573 return reduce::reduceCuda<double3,QudaSumFloat3,CdotNormA,0,0,0,0,0,false>
574 (make_double2(0.0, 0.0), make_double2(0.0, 0.0),
x,
y,
x,
x,
x);
582 template<
typename ReduceType,
typename InputType>
583 __device__ __host__
void cdotNormB_(ReduceType &
sum,
const InputType &
a,
const InputType &
b) {
586 cdot_<ReduceType>(
sum,
a,
b);
587 norm2_<scalar>(
sum.z,
b);
590 template <
typename ReduceType,
typename Float2,
typename FloatN>
593 __device__ __host__
void operator()(ReduceType &
sum, FloatN &
x, FloatN &
y, FloatN &
z, FloatN &
w, FloatN &v)
594 { cdotNormB_<ReduceType>(
sum,
x,
y); }
600 return reduce::reduceCuda<double3,QudaSumFloat3,CdotNormB,0,0,0,0,0,false>
601 (make_double2(0.0, 0.0), make_double2(0.0, 0.0),
x,
y,
x,
x,
x);
609 template <
typename ReduceType,
typename Float2,
typename FloatN>
614 __device__ __host__
void operator()(ReduceType &
sum, FloatN &
x, FloatN &
y, FloatN &
z, FloatN &
w, FloatN &v) {
Caxpy_(
a,
x,
z);
Caxpy_(
b,
y,
z);
Caxpy_(-
b,
w,
y); cdotNormB_<ReduceType>(
sum,v,
y); }
623 if (
x.Precision() !=
z.Precision()) {
624 return reduce::mixed::reduceCuda<double3,QudaSumFloat3,caxpbypzYmbwcDotProductUYNormY_,0,1,1,0,0,false>
627 return reduce::reduceCuda<double3,QudaSumFloat3,caxpbypzYmbwcDotProductUYNormY_,0,1,1,0,0,false>
639 template <
typename ReduceType,
typename Float2,
typename FloatN>
643 __device__ __host__
void operator()(ReduceType &
sum, FloatN &
x, FloatN &
y, FloatN &
z, FloatN &
w, FloatN &v) {
645 FloatN z_new =
z +
a.x*
x;
646 norm2_<scalar>(
sum.x,z_new);
647 dot_<scalar>(
sum.y,z_new,z_new-
z);
657 if (
x.Precision() !=
y.Precision()) {
658 cg_norm = reduce::mixed::reduceCuda<double2,QudaSumFloat2,axpyCGNorm2,0,0,1,0,0,false>
659 (make_double2(
a, 0.0), make_double2(0.0, 0.0),
x,
x,
y,
x,
x);
661 cg_norm = reduce::reduceCuda<double2,QudaSumFloat2,axpyCGNorm2,0,0,1,0,0,false>
662 (make_double2(
a, 0.0), make_double2(0.0, 0.0),
x,
x,
y,
x,
x);
664 return Complex(cg_norm.x, cg_norm.y);
679 template <
typename ReduceType,
typename Float2,
typename FloatN>
687 __device__ __host__
void pre() {
aux.x = 0;
aux.y = 0; }
689 __device__ __host__
void operator()(ReduceType &
sum, FloatN &
x, FloatN &
y, FloatN &
z, FloatN &
w, FloatN &v) {
690 norm2_<real>(
aux.x,
x); norm2_<real>(
aux.y,
y);
694 __device__ __host__
void post(ReduceType &
sum)
704 double3 rtn = reduce::reduceCuda<double3,QudaSumFloat3,HeavyQuarkResidualNorm_,0,0,0,0,0,true>
705 (make_double2(0.0, 0.0), make_double2(0.0, 0.0),
x, r, r, r, r);
718 template <
typename ReduceType,
typename Float2,
typename FloatN>
726 __device__ __host__
void pre() {
aux.x = 0;
aux.y = 0; }
728 __device__ __host__
void operator()(ReduceType &
sum, FloatN &
x, FloatN &
y, FloatN &
z, FloatN &
w, FloatN &v) {
729 norm2_<real>(
aux.x,
x +
y); norm2_<real>(
aux.y,
z);
733 __device__ __host__
void post(ReduceType &
sum)
744 double3 rtn = reduce::reduceCuda<double3,QudaSumFloat3,xpyHeavyQuarkResidualNorm_,0,0,0,0,0,true>
745 (make_double2(0.0, 0.0), make_double2(0.0, 0.0),
x,
y, r, r, r);
756 template <
typename ReduceType,
typename Float2,
typename FloatN>
759 __device__ __host__
void operator()(ReduceType &
sum, FloatN &
x, FloatN &
y, FloatN &
z, FloatN &
w, FloatN &v) {
761 norm2_<scalar>(
sum.x,
x); norm2_<scalar>(
sum.y,
y); dot_<scalar>(
sum.z,
y,
z);
768 return reduce::reduceCuda<double3,QudaSumFloat3,tripleCGReduction_,0,0,0,0,0,false>
769 (make_double2(0.0, 0.0), make_double2(0.0, 0.0),
x,
y,
z,
x,
x);
781 template <
typename ReduceType,
typename Float2,
typename FloatN>
782 struct quadrupleCGReduction_ :
public ReduceFunctor<ReduceType, Float2, FloatN> {
783 quadrupleCGReduction_(
const Float2 &
a,
const Float2 &
b) { ; }
784 __device__ __host__
void operator()(ReduceType &
sum, FloatN &
x, FloatN &
y, FloatN &
z, FloatN &
w, FloatN &v) {
786 norm2_<scalar>(
sum.x,
x); norm2_<scalar>(
sum.y,
y); dot_<scalar>(
sum.z,
y,
z); norm2_<scalar>(
sum.w,
w);
788 static int streams() {
return 3; }
789 static int flops() {
return 8; }
793 return reduce::reduceCuda<double4,QudaSumFloat4,quadrupleCGReduction_,0,0,0,0,0,false>
794 (make_double2(0.0, 0.0), make_double2(0.0, 0.0),
x,
y,
z,
x,
x);
AxpyReDot(const Float2 &a, const Float2 &b)
scalar< ReduceType >::type real
__device__ __host__ ReduceType dotNormA_(const InputType &a, const InputType &b)
static struct @8 blasStrings
__device__ __host__ void cdotNormA_(ReduceType &sum, const InputType &a, const InputType &b)
__device__ __host__ void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v)
where the reduction is usually computed and any auxiliary operations
#define pinned_malloc(size)
void * getHostReduceBuffer()
double3 cDotProductNormA(ColorSpinorField &a, ColorSpinorField &b)
double caxpyNorm(const Complex &a, ColorSpinorField &x, ColorSpinorField &y)
__device__ __host__ void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v)
where the reduction is usually computed and any auxiliary operations
caxpyNorm2(const Float2 &a, const Float2 &b)
static int flops()
total number of input and output streams
__device__ __host__ void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v)
where the reduction is usually computed and any auxiliary operations
cudaDeviceProp deviceProp
__device__ __host__ void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v)
where the reduction is usually computed and any auxiliary operations
virtual __device__ __host__ void pre()
pre-computation routine called before the "M-loop"
__device__ __host__ void cdot_(ReduceType &sum, const double2 &a, const double2 &b)
__device__ __host__ void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v)
where the reduction is usually computed and any auxiliary operations
char aux_tmp[quda::TuneKey::aux_n]
__device__ __host__ void cdotNormB_(ReduceType &sum, const InputType &a, const InputType &b)
__device__ __host__ void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v)
where the reduction is usually computed and any auxiliary operations
double norm2(const ColorSpinorField &a)
static int flops()
total number of input and output streams
axpyNorm2(const Float2 &a, const Float2 &b)
caxpydotzy(const Float2 &a, const Float2 &b)
Complex cDotProduct(ColorSpinorField &, ColorSpinorField &)
std::complex< double > Complex
static int flops()
total number of input and output streams
__device__ __host__ ReduceType norm1_(const double2 &a)
Norm2(const Float2 &a, const Float2 &b)
double3 xpyHeavyQuarkResidualNorm(ColorSpinorField &x, ColorSpinorField &y, ColorSpinorField &r)
void checkLength(const ColorSpinorField &a, ColorSpinorField &b)
Cdot(const Float2 &a, const Float2 &b)
double axpyNorm(const double &a, ColorSpinorField &x, ColorSpinorField &y)
double reDotProduct(ColorSpinorField &x, ColorSpinorField &y)
__device__ __host__ void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v)
where the reduction is usually computed and any auxiliary operations
void * getMappedHostReduceBuffer()
static int flops()
total number of input and output streams
tripleCGReduction_(const Float2 &a, const Float2 &b)
double xmyNorm(ColorSpinorField &x, ColorSpinorField &y)
__device__ __host__ void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v)
where the reduction is usually computed and any auxiliary operations
static int flops()
total number of input and output streams
CdotNormA(const Float2 &a, const Float2 &b)
static int flops()
total number of input and output streams
scalar< ReduceType >::type real
__device__ __host__ void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v)
where the reduction is usually computed and any auxiliary operations
static int flops()
total number of input and output streams
__device__ __host__ void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v)
where the reduction is usually computed and any auxiliary operations
cudaStream_t * getStream()
static cudaEvent_t reduceEnd
Norm1(const Float2 &a, const Float2 &b)
double cabxpyAxNorm(const double &a, const Complex &b, ColorSpinorField &x, ColorSpinorField &y)
__device__ __host__ void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v)
where the reduction is usually computed and any auxiliary operations
static int flops()
total number of input and output streams
static int flops()
total number of input and output streams
__host__ __device__ void sum(double &a, double &b)
static int flops()
total number of input and output streams
xpyHeavyQuarkResidualNorm_(const Float2 &a, const Float2 &b)
Complex axpyCGNorm(const double &a, ColorSpinorField &x, ColorSpinorField &y)
__device__ __host__ void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v)
where the reduction is usually computed and any auxiliary operations
double4 quadrupleCGReduction(ColorSpinorField &x, ColorSpinorField &y, ColorSpinorField &z)
static int flops()
total number of input and output streams
__device__ __host__ void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v)
where the reduction is usually computed and any auxiliary operations
static int flops()
total number of input and output streams
double3 HeavyQuarkResidualNorm(ColorSpinorField &x, ColorSpinorField &r)
Dot(const Float2 &a, const Float2 &b)
virtual __device__ __host__ void post(ReduceType &sum)
post-computation routine called after the "M-loop"
cudaEvent_t * getReduceEvent()
double caxpyXmazNormX(const Complex &a, ColorSpinorField &x, ColorSpinorField &y, ColorSpinorField &z)
Complex caxpyDotzy(const Complex &a, ColorSpinorField &x, ColorSpinorField &y, ColorSpinorField &z)
static int flops()
total number of input and output streams
CdotNormB(const Float2 &a, const Float2 &b)
__device__ __host__ void post(ReduceType &sum)
sum the solution and residual norms, and compute the heavy-quark norm
HeavyQuarkResidualNorm_(const Float2 &a, const Float2 &b)
void checkSpinor(const ColorSpinorField &a, const ColorSpinorField &b)
__device__ __host__ void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v)
where the reduction is usually computed and any auxiliary operations
void * memset(void *__b, int __c, size_t __len)
__device__ __host__ void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v)
where the reduction is usually computed and any auxiliary operations
static int flops()
total number of input and output streams
double3 caxpbypzYmbwcDotProductUYNormY(const Complex &a, ColorSpinorField &x, const Complex &b, ColorSpinorField &y, ColorSpinorField &z, ColorSpinorField &w, ColorSpinorField &u)
cabxpyaxnorm(const Float2 &a, const Float2 &b)
__device__ __host__ void Caxpy_(const double2 &a, const double2 &x, double2 &y)
double norm1(const ColorSpinorField &b)
__device__ __host__ void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v)
where the reduction is usually computed and any auxiliary operations
virtual __device__ __host__ __host__ void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v)=0
where the reduction is usually computed and any auxiliary operations
double axpyReDot(const double &a, ColorSpinorField &x, ColorSpinorField &y)
xmyNorm2(const Float2 &a, const Float2 &b)
caxpbypzYmbwcDotProductUYNormY_(const Float2 &a, const Float2 &b)
__device__ void reduce(ReduceArg< T > arg, const T &in, const int idx=0)
static QudaSumFloat * h_reduce
xpaycdotzy(const Float2 &a, const Float2 &b)
__device__ __host__ void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v)
where the reduction is usually computed and any auxiliary operations
Complex xpaycDotzy(ColorSpinorField &x, const double &a, ColorSpinorField &y, ColorSpinorField &z)
__device__ __host__ void norm2_(ReduceType &sum, const double2 &a)
void * getDeviceReduceBuffer()
static int flops()
total number of input and output streams
static int flops()
total number of input and output streams
__device__ __host__ void pre()
pre-computation routine called before the "M-loop"
__device__ __host__ void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v)
where the reduction is usually computed and any auxiliary operations
DotNormA(const Float2 &a, const Float2 &b)
static int flops()
total number of input and output streams
#define device_malloc(size)
caxpyxmaznormx(const Float2 &a, const Float2 &b)
static QudaSumFloat * d_reduce
axpyCGNorm2(const Float2 &a, const Float2 &b)
__device__ __host__ void pre()
pre-computation routine called before the "M-loop"
#define mapped_malloc(size)
__device__ __host__ void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v)
where the reduction is usually computed and any auxiliary operations
static int flops()
total number of input and output streams
double3 cDotProductNormB(ColorSpinorField &a, ColorSpinorField &b)
__device__ __host__ void dot_(ReduceType &sum, const double2 &a, const double2 &b)
__device__ __host__ void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v)
where the reduction is usually computed and any auxiliary operations
static int flops()
total number of input and output streams
static QudaSumFloat * hd_reduce
double2 reDotProductNormA(ColorSpinorField &a, ColorSpinorField &b)
__device__ __host__ void post(ReduceType &sum)
sum the solution and residual norms, and compute the heavy-quark norm
double3 tripleCGReduction(ColorSpinorField &x, ColorSpinorField &y, ColorSpinorField &z)