14 #define BLAS_SPINOR // do not include ghost functions in Spinor class to reduce parameter space overhead 17 template <
typename ReduceType,
typename SpinorX,
typename SpinorY,
typename SpinorZ,
typename SpinorW,
18 typename SpinorV,
typename Reducer>
27 ReductionArg(SpinorX X, SpinorY Y, SpinorZ Z, SpinorW W, SpinorV V, Reducer r,
int length) :
43 template <
int block_size,
typename ReduceType,
typename FloatN,
int M,
typename Arg>
46 unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
47 unsigned int parity = blockIdx.y;
48 unsigned int gridSize = gridDim.x * blockDim.x;
53 while (i < arg.length) {
54 FloatN x[M], y[M], z[M], w[M], v[M];
55 arg.X.load(x, i, parity);
56 arg.Y.load(y, i, parity);
57 arg.Z.load(z, i, parity);
58 arg.W.load(w, i, parity);
59 arg.
V.load(v, i, parity);
64 for (
int j = 0; j < M; j++) arg.r(sum, x[j], y[j], z[j], w[j], v[j]);
68 arg.X.save(x, i, parity);
69 arg.Y.save(y, i, parity);
70 arg.Z.save(z, i, parity);
71 arg.W.save(w, i, parity);
72 arg.
V.save(v, i, parity);
77 ::quda::reduce<block_size, ReduceType>(
arg,
sum,
parity);
83 template <
typename ReduceType,
typename Float2,
typename FloatN>
struct ReduceFunctor {
86 virtual __device__ __host__
void pre() { ; }
89 virtual __device__ __host__ __host__
void operator()(
90 ReduceType &
sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v)
94 virtual __device__ __host__
void post(ReduceType &sum) { ; }
100 template <
typename ReduceType> __device__ __host__ ReduceType
norm1_(
const double2 &a)
102 return (ReduceType)
sqrt(a.x * a.x + a.y * a.y);
105 template <
typename ReduceType> __device__ __host__ ReduceType
norm1_(
const float2 &a)
107 return (ReduceType)
sqrt(a.x * a.x + a.y * a.y);
110 template <
typename ReduceType> __device__ __host__ ReduceType
norm1_(
const float4 &a)
112 return (ReduceType)
sqrt(a.x * a.x + a.y * a.y) + (ReduceType)
sqrt(a.z * a.z + a.w * a.w);
115 template <
typename ReduceType,
typename Float2,
typename FloatN>
117 Norm1(
const Float2 &a,
const Float2 &b) { ; }
118 __device__ __host__
void operator()(ReduceType &
sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v)
120 sum += norm1_<ReduceType>(x);
129 template <
typename ReduceType> __device__ __host__
void norm2_(ReduceType &
sum,
const double2 &a)
131 sum += (ReduceType)a.x * (ReduceType)a.x;
132 sum += (ReduceType)a.y * (ReduceType)a.y;
135 template <
typename ReduceType> __device__ __host__
void norm2_(ReduceType &
sum,
const float2 &a)
137 sum += (ReduceType)a.x * (ReduceType)a.x;
138 sum += (ReduceType)a.y * (ReduceType)a.y;
141 template <
typename ReduceType> __device__ __host__
void norm2_(ReduceType &
sum,
const float4 &a)
143 sum += (ReduceType)a.x * (ReduceType)a.x;
144 sum += (ReduceType)a.y * (ReduceType)a.y;
145 sum += (ReduceType)a.z * (ReduceType)a.z;
146 sum += (ReduceType)a.w * (ReduceType)a.w;
149 template <
typename ReduceType,
typename Float2,
typename FloatN>
151 Norm2(
const Float2 &a,
const Float2 &b) { ; }
152 __device__ __host__
void operator()(ReduceType &
sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v)
154 norm2_<ReduceType>(
sum, x);
163 template <
typename ReduceType> __device__ __host__
void dot_(ReduceType &
sum,
const double2 &a,
const double2 &b)
165 sum += (ReduceType)a.x * (ReduceType)b.x;
166 sum += (ReduceType)a.y * (ReduceType)b.y;
169 template <
typename ReduceType> __device__ __host__
void dot_(ReduceType &sum,
const float2 &a,
const float2 &b)
171 sum += (ReduceType)a.x * (ReduceType)b.x;
172 sum += (ReduceType)a.y * (ReduceType)b.y;
175 template <
typename ReduceType> __device__ __host__
void dot_(ReduceType &sum,
const float4 &a,
const float4 &b)
177 sum += (ReduceType)a.x * (ReduceType)b.x;
178 sum += (ReduceType)a.y * (ReduceType)b.y;
179 sum += (ReduceType)a.z * (ReduceType)b.z;
180 sum += (ReduceType)a.w * (ReduceType)b.w;
183 template <
typename ReduceType,
typename Float2,
typename FloatN>
185 Dot(
const Float2 &a,
const Float2 &b) { ; }
186 __device__ __host__
void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v)
188 dot_<ReduceType>(
sum, x, y);
198 template <
typename ReduceType,
typename Float2,
typename FloatN>
203 __device__ __host__
void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v)
205 z = a.x * x + b.x * y;
206 norm2_<ReduceType>(
sum, z);
216 template <
typename ReduceType,
typename Float2,
typename FloatN>
219 AxpyReDot(
const Float2 &a,
const Float2 &b) : a(a) { ; }
220 __device__ __host__
void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v)
223 dot_<ReduceType>(
sum, x, y);
232 __device__ __host__
void Caxpy_(
const double2 &a,
const double2 &x, double2 &y)
239 __device__ __host__
void Caxpy_(
const float2 &a,
const float2 &x, float2 &y)
246 __device__ __host__
void Caxpy_(
const float2 &a,
const float4 &x, float4 &y)
262 template <
typename ReduceType,
typename Float2,
typename FloatN>
266 __device__ __host__
void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v)
269 norm2_<ReduceType>(
sum, y);
281 template <
typename ReduceType,
typename Float2,
typename FloatN>
285 __device__ __host__
void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v)
289 norm2_<ReduceType>(
sum, x);
301 template <
typename ReduceType,
typename Float2,
typename FloatN>
306 __device__ __host__
void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v)
311 norm2_<ReduceType>(
sum, z);
320 template <
typename ReduceType> __device__ __host__
void cdot_(ReduceType &sum,
const double2 &a,
const double2 &b)
323 sum.x += (scalar)a.x * (scalar)b.x;
324 sum.x += (scalar)a.y * (scalar)b.y;
325 sum.y += (scalar)a.x * (scalar)b.y;
326 sum.y -= (scalar)a.y * (scalar)b.x;
329 template <
typename ReduceType> __device__ __host__
void cdot_(ReduceType &sum,
const float2 &a,
const float2 &b)
332 sum.x += (scalar)a.x * (scalar)b.x;
333 sum.x += (scalar)a.y * (scalar)b.y;
334 sum.y += (scalar)a.x * (scalar)b.y;
335 sum.y -= (scalar)a.y * (scalar)b.x;
338 template <
typename ReduceType> __device__ __host__
void cdot_(ReduceType &sum,
const float4 &a,
const float4 &b)
341 sum.x += (scalar)a.x * (scalar)b.x;
342 sum.x += (scalar)a.y * (scalar)b.y;
343 sum.x += (scalar)a.z * (scalar)b.z;
344 sum.x += (scalar)a.w * (scalar)b.w;
345 sum.y += (scalar)a.x * (scalar)b.y;
346 sum.y -= (scalar)a.y * (scalar)b.x;
347 sum.y += (scalar)a.z * (scalar)b.w;
348 sum.y -= (scalar)a.w * (scalar)b.z;
351 template <
typename ReduceType,
typename Float2,
typename FloatN>
353 Cdot(
const Float2 &a,
const Float2 &b) { ; }
354 __device__ __host__
void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v)
356 cdot_<ReduceType>(
sum, x, y);
367 template <
typename ReduceType,
typename Float2,
typename FloatN>
371 __device__ __host__
void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v)
374 cdot_<ReduceType>(
sum, z, y);
384 template <
typename ReduceType,
typename InputType>
385 __device__ __host__
void cdotNormA_(ReduceType &sum,
const InputType &a,
const InputType &b)
388 typedef typename vector<scalar, 2>::type vec2;
389 cdot_<ReduceType>(
sum, a, b);
390 norm2_<scalar>(sum.z, a);
397 template <
typename ReduceType,
typename InputType>
398 __device__ __host__
void cdotNormB_(ReduceType &sum,
const InputType &a,
const InputType &b)
401 typedef typename vector<scalar, 2>::type vec2;
402 cdot_<ReduceType>(
sum, a, b);
403 norm2_<scalar>(sum.z, b);
406 template <
typename ReduceType,
typename Float2,
typename FloatN>
409 __device__ __host__
void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v)
411 cdotNormA_<ReduceType>(
sum, x, y);
421 template <
typename ReduceType,
typename Float2,
typename FloatN>
426 __device__ __host__
void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v)
431 cdotNormB_<ReduceType>(
sum, v, y);
443 template <
typename ReduceType,
typename Float2,
typename FloatN>
447 __device__ __host__
void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v)
450 FloatN z_new = z + a.x * x;
451 norm2_<scalar>(sum.x, z_new);
452 dot_<scalar>(sum.y, z_new, z_new - z);
470 template <
typename ReduceType,
typename Float2,
typename FloatN>
478 __device__ __host__
void pre()
484 __device__ __host__
void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v)
486 norm2_<real>(aux.x, x);
487 norm2_<real>(aux.y, y);
491 __device__ __host__
void post(ReduceType &sum)
495 sum.z += (aux.x > 0.0) ? (aux.y / aux.x) : static_cast<real>(1.0);
509 template <
typename ReduceType,
typename Float2,
typename FloatN>
517 __device__ __host__
void pre()
523 __device__ __host__
void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v)
525 norm2_<real>(aux.x, x + y);
526 norm2_<real>(aux.y, z);
530 __device__ __host__
void post(ReduceType &sum)
534 sum.z += (aux.x > 0.0) ? (aux.y / aux.x) : static_cast<real>(1.0);
547 template <
typename ReduceType,
typename Float2,
typename FloatN>
550 __device__ __host__
void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v)
553 norm2_<scalar>(sum.x, x);
554 norm2_<scalar>(sum.y, y);
555 dot_<scalar>(sum.z, y, z);
568 template <
typename ReduceType,
typename Float2,
typename FloatN>
571 __device__ __host__
void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v)
574 norm2_<scalar>(sum.x, x);
575 norm2_<scalar>(sum.y, y);
576 dot_<scalar>(sum.z, y, z);
577 norm2_<scalar>(sum.w, w);
591 template <
typename ReduceType,
typename Float2,
typename FloatN>
595 __device__ __host__
void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v)
601 norm2_<ReduceType>(
sum, y);
617 template <
typename ReduceType,
typename Float2,
typename FloatN>
621 FloatN tmpx {}, tmpy {};
622 __device__ __host__
void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v)
626 x = b.x * (x + a.x * y) + b.y * z;
627 y = b.x * (y - a.x * v) + b.y * w;
630 norm2_<ReduceType>(
sum, y);
642 template <
typename ReduceType,
typename Float2,
typename FloatN>
646 __device__ __host__
void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v)
650 norm2_<ReduceType>(
sum, x);
663 template <
typename ReduceType,
typename Float2,
typename FloatN>
668 __device__ __host__
void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v)
671 x = b.
x * (x - a.x * z) + b.y * y;
673 norm2_<ReduceType>(sum, x);
AxpyReDot(const Float2 &a, const Float2 &b)
scalar< ReduceType >::type real
static int flops()
total number of input and output streams
__device__ __host__ void cdotNormA_(ReduceType &sum, const InputType &a, const InputType &b)
__device__ __host__ void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v)
where the reduction is usually computed and any auxiliary operations
caxpyNorm2(const Float2 &a, const Float2 &b)
static int flops()
total number of input and output streams
__device__ __host__ void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v)
where the reduction is usually computed and any auxiliary operations
__device__ __host__ void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v)
where the reduction is usually computed and any auxiliary operations
virtual __device__ __host__ void pre()
pre-computation routine called before the "M-loop"
__device__ __host__ void cdot_(ReduceType &sum, const double2 &a, const double2 &b)
__device__ __host__ void cdotNormB_(ReduceType &sum, const InputType &a, const InputType &b)
__device__ __host__ void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v)
where the reduction is usually computed and any auxiliary operations
static int flops()
total number of input and output streams
static int flops()
total number of input and output streams
caxpydotzy(const Float2 &a, const Float2 &b)
__host__ __device__ ValueType sqrt(ValueType x)
__device__ __host__ void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v)
where the reduction is usually computed and any auxiliary operations
static int flops()
total number of input and output streams
__device__ __host__ ReduceType norm1_(const double2 &a)
Norm2(const Float2 &a, const Float2 &b)
cudaColorSpinorField * tmp
doubleCG3InitNorm_(const Float2 &a, const Float2 &b)
Cdot(const Float2 &a, const Float2 &b)
__device__ __host__ void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v)
where the reduction is usually computed and any auxiliary operations
static int flops()
total number of input and output streams
static int flops()
total number of input and output streams
tripleCGReduction_(const Float2 &a, const Float2 &b)
CdotNormA(const Float2 &a, const Float2 &b)
__host__ __device__ void sum(double &a, double &b)
scalar< ReduceType >::type real
__device__ __host__ void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v)
where the reduction is usually computed and any auxiliary operations
static int flops()
total number of input and output streams
__device__ __host__ void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v)
where the reduction is usually computed and any auxiliary operations
static int flops()
total number of input and output streams
Norm1(const Float2 &a, const Float2 &b)
cabxpyzaxnorm(const Float2 &a, const Float2 &b)
static int flops()
total number of input and output streams
__device__ __host__ void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v)
where the reduction is usually computed and any auxiliary operations
static int flops()
total number of input and output streams
static int flops()
total number of input and output streams
xpyHeavyQuarkResidualNorm_(const Float2 &a, const Float2 &b)
__device__ __host__ void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v)
where the reduction is usually computed and any auxiliary operations
static int flops()
total number of input and output streams
__global__ void reduceKernel(Arg arg)
ReductionArg(SpinorX X, SpinorY Y, SpinorZ Z, SpinorW W, SpinorV V, Reducer r, int length)
static int flops()
total number of input and output streams
Dot(const Float2 &a, const Float2 &b)
virtual __device__ __host__ void post(ReduceType &sum)
post-computation routine called after the "M-loop"
doubleCG3UpdateNorm_(const Float2 &a, const Float2 &b)
static int flops()
total number of input and output streams
void zero(ColorSpinorField &a)
static int flops()
total number of input and output streams
__device__ __host__ void post(ReduceType &sum)
sum the solution and residual norms, and compute the heavy-quark norm
HeavyQuarkResidualNorm_(const Float2 &a, const Float2 &b)
axpbyzNorm2(const Float2 &a, const Float2 &b)
__device__ __host__ void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v)
where the reduction is usually computed and any auxiliary operations
__device__ __host__ void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v)
where the reduction is usually computed and any auxiliary operations
__device__ __host__ void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v)
where the reduction is usually computed and any auxiliary operations
static int flops()
total number of input and output streams
__device__ __host__ void Caxpy_(const double2 &a, const double2 &x, double2 &y)
__device__ __host__ void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v)
where the reduction is usually computed and any auxiliary operations
caxpbypzYmbwcDotProductUYNormY_(const Float2 &a, const Float2 &b)
quadrupleCG3InitNorm_(const Float2 &a, const Float2 &b)
__device__ __host__ void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v)
where the reduction is usually computed and any auxiliary operations
__device__ __host__ void norm2_(ReduceType &sum, const double2 &a)
__device__ __host__ void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v)
where the reduction is usually computed and any auxiliary operations
static int flops()
total number of input and output streams
__device__ __host__ void pre()
pre-computation routine called before the "M-loop"
__device__ __host__ void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v)
where the reduction is usually computed and any auxiliary operations
static int flops()
total number of input and output streams
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
colorspinor::FieldOrderCB< real, Ns, Nc, 1, order > V
quadrupleCGReduction_(const Float2 &a, const Float2 &b)
caxpyxmaznormx(const Float2 &a, const Float2 &b)
axpyCGNorm2(const Float2 &a, const Float2 &b)
quadrupleCG3UpdateNorm_(const Float2 &a, const Float2 &b)
__device__ __host__ void pre()
pre-computation routine called before the "M-loop"
__device__ __host__ void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v)
where the reduction is usually computed and any auxiliary operations
static int flops()
total number of input and output streams
__device__ __host__ void dot_(ReduceType &sum, const double2 &a, const double2 &b)
__device__ __host__ void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v)
where the reduction is usually computed and any auxiliary operations
__device__ __host__ void post(ReduceType &sum)
sum the solution and residual norms, and compute the heavy-quark norm
__device__ __host__ void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v)
where the reduction is usually computed and any auxiliary operations
__device__ __host__ void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v)
where the reduction is usually computed and any auxiliary operations
static int flops()
total number of input and output streams