12 #define BLAS_SPINOR // do not include ghost functions in Spinor class to reduce parameter space overhead 18 template <
typename SpinorX,
typename SpinorY,
typename SpinorZ,
typename SpinorW,
typename SpinorV,
typename Functor>
27 BlasArg(SpinorX X, SpinorY Y, SpinorZ Z, SpinorW W, SpinorV V, Functor f,
int length) :
45 unsigned int i = blockIdx.x * (blockDim.x) + threadIdx.x;
46 unsigned int parity = blockIdx.y;
47 unsigned int gridSize = gridDim.x * blockDim.x;
51 while (i < arg.length) {
52 FloatN x[M], y[M], z[M], w[M], v[M];
60 for (
int j = 0; j < M; j++) arg.f(x[j], y[j], z[j], w[j], v[j]);
71 template <
typename Float2,
typename FloatN>
struct BlasFunctor {
74 virtual __device__ __host__
void init() { ; }
77 virtual __device__ __host__
void operator()(FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v) = 0;
83 template <
typename Float2,
typename FloatN>
struct axpbyz_ :
public BlasFunctor<Float2, FloatN> {
86 axpbyz_(
const Float2 &a,
const Float2 &b,
const Float2 &c) : a(a), b(b) { ; }
87 __device__ __host__
void operator()(FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v)
89 v = a.x * x + b.x * y;
92 static int flops() {
return 3; }
98 template <
typename Float2,
typename FloatN>
struct ax_ :
public BlasFunctor<Float2, FloatN> {
100 ax_(
const Float2 &a,
const Float2 &b,
const Float2 &c) : a(a) { ; }
101 __device__ __host__
void operator()(FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v) { x *= a.x; }
110 __device__ __host__
void _caxpy(
const float2 &a,
const float4 &x, float4 &y)
122 __device__ __host__
void _caxpy(
const float2 &a,
const float2 &x, float2 &y)
130 __device__ __host__
void _caxpy(
const double2 &a,
const double2 &x, double2 &y)
138 template <
typename Float2,
typename FloatN>
struct caxpy_ :
public BlasFunctor<Float2, FloatN> {
140 caxpy_(
const Float2 &a,
const Float2 &b,
const Float2 &c) : a(a) { ; }
141 __device__ __host__
void operator()(FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v) {
_caxpy(a, x, y); }
150 __device__ __host__
void _caxpby(
const float2 &a,
const float4 &x,
const float2 &b, float4 &y)
172 __device__ __host__
void _caxpby(
const float2 &a,
const float2 &x,
const float2 &b, float2 &y)
186 __device__ __host__
void _caxpby(
const double2 &a,
const double2 &x,
const double2 &b, double2 &y)
203 caxpby_(
const Float2 &a,
const Float2 &b,
const Float2 &c) : a(a), b(b) { ; }
204 __device__ __host__
void operator()(FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v)
216 caxpbypczw_(
const Float2 &a,
const Float2 &b,
const Float2 &c) : a(a), b(b), c(c) { ; }
217 __device__ __host__
void operator()(FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v)
234 axpyBzpcx_(
const Float2 &a,
const Float2 &b,
const Float2 &c) : a(a), b(b), c(c) { ; }
235 __device__ __host__
void operator()(FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v)
238 x = b.x * z + c.x * x;
250 axpyZpbx_(
const Float2 &a,
const Float2 &b,
const Float2 &c) : a(a), b(b) { ; }
251 __device__ __host__
void operator()(FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v)
266 caxpyBzpx_(
const Float2 &a,
const Float2 &b,
const Float2 &c) : a(a), b(b) { ; }
267 __device__ __host__
void operator()(FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v)
283 caxpyBxpz_(
const Float2 &a,
const Float2 &b,
const Float2 &c) : a(a), b(b) { ; }
284 __device__ __host__
void operator()(FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v)
300 caxpbypzYmbw_(
const Float2 &a,
const Float2 &b,
const Float2 &c) : a(a), b(b) { ; }
301 __device__ __host__
void operator()(FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v)
318 cabxpyAx_(
const Float2 &a,
const Float2 &b,
const Float2 &c) : a(a), b(b) { ; }
319 __device__ __host__
void operator()(FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v)
335 caxpyxmaz_(
const Float2 &a,
const Float2 &b,
const Float2 &c) : a(a) { ; }
336 __device__ __host__
void operator()(FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v)
360 inline __device__ __host__
void init()
363 typedef decltype(a.x) real;
364 double3 result = __ldg(Ar3);
365 a.y = a.x * (real)(result.y) * ((real)1.0 / (real)result.z);
366 a.x = a.x * (real)(result.x) * ((real)1.0 / (real)result.z);
370 __device__ __host__
void operator()(FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v)
389 __device__ __host__
void operator()(FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v)
407 __device__ __host__
void operator()(FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v)
426 __device__ __host__
void operator()(FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v)
429 x = b.x * (x + a.x * z) + b.y * y;
caxpbypzYmbw_(const Float2 &a, const Float2 &b, const Float2 &c)
tripleCGUpdate_(const Float2 &a, const Float2 &b, const Float2 &c)
axpyBzpcx_(const Float2 &a, const Float2 &b, const Float2 &c)
__device__ __host__ void _caxpby(const float2 &a, const float4 &x, const float2 &b, float4 &y)
axpbyz_(const Float2 &a, const Float2 &b, const Float2 &c)
static int flops()
total number of input and output streams
doubleCG3Update_(const Float2 &a, const Float2 &b, const Float2 &c)
doubleCG3Init_(const Float2 &a, const Float2 &b, const Float2 &c)
caxpyxmaz_(const Float2 &a, const Float2 &b, const Float2 &c)
static int flops()
total number of input and output streams
__device__ __host__ void operator()(FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v)
where the reduction is usually computed and any auxiliary operations
cudaColorSpinorField * tmp
caxpyBzpx_(const Float2 &a, const Float2 &b, const Float2 &c)
__device__ __host__ void operator()(FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v)
where the reduction is usually computed and any auxiliary operations
static int flops()
total number of input and output streams
caxpyBxpz_(const Float2 &a, const Float2 &b, const Float2 &c)
caxpbypczw_(const Float2 &a, const Float2 &b, const Float2 &c)
__device__ __host__ void operator()(FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v)
where the reduction is usually computed and any auxiliary operations
__device__ __host__ void operator()(FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v)
where the reduction is usually computed and any auxiliary operations
__device__ __host__ void _caxpy(const float2 &a, const float4 &x, float4 &y)
__device__ __host__ void operator()(FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v)
where the reduction is usually computed and any auxiliary operations
BlasArg(SpinorX X, SpinorY Y, SpinorZ Z, SpinorW W, SpinorV V, Functor f, int length)
caxpy_(const Float2 &a, const Float2 &b, const Float2 &c)
__device__ __host__ void operator()(FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v)
where the reduction is usually computed and any auxiliary operations
virtual __device__ __host__ void init()
pre-computation routine before the main loop
__device__ __host__ void operator()(FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v)
where the reduction is usually computed and any auxiliary operations
static int flops()
total number of input and output streams
__global__ void blasKernel(Arg arg)
__device__ __host__ void operator()(FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v)
where the reduction is usually computed and any auxiliary operations
__device__ __host__ void operator()(FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v)
where the reduction is usually computed and any auxiliary operations
static int flops()
total number of input and output streams
static int flops()
total number of input and output streams
static int flops()
total number of input and output streams
axpyZpbx_(const Float2 &a, const Float2 &b, const Float2 &c)
__device__ __host__ void operator()(FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v)
where the reduction is usually computed and any auxiliary operations
static int flops()
total number of input and output streams
static int flops()
total number of input and output streams
ax_(const Float2 &a, const Float2 &b, const Float2 &c)
static int flops()
total number of input and output streams
cabxpyAx_(const Float2 &a, const Float2 &b, const Float2 &c)
__device__ __host__ void init()
pre-computation routine before the main loop
__device__ __host__ void operator()(FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v)
where the reduction is usually computed and any auxiliary operations
__device__ __host__ void operator()(FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v)
where the reduction is usually computed and any auxiliary operations
static int flops()
total number of input and output streams
__device__ __host__ void operator()(FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v)
where the reduction is usually computed and any auxiliary operations
__device__ __host__ void operator()(FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v)
where the reduction is usually computed and any auxiliary operations
void * getDeviceReduceBuffer()
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
colorspinor::FieldOrderCB< real, Ns, Nc, 1, order > V
static int flops()
total number of input and output streams
static int flops()
total number of input and output streams
caxpyxmazMR_(const Float2 &a, const Float2 &b, const Float2 &c)
__device__ __host__ void operator()(FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v)
where the reduction is usually computed and any auxiliary operations
static int flops()
total number of input and output streams
__device__ __host__ void operator()(FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v)
where the reduction is usually computed and any auxiliary operations
static int flops()
total number of input and output streams
caxpby_(const Float2 &a, const Float2 &b, const Float2 &c)
static int flops()
total number of input and output streams