15 #define checkSpinor(a, b) \
17 if (a.Precision() != b.Precision()) \
18 errorQuda("precisions do not match: %d %d", a.Precision(), b.Precision()); \
19 if (a.Length() != b.Length()) \
20 errorQuda("lengths do not match: %d %d", a.Length(), b.Length()); \
21 if (a.Stride() != b.Stride()) \
22 errorQuda("strides do not match: %d %d", a.Stride(), b.Stride()); \
25 #define checkLength(a, b) \
27 if (a.Length() != b.Length()) \
28 errorQuda("lengths do not match: %d %d", a.Length(), b.Length()); \
29 if (a.Stride() != b.Stride()) \
30 errorQuda("strides do not match: %d %d", a.Stride(), b.Stride()); \
42 static cudaStream_t *blasStream;
72 template <
typename Float2,
typename FloatN>
76 axpby(
const Float2 &
a,
const Float2 &
b,
const Float2 &c) : a(a), b(b) { ; }
77 __device__
void operator()(
const FloatN &
x, FloatN &
y,
const FloatN &z,
const FloatN &w) { y =
a.x*x +
b.x*
y; }
79 static int flops() {
return 3; }
83 blasCuda<axpby,0,1,0,0>(make_double2(a, 0.0), make_double2(b, 0.0), make_double2(0.0, 0.0),
90 template <
typename Float2,
typename FloatN>
92 xpy(
const Float2 &a,
const Float2 &b,
const Float2 &c) { ; }
93 __device__
void operator()(
const FloatN &
x, FloatN &
y,
const FloatN &z,
const FloatN &w) { y +=
x ; }
95 static int flops() {
return 1; }
99 blasCuda<xpy,0,1,0,0>(make_double2(1.0, 0.0), make_double2(1.0, 0.0), make_double2(0.0, 0.0),
106 template <
typename Float2,
typename FloatN>
109 axpy(
const Float2 &
a,
const Float2 &b,
const Float2 &c) : a(a) { ; }
110 __device__
void operator()(
const FloatN &
x, FloatN &
y,
const FloatN &z,
const FloatN &w) { y =
a.x*x +
y; }
118 mixed::blasCuda<axpy,0,1,0,0>(make_double2(a,0.0), make_double2(1.0,0.0), make_double2(0.0,0.0),
121 blasCuda<axpy,0,1,0,0>(make_double2(a, 0.0), make_double2(1.0, 0.0), make_double2(0.0, 0.0),
129 template <
typename Float2,
typename FloatN>
132 xpay(
const Float2 &
a,
const Float2 &b,
const Float2 &c) : a(a) { ; }
133 __device__
void operator()(
const FloatN &
x, FloatN &
y,
const FloatN &z,
const FloatN &w) { y = x +
a.x*
y; }
139 blasCuda<xpay,0,1,0,0>(make_double2(a,0.0), make_double2(0.0, 0.0), make_double2(0.0, 0.0),
146 template <
typename Float2,
typename FloatN>
148 mxpy(
const Float2 &a,
const Float2 &b,
const Float2 &c) { ; }
149 __device__
void operator()(
const FloatN &
x, FloatN &
y,
const FloatN &z,
const FloatN &w) { y -=
x; }
155 blasCuda<mxpy,0,1,0,0>(make_double2(1.0, 0.0), make_double2(1.0, 0.0),
156 make_double2(0.0, 0.0),
x,
y,
x,
x);
162 template <
typename Float2,
typename FloatN>
165 ax(
const Float2 &
a,
const Float2 &b,
const Float2 &c) : a(a) { ; }
166 __device__
void operator()(FloatN &
x,
const FloatN &
y,
const FloatN &z,
const FloatN &w) { x *=
a.x; }
172 blasCuda<ax,1,0,0,0>(make_double2(a, 0.0), make_double2(0.0, 0.0),
173 make_double2(0.0, 0.0),
x,
x,
x,
x);
180 __device__
void caxpy_(
const float2 &a,
const float4 &
x, float4 &
y) {
181 y.x += a.x*x.x; y.x -= a.y*x.y;
182 y.y += a.y*x.x; y.y += a.x*x.y;
183 y.z += a.x*x.z; y.z -= a.y*x.w;
184 y.w += a.y*x.z; y.w += a.x*x.w;
187 __device__
void caxpy_(
const float2 &a,
const float2 &
x, float2 &
y) {
188 y.x += a.x*x.x; y.x -= a.y*x.y;
189 y.y += a.y*x.x; y.y += a.x*x.y;
192 __device__
void caxpy_(
const double2 &a,
const double2 &
x, double2 &
y) {
193 y.x += a.x*x.x; y.x -= a.y*x.y;
194 y.y += a.y*x.x; y.y += a.x*x.y;
197 template <
typename Float2,
typename FloatN>
200 caxpy(
const Float2 &
a,
const Float2 &b,
const Float2 &c) : a(a) { ; }
201 __device__
void operator()(
const FloatN &
x, FloatN &
y,
const FloatN &z,
const FloatN &w)
208 blasCuda<caxpy,0,1,0,0>(make_double2(
REAL(a),
IMAG(a)),
209 make_double2(0.0, 0.0),
210 make_double2(0.0, 0.0),
x,
y,
x,
x);
217 __device__
void caxpby_(
const float2 &a,
const float4 &
x,
const float2 &b, float4 &
y)
219 yy.x = a.x*x.x; yy.x -= a.y*x.y; yy.x += b.x*y.x; yy.x -= b.y*y.y;
220 yy.y = a.y*x.x; yy.y += a.x*x.y; yy.y += b.y*y.x; yy.y += b.x*y.y;
221 yy.z = a.x*x.z; yy.z -= a.y*x.w; yy.z += b.x*y.z; yy.z -= b.y*y.w;
222 yy.w = a.y*x.z; yy.w += a.x*x.w; yy.w += b.y*y.z; yy.w += b.x*y.w;
225 __device__
void caxpby_(
const float2 &a,
const float2 &
x,
const float2 &b, float2 &
y)
227 yy.x = a.x*x.x; yy.x -= a.y*x.y; yy.x += b.x*y.x; yy.x -= b.y*y.y;
228 yy.y = a.y*x.x; yy.y += a.x*x.y; yy.y += b.y*y.x; yy.y += b.x*y.y;
231 __device__
void caxpby_(
const double2 &a,
const double2 &
x,
const double2 &b, double2 &
y)
233 yy.x = a.x*x.x; yy.x -= a.y*x.y; yy.x += b.x*y.x; yy.x -= b.y*y.y;
234 yy.y = a.y*x.x; yy.y += a.x*x.y; yy.y += b.y*y.x; yy.y += b.x*y.y;
237 template <
typename Float2,
typename FloatN>
241 caxpby(
const Float2 &
a,
const Float2 &
b,
const Float2 &c) : a(a), b(b) { ; }
248 blasCuda<caxpby,0,1,0,0>(make_double2(
REAL(a),
IMAG(a)), make_double2(
REAL(b),
IMAG(b)),
249 make_double2(0.0, 0.0),
x,
y,
x,
x);
256 __device__
void cxpaypbz_(
const float4 &
x,
const float2 &a,
const float4 &
y,
const float2 &b, float4 &z) {
258 zz.x = x.x + a.x*y.x; zz.x -= a.y*y.y; zz.x += b.x*z.x; zz.x -= b.y*z.y;
259 zz.y = x.y + a.y*y.x; zz.y += a.x*y.y; zz.y += b.y*z.x; zz.y += b.x*z.y;
260 zz.z = x.z + a.x*y.z; zz.z -= a.y*y.w; zz.z += b.x*z.z; zz.z -= b.y*z.w;
261 zz.w = x.w + a.y*y.z; zz.w += a.x*y.w; zz.w += b.y*z.z; zz.w += b.x*z.w;
265 __device__
void cxpaypbz_(
const float2 &
x,
const float2 &a,
const float2 &
y,
const float2 &b, float2 &z) {
267 zz.x = x.x + a.x*y.x; zz.x -= a.y*y.y; zz.x += b.x*z.x; zz.x -= b.y*z.y;
268 zz.y = x.y + a.y*y.x; zz.y += a.x*y.y; zz.y += b.y*z.x; zz.y += b.x*z.y;
272 __device__
void cxpaypbz_(
const double2 &
x,
const double2 &a,
const double2 &
y,
const double2 &b, double2 &z) {
274 zz.x = x.x + a.x*y.x; zz.x -= a.y*y.y; zz.x += b.x*z.x; zz.x -= b.y*z.y;
275 zz.y = x.y + a.y*y.x; zz.y += a.x*y.y; zz.y += b.y*z.x; zz.y += b.x*z.y;
279 template <
typename Float2,
typename FloatN>
283 cxpaypbz(
const Float2 &
a,
const Float2 &
b,
const Float2 &c) : a(a), b(b) { ; }
284 __device__
void operator()(
const FloatN &
x,
const FloatN &
y, FloatN &z, FloatN &w)
292 blasCuda<cxpaypbz,0,0,1,0>(make_double2(
REAL(a),
IMAG(a)), make_double2(
REAL(b),
IMAG(b)),
293 make_double2(0.0, 0.0),
x,
y, z, z);
299 template <
typename Float2,
typename FloatN>
304 axpyBzpcx(
const Float2 &
a,
const Float2 &
b,
const Float2 &
c) : a(a), b(b), c(c) { ; }
305 __device__
void operator()(FloatN &
x, FloatN &
y,
const FloatN &z,
const FloatN &w)
306 { y +=
a.x*
x; x =
b.x*z +
c.x*
x; }
315 mixed::blasCuda<axpyBzpcx,1,1,0,0>(make_double2(a,0.0), make_double2(b,0.0),
316 make_double2(c,0.0),
x,
y, z,
x);
319 blasCuda<axpyBzpcx,1,1,0,0>(make_double2(a,0.0), make_double2(b,0.0),
320 make_double2(c,0.0),
x,
y, z,
x);
327 template <
typename Float2,
typename FloatN>
331 axpyZpbx(
const Float2 &
a,
const Float2 &
b,
const Float2 &c) : a(a), b(b) { ; }
332 __device__
void operator()(FloatN &
x, FloatN &
y,
const FloatN &z,
const FloatN &w)
333 { y +=
a.x*
x; x = z +
b.x*
x; }
342 mixed::blasCuda<axpyZpbx,1,1,0,0>(make_double2(a,0.0), make_double2(b,0.0), make_double2(0.0,0.0),
346 blasCuda<axpyZpbx,1,1,0,0>(make_double2(a,0.0), make_double2(b,0.0), make_double2(0.0,0.0),
354 template <
typename Float2,
typename FloatN>
358 caxpbypzYmbw(
const Float2 &
a,
const Float2 &
b,
const Float2 &c) : a(a), b(b) { ; }
359 __device__
void operator()(
const FloatN &
x, FloatN &
y, FloatN &z,
const FloatN &w)
368 blasCuda<caxpbypzYmbw,0,1,1,0>(make_double2(
REAL(a),
IMAG(a)), make_double2(
REAL(b),
IMAG(b)),
369 make_double2(0.0,0.0),
x,
y, z, w);
375 template <
typename Float2,
typename FloatN>
379 cabxpyAx(
const Float2 &
a,
const Float2 &
b,
const Float2 &c) : a(a), b(b) { ; }
380 __device__
void operator()(FloatN &
x, FloatN &
y,
const FloatN &z,
const FloatN &w)
389 blasCuda<cabxpyAx,1,1,0,0>(make_double2(a,0.0), make_double2(
REAL(b),
IMAG(b)),
390 make_double2(0.0,0.0),
x,
y,
x,
x);
396 template <
typename Float2,
typename FloatN>
400 caxpbypz(
const Float2 &
a,
const Float2 &
b,
const Float2 &c) : a(a), b(b) { ; }
401 __device__
void operator()(
const FloatN &
x,
const FloatN &
y, FloatN &z,
const FloatN &w)
409 blasCuda<caxpbypz,0,0,1,0>(make_double2(
REAL(a),
IMAG(a)), make_double2(
REAL(b),
IMAG(b)),
410 make_double2(0.0,0.0),
x,
y, z, z);
416 template <
typename Float2,
typename FloatN>
421 caxpbypczpw(
const Float2 &
a,
const Float2 &
b,
const Float2 &
c) : a(a), b(b), c(c) { ; }
422 __device__
void operator()(
const FloatN &
x,
const FloatN &
y,
const FloatN &z, FloatN &w)
432 blasCuda<caxpbypczpw,0,0,0,1>(make_double2(
REAL(a),
IMAG(a)), make_double2(
REAL(b),
IMAG(b)),
442 template <
typename Float2,
typename FloatN>
445 caxpyxmaz(
const Float2 &
a,
const Float2 &b,
const Float2 &c) : a(a) { ; }
446 __device__
void operator()(FloatN &
x, FloatN &
y,
const FloatN &z,
const FloatN &w)
454 blasCuda<caxpyxmaz,1,1,0,0>(make_double2(
REAL(a),
IMAG(a)), make_double2(0.0, 0.0),
455 make_double2(0.0, 0.0),
x,
y, z,
x);
469 template <
typename Float2,
typename FloatN>
473 __device__
void operator()(
const FloatN &
x, FloatN &
y, FloatN &z, FloatN &w)
475 { y +=
a.x*w; z -=
a.x*
x; w = z +
b.x*w; }
484 mixed::blasCuda<tripleCGUpdate,0,1,1,1>(make_double2(a,0.0), make_double2(b,0.0),
485 make_double2(0.0,0.0),
x,
y, z, w);
487 blasCuda<tripleCGUpdate,0,1,1,1>(make_double2(a, 0.0), make_double2(b, 0.0),
488 make_double2(0.0, 0.0),
x,
y, z, w);
__device__ void operator()(const FloatN &x, FloatN &y, const FloatN &z, const FloatN &w)
static int flops()
total number of input and output streams
void caxpyCuda(const Complex &a, cudaColorSpinorField &x, cudaColorSpinorField &y)
axpby(const Float2 &a, const Float2 &b, const Float2 &c)
__device__ void operator()(const FloatN &x, FloatN &y, FloatN &z, const FloatN &w)
xpy(const Float2 &a, const Float2 &b, const Float2 &c)
static int flops()
total number of input and output streams
__device__ void operator()(const FloatN &x, FloatN &y, const FloatN &z, const FloatN &w)
axpyZpbx(const Float2 &a, const Float2 &b, const Float2 &c)
caxpbypczpw(const Float2 &a, const Float2 &b, const Float2 &c)
__device__ void operator()(const FloatN &x, const FloatN &y, FloatN &z, FloatN &w)
char aux_tmp[TuneKey::aux_n]
static int flops()
total number of input and output streams
__device__ void operator()(const FloatN &x, const FloatN &y, const FloatN &z, FloatN &w)
unsigned long long blas_bytes
std::complex< double > Complex
void axpbyCuda(const double &a, cudaColorSpinorField &x, const double &b, cudaColorSpinorField &y)
__device__ void caxpy_(const float2 &a, const float4 &x, float4 &y)
void axpyZpbxCuda(const double &a, cudaColorSpinorField &x, cudaColorSpinorField &y, cudaColorSpinorField &z, const double &b)
mxpy(const Float2 &a, const Float2 &b, const Float2 &c)
xpay(const Float2 &a, const Float2 &b, const Float2 &c)
__device__ void cxpaypbz_(const float4 &x, const float2 &a, const float4 &y, const float2 &b, float4 &z)
static int flops()
total number of input and output streams
static int flops()
total number of input and output streams
static int flops()
total number of input and output streams
__device__ void operator()(const FloatN &x, const FloatN &y, FloatN &z, const FloatN &w)
static int flops()
total number of input and output streams
axpyBzpcx(const Float2 &a, const Float2 &b, const Float2 &c)
void cabxpyAxCuda(const double &a, const Complex &b, cudaColorSpinorField &x, cudaColorSpinorField &y)
__device__ void operator()(FloatN &x, const FloatN &y, const FloatN &z, const FloatN &w)
static int flops()
total number of input and output streams
void axpyBzpcxCuda(const double &a, cudaColorSpinorField &x, cudaColorSpinorField &y, const double &b, cudaColorSpinorField &z, const double &c)
void caxpyXmazCuda(const Complex &a, cudaColorSpinorField &x, cudaColorSpinorField &y, cudaColorSpinorField &z)
caxpyxmaz(const Float2 &a, const Float2 &b, const Float2 &c)
cxpaypbz(const Float2 &a, const Float2 &b, const Float2 &c)
static int flops()
total number of input and output streams
__device__ void operator()(const FloatN &x, FloatN &y, const FloatN &z, const FloatN &w)
void mxpyCuda(cudaColorSpinorField &x, cudaColorSpinorField &y)
__device__ void operator()(const FloatN &x, FloatN &y, FloatN &z, FloatN &w)
__device__ void operator()(FloatN &x, FloatN &y, const FloatN &z, const FloatN &w)
__device__ void caxpby_(const float2 &a, const float4 &x, const float2 &b, float4 &y)
static int flops()
total number of input and output streams
__device__ void operator()(const FloatN &x, FloatN &y, const FloatN &z, const FloatN &w)
caxpby(const Float2 &a, const Float2 &b, const Float2 &c)
caxpbypzYmbw(const Float2 &a, const Float2 &b, const Float2 &c)
__device__ void operator()(FloatN &x, FloatN &y, const FloatN &z, const FloatN &w)
void axpyCuda(const double &a, cudaColorSpinorField &x, cudaColorSpinorField &y)
void caxpbypczpwCuda(const Complex &, cudaColorSpinorField &, const Complex &, cudaColorSpinorField &, const Complex &, cudaColorSpinorField &, cudaColorSpinorField &)
static int flops()
total number of input and output streams
static int flops()
total number of input and output streams
cabxpyAx(const Float2 &a, const Float2 &b, const Float2 &c)
cudaStream_t * getBlasStream()
unsigned long long blas_flops
void xpyCuda(cudaColorSpinorField &x, cudaColorSpinorField &y)
axpy(const Float2 &a, const Float2 &b, const Float2 &c)
tripleCGUpdate(const Float2 &a, const Float2 &b, const Float2 &c)
QudaPrecision Precision() const
void cxpaypbzCuda(cudaColorSpinorField &, const Complex &b, cudaColorSpinorField &y, const Complex &c, cudaColorSpinorField &z)
ax(const Float2 &a, const Float2 &b, const Float2 &c)
caxpbypz(const Float2 &a, const Float2 &b, const Float2 &c)
caxpy(const Float2 &a, const Float2 &b, const Float2 &c)
static int flops()
total number of input and output streams
void caxpbypzCuda(const Complex &, cudaColorSpinorField &, const Complex &, cudaColorSpinorField &, cudaColorSpinorField &)
__device__ void operator()(const FloatN &x, FloatN &y, const FloatN &z, const FloatN &w)
static int flops()
total number of input and output streams
void zeroCuda(cudaColorSpinorField &a)
__device__ void operator()(FloatN &x, FloatN &y, const FloatN &z, const FloatN &w)
static int flops()
total number of input and output streams
void tripleCGUpdateCuda(const double &alpha, const double &beta, cudaColorSpinorField &q, cudaColorSpinorField &r, cudaColorSpinorField &x, cudaColorSpinorField &p)
void caxpbyCuda(const Complex &a, cudaColorSpinorField &x, const Complex &b, cudaColorSpinorField &y)
void xpayCuda(cudaColorSpinorField &x, const double &a, cudaColorSpinorField &y)
__device__ void operator()(const FloatN &x, FloatN &y, const FloatN &z, const FloatN &w)
static int flops()
total number of input and output streams
void axCuda(const double &a, cudaColorSpinorField &x)
void caxpbypzYmbwCuda(const Complex &, cudaColorSpinorField &, const Complex &, cudaColorSpinorField &, cudaColorSpinorField &, cudaColorSpinorField &)
__device__ void operator()(const FloatN &x, FloatN &y, const FloatN &z, const FloatN &w)
__device__ void operator()(FloatN &x, FloatN &y, const FloatN &z, const FloatN &w)
static int flops()
total number of input and output streams