13 #define checkSpinor(a, b) \ 15 if (a.Precision() != b.Precision()) \ 16 errorQuda("precisions do not match: %d %d", a.Precision(), b.Precision()); \ 17 if (a.Length() != b.Length()) \ 18 errorQuda("lengths do not match: %lu %lu", a.Length(), b.Length()); \ 19 if (a.Stride() != b.Stride()) \ 20 errorQuda("strides do not match: %d %d", a.Stride(), b.Stride()); \ 23 #define checkLength(a, b) \ 25 if (a.Length() != b.Length()) \ 26 errorQuda("lengths do not match: %lu %lu", a.Length(), b.Length()); \ 27 if (a.Stride() != b.Stride()) \ 28 errorQuda("strides do not match: %d %d", a.Stride(), b.Stride()); \ 36 #define BLAS_SPINOR // do not include ghost functions in Spinor class to reduce parameter space overhead 48 template <
int writeX,
int writeY,
int writeZ,
int writeW>
50 static constexpr
int X = writeX;
51 static constexpr
int Y = writeY;
52 static constexpr
int Z = writeZ;
53 static constexpr
int W = writeW;
61 template <
int NXZ,
typename Float2,
typename FloatN>
65 virtual __device__ __host__
void init() { ; }
68 virtual __device__ __host__
void operator()(FloatN &
x, FloatN &
y, FloatN &
z, FloatN &
w,
const int i,
const int j) = 0;
76 __device__ __host__
inline void _caxpy(
const float2 &
a,
const float4 &
x, float4 &
y) {
77 y.x +=
a.x*
x.x;
y.x -=
a.y*
x.y;
78 y.y +=
a.y*
x.x;
y.y +=
a.x*
x.y;
79 y.z +=
a.x*
x.z;
y.z -=
a.y*
x.w;
80 y.w +=
a.y*
x.z;
y.w +=
a.x*
x.w;
83 __device__ __host__
inline void _caxpy(
const float2 &
a,
const float2 &
x, float2 &
y) {
84 y.x +=
a.x*
x.x;
y.x -=
a.y*
x.y;
85 y.y +=
a.y*
x.x;
y.y +=
a.x*
x.y;
88 __device__ __host__
inline void _caxpy(
const double2 &
a,
const double2 &
x, double2 &
y) {
89 y.x +=
a.x*
x.x;
y.x -=
a.y*
x.y;
90 y.y +=
a.y*
x.x;
y.y +=
a.x*
x.y;
93 template<
int NXZ,
typename Float2,
typename FloatN>
101 __device__ __host__
inline void operator()(FloatN &
x, FloatN &
y, FloatN &
z, FloatN &
w,
const int i,
const int j)
104 Float2 *
a =
reinterpret_cast<Float2*
>(
Amatrix_d);
107 Float2 *
a =
reinterpret_cast<Float2*
>(
Amatrix_h);
118 int i_idx ,
int j_idx,
int upper) {
124 Complex* tmpmajor0 = &tmpmajor[0];
125 Complex* tmpmajor1 = &tmpmajor[
x.size()*(
y.size()/2)];
126 std::vector<ColorSpinorField*>
y0(
y.begin(),
y.begin() +
y.size()/2);
127 std::vector<ColorSpinorField*>
y1(
y.begin() +
y.size()/2,
y.end());
129 const unsigned int xlen =
x.size();
130 const unsigned int ylen0 =
y.size()/2;
131 const unsigned int ylen1 =
y.size() -
y.size()/2;
133 int count = 0, count0 = 0, count1 = 0;
134 for (
unsigned int i = 0;
i < xlen;
i++)
136 for (
unsigned int j = 0; j < ylen0; j++)
137 tmpmajor0[count0++] = a_[
count++];
138 for (
unsigned int j = 0; j < ylen1; j++)
139 tmpmajor1[count1++] = a_[
count++];
153 if (upper == 1 && j_idx < i_idx) {
return; }
154 if (upper == -1 && j_idx > i_idx) {
return; }
160 if (
x[0]->Precision() ==
y[0]->Precision())
164 multiblasCuda<1,multicaxpy_,write<0,1,0,0> >(
a,
b,
c,
x,
y,
x,
y);
166 #if MAX_MULTI_BLAS_N >= 2 168 multiblasCuda<2,multicaxpy_,write<0,1,0,0> >(
a,
b,
c,
x,
y,
x,
y);
170 #if MAX_MULTI_BLAS_N >= 3 172 multiblasCuda<3,multicaxpy_,write<0,1,0,0> >(
a,
b,
c,
x,
y,
x,
y);
174 #if MAX_MULTI_BLAS_N >= 4 176 multiblasCuda<4,multicaxpy_,write<0,1,0,0> >(
a,
b,
c,
x,
y,
x,
y);
178 #if MAX_MULTI_BLAS_N >= 5 180 multiblasCuda<5,multicaxpy_,write<0,1,0,0> >(
a,
b,
c,
x,
y,
x,
y);
182 #if MAX_MULTI_BLAS_N >= 6 184 multiblasCuda<6,multicaxpy_,write<0,1,0,0> >(
a,
b,
c,
x,
y,
x,
y);
186 #if MAX_MULTI_BLAS_N >= 7 188 multiblasCuda<7,multicaxpy_,write<0,1,0,0> >(
a,
b,
c,
x,
y,
x,
y);
190 #if MAX_MULTI_BLAS_N >= 8 192 multiblasCuda<8,multicaxpy_,write<0,1,0,0> >(
a,
b,
c,
x,
y,
x,
y);
194 #if MAX_MULTI_BLAS_N >= 9 196 multiblasCuda<9,multicaxpy_,write<0,1,0,0> >(
a,
b,
c,
x,
y,
x,
y);
198 #if MAX_MULTI_BLAS_N >= 10 200 multiblasCuda<10,multicaxpy_,write<0,1,0,0> >(
a,
b,
c,
x,
y,
x,
y);
202 #if MAX_MULTI_BLAS_N >= 11 204 multiblasCuda<11,multicaxpy_,write<0,1,0,0> >(
a,
b,
c,
x,
y,
x,
y);
206 #if MAX_MULTI_BLAS_N >= 12 208 multiblasCuda<12,multicaxpy_,write<0,1,0,0> >(
a,
b,
c,
x,
y,
x,
y);
210 #if MAX_MULTI_BLAS_N >= 13 212 multiblasCuda<13,multicaxpy_,write<0,1,0,0> >(
a,
b,
c,
x,
y,
x,
y);
214 #if MAX_MULTI_BLAS_N >= 14 216 multiblasCuda<14,multicaxpy_,write<0,1,0,0> >(
a,
b,
c,
x,
y,
x,
y);
218 #if MAX_MULTI_BLAS_N >= 15 220 multiblasCuda<15,multicaxpy_,write<0,1,0,0> >(
a,
b,
c,
x,
y,
x,
y);
222 #if MAX_MULTI_BLAS_N >= 16 224 multiblasCuda<16,multicaxpy_,write<0,1,0,0> >(
a,
b,
c,
x,
y,
x,
y);
246 std::vector<ColorSpinorField*> x0(
x.begin(),
x.begin() +
x.size()/2);
247 std::vector<ColorSpinorField*> x1(
x.begin() +
x.size()/2,
x.end());
258 mixed::multiblasCuda<1,multicaxpy_,write<0,1,0,0> >(
a,
b,
c,
x,
y,
x,
y);
260 #if MAX_MULTI_BLAS_N >= 2 262 mixed::multiblasCuda<2,multicaxpy_,write<0,1,0,0> >(
a,
b,
c,
x,
y,
x,
y);
264 #if MAX_MULTI_BLAS_N >= 3 266 mixed::multiblasCuda<3,multicaxpy_,write<0,1,0,0> >(
a,
b,
c,
x,
y,
x,
y);
268 #if MAX_MULTI_BLAS_N >= 4 270 mixed::multiblasCuda<4,multicaxpy_,write<0,1,0,0> >(
a,
b,
c,
x,
y,
x,
y);
272 #if MAX_MULTI_BLAS_N >= 5 274 mixed::multiblasCuda<5,multicaxpy_,write<0,1,0,0> >(
a,
b,
c,
x,
y,
x,
y);
276 #if MAX_MULTI_BLAS_N >= 6 278 mixed::multiblasCuda<6,multicaxpy_,write<0,1,0,0> >(
a,
b,
c,
x,
y,
x,
y);
280 #if MAX_MULTI_BLAS_N >= 7 282 mixed::multiblasCuda<7,multicaxpy_,write<0,1,0,0> >(
a,
b,
c,
x,
y,
x,
y);
284 #if MAX_MULTI_BLAS_N >= 8 286 mixed::multiblasCuda<8,multicaxpy_,write<0,1,0,0> >(
a,
b,
c,
x,
y,
x,
y);
288 #if MAX_MULTI_BLAS_N >= 9 290 mixed::multiblasCuda<9,multicaxpy_,write<0,1,0,0> >(
a,
b,
c,
x,
y,
x,
y);
292 #if MAX_MULTI_BLAS_N >= 10 294 mixed::multiblasCuda<10,multicaxpy_,write<0,1,0,0> >(
a,
b,
c,
x,
y,
x,
y);
296 #if MAX_MULTI_BLAS_N >= 11 298 mixed::multiblasCuda<11,multicaxpy_,write<0,1,0,0> >(
a,
b,
c,
x,
y,
x,
y);
300 #if MAX_MULTI_BLAS_N >= 12 302 mixed::multiblasCuda<12,multicaxpy_,write<0,1,0,0> >(
a,
b,
c,
x,
y,
x,
y);
304 #if MAX_MULTI_BLAS_N >= 13 306 mixed::multiblasCuda<13,multicaxpy_,write<0,1,0,0> >(
a,
b,
c,
x,
y,
x,
y);
308 #if MAX_MULTI_BLAS_N >= 14 310 mixed::multiblasCuda<14,multicaxpy_,write<0,1,0,0> >(
a,
b,
c,
x,
y,
x,
y);
312 #if MAX_MULTI_BLAS_N >= 15 314 mixed::multiblasCuda<15,multicaxpy_,write<0,1,0,0> >(
a,
b,
c,
x,
y,
x,
y);
316 #if MAX_MULTI_BLAS_N >= 16 318 mixed::multiblasCuda<16,multicaxpy_,write<0,1,0,0> >(
a,
b,
c,
x,
y,
x,
y);
340 std::vector<ColorSpinorField*> x0(
x.begin(),
x.begin() +
x.size()/2);
341 std::vector<ColorSpinorField*> x1(
x.begin() +
x.size()/2,
x.end());
351 void caxpy(
const Complex *a_, std::vector<ColorSpinorField*> &
x, std::vector<ColorSpinorField*> &
y) {
357 void caxpy_U(
const Complex *a_, std::vector<ColorSpinorField*> &
x, std::vector<ColorSpinorField*> &
y) {
361 if (
x.size() !=
y.size())
363 errorQuda(
"An optimal block caxpy_U with non-square 'a' has not yet been implemented. Use block caxpy instead.\n");
369 void caxpy_L(
const Complex *a_, std::vector<ColorSpinorField*> &
x, std::vector<ColorSpinorField*> &
y) {
373 if (
x.size() !=
y.size())
375 errorQuda(
"An optimal block caxpy_L with non-square 'a' has not yet been implemented. Use block caxpy instead.\n");
391 template<
int NXZ,
typename Float2,
typename FloatN>
399 __device__ __host__
inline void operator()(FloatN &
x, FloatN &
y, FloatN &
z, FloatN &
w,
const int i,
const int j)
402 Float2 *
a =
reinterpret_cast<Float2*
>(
Amatrix_d);
406 Float2 *
a =
reinterpret_cast<Float2*
>(
Amatrix_h);
416 void caxpyz_recurse(
const Complex *a_, std::vector<ColorSpinorField*> &
x, std::vector<ColorSpinorField*> &
y, std::vector<ColorSpinorField*> &
z,
int i,
int j,
int pass,
int upper) {
422 Complex* tmpmajor0 = &tmpmajor[0];
423 Complex* tmpmajor1 = &tmpmajor[
x.size()*(
y.size()/2)];
424 std::vector<ColorSpinorField*>
y0(
y.begin(),
y.begin() +
y.size()/2);
425 std::vector<ColorSpinorField*>
y1(
y.begin() +
y.size()/2,
y.end());
427 std::vector<ColorSpinorField*> z0(
z.begin(),
z.begin() +
z.size()/2);
428 std::vector<ColorSpinorField*> z1(
z.begin() +
z.size()/2,
z.end());
430 const unsigned int xlen =
x.size();
431 const unsigned int ylen0 =
y.size()/2;
432 const unsigned int ylen1 =
y.size() -
y.size()/2;
434 int count = 0, count0 = 0, count1 = 0;
435 for (
unsigned int i_ = 0; i_ < xlen; i_++)
437 for (
unsigned int j = 0; j < ylen0; j++)
438 tmpmajor0[count0++] = a_[
count++];
439 for (
unsigned int j = 0; j < ylen1; j++)
440 tmpmajor1[count1++] = a_[
count++];
455 if (upper == 1 && j <
i) {
return; }
456 if (upper == -1 &&
i < j) {
return; }
468 if (
x[0]->Precision() ==
y[0]->Precision())
472 multiblasCuda<1,multicaxpyz_,write<0,0,0,1> >(
a,
b,
c,
x,
y,
x,
z);
474 #if MAX_MULTI_BLAS_N >= 2 476 multiblasCuda<2,multicaxpyz_,write<0,0,0,1> >(
a,
b,
c,
x,
y,
x,
z);
478 #if MAX_MULTI_BLAS_N >= 3 480 multiblasCuda<3,multicaxpyz_,write<0,0,0,1> >(
a,
b,
c,
x,
y,
x,
z);
482 #if MAX_MULTI_BLAS_N >= 4 484 multiblasCuda<4,multicaxpyz_,write<0,0,0,1> >(
a,
b,
c,
x,
y,
x,
z);
486 #if MAX_MULTI_BLAS_N >= 5 488 multiblasCuda<5,multicaxpyz_,write<0,0,0,1> >(
a,
b,
c,
x,
y,
x,
z);
490 #if MAX_MULTI_BLAS_N >= 6 492 multiblasCuda<6,multicaxpyz_,write<0,0,0,1> >(
a,
b,
c,
x,
y,
x,
z);
494 #if MAX_MULTI_BLAS_N >= 7 496 multiblasCuda<7,multicaxpyz_,write<0,0,0,1> >(
a,
b,
c,
x,
y,
x,
z);
498 #if MAX_MULTI_BLAS_N >= 8 500 multiblasCuda<8,multicaxpyz_,write<0,0,0,1> >(
a,
b,
c,
x,
y,
x,
z);
502 #if MAX_MULTI_BLAS_N >= 9 504 multiblasCuda<9,multicaxpyz_,write<0,0,0,1> >(
a,
b,
c,
x,
y,
x,
z);
506 #if MAX_MULTI_BLAS_N >= 10 508 multiblasCuda<10,multicaxpyz_,write<0,0,0,1> >(
a,
b,
c,
x,
y,
x,
z);
510 #if MAX_MULTI_BLAS_N >= 11 512 multiblasCuda<11,multicaxpyz_,write<0,0,0,1> >(
a,
b,
c,
x,
y,
x,
z);
514 #if MAX_MULTI_BLAS_N >= 12 516 multiblasCuda<12,multicaxpyz_,write<0,0,0,1> >(
a,
b,
c,
x,
y,
x,
z);
518 #if MAX_MULTI_BLAS_N >= 13 520 multiblasCuda<13,multicaxpyz_,write<0,0,0,1> >(
a,
b,
c,
x,
y,
x,
z);
522 #if MAX_MULTI_BLAS_N >= 14 524 multiblasCuda<14,multicaxpyz_,write<0,0,0,1> >(
a,
b,
c,
x,
y,
x,
z);
526 #if MAX_MULTI_BLAS_N >= 15 528 multiblasCuda<15,multicaxpyz_,write<0,0,0,1> >(
a,
b,
c,
x,
y,
x,
z);
530 #if MAX_MULTI_BLAS_N >= 16 532 multiblasCuda<16,multicaxpyz_,write<0,0,0,1> >(
a,
b,
c,
x,
y,
x,
z);
554 std::vector<ColorSpinorField*> x0(
x.begin(),
x.begin() +
x.size()/2);
555 std::vector<ColorSpinorField*> x1(
x.begin() +
x.size()/2,
x.end());
566 mixed::multiblasCuda<1,multicaxpyz_,write<0,0,0,1> >(
a,
b,
c,
x,
y,
x,
z);
568 #if MAX_MULTI_BLAS_N >= 2 570 mixed::multiblasCuda<2,multicaxpyz_,write<0,0,0,1> >(
a,
b,
c,
x,
y,
x,
z);
572 #if MAX_MULTI_BLAS_N >= 3 574 mixed::multiblasCuda<3,multicaxpyz_,write<0,0,0,1> >(
a,
b,
c,
x,
y,
x,
z);
576 #if MAX_MULTI_BLAS_N >= 4 578 mixed::multiblasCuda<4,multicaxpyz_,write<0,0,0,1> >(
a,
b,
c,
x,
y,
x,
z);
580 #if MAX_MULTI_BLAS_N >= 5 582 mixed::multiblasCuda<5,multicaxpyz_,write<0,0,0,1> >(
a,
b,
c,
x,
y,
x,
z);
584 #if MAX_MULTI_BLAS_N >= 6 586 mixed::multiblasCuda<6,multicaxpyz_,write<0,0,0,1> >(
a,
b,
c,
x,
y,
x,
z);
588 #if MAX_MULTI_BLAS_N >= 7 590 mixed::multiblasCuda<7,multicaxpyz_,write<0,0,0,1> >(
a,
b,
c,
x,
y,
x,
z);
592 #if MAX_MULTI_BLAS_N >= 8 594 mixed::multiblasCuda<8,multicaxpyz_,write<0,0,0,1> >(
a,
b,
c,
x,
y,
x,
z);
596 #if MAX_MULTI_BLAS_N >= 9 598 mixed::multiblasCuda<9,multicaxpyz_,write<0,0,0,1> >(
a,
b,
c,
x,
y,
x,
z);
600 #if MAX_MULTI_BLAS_N >= 10 602 mixed::multiblasCuda<10,multicaxpyz_,write<0,0,0,1> >(
a,
b,
c,
x,
y,
x,
z);
604 #if MAX_MULTI_BLAS_N >= 11 606 mixed::multiblasCuda<11,multicaxpyz_,write<0,0,0,1> >(
a,
b,
c,
x,
y,
x,
z);
608 #if MAX_MULTI_BLAS_N >= 12 610 mixed::multiblasCuda<12,multicaxpyz_,write<0,0,0,1> >(
a,
b,
c,
x,
y,
x,
z);
612 #if MAX_MULTI_BLAS_N >= 13 614 mixed::multiblasCuda<13,multicaxpyz_,write<0,0,0,1> >(
a,
b,
c,
x,
y,
x,
z);
616 #if MAX_MULTI_BLAS_N >= 14 618 mixed::multiblasCuda<14,multicaxpyz_,write<0,0,0,1> >(
a,
b,
c,
x,
y,
x,
z);
620 #if MAX_MULTI_BLAS_N >= 15 622 mixed::multiblasCuda<15,multicaxpyz_,write<0,0,0,1> >(
a,
b,
c,
x,
y,
x,
z);
624 #if MAX_MULTI_BLAS_N >= 16 626 mixed::multiblasCuda<16,multicaxpyz_,write<0,0,0,1> >(
a,
b,
c,
x,
y,
x,
z);
648 std::vector<ColorSpinorField*> x0(
x.begin(),
x.begin() +
x.size()/2);
649 std::vector<ColorSpinorField*> x1(
x.begin() +
x.size()/2,
x.end());
659 void caxpyz(
const Complex *
a, std::vector<ColorSpinorField*> &
x, std::vector<ColorSpinorField*> &
y, std::vector<ColorSpinorField*> &
z) {
666 void caxpyz_U(
const Complex *
a, std::vector<ColorSpinorField*> &
x, std::vector<ColorSpinorField*> &
y, std::vector<ColorSpinorField*> &
z) {
674 void caxpyz_L(
const Complex *
a, std::vector<ColorSpinorField*> &
x, std::vector<ColorSpinorField*> &
y, std::vector<ColorSpinorField*> &
z) {
684 caxpyz(
a,
x.Components(),
y.Components(),
z.Components());
688 caxpyz_U(
a,
x.Components(),
y.Components(),
z.Components());
692 caxpyz_L(
a,
x.Components(),
y.Components(),
z.Components());
698 template<
int NXZ,
typename Float2,
typename FloatN>
707 for (
int i=0;
i<
NYW;
i++) { this->a[
i] =
a.data[
i]; this->b[
i] =
b.data[
i]; this->c[
i] =
c.data[
i]; }
709 __device__ __host__
inline void operator()(FloatN &
x, FloatN &
y, FloatN &
z, FloatN &
w,
const int i,
const int j)
718 void axpyBzpcx(
const double *a_, std::vector<ColorSpinorField*> &x_, std::vector<ColorSpinorField*> &y_,
725 std::vector<ColorSpinorField*> &
y = y_;
726 std::vector<ColorSpinorField*> &
w = x_;
729 std::vector<ColorSpinorField*>
x;
735 if (
x[0]->Precision() !=
y[0]->Precision() ) {
736 mixed::multiblasCuda<1,multi_axpyBzpcx_,write<0,1,0,1> >(
a,
b,
c,
x,
y,
x,
w);
738 multiblasCuda<1,multi_axpyBzpcx_,write<0,1,0,1> >(
a,
b,
c,
x,
y,
x,
w);
742 const double *a0 = &a_[0];
743 const double *b0 = &b_[0];
744 const double *c0 = &c_[0];
746 std::vector<ColorSpinorField*> x0(x_.begin(), x_.begin() + x_.size()/2);
747 std::vector<ColorSpinorField*>
y0(y_.begin(), y_.begin() + y_.size()/2);
751 const double *
a1 = &a_[y_.size()/2];
752 const double *b1 = &b_[y_.size()/2];
753 const double *c1 = &c_[y_.size()/2];
755 std::vector<ColorSpinorField*> x1(x_.begin() + x_.size()/2, x_.end());
756 std::vector<ColorSpinorField*>
y1(y_.begin() + y_.size()/2, y_.end());
765 template<
int NXZ,
typename Float2,
typename FloatN>
775 __device__ __host__
inline void operator()(FloatN &
x, FloatN &
y, FloatN &
z, FloatN &
w,
const int i,
const int j)
778 Float2 *
a =
reinterpret_cast<Float2*
>(
Amatrix_d);
779 Float2 *
b =
reinterpret_cast<Float2*
>(
Bmatrix_d);
782 Float2 *
a =
reinterpret_cast<Float2*
>(
Amatrix_h);
783 Float2 *
b =
reinterpret_cast<Float2*
>(
Bmatrix_h);
795 const int xsize = x_.size();
802 std::vector<ColorSpinorField*>
y;
804 std::vector<ColorSpinorField*>
w;
808 std::vector<ColorSpinorField*> &
x = x_;
813 if (
x[0]->Precision() !=
y[0]->Precision() )
818 mixed::multiblasCuda<1,multi_caxpyBxpz_,write<0,1,0,1> >(
a,
b,
c,
x,
y,
x,
w);
820 #if MAX_MULTI_BLAS_N >= 2 822 mixed::multiblasCuda<2,multi_caxpyBxpz_,write<0,1,0,1> >(
a,
b,
c,
x,
y,
x,
w);
824 #if MAX_MULTI_BLAS_N >= 3 826 mixed::multiblasCuda<3,multi_caxpyBxpz_,write<0,1,0,1> >(
a,
b,
c,
x,
y,
x,
w);
828 #if MAX_MULTI_BLAS_N >= 4 830 mixed::multiblasCuda<4,multi_caxpyBxpz_,write<0,1,0,1> >(
a,
b,
c,
x,
y,
x,
w);
832 #if MAX_MULTI_BLAS_N >= 5 834 mixed::multiblasCuda<5,multi_caxpyBxpz_,write<0,1,0,1> >(
a,
b,
c,
x,
y,
x,
w);
836 #if MAX_MULTI_BLAS_N >= 6 838 mixed::multiblasCuda<6,multi_caxpyBxpz_,write<0,1,0,1> >(
a,
b,
c,
x,
y,
x,
w);
840 #if MAX_MULTI_BLAS_N >= 7 842 mixed::multiblasCuda<7,multi_caxpyBxpz_,write<0,1,0,1> >(
a,
b,
c,
x,
y,
x,
w);
844 #if MAX_MULTI_BLAS_N >= 8 846 mixed::multiblasCuda<8,multi_caxpyBxpz_,write<0,1,0,1> >(
a,
b,
c,
x,
y,
x,
w);
848 #if MAX_MULTI_BLAS_N >= 9 850 mixed::multiblasCuda<9,multi_caxpyBxpz_,write<0,1,0,1> >(
a,
b,
c,
x,
y,
x,
w);
852 #if MAX_MULTI_BLAS_N >= 10 854 mixed::multiblasCuda<10,multi_caxpyBxpz_,write<0,1,0,1> >(
a,
b,
c,
x,
y,
x,
w);
856 #if MAX_MULTI_BLAS_N >= 11 858 mixed::multiblasCuda<11,multi_caxpyBxpz_,write<0,1,0,1> >(
a,
b,
c,
x,
y,
x,
w);
860 #if MAX_MULTI_BLAS_N >= 12 862 mixed::multiblasCuda<12,multi_caxpyBxpz_,write<0,1,0,1> >(
a,
b,
c,
x,
y,
x,
w);
864 #if MAX_MULTI_BLAS_N >= 13 866 mixed::multiblasCuda<13,multi_caxpyBxpz_,write<0,1,0,1> >(
a,
b,
c,
x,
y,
x,
w);
868 #if MAX_MULTI_BLAS_N >= 14 870 mixed::multiblasCuda<14,multi_caxpyBxpz_,write<0,1,0,1> >(
a,
b,
c,
x,
y,
x,
w);
872 #if MAX_MULTI_BLAS_N >= 15 874 mixed::multiblasCuda<15,multi_caxpyBxpz_,write<0,1,0,1> >(
a,
b,
c,
x,
y,
x,
w);
876 #if MAX_MULTI_BLAS_N >= 16 878 mixed::multiblasCuda<16,multi_caxpyBxpz_,write<0,1,0,1> >(
a,
b,
c,
x,
y,
x,
w);
905 multiblasCuda<1,multi_caxpyBxpz_,write<0,1,0,1> >(
a,
b,
c,
x,
y,
x,
w);
907 #if MAX_MULTI_BLAS_N >= 2 909 multiblasCuda<2,multi_caxpyBxpz_,write<0,1,0,1> >(
a,
b,
c,
x,
y,
x,
w);
911 #if MAX_MULTI_BLAS_N >= 3 913 multiblasCuda<3,multi_caxpyBxpz_,write<0,1,0,1> >(
a,
b,
c,
x,
y,
x,
w);
915 #if MAX_MULTI_BLAS_N >= 4 917 multiblasCuda<4,multi_caxpyBxpz_,write<0,1,0,1> >(
a,
b,
c,
x,
y,
x,
w);
919 #if MAX_MULTI_BLAS_N >= 5 921 multiblasCuda<5,multi_caxpyBxpz_,write<0,1,0,1> >(
a,
b,
c,
x,
y,
x,
w);
923 #if MAX_MULTI_BLAS_N >= 6 925 multiblasCuda<6,multi_caxpyBxpz_,write<0,1,0,1> >(
a,
b,
c,
x,
y,
x,
w);
927 #if MAX_MULTI_BLAS_N >= 7 929 multiblasCuda<7,multi_caxpyBxpz_,write<0,1,0,1> >(
a,
b,
c,
x,
y,
x,
w);
931 #if MAX_MULTI_BLAS_N >= 8 933 multiblasCuda<8,multi_caxpyBxpz_,write<0,1,0,1> >(
a,
b,
c,
x,
y,
x,
w);
935 #if MAX_MULTI_BLAS_N >= 9 937 multiblasCuda<9,multi_caxpyBxpz_,write<0,1,0,1> >(
a,
b,
c,
x,
y,
x,
w);
939 #if MAX_MULTI_BLAS_N >= 10 941 multiblasCuda<10,multi_caxpyBxpz_,write<0,1,0,1> >(
a,
b,
c,
x,
y,
x,
w);
943 #if MAX_MULTI_BLAS_N >= 11 945 multiblasCuda<11,multi_caxpyBxpz_,write<0,1,0,1> >(
a,
b,
c,
x,
y,
x,
w);
947 #if MAX_MULTI_BLAS_N >= 12 949 multiblasCuda<12,multi_caxpyBxpz_,write<0,1,0,1> >(
a,
b,
c,
x,
y,
x,
w);
951 #if MAX_MULTI_BLAS_N >= 13 953 multiblasCuda<13,multi_caxpyBxpz_,write<0,1,0,1> >(
a,
b,
c,
x,
y,
x,
w);
955 #if MAX_MULTI_BLAS_N >= 14 957 multiblasCuda<14,multi_caxpyBxpz_,write<0,1,0,1> >(
a,
b,
c,
x,
y,
x,
w);
959 #if MAX_MULTI_BLAS_N >= 15 961 multiblasCuda<15,multi_caxpyBxpz_,write<0,1,0,1> >(
a,
b,
c,
x,
y,
x,
w);
963 #if MAX_MULTI_BLAS_N >= 16 965 multiblasCuda<16,multi_caxpyBxpz_,write<0,1,0,1> >(
a,
b,
c,
x,
y,
x,
w);
994 std::vector<ColorSpinorField*> x0(x_.begin(), x_.begin() + x_.size()/2);
999 const Complex *b1 = &b_[x_.size()/2];
1001 std::vector<ColorSpinorField*> x1(x_.begin() + x_.size()/2, x_.end());
void caxpyz(const Complex *a, std::vector< ColorSpinorField *> &x, std::vector< ColorSpinorField *> &y, std::vector< ColorSpinorField *> &z)
Compute the block "caxpyz" with over the set of ColorSpinorFields. E.g., it computes.
void caxpyz_U(const Complex *a, std::vector< ColorSpinorField *> &x, std::vector< ColorSpinorField *> &y, std::vector< ColorSpinorField *> &z)
Compute the block "caxpyz" with over the set of ColorSpinorFields. E.g., it computes.
virtual __device__ __host__ void init()
pre-computation routine before the main loop
multi_axpyBzpcx_(const coeff_array< double > &a, const coeff_array< double > &b, const coeff_array< double > &c, int NYW)
static __constant__ signed char Bmatrix_d[MAX_MATRIX_SIZE]
char aux_tmp[TuneKey::aux_n]
static __constant__ signed char Amatrix_d[MAX_MATRIX_SIZE]
std::complex< double > Complex
multicaxpyz_(const coeff_array< Complex > &a, const coeff_array< Complex > &b, const coeff_array< Complex > &c, int NYW)
multi_caxpyBxpz_(const coeff_array< Complex > &a, const coeff_array< Complex > &b, const coeff_array< Complex > &c, int NYW)
double y0(double) __attribute__((availability(macosx
void caxpy_U(const Complex *a, std::vector< ColorSpinorField *> &x, std::vector< ColorSpinorField *> &y)
Compute the block "caxpy_U" with over the set of ColorSpinorFields. E.g., it computes.
int flops()
total number of input and output streams
void caxpyBxpz(const Complex &, ColorSpinorField &, ColorSpinorField &, const Complex &, ColorSpinorField &)
__device__ __host__ void _caxpy(const float2 &a, const float4 &x, float4 &y)
__device__ __host__ void operator()(FloatN &x, FloatN &y, FloatN &z, FloatN &w, const int i, const int j)
where the reduction is usually computed and any auxiliary operations
cudaStream_t * getStream()
double y1(double) __attribute__((availability(macosx
static struct quda::blas::@4 blasStrings
scalar< Float2 >::type real
__device__ __host__ void operator()(FloatN &x, FloatN &y, FloatN &z, FloatN &w, const int i, const int j)
where the reduction is usually computed and any auxiliary operations
void caxpy_recurse(const Complex *a_, std::vector< ColorSpinorField *> &x, std::vector< ColorSpinorField *> &y, int i_idx, int j_idx, int upper)
void caxpy(const Complex &a, ColorSpinorField &x, ColorSpinorField &y)
__device__ __host__ void operator()(FloatN &x, FloatN &y, FloatN &z, FloatN &w, const int i, const int j)
where the reduction is usually computed and any auxiliary operations
scalar< Float2 >::type real
void caxpy_L(const Complex *a, std::vector< ColorSpinorField *> &x, std::vector< ColorSpinorField *> &y)
Compute the block "caxpy_L" with over the set of ColorSpinorFields. E.g., it computes.
static signed char * Amatrix_h
void caxpyz_L(const Complex *a, std::vector< ColorSpinorField *> &x, std::vector< ColorSpinorField *> &y, std::vector< ColorSpinorField *> &z)
Compute the block "caxpyz" with over the set of ColorSpinorFields. E.g., it computes.
int flops()
total number of input and output streams
void axpyBzpcx(const double &a, ColorSpinorField &x, ColorSpinorField &y, const double &b, ColorSpinorField &z, const double &c)
__device__ __host__ void operator()(FloatN &x, FloatN &y, FloatN &z, FloatN &w, const int i, const int j)
where the reduction is usually computed and any auxiliary operations
virtual __device__ __host__ void operator()(FloatN &x, FloatN &y, FloatN &z, FloatN &w, const int i, const int j)=0
where the reduction is usually computed and any auxiliary operations
void caxpyz_recurse(const Complex *a_, std::vector< ColorSpinorField *> &x, std::vector< ColorSpinorField *> &y, std::vector< ColorSpinorField *> &z, int i, int j, int pass, int upper)
multicaxpy_(const coeff_array< Complex > &a, const coeff_array< Complex > &b, const coeff_array< Complex > &c, int NYW)
int flops()
total number of input and output streams
int flops()
total number of input and output streams
__device__ unsigned int count[QUDA_MAX_MULTI_REDUCE]
static signed char * Bmatrix_h