12 #if (__COMPUTE_CAPABILITY__ < 300) 13 #undef MAX_MULTI_BLAS_N 14 #define MAX_MULTI_BLAS_N 2 27 template <
int writeX,
int writeY,
int writeZ,
int writeW>
29 static constexpr
int X = writeX;
30 static constexpr
int Y = writeY;
31 static constexpr
int Z = writeZ;
32 static constexpr
int W = writeW;
35 template <
typename doubleN,
typename ReduceType,
typename FloatN,
int M,
int NXZ,
typename Arg>
42 const int32_t words = tp.
grid.z * NXZ * arg.NYW *
sizeof(ReduceType) /
sizeof(int32_t);
45 #ifdef WARP_MULTI_REDUCE 46 #error "Untested - should be reverified" 50 using namespace jitify::reflection;
51 tunable.
jitifyError() = program->kernel(
"quda::blas::multiReduceKernel")
52 .instantiate((
int)tp.
block.x, Type<ReduceType>(), Type<FloatN>(), M, NXZ, Type<Arg>())
56 #if CUDA_VERSION < 9000 57 cudaMemcpyToSymbolAsync(
arg_buffer, reinterpret_cast<char *>(&arg),
sizeof(arg), 0, cudaMemcpyHostToDevice,
65 #if (defined(_MSC_VER) && defined(_WIN64) || defined(__LP64__)) 77 cudaMemcpyDeviceToHost);
82 for (
int i = 0; i < NXZ; i++) {
83 for (
int j = 0; j < arg.NYW; j++) {
93 template <
unsigned... digits>
struct to_chars {
94 static const char value[];
97 template <
unsigned... digits>
const char to_chars<digits...>::value[] = {(
'0' + digits)..., 0};
99 template <
unsigned rem,
unsigned... digits>
struct explode : explode<rem / 10, rem % 10, digits...> {
102 template <
unsigned... digits>
struct explode<0, digits...> : to_chars<digits...> {
109 template <
int NXZ,
typename doubleN,
typename ReduceType,
typename FloatN,
int M,
typename SpinorX,
110 typename SpinorY,
typename SpinorZ,
typename SpinorW,
typename Reducer>
120 std::vector<ColorSpinorField *> &x, &y, &
z, &w;
131 advanceBlockDim(next);
133 param.
shared_bytes = sharedBytesPerThread() * nthreads > sharedBytesPerBlock(param) ?
134 sharedBytesPerThread() * nthreads :
135 sharedBytesPerBlock(param);
145 std::vector<ColorSpinorField *> &x, std::vector<ColorSpinorField *> &y, std::vector<ColorSpinorField *> &z,
146 std::vector<ColorSpinorField *> &w,
int NYW,
int length) :
148 nParity(x[0]->SiteSubset()),
149 arg(X, Y, Z, W, r, NYW, length / nParity),
160 strcpy(aux,
"policy_kernel,");
161 strcat(aux, x[0]->AuxString());
165 bool is_norm =
false;
168 for (
int i = 0; i < NXZ; i++) {
169 if (x[i]->
V() != y[i]->
V() || x[i]->
V() != z[i]->
V() || x[i]->
V() != w[i]->
V()) {
175 if (is_norm) strcat(aux,
",norm");
178 ::quda::create_jitify_program(
"kernels/multi_reduce_core.cuh");
186 strcat(name, std::to_string(NYW).c_str());
187 strcat(name,
typeid(arg.
r).name());
188 return TuneKey(x[0]->VolString(), name, aux);
194 multiReduceLaunch<doubleN, ReduceType, FloatN, M, NXZ>(result,
arg, tp,
stream, *
this);
198 #ifdef WARP_MULTI_REDUCE 208 if (param.
block.y < NYW) {
223 if (NYW >
deviceProp.maxGridSize[1])
errorQuda(
"N=%d is greater than the maximum support grid size", NYW);
232 param.
grid.z = nParity;
240 param.
grid.z = nParity;
245 for (
int i = 0; i < NYW; ++i) {
246 arg.
Y[i].backup(&Y_h[i], &Ynorm_h[i], y[i]->Bytes(), y[i]->NormBytes());
247 arg.
W[i].backup(&W_h[i], &Wnorm_h[i], w[i]->Bytes(), w[i]->NormBytes());
253 for (
int i = 0; i < NYW; ++i) {
254 arg.
Y[i].restore(&Y_h[i], &Ynorm_h[i], y[i]->Bytes(), y[i]->NormBytes());
255 arg.
W[i].restore(&W_h[i], &Wnorm_h[i], w[i]->Bytes(), w[i]->NormBytes());
267 return NYW * NXZ * arg.
r.streams() * x[0]->Bytes();
273 template <
typename doubleN,
typename ReduceType,
typename RegType,
typename StoreType,
typename yType,
int M,
int NXZ,
274 template <
int MXZ,
typename ReducerType,
typename Float,
typename FloatN>
class Reducer,
typename write,
typename T>
276 std::vector<ColorSpinorField *> &x, std::vector<ColorSpinorField *> &y, std::vector<ColorSpinorField *> &z,
277 std::vector<ColorSpinorField *> &w,
int length)
280 const int NYW = y.size();
282 memset(result, 0, NXZ * NYW *
sizeof(doubleN));
284 const int N_MAX = NXZ > NYW ? NXZ : NYW;
285 const int N_MIN = NXZ < NYW ? NXZ : NYW;
288 "MAX_MULTI_BLAS_N^2 exceeds maximum number of reductions");
289 static_assert(
MAX_MULTI_BLAS_N <= 16,
"MAX_MULTI_BLAS_N exceeds maximum size 16");
296 for (
int i = 0; i < N_MIN; i++) {
300 if (!x[i]->isNative()) {
301 warningQuda(
"Reductions on non-native fields are not supported\n");
307 typedef typename vector<Float, 2>::type Float2;
308 typedef vector<Float, 2> vec2;
313 errorQuda(
"Constant memory buffer support not enabled with jitify yet");
322 for (
int i = 0; i < NXZ; i++)
323 for (
int j = 0; j < NYW; j++) A[MAX_MULTI_BLAS_N * i + j] = make_Float2<Float2>(
Complex(a.
data[NYW * i + j]));
326 Amatrix_h =
reinterpret_cast<signed char *
>(
const_cast<T *
>(a.
data));
333 for (
int i = 0; i < NXZ; i++)
334 for (
int j = 0; j < NYW; j++) B[MAX_MULTI_BLAS_N * i + j] = make_Float2<Float2>(
Complex(b.
data[NYW * i + j]));
337 Bmatrix_h =
reinterpret_cast<signed char *
>(
const_cast<T *
>(b.
data));
344 for (
int i = 0; i < NXZ; i++)
345 for (
int j = 0; j < NYW; j++) C[MAX_MULTI_BLAS_N * i + j] = make_Float2<Float2>(
Complex(c.
data[NYW * i + j]));
348 Cmatrix_h =
reinterpret_cast<signed char *
>(
const_cast<T *
>(c.
data));
356 for (
int i = 0; i < NXZ; i++) {
357 X[i].
set(*dynamic_cast<cudaColorSpinorField *>(x[i]));
358 Z[i].
set(*dynamic_cast<cudaColorSpinorField *>(z[i]));
360 for (
int i = 0; i < NYW; i++) {
361 Y[i].
set(*dynamic_cast<cudaColorSpinorField *>(y[i]));
362 W[i].
set(*dynamic_cast<cudaColorSpinorField *>(w[i]));
365 Reducer<NXZ, ReduceType, Float2, RegType> r(a, b, c, NYW);
370 reduce(result, X, Y, Z, W, r, x, y, z, w, NYW, length);
382 template <
int NXZ,
typename doubleN,
typename ReduceType,
383 template <
int MXZ,
typename ReducerType,
typename Float,
typename FloatN>
class Reducer,
typename write,
384 bool siteUnroll,
typename T>
389 const int NYW = y.size();
391 int reduce_length = siteUnroll ? x[0]->RealLength() : x[0]->Length();
397 #if QUDA_PRECISION & 8 398 if (x[0]->
Nspin() == 4 || x[0]->
Nspin() == 2) {
399 #if defined(GPU_WILSON_DIRAC) || defined(GPU_DOMAIN_WALL_DIRAC) || defined(GPU_MULTIGRID) 400 const int M = siteUnroll ? 12 : 1;
401 if (x[0]->
Nspin() == 2 && siteUnroll)
errorQuda(
"siteUnroll not supported for nSpin==2");
402 multiReduce<doubleN, ReduceType, double2, double2, double2, M, NXZ, Reducer, write>(
403 result, a, b, c, x, y, z, w, reduce_length / (2 * M));
405 errorQuda(
"blas has not been built for Nspin=%d fields", x[0]->
Nspin());
407 }
else if (x[0]->
Nspin() == 1) {
408 #ifdef GPU_STAGGERED_DIRAC 409 const int M = siteUnroll ? 3 : 1;
410 multiReduce<doubleN, ReduceType, double2, double2, double2, M, NXZ, Reducer, write>(
411 result, a, b, c, x, y, z, w, reduce_length / (2 * M));
413 errorQuda(
"blas has not been built for Nspin=%d field", x[0]->
Nspin());
419 errorQuda(
"QUDA_PRECISION=%d does not enable precision %d", QUDA_PRECISION, precision);
424 #if QUDA_PRECISION & 4 425 if (x[0]->
Nspin() == 4) {
426 #if defined(GPU_WILSON_DIRAC) || defined(GPU_DOMAIN_WALL_DIRAC) 427 const int M = siteUnroll ? 6 : 1;
428 multiReduce<doubleN, ReduceType, float4, float4, float4, M, NXZ, Reducer, write>(
429 result, a, b, c, x, y, z, w, reduce_length / (4 * M));
431 errorQuda(
"blas has not been built for Nspin=%d fields", x[0]->
Nspin());
433 }
else if (x[0]->
Nspin() == 1 || x[0]->
Nspin() == 2) {
434 #if defined(GPU_STAGGERED_DIRAC) || defined(GPU_MULTIGRID) 435 const int M = siteUnroll ? 3 : 1;
436 if (x[0]->
Nspin() == 2 && siteUnroll)
errorQuda(
"siteUnroll not supported for nSpin==2");
437 multiReduce<doubleN, ReduceType, float2, float2, float2, M, NXZ, Reducer, write>(
438 result, a, b, c, x, y, z, w, reduce_length / (2 * M));
440 errorQuda(
"blas has not been built for Nspin=%d fields", x[0]->
Nspin());
446 errorQuda(
"QUDA_PRECISION=%d does not enable precision %d", QUDA_PRECISION, precision);
451 #if QUDA_PRECISION & 2 452 if (x[0]->
Nspin() == 4) {
453 #if defined(GPU_WILSON_DIRAC) || defined(GPU_DOMAIN_WALL_DIRAC) 455 multiReduce<doubleN, ReduceType, float4, short4, short4, M, NXZ, Reducer, write>(
456 result, a, b, c, x, y, z, w, x[0]->Volume());
458 errorQuda(
"blas has not been built for Nspin=%d fields", x[0]->
Nspin());
460 }
else if (x[0]->
Nspin() == 1) {
461 #ifdef GPU_STAGGERED_DIRAC 463 multiReduce<doubleN, ReduceType, float2, short2, short2, M, NXZ, Reducer, write>(
464 result, a, b, c, x, y, z, w, x[0]->Volume());
466 errorQuda(
"blas has not been built for Nspin=%d fields", x[0]->
Nspin());
472 errorQuda(
"QUDA_PRECISION=%d does not enable precision %d", QUDA_PRECISION, precision);
477 #if QUDA_PRECISION & 1 478 if (x[0]->
Nspin() == 4) {
479 #if defined(GPU_WILSON_DIRAC) || defined(GPU_DOMAIN_WALL_DIRAC) 481 multiReduce<doubleN, ReduceType, float4, char4, char4, M, NXZ, Reducer, write>(
482 result, a, b, c, x, y, z, w, x[0]->Volume());
484 errorQuda(
"blas has not been built for Nspin=%d fields", x[0]->
Nspin());
486 }
else if (x[0]->
Nspin() == 1) {
487 #ifdef GPU_STAGGERED_DIRAC 489 multiReduce<doubleN, ReduceType, float2, char2, char2, M, NXZ, Reducer, write>(
490 result, a, b, c, x, y, z, w, x[0]->Volume());
492 errorQuda(
"blas has not been built for Nspin=%d fields", x[0]->
Nspin());
498 errorQuda(
"QUDA_PRECISION=%d does not enable precision %d", QUDA_PRECISION, precision);
501 errorQuda(
"Precision %d not supported\n", precision);
508 template <
int NXZ,
typename doubleN,
typename ReduceType,
509 template <
int MXZ,
typename ReducerType,
typename Float,
typename FloatN>
class Reducer,
typename write,
510 bool siteUnroll,
typename T>
515 const int NYW = y.size();
520 assert(siteUnroll ==
true);
521 int reduce_length = siteUnroll ? x[0]->RealLength() : x[0]->Length();
525 if (x[0]->
Nspin() == 4) {
526 #if defined(GPU_WILSON_DIRAC) || defined(GPU_DOMAIN_WALL_DIRAC) 528 multiReduce<doubleN, ReduceType, double2, float4, double2, M, NXZ, Reducer, write>(
529 result, a, b, c, x, y, z, w, reduce_length / (2 * M));
531 errorQuda(
"blas has not been built for Nspin=%d fields", x[0]->
Nspin());
533 }
else if (x[0]->
Nspin() == 1) {
534 #ifdef GPU_STAGGERED_DIRAC 536 multiReduce<doubleN, ReduceType, double2, float2, double2, M, NXZ, Reducer, write>(
537 result, a, b, c, x, y, z, w, reduce_length / (2 * M));
539 errorQuda(
"blas has not been built for Nspin=%d field", x[0]->
Nspin());
547 if (x[0]->
Nspin() == 4) {
548 #if defined(GPU_WILSON_DIRAC) || defined(GPU_DOMAIN_WALL_DIRAC) 550 multiReduce<doubleN, ReduceType, double2, short4, double2, M, NXZ, Reducer, write>(
551 result, a, b, c, x, y, z, w, reduce_length / (4 * M));
553 errorQuda(
"blas has not been built for Nspin=%d fields", x[0]->
Nspin());
555 }
else if (x[0]->
Nspin() == 1 || x[0]->
Nspin() == 2) {
556 #if defined(GPU_STAGGERED_DIRAC) 558 multiReduce<doubleN, ReduceType, double2, short2, double2, M, NXZ, Reducer, write>(
559 result, a, b, c, x, y, z, w, reduce_length / (2 * M));
561 errorQuda(
"blas has not been built for Nspin=%d fields", x[0]->
Nspin());
569 if (x[0]->
Nspin() == 4) {
570 #if defined(GPU_WILSON_DIRAC) || defined(GPU_DOMAIN_WALL_DIRAC) 572 multiReduce<doubleN, ReduceType, float4, short4, float4, M, NXZ, Reducer, write>(
573 result, a, b, c, x, y, z, w, x[0]->Volume());
575 errorQuda(
"blas has not been built for Nspin=%d fields", x[0]->
Nspin());
577 }
else if (x[0]->
Nspin() == 1) {
578 #ifdef GPU_STAGGERED_DIRAC 580 multiReduce<doubleN, ReduceType, float2, short2, float2, M, NXZ, Reducer, write>(
581 result, a, b, c, x, y, z, w, x[0]->Volume());
583 errorQuda(
"blas has not been built for Nspin=%d fields", x[0]->
Nspin());
590 errorQuda(
"Precision combination x=%d y=%d not supported\n", x[0]->Precision(), y[0]->Precision());
594 template <
int NXZ,
typename doubleN,
typename ReduceType,
595 template <
int MXZ,
typename ReducerType,
typename Float,
typename FloatN>
class ReducerDiagonal,
typename writeDiagonal,
596 template <
int MXZ,
typename ReducerType,
typename Float,
typename FloatN>
class ReducerOffDiagonal,
597 typename writeOffDiagonal,
bool siteUnroll,
typename T>
603 if (x[0]->Precision() == y[0]->Precision()) {
605 multiReduce<NXZ, doubleN, ReduceType, ReducerDiagonal, writeDiagonal, siteUnroll, T>(
606 result, a, b, c, x, y, z, w);
608 multiReduce<NXZ, doubleN, ReduceType, ReducerOffDiagonal, writeOffDiagonal, siteUnroll, T>(
609 result, a, b, c, x, y, z, w);
613 mixedMultiReduce<NXZ, doubleN, ReduceType, ReducerDiagonal, writeDiagonal, true, T>(
614 result, a, b, c, x, y, z, w);
616 mixedMultiReduce<NXZ, doubleN, ReduceType, ReducerOffDiagonal, writeOffDiagonal, true, T>(
617 result, a, b, c, x, y, z, w);
622 void reDotProduct(
double* result, std::vector<ColorSpinorField*>& x, std::vector<ColorSpinorField*>& y){
628 multiReduce<1, double, QudaSumFloat, Dot, 0, 0, 0, 0, false>(
629 result, make_double2(0.0, 0.0), make_double2(0.0, 0.0), x, y, x, y);
632 multiReduce<2, double, QudaSumFloat, Dot, 0, 0, 0, 0, false>(
633 result, make_double2(0.0, 0.0), make_double2(0.0, 0.0), x, y, x, y);
636 multiReduce<3, double, QudaSumFloat, Dot, 0, 0, 0, 0, false>(
637 result, make_double2(0.0, 0.0), make_double2(0.0, 0.0), x, y, x, y);
640 multiReduce<4, double, QudaSumFloat, Dot, 0, 0, 0, 0, false>(
641 result, make_double2(0.0, 0.0), make_double2(0.0, 0.0), x, y, x, y);
644 multiReduce<5, double, QudaSumFloat, Dot, 0, 0, 0, 0, false>(
645 result, make_double2(0.0, 0.0), make_double2(0.0, 0.0), x, y, x, y);
648 multiReduce<6, double, QudaSumFloat, Dot, 0, 0, 0, 0, false>(
649 result, make_double2(0.0, 0.0), make_double2(0.0, 0.0), x, y, x, y);
652 multiReduce<7, double, QudaSumFloat, Dot, 0, 0, 0, 0, false>(
653 result, make_double2(0.0, 0.0), make_double2(0.0, 0.0), x, y, x, y);
656 multiReduce<8, double, QudaSumFloat, Dot, 0, 0, 0, 0, false>(
657 result, make_double2(0.0, 0.0), make_double2(0.0, 0.0), x, y, x, y);
697 const int Nreduce = x.size()*y.size();
704 template <
template <
int MXZ,
typename ReducerType,
typename Float,
typename FloatN>
class ReducerDiagonal,
typename writeDiagonal,
705 template <
int MXZ,
typename ReducerType,
typename Float,
typename FloatN>
class ReducerOffDiagonal,
typename writeOffDiagonal>
707 std::vector<ColorSpinorField*>&z, std::vector<ColorSpinorField*>&w,
int i_idx,
int j_idx,
bool hermitian,
unsigned int tile_size) {
709 if (y.size() > tile_size)
713 Complex* result1 = &result[x.size()*(y.size()/2)];
714 std::vector<ColorSpinorField*> y0(y.begin(), y.begin() + y.size()/2);
715 std::vector<ColorSpinorField*> y1(y.begin() + y.size()/2, y.end());
716 std::vector<ColorSpinorField*> w0(w.begin(), w.begin() + w.size()/2);
717 std::vector<ColorSpinorField*> w1(w.begin() + w.size()/2, w.end());
718 multiReduce_recurse<ReducerDiagonal,writeDiagonal,ReducerOffDiagonal,writeOffDiagonal>(result0, x, y0, z, w0, i_idx, 2*j_idx+0, hermitian, tile_size);
719 multiReduce_recurse<ReducerDiagonal,writeDiagonal,ReducerOffDiagonal,writeOffDiagonal>(result1, x, y1, z, w1, i_idx, 2*j_idx+1, hermitian, tile_size);
723 double2* cdot =
new double2[x.size()*y.size()];
726 if (x.size() <= tile_size && hermitian) {
727 if (j_idx < i_idx) {
return; }
732 if (x.size() <= tile_size) {
735 multiReduce<1, double2, QudaSumFloat2, ReducerDiagonal, writeDiagonal, ReducerOffDiagonal, writeOffDiagonal, false>(
736 cdot, a, b, c, x, y, z, w, i_idx, j_idx);
738 #if MAX_MULTI_BLAS_N >= 2 740 multiReduce<2, double2, QudaSumFloat2, ReducerDiagonal, writeDiagonal, ReducerOffDiagonal, writeOffDiagonal, false>(
741 cdot, a, b, c, x, y, z, w, i_idx, j_idx);
743 #if MAX_MULTI_BLAS_N >= 3 745 multiReduce<3, double2, QudaSumFloat2, ReducerDiagonal, writeDiagonal, ReducerOffDiagonal, writeOffDiagonal, false>(
746 cdot, a, b, c, x, y, z, w, i_idx, j_idx);
748 #if MAX_MULTI_BLAS_N >= 4 750 multiReduce<4, double2, QudaSumFloat2, ReducerDiagonal, writeDiagonal, ReducerOffDiagonal, writeOffDiagonal, false>(
751 cdot, a, b, c, x, y, z, w, i_idx, j_idx);
753 #if MAX_MULTI_BLAS_N >= 5 755 multiReduce<5, double2, QudaSumFloat2, ReducerDiagonal, writeDiagonal, ReducerOffDiagonal, writeOffDiagonal, false>(
756 cdot, a, b, c, x, y, z, w, i_idx, j_idx);
758 #if MAX_MULTI_BLAS_N >= 6 760 multiReduce<6, double2, QudaSumFloat2, ReducerDiagonal, writeDiagonal, ReducerOffDiagonal, writeOffDiagonal, false>(
761 cdot, a, b, c, x, y, z, w, i_idx, j_idx);
763 #if MAX_MULTI_BLAS_N >= 7 765 multiReduce<7, double2, QudaSumFloat2, ReducerDiagonal, writeDiagonal, ReducerOffDiagonal, writeOffDiagonal, false>(
766 cdot, a, b, c, x, y, z, w, i_idx, j_idx);
768 #if MAX_MULTI_BLAS_N >= 8 770 multiReduce<8, double2, QudaSumFloat2, ReducerDiagonal, writeDiagonal, ReducerOffDiagonal, writeOffDiagonal, false>(
771 cdot, a, b, c, x, y, z, w, i_idx, j_idx);
773 #if MAX_MULTI_BLAS_N >= 9 775 multiReduce<9, double2, QudaSumFloat2, ReducerDiagonal, writeDiagonal, ReducerOffDiagonal, writeOffDiagonal, false>(
776 cdot, a, b, c, x, y, z, w, i_idx, j_idx);
778 #if MAX_MULTI_BLAS_N >= 10 780 multiReduce<10, double2, QudaSumFloat2, ReducerDiagonal, writeDiagonal, ReducerOffDiagonal, writeOffDiagonal, false>(
781 cdot, a, b, c, x, y, z, w, i_idx, j_idx);
783 #if MAX_MULTI_BLAS_N >= 11 785 multiReduce<11, double2, QudaSumFloat2, ReducerDiagonal, writeDiagonal, ReducerOffDiagonal, writeOffDiagonal, false>(
786 cdot, a, b, c, x, y, z, w, i_idx, j_idx);
788 #if MAX_MULTI_BLAS_N >= 12 790 multiReduce<12, double2, QudaSumFloat2, ReducerDiagonal, writeDiagonal, ReducerOffDiagonal, writeOffDiagonal, false>(
791 cdot, a, b, c, x, y, z, w, i_idx, j_idx);
793 #if MAX_MULTI_BLAS_N >= 13 795 multiReduce<13, double2, QudaSumFloat2, ReducerDiagonal, writeDiagonal, ReducerOffDiagonal, writeOffDiagonal, false>(
796 cdot, a, b, c, x, y, z, w, i_idx, j_idx);
798 #if MAX_MULTI_BLAS_N >= 14 800 multiReduce<14, double2, QudaSumFloat2, ReducerDiagonal, writeDiagonal, ReducerOffDiagonal, writeOffDiagonal, false>(
801 cdot, a, b, c, x, y, z, w, i_idx, j_idx);
803 #if MAX_MULTI_BLAS_N >= 15 805 multiReduce<15, double2, QudaSumFloat2, ReducerDiagonal, writeDiagonal, ReducerOffDiagonal, writeOffDiagonal, false>(
806 cdot, a, b, c, x, y, z, w, i_idx, j_idx);
808 #if MAX_MULTI_BLAS_N >= 16 810 multiReduce<16, double2, QudaSumFloat2, ReducerDiagonal, writeDiagonal, ReducerOffDiagonal, writeOffDiagonal, false>(
811 cdot, a, b, c, x, y, z, w, i_idx, j_idx);
835 Complex* result0 = &tmpmajor[0];
836 Complex* result1 = &tmpmajor[(x.size()/2)*y.size()];
837 std::vector<ColorSpinorField*> x0(x.begin(), x.begin() + x.size()/2);
838 std::vector<ColorSpinorField*> x1(x.begin() + x.size()/2, x.end());
839 std::vector<ColorSpinorField*> z0(z.begin(), z.begin() + z.size()/2);
840 std::vector<ColorSpinorField*> z1(z.begin() + z.size()/2, z.end());
842 multiReduce_recurse<ReducerDiagonal,writeDiagonal,ReducerOffDiagonal,writeOffDiagonal>(result0, x0, y, z0, w, 2*i_idx+0, j_idx, hermitian, tile_size);
843 multiReduce_recurse<ReducerDiagonal,writeDiagonal,ReducerOffDiagonal,writeOffDiagonal>(result1, x1, y, z1, w, 2*i_idx+1, j_idx, hermitian, tile_size);
845 const unsigned int xlen0 = x.size()/2;
846 const unsigned int xlen1 = x.size() - xlen0;
847 const unsigned int ylen = y.size();
850 int count = 0, count0 = 0, count1 = 0;
851 for (
unsigned int i = 0; i < ylen; i++)
853 for (
unsigned int j = 0; j < xlen0; j++)
854 result[count++] = result0[count0++];
855 for (
unsigned int j = 0; j < xlen1; j++)
856 result[count++] = result1[count1++];
863 if (x.size() <= tile_size)
865 const unsigned int xlen = x.size();
866 const unsigned int ylen = y.size();
867 for (
unsigned int j = 0; j < xlen; j++)
868 for (
unsigned int i = 0; i < ylen; i++)
869 result[i*xlen+j] =
Complex(cdot[j*ylen + i].x, cdot[j*ylen+i].y);
876 template <
template <
int MXZ,
typename ReducerType,
typename Float,
typename FloatN>
class ReducerDiagonal,
877 typename writeDiagonal,
878 template <
int MXZ,
typename ReducerType,
typename Float,
typename FloatN>
class ReducerOffDiagonal,
879 typename writeOffDiagonal>
881 typedef std::vector<ColorSpinorField*>
vec;
894 : result(result), x(x), y(y), z(z), w(w), hermitian(hermitian), Anorm(Anorm), max_tile_size(1)
896 strcpy(aux,
"policy,");
897 strcat(aux, x[0]->AuxString());
899 strcat(aux, y[0]->AuxString());
900 if (hermitian) strcat(aux,
",hermitian");
901 if (Anorm) strcat(aux,
",Anorm");
910 strcat(aux,
",multi-blas-n=");
919 if (x.size() == 1 || y.size() == 1) {
921 max_tile_size = std::min(
MAX_MULTI_BLAS_N, (
int)std::max(x.size(), y.size()));
924 for (
unsigned int tile_size=1; tile_size <= max_tile_size; tile_size++) {
925 multiReduce_recurse<ReducerDiagonal,writeDiagonal,ReducerOffDiagonal,writeOffDiagonal>
926 (result, x, y, z, w, 0, 0, hermitian, tile_size);
934 unsigned int max_count = 0;
936 while (tile_size_tmp != 1) { tile_size_tmp = tile_size_tmp >> 1; max_count++; }
938 for (
unsigned int i = 0; i < max_count; i++) { tile_size_tmp = tile_size_tmp << 1; }
939 max_tile_size = tile_size_tmp;
942 for (
unsigned int tile_size=1; tile_size <= max_tile_size && tile_size <= x.size() &&
943 (tile_size <= y.size() || y.size()==1) ; tile_size*=2) {
944 multiReduce_recurse<ReducerDiagonal,writeDiagonal,ReducerOffDiagonal,writeOffDiagonal>
945 (result, x, y, z, w, 0, 0, hermitian, tile_size);
951 multiReduce_recurse<ReducerDiagonal,writeDiagonal,ReducerOffDiagonal,writeOffDiagonal>
970 multiReduce_recurse<ReducerDiagonal,writeDiagonal,ReducerOffDiagonal,writeOffDiagonal>
971 (result, x, y, z, w, 0, 0, hermitian, tp.
aux.x);
977 if ( x.size()==1 || y.size()==1 ) {
980 if ( (
unsigned int)param.
aux.x <= max_tile_size ) {
989 if ( (
unsigned int)(2*param.
aux.x) <= max_tile_size &&
990 (
unsigned int)(2*param.
aux.x) <= x.size() &&
991 (
unsigned int)(2*param.
aux.x) <= y.size() ) {
1011 param.
aux.x = 1; param.
aux.y = 0; param.
aux.z = 0; param.
aux.w = 0;
1017 param.
aux.x = max_tile_size; param.
aux.y = 0; param.
aux.z = 0; param.
aux.w = 0;
1021 return TuneKey(x[0]->VolString(),
typeid(*this).name(), aux);
1032 if (x.size() == 0 || y.size() == 0)
errorQuda(
"vector.size() == 0");
1034 for (
unsigned int i = 0; i < x.size()*y.size(); i++) result_tmp[i] = 0.0;
1039 TileSizeTune<Cdot,write<0,0,0,0>,
Cdot,
write<0,0,0,0> > tile(result_tmp, x, y, x, y,
false);
1043 const int Nreduce = 2*x.size()*y.size();
1047 const unsigned int xlen = x.size();
1048 const unsigned int ylen = y.size();
1049 for (
unsigned int j = 0; j < xlen; j++)
1050 for (
unsigned int i = 0; i < ylen; i++)
1051 result[j*ylen+i] = result_tmp[i*xlen + j];
1053 delete[] result_tmp;
1057 if (x.size() == 0 || y.size() == 0)
errorQuda(
"vector.size() == 0");
1058 if (x.size() != y.size())
errorQuda(
"Cannot call Hermitian block dot product on non-square inputs");
1061 for (
unsigned int i = 0; i < x.size()*y.size(); i++) result_tmp[i] = 0.0;
1063 TileSizeTune<Cdot,write<0,0,0,0>,
Cdot,
write<0,0,0,0> > tile(result_tmp, x, y, x, y,
true,
false);
1067 const int Nreduce = 2*x.size()*y.size();
1071 const unsigned int xlen = x.size();
1072 const unsigned int ylen = y.size();
1073 for (
unsigned int j = 0; j < xlen; j++)
1074 for (
unsigned int i = j; i < ylen; i++) {
1075 result[j*ylen+i] = result_tmp[i*xlen + j];
1076 result[i*ylen+j] =
conj(result_tmp[i*xlen + j]);
1079 delete[] result_tmp;
1084 if (x.size() == 0 || y.size() == 0)
errorQuda(
"vector.size() == 0");
1085 if (x.size() != y.size())
errorQuda(
"Cannot call Hermitian block A-norm dot product on non-square inputs");
1088 for (
unsigned int i = 0; i < x.size()*y.size(); i++) result_tmp[i] = 0.0;
1090 TileSizeTune<Cdot,write<0,0,0,0>,
Cdot,
write<0,0,0,0> > tile(result_tmp, x, y, x, y,
true,
true);
1094 const int Nreduce = 2*x.size()*y.size();
1098 const unsigned int xlen = x.size();
1099 const unsigned int ylen = y.size();
1100 for (
unsigned int j = 0; j < xlen; j++)
1101 for (
unsigned int i = j; i < ylen; i++) {
1102 result[j*ylen+i] = result_tmp[i*xlen + j];
1103 result[i*ylen+j] =
conj(result_tmp[i*xlen + j]);
1106 delete[] result_tmp;
1111 std::vector<ColorSpinorField*>&z){
1114 if (x.size() == 0 || y.size() == 0)
errorQuda(
"vector.size() == 0");
1115 if (y.size() != z.size())
errorQuda(
"Cannot copy input y of size %lu into z of size %lu\n", y.size(), z.size());
1118 for (
unsigned int i = 0; i < x.size()*y.size(); i++) result_tmp[i] = 0.0;
1121 TileSizeTune<CdotCopy,write<0,0,0,1>,
Cdot,
write<0,0,0,0> > tile(result_tmp, x, y, x, y,
true);
1125 const int Nreduce = 2*x.size()*y.size();
1129 const unsigned int xlen = x.size();
1130 const unsigned int ylen = y.size();
1131 for (
unsigned int j = 0; j < xlen; j++)
1132 for (
unsigned int i = 0; i < ylen; i++)
1133 result[j*ylen+i] = result_tmp[i*xlen + j];
1135 delete[] result_tmp;
1137 errorQuda(
"cDotProductCopy not enabled");
MultiReduceCuda(doubleN result[], SpinorX X[], SpinorY Y[], SpinorZ Z[], SpinorW W[], Reducer &r, std::vector< ColorSpinorField *> &x, std::vector< ColorSpinorField *> &y, std::vector< ColorSpinorField *> &z, std::vector< ColorSpinorField *> &w, int NYW, int length)
#define qudaMemcpy(dst, src, count, kind)
CUresult jitifyError() const
void multiReduceLaunch(doubleN result[], Arg &arg, const TuneParam &tp, const cudaStream_t &stream, Tunable &tunable)
enum QudaPrecision_s QudaPrecision
void * getHostReduceBuffer()
bool commAsyncReduction()
static __constant__ signed char Cmatrix_d[MAX_MATRIX_SIZE]
cudaError_t qudaEventQuery(cudaEvent_t &event)
Wrapper around cudaEventQuery or cuEventQuery.
cudaDeviceProp deviceProp
#define LAUNCH_KERNEL_LOCAL_PARITY(kernel, tp, stream, arg,...)
void disableProfileCount()
Disable the profile kernel counting.
QudaVerbosity getVerbosity()
unsigned int sharedBytesPerBlock(const TuneParam ¶m) const
#define checkPrecision(...)
Helper file when using jitify run-time compilation. This file should be included in source code...
static __constant__ signed char Amatrix_d[MAX_MATRIX_SIZE]
unsigned int max_tile_size
Complex cDotProduct(ColorSpinorField &, ColorSpinorField &)
void cDotProductCopy(Complex *result, std::vector< ColorSpinorField *> &a, std::vector< ColorSpinorField *> &b, std::vector< ColorSpinorField *> &c)
Computes the matrix of inner products between the vector set a and the vector set b...
#define QUDA_MAX_MULTI_REDUCE
Maximum number of simultaneous reductions that can take place. This number may be increased if needed...
void reduceDoubleArray(double *, const int len)
void apply(const cudaStream_t &stream)
void set(const cudaColorSpinorField &x)
double reDotProduct(ColorSpinorField &x, ColorSpinorField &y)
void mixedMultiReduce(doubleN result[], const coeff_array< T > &a, const coeff_array< T > &b, const coeff_array< T > &c, CompositeColorSpinorField &x, CompositeColorSpinorField &y, CompositeColorSpinorField &z, CompositeColorSpinorField &w)
void multiReduce(doubleN result[], const coeff_array< T > &a, const coeff_array< T > &b, const coeff_array< T > &c, std::vector< ColorSpinorField *> &x, std::vector< ColorSpinorField *> &y, std::vector< ColorSpinorField *> &z, std::vector< ColorSpinorField *> &w, int length)
void * getMappedHostReduceBuffer()
std::vector< ColorSpinorField * > & z
void multiReduce_recurse(Complex *result, std::vector< ColorSpinorField *> &x, std::vector< ColorSpinorField *> &y, std::vector< ColorSpinorField *> &z, std::vector< ColorSpinorField *> &w, int i_idx, int j_idx, bool hermitian, unsigned int tile_size)
__device__ void reduce(ReduceArg< T > arg, const T &in, const int idx=0)
__host__ __device__ void sum(double &a, double &b)
virtual bool advanceGridDim(TuneParam ¶m) const
void initTuneParam(TuneParam ¶m) const
void defaultTuneParam(TuneParam ¶m) const
void enableProfileCount()
Enable the profile kernel counting.
void completeFastReduce(int32_t words)
SpinorW W[MAX_MULTI_BLAS_N]
cudaStream_t * getStream()
void initFastReduce(int words)
std::vector< ColorSpinorField * > CompositeColorSpinorField
void initTuneParam(TuneParam ¶m) const
SpinorY Y[MAX_MULTI_BLAS_N]
unsigned int sharedBytesPerThread() const
bool advanceTuneParam(TuneParam ¶m) const
std::vector< ColorSpinorField * > vec
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
static signed char * Bmatrix_h
static __constant__ signed char Bmatrix_d[MAX_MATRIX_SIZE]
void hDotProduct_Anorm(Complex *result, std::vector< ColorSpinorField *> &a, std::vector< ColorSpinorField *> &b)
Computes the matrix of inner products between the vector set a and the vector set b...
std::complex< double > Complex
cudaEvent_t * getReduceEvent()
void defaultTuneParam(TuneParam ¶m) const
Parameter struct for generic multi-blas kernel.
MultiReduceArg< NXZ, ReduceType, SpinorX, SpinorY, SpinorZ, SpinorW, Reducer > arg
void setPolicyTuning(bool)
Enable / disable whether are tuning a policy.
unsigned int maxBlockSize(const TuneParam ¶m) const
void * memset(void *s, int c, size_t n)
void apply(const cudaStream_t &stream)
void set(const cudaColorSpinorField &x, int nFace=1)
TileSizeTune(Complex *result, vec &x, vec &y, vec &z, vec &w, bool hermitian, bool Anorm=false)
void hDotProduct(Complex *result, std::vector< ColorSpinorField *> &a, std::vector< ColorSpinorField *> &b)
Computes the matrix of inner products between the vector set a and the vector set b...
static __constant__ signed char arg_buffer[MAX_MATRIX_SIZE]
bool advanceAux(TuneParam ¶m) const
void checkSpinor(const ColorSpinorField &a, const ColorSpinorField &b)
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
cudaError_t qudaEventRecord(cudaEvent_t &event, cudaStream_t stream=0)
Wrapper around cudaEventRecord or cuEventRecord.
virtual bool advanceSharedBytes(TuneParam ¶m) const
__global__ void multiReduceKernel(Arg arg_)
virtual void initTuneParam(TuneParam ¶m) const
const std::map< TuneKey, TuneParam > & getTuneCache()
Returns a reference to the tunecache map.
__host__ __device__ ValueType conj(ValueType x)
QudaTune getTuning()
Query whether autotuning is enabled or not. Default is enabled but can be overridden by setting QUDA_...
void u64toa(char *buffer, uint64_t value)
__device__ unsigned int count[QUDA_MAX_MULTI_REDUCE]
static signed char * Amatrix_h
bool advanceGridDim(TuneParam ¶m) const
unsigned int sharedBytesPerThread() const
static signed char * Cmatrix_h
unsigned int sharedBytesPerBlock(const TuneParam ¶m) const
virtual void defaultTuneParam(TuneParam ¶m) const