35 for (int32_t i = 0; i < words; i++) {
36 reinterpret_cast<int32_t *
>(
h_reduce)[i] = std::numeric_limits<int32_t>::min();
40 atomic_thread_fence(std::memory_order_release);
45 volatile int32_t *check =
reinterpret_cast<int32_t *
>(
h_reduce);
48 while (complete < words) {
50 atomic_thread_fence(std::memory_order_acquire);
53 for (int32_t i = 0; i < words; i++) {
55 if (check[i] != std::numeric_limits<int32_t>::min()) complete++;
57 if (count++ % 10000 == 0) {
59 if (cudaSuccess != cudaPeekAtLastError())
break;
80 const int max_reduce_blocks = 2*
deviceProp.multiProcessorCount;
82 const int max_reduce = 2 * max_reduce_blocks * reduce_size;
86 size_t bytes = max_reduce > max_multi_reduce ? max_reduce : max_multi_reduce;
95 #if (defined(_MSC_VER) && defined(_WIN64)) || defined(__LP64__) 108 cudaEventCreateWithFlags(&
reduceEnd, cudaEventDisableTiming);
111 char *fast_reduce_env = getenv(
"QUDA_ENABLE_FAST_REDUCE");
112 if (fast_reduce_env && strcmp(fast_reduce_env,
"1") == 0) {
113 warningQuda(
"Experimental fast reductions enabled");
138 template <
typename doubleN,
typename ReduceType,
typename FloatN,
int M,
typename Arg>
144 const int32_t words = tp.
grid.y *
sizeof(ReduceType) /
sizeof(int32_t);
148 using namespace jitify::reflection;
149 tunable.
jitifyError() = program->kernel(
"quda::blas::reduceKernel")
150 .instantiate((
int)tp.
block.x, Type<ReduceType>(), Type<FloatN>(), M, Type<Arg>())
158 #if (defined(_MSC_VER) && defined(_WIN64)) || defined(__LP64__) 172 doubleN cpu_sum =
set(((ReduceType *)
h_reduce)[0]);
177 template <
typename doubleN,
typename ReduceType,
typename FloatN,
int M,
typename SpinorX,
typename SpinorY,
178 typename SpinorZ,
typename SpinorW,
typename SpinorV,
typename Reducer>
192 char *Xnorm_h, *Ynorm_h, *
Znorm_h, *Wnorm_h, *Vnorm_h;
200 advanceBlockDim(next);
202 param.
shared_bytes = sharedBytesPerThread() * nthreads > sharedBytesPerBlock(param) ?
203 sharedBytesPerThread() * nthreads :
204 sharedBytesPerBlock(param);
209 ReduceCuda(doubleN &result, SpinorX &
X, SpinorY &Y, SpinorZ &
Z, SpinorW &W, SpinorV &
V, Reducer &r,
212 nParity((x.IsComposite() ? x.CompositeDim() : 1) * (x.SiteSubset())),
213 arg(X, Y, Z, W, V, r, length / nParity),
239 ::quda::create_jitify_program(
"kernels/reduce_core.cuh");
249 result = reduceLaunch<doubleN, ReduceType, FloatN, M>(
arg, tp,
stream, *
this);
273 param.
grid.y = nParity;
279 param.
grid.y = nParity;
288 return (arg.
r.streams() - 2) * x.
Bytes() + 2 * z.
Bytes();
294 template <
typename doubleN,
typename ReduceType,
typename RegType,
typename StoreType,
typename zType,
int M,
295 template <
typename ReducerType,
typename Float,
typename FloatN>
class Reducer,
int writeX,
int writeY,
296 int writeZ,
int writeW,
int writeV>
314 typedef typename vector<Float, 2>::type Float2;
315 typedef vector<Float, 2> vec2;
317 Reducer<ReduceType, Float2, RegType> r((Float2)vec2(a), (Float2)vec2(b));
318 ReduceCuda<doubleN, ReduceType, RegType, M, decltype(X), decltype(Y), decltype(Z), decltype(W), decltype(V),
319 Reducer<ReduceType, Float2, RegType>>
320 reduce(value, X, Y, Z, W, V, r, x, y, z, w, v, length);
347 template <
typename doubleN,
typename ReduceType,
template <
typename ReducerType,
typename Float,
typename FloatN>
class Reducer,
348 int writeX,
int writeY,
int writeZ,
int writeW,
int writeV,
bool siteUnroll>
361 warningQuda(
"Device reductions on non-native fields is not supported\n");
374 #if QUDA_PRECISION & 8 376 #if defined(GPU_WILSON_DIRAC) || defined(GPU_DOMAIN_WALL_DIRAC) || defined(GPU_MULTIGRID) || defined(GPU_COVDEV) 377 const int M = siteUnroll ? 12 : 1;
378 if (x.
Nspin() == 2 && siteUnroll)
errorQuda(
"siteUnroll not supported for nSpin==2");
379 value =
nativeReduce<doubleN, ReduceType, double2, double2, double2, M, Reducer, writeX, writeY, writeZ,
380 writeW, writeV>(a, b, x, y, z, w, v, reduce_length / (2 * M));
382 errorQuda(
"blas has not been built for Nspin=%d fields", x.Nspin());
384 }
else if (x.
Nspin() == 1) {
385 #ifdef GPU_STAGGERED_DIRAC 386 const int M = siteUnroll ? 3 : 1;
387 value =
nativeReduce<doubleN, ReduceType, double2, double2, double2, M, Reducer, writeX, writeY, writeZ,
388 writeW, writeV>(a, b, x, y, z, w, v, reduce_length / (2 * M));
390 errorQuda(
"blas has not been built for Nspin=%d fields", x.Nspin());
396 errorQuda(
"QUDA_PRECISION=%d does not enable precision %d", QUDA_PRECISION, x.
Precision());
401 #if QUDA_PRECISION & 4 403 #if defined(GPU_WILSON_DIRAC) || defined(GPU_DOMAIN_WALL_DIRAC) || defined(GPU_COVDEV) 404 const int M = siteUnroll ? 6 : 1;
405 value =
nativeReduce<doubleN, ReduceType, float4, float4, float4, M, Reducer, writeX, writeY, writeZ,
406 writeW, writeV>(a, b, x, y, z, w, v, reduce_length / (4 * M));
408 errorQuda(
"blas has not been built for Nspin=%d fields", x.Nspin());
411 #if defined(GPU_STAGGERED_DIRAC) || defined(GPU_MULTIGRID) 412 const int M = siteUnroll ? 3 : 1;
413 if (x.
Nspin() == 2 && siteUnroll)
errorQuda(
"siteUnroll not supported for nSpin==2");
414 value =
nativeReduce<doubleN, ReduceType, float2, float2, float2, M, Reducer, writeX, writeY, writeZ,
415 writeW, writeV>(a, b, x, y, z, w, v, reduce_length / (2 * M));
417 errorQuda(
"blas has not been built for Nspin=%d fields", x.Nspin());
423 errorQuda(
"QUDA_PRECISION=%d does not enable precision %d", QUDA_PRECISION, x.
Precision());
428 #if QUDA_PRECISION & 2 430 #if defined(GPU_WILSON_DIRAC) || defined(GPU_DOMAIN_WALL_DIRAC) || defined(GPU_COVDEV) 432 value =
nativeReduce<doubleN, ReduceType, float4, short4, short4, M, Reducer, writeX, writeY, writeZ,
433 writeW, writeV>(a, b, x, y, z, w, v, y.
Volume());
435 errorQuda(
"blas has not been built for Nspin=%d fields", x.Nspin());
438 #if defined(GPU_MULTIGRID) 441 = nativeReduce<doubleN, ReduceType, float2, short2, short2, M, Reducer, writeX, writeY, writeZ, writeW, writeV>(
442 a, b, x, y, z, w, v, y.
Volume());
444 errorQuda(
"blas has not been built for Nspin=%d fields", x.Nspin());
446 }
else if (x.
Nspin() == 1) {
447 #ifdef GPU_STAGGERED_DIRAC 449 value =
nativeReduce<doubleN, ReduceType, float2, short2, short2, M, Reducer, writeX, writeY, writeZ,
450 writeW, writeV>(a, b, x, y, z, w, v, y.
Volume());
452 errorQuda(
"blas has not been built for Nspin=%d fields", x.Nspin());
458 errorQuda(
"QUDA_PRECISION=%d does not enable precision %d", QUDA_PRECISION, x.
Precision());
463 #if QUDA_PRECISION & 1 464 if (x.
Nspin() == 4) {
465 #if defined(GPU_WILSON_DIRAC) || defined(GPU_DOMAIN_WALL_DIRAC) || defined(GPU_COVDEV) 468 = nativeReduce<doubleN, ReduceType, float4, char4, char4, M, Reducer, writeX, writeY, writeZ, writeW, writeV>(
469 a, b, x, y, z, w, v, y.
Volume());
471 errorQuda(
"blas has not been built for Nspin=%d fields", x.Nspin());
473 }
else if (x.
Nspin() == 1) {
474 #ifdef GPU_STAGGERED_DIRAC 477 = nativeReduce<doubleN, ReduceType, float2, char2, char2, M, Reducer, writeX, writeY, writeZ, writeW, writeV>(
478 a, b, x, y, z, w, v, y.
Volume());
480 errorQuda(
"blas has not been built for Nspin=%d fields", x.Nspin());
486 errorQuda(
"QUDA_PRECISION=%d does not enable precision %d", QUDA_PRECISION, x.
Precision());
495 Reducer<doubleN, double2, double2> r(a, b);
496 value =
genericReduce<doubleN, doubleN, double, double, writeX, writeY, writeZ, writeW, writeV,
497 Reducer<doubleN, double2, double2>>(x, y, z, w, v, r);
499 Reducer<doubleN, float2, float2> r(make_float2(a.x, a.y), make_float2(b.x, b.y));
500 value =
genericReduce<doubleN, doubleN, float, float, writeX, writeY, writeZ, writeW, writeV,
501 Reducer<doubleN, float2, float2>>(x, y, z, w, v, r);
507 const int Nreduce =
sizeof(doubleN) /
sizeof(
double);
518 template <
typename doubleN,
typename ReduceType,
template <
typename ReducerType,
typename Float,
typename FloatN>
class Reducer,
519 int writeX,
int writeY,
int writeZ,
int writeW,
int writeV,
bool siteUnroll>
531 warningQuda(
"Device reductions on non-native fields is not supported\n");
542 #if QUDA_PRECISION & 8 545 #if QUDA_PRECISION & 4 546 if (x.
Nspin() == 4) {
547 #if defined(GPU_WILSON_DIRAC) || defined(GPU_DOMAIN_WALL_DIRAC) 549 value =
nativeReduce<doubleN, ReduceType, double2, float4, double2, M, Reducer, writeX, writeY, writeZ,
550 writeW, writeV>(a, b, x, y, z, w, v, x.
Volume());
552 errorQuda(
"blas has not been built for Nspin=%d fields", x.Nspin());
554 }
else if (x.
Nspin() == 1) {
555 #ifdef GPU_STAGGERED_DIRAC 556 const int M = siteUnroll ? 3 : 1;
558 value =
nativeReduce<doubleN, ReduceType, double2, float2, double2, M, Reducer, writeX, writeY, writeZ,
559 writeW, writeV>(a, b, x, y, z, w, v, reduce_length / (2 * M));
561 errorQuda(
"blas has not been built for Nspin=%d fields", x.Nspin());
567 errorQuda(
"QUDA_PRECISION=%d does not enable precision %d", QUDA_PRECISION, x.
Precision());
572 #if QUDA_PRECISION & 2 573 if (x.
Nspin() == 4) {
574 #if defined(GPU_WILSON_DIRAC) || defined(GPU_DOMAIN_WALL_DIRAC) 576 value =
nativeReduce<doubleN, ReduceType, double2, short4, double2, M, Reducer, writeX, writeY, writeZ,
577 writeW, writeV>(a, b, x, y, z, w, v, x.
Volume());
579 errorQuda(
"blas has not been built for Nspin=%d fields", x.Nspin());
581 }
else if (x.
Nspin() == 1) {
582 #ifdef GPU_STAGGERED_DIRAC 584 value =
nativeReduce<doubleN, ReduceType, double2, short2, double2, M, Reducer, writeX, writeY, writeZ,
585 writeW, writeV>(a, b, x, y, z, w, v, x.
Volume());
587 errorQuda(
"blas has not been built for Nspin=%d fields", x.Nspin());
593 errorQuda(
"QUDA_PRECISION=%d does not enable precision %d", QUDA_PRECISION, x.
Precision());
598 #if QUDA_PRECISION & 1 599 if (x.
Nspin() == 4) {
600 #if defined(GPU_WILSON_DIRAC) || defined(GPU_DOMAIN_WALL_DIRAC) 602 value =
nativeReduce<doubleN, ReduceType, double2, char4, double2, M, Reducer, writeX, writeY, writeZ,
603 writeW, writeV>(a, b, x, y, z, w, v, x.
Volume());
605 errorQuda(
"blas has not been built for Nspin=%d fields", x.Nspin());
607 }
else if (x.
Nspin() == 1) {
608 #ifdef GPU_STAGGERED_DIRAC 610 value =
nativeReduce<doubleN, ReduceType, double2, char2, double2, M, Reducer, writeX, writeY, writeZ,
611 writeW, writeV>(a, b, x, y, z, w, v, x.
Volume());
613 errorQuda(
"blas has not been built for Nspin=%d fields", x.Nspin());
619 errorQuda(
"QUDA_PRECISION=%d does not enable precision %d", QUDA_PRECISION, x.
Precision());
626 errorQuda(
"QUDA_PRECISION=%d does not enable precision %d", QUDA_PRECISION, z.
Precision());
631 #if QUDA_PRECISION & 4 634 #if QUDA_PRECISION & 2 635 if (x.
Nspin() == 4) {
636 #if defined(GPU_WILSON_DIRAC) || defined(GPU_DOMAIN_WALL_DIRAC) 638 value =
nativeReduce<doubleN, ReduceType, float4, short4, float4, M, Reducer, writeX, writeY, writeZ,
639 writeW, writeV>(a, b, x, y, z, w, v, x.
Volume());
641 errorQuda(
"blas has not been built for Nspin=%d fields", x.Nspin());
643 }
else if (x.
Nspin() == 1) {
644 #ifdef GPU_STAGGERED_DIRAC 646 value =
nativeReduce<doubleN, ReduceType, float2, short2, float2, M, Reducer, writeX, writeY, writeZ,
647 writeW, writeV>(a, b, x, y, z, w, v, x.
Volume());
649 errorQuda(
"blas has not been built for Nspin=%d fields", x.Nspin());
657 errorQuda(
"QUDA_PRECISION=%d does not enable precision %d", QUDA_PRECISION, x.
Precision());
661 #if QUDA_PRECISION & 1 662 if (x.
Nspin() == 4) {
663 #if defined(GPU_WILSON_DIRAC) || defined(GPU_DOMAIN_WALL_DIRAC) 665 value =
nativeReduce<doubleN, ReduceType, float4, char4, float4, M, Reducer, writeX, writeY, writeZ,
666 writeW, writeV>(a, b, x, y, z, w, v, x.
Volume());
668 errorQuda(
"blas has not been built for Nspin=%d fields", x.Nspin());
670 }
else if (x.
Nspin() == 1) {
671 #ifdef GPU_STAGGERED_DIRAC 673 value =
nativeReduce<doubleN, ReduceType, float2, char2, float2, M, Reducer, writeX, writeY, writeZ,
674 writeW, writeV>(a, b, x, y, z, w, v, x.
Volume());
676 errorQuda(
"blas has not been built for Nspin=%d fields", x.Nspin());
684 errorQuda(
"QUDA_PRECISION=%d does not enable precision %d", QUDA_PRECISION, x.
Precision());
690 errorQuda(
"QUDA_PRECISION=%d does not enable precision %d", QUDA_PRECISION, x.
Precision());
700 Reducer<doubleN, double2, double2> r(a, b);
701 value =
genericReduce<doubleN, doubleN, float, double, writeX, writeY, writeZ, writeW, writeV,
702 Reducer<doubleN, double2, double2>>(x, y, z, w, v, r);
708 const int Nreduce =
sizeof(doubleN) /
sizeof(
double);
717 return uni_reduce<double, QudaSumFloat, Norm1, 0, 0, 0, 0, 0, false>(
718 make_double2(0.0, 0.0), make_double2(0.0, 0.0), y, y, y, y, y);
724 return uni_reduce<double, QudaSumFloat, Norm2, 0, 0, 0, 0, 0, false>(
725 make_double2(0.0, 0.0), make_double2(0.0, 0.0), y, y, y, y, y);
730 return uni_reduce<double, QudaSumFloat, Dot, 0, 0, 0, 0, 0, false>(
731 make_double2(0.0, 0.0), make_double2(0.0, 0.0), x, y, x, x, x);
736 return uni_reduce<double, QudaSumFloat, axpbyzNorm2, 0, 0, 1, 0, 0, false>(
737 make_double2(a, 0.0), make_double2(b, 0.0), x, y, z, x, x);
742 return uni_reduce<double, QudaSumFloat, AxpyReDot, 0, 1, 0, 0, 0, false>(
743 make_double2(a, 0.0), make_double2(0.0, 0.0), x, y, x, x, x);
748 return uni_reduce<double, QudaSumFloat, caxpyNorm2, 0, 1, 0, 0, 0, false>(
749 make_double2(
REAL(a),
IMAG(a)), make_double2(0.0, 0.0), x, y, x, x, x);
754 return uni_reduce<double, QudaSumFloat, caxpyxmaznormx, 1, 1, 0, 0, 0, false>(
755 make_double2(
REAL(a),
IMAG(a)), make_double2(0.0, 0.0), x, y, z, x, x);
760 return uni_reduce<double, QudaSumFloat, cabxpyzaxnorm, 1, 0, 1, 0, 0, false>(
761 make_double2(a, 0.0), make_double2(
REAL(b),
IMAG(b)), x, y, z, x, x);
766 double2 cdot = uni_reduce<double2, QudaSumFloat2, Cdot, 0, 0, 0, 0, 0, false>(
767 make_double2(0.0, 0.0), make_double2(0.0, 0.0), x, y, x, x, x);
768 return Complex(cdot.x, cdot.y);
773 double2 cdot = uni_reduce<double2, QudaSumFloat2, caxpydotzy, 0, 1, 0, 0, 0, false>(
774 make_double2(
REAL(a),
IMAG(a)), make_double2(0.0, 0.0), x, y, z, x, x);
775 return Complex(cdot.x, cdot.y);
779 return uni_reduce<double3, QudaSumFloat3, CdotNormA, 0, 0, 0, 0, 0, false>(
780 make_double2(0.0, 0.0), make_double2(0.0, 0.0), x, y, x, x, x);
788 return mixed_reduce<double3, QudaSumFloat3, caxpbypzYmbwcDotProductUYNormY_, 0, 1, 1, 0, 0, false>(
789 make_double2(
REAL(a),
IMAG(a)), make_double2(
REAL(b),
IMAG(b)), x, y, z, w, u);
791 return uni_reduce<double3, QudaSumFloat3, caxpbypzYmbwcDotProductUYNormY_, 0, 1, 1, 0, 0, false>(
792 make_double2(
REAL(a),
IMAG(a)), make_double2(
REAL(b),
IMAG(b)), x, y, z, w, u);
800 cg_norm = mixed_reduce<double2, QudaSumFloat2, axpyCGNorm2, 0, 0, 1, 0, 0, false>(
801 make_double2(a, 0.0), make_double2(0.0, 0.0), x, x, y, x, x);
803 cg_norm = uni_reduce<double2, QudaSumFloat2, axpyCGNorm2, 0, 0, 1, 0, 0, false>(
804 make_double2(a, 0.0), make_double2(0.0, 0.0), x, x, y, x, x);
806 return Complex(cg_norm.x, cg_norm.y);
811 if (x.
Ncolor()!=3)
return make_double3(0.0, 0.0, 0.0);
812 double3 rtn = uni_reduce<double3, QudaSumFloat3, HeavyQuarkResidualNorm_, 0, 0, 0, 0, 0, true>(
813 make_double2(0.0, 0.0), make_double2(0.0, 0.0), x, r, r, r, r);
821 if (x.
Ncolor()!=3)
return make_double3(0.0, 0.0, 0.0);
822 double3 rtn = uni_reduce<double3, QudaSumFloat3, xpyHeavyQuarkResidualNorm_, 0, 0, 0, 0, 0, true>(
823 make_double2(0.0, 0.0), make_double2(0.0, 0.0), x, y, r, r, r);
829 return uni_reduce<double3, QudaSumFloat3, tripleCGReduction_, 0, 0, 0, 0, 0, false>(
830 make_double2(0.0, 0.0), make_double2(0.0, 0.0), x, y, z, x, x);
834 return uni_reduce<double4, QudaSumFloat4, quadrupleCGReduction_, 0, 0, 0, 0, 0, false>(
835 make_double2(0.0, 0.0), make_double2(0.0, 0.0), x, y, z, x, x);
839 return uni_reduce<double, QudaSumFloat, quadrupleCG3InitNorm_, 1, 1, 1, 1, 0, false>(
840 make_double2(a, 0.0), make_double2(0.0, 0.0), x, y, z, w, v);
844 return uni_reduce<double, QudaSumFloat, quadrupleCG3UpdateNorm_, 1, 1, 1, 1, 0, false>(
845 make_double2(a, 0.0), make_double2(b, 1. - b), x, y, z, w, v);
849 return uni_reduce<double, QudaSumFloat, doubleCG3InitNorm_, 1, 1, 0, 0, 0, false>(
850 make_double2(a, 0.0), make_double2(0.0, 0.0), x, y, z, z, z);
854 return uni_reduce<double, QudaSumFloat, doubleCG3UpdateNorm_, 1, 1, 0, 0, 0, false>(
855 make_double2(a, 0.0), make_double2(b, 1.0 - b), x, y, z, z, z);
#define qudaMemcpy(dst, src, count, kind)
CUresult jitifyError() const
#define pinned_malloc(size)
void * getHostReduceBuffer()
double3 cDotProductNormA(ColorSpinorField &a, ColorSpinorField &b)
bool commAsyncReduction()
double caxpyNorm(const Complex &a, ColorSpinorField &x, ColorSpinorField &y)
const char * AuxString() const
cudaError_t qudaEventQuery(cudaEvent_t &event)
Wrapper around cudaEventQuery or cuEventQuery.
cudaDeviceProp deviceProp
double quadrupleCG3InitNorm(double a, ColorSpinorField &x, ColorSpinorField &y, ColorSpinorField &z, ColorSpinorField &w, ColorSpinorField &v)
QudaVerbosity getVerbosity()
#define checkPrecision(...)
ReductionArg< ReduceType, SpinorX, SpinorY, SpinorZ, SpinorW, SpinorV, Reducer > arg
double norm2(const ColorSpinorField &a)
Helper file when using jitify run-time compilation. This file should be included in source code...
Complex cDotProduct(ColorSpinorField &, ColorSpinorField &)
void apply(const cudaStream_t &stream)
doubleN reduceLaunch(Arg &arg, const TuneParam &tp, const cudaStream_t &stream, Tunable &tunable)
ReduceCuda(doubleN &result, SpinorX &X, SpinorY &Y, SpinorZ &Z, SpinorW &W, SpinorV &V, Reducer &r, ColorSpinorField &x, ColorSpinorField &y, ColorSpinorField &z, ColorSpinorField &w, ColorSpinorField &v, int length)
double3 xpyHeavyQuarkResidualNorm(ColorSpinorField &x, ColorSpinorField &y, ColorSpinorField &r)
void reduceDoubleArray(double *, const int len)
double reDotProduct(ColorSpinorField &x, ColorSpinorField &y)
Complex axpyCGNorm(double a, ColorSpinorField &x, ColorSpinorField &y)
const char * VolString() const
unsigned int sharedBytesPerBlock(const TuneParam ¶m) const
void * getMappedHostReduceBuffer()
__device__ void reduce(ReduceArg< T > arg, const T &in, const int idx=0)
__host__ __device__ void sum(double &a, double &b)
void initTuneParam(TuneParam ¶m) const
doubleN uni_reduce(const double2 &a, const double2 &b, ColorSpinorField &x, ColorSpinorField &y, ColorSpinorField &z, ColorSpinorField &w, ColorSpinorField &v)
static bool fast_reduce_enabled
void completeFastReduce(int32_t words)
size_t RealLength() const
const ColorSpinorField & z
cudaStream_t * getStream()
void initFastReduce(int words)
static cudaEvent_t reduceEnd
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
double4 quadrupleCGReduction(ColorSpinorField &x, ColorSpinorField &y, ColorSpinorField &z)
#define checkLocation(...)
__global__ void reduceKernel(Arg arg)
double cabxpyzAxNorm(double a, const Complex &b, ColorSpinorField &x, ColorSpinorField &y, ColorSpinorField &z)
double3 HeavyQuarkResidualNorm(ColorSpinorField &x, ColorSpinorField &r)
std::complex< double > Complex
cudaEvent_t * getReduceEvent()
double caxpyXmazNormX(const Complex &a, ColorSpinorField &x, ColorSpinorField &y, ColorSpinorField &z)
double axpyReDot(double a, ColorSpinorField &x, ColorSpinorField &y)
Complex caxpyDotzy(const Complex &a, ColorSpinorField &x, ColorSpinorField &y, ColorSpinorField &z)
void zero(ColorSpinorField &a)
void checkLength(const ColorSpinorField &a, const ColorSpinorField &b)
void * memset(void *s, int c, size_t n)
#define LAUNCH_KERNEL(kernel, tp, stream, arg,...)
double axpbyzNorm(double a, ColorSpinorField &x, double b, ColorSpinorField &y, ColorSpinorField &z)
double3 caxpbypzYmbwcDotProductUYNormY(const Complex &a, ColorSpinorField &x, const Complex &b, ColorSpinorField &y, ColorSpinorField &z, ColorSpinorField &w, ColorSpinorField &u)
double norm1(const ColorSpinorField &b)
static QudaSumFloat * h_reduce
virtual bool advanceSharedBytes(TuneParam ¶m) const
unsigned int sharedBytesPerThread() const
void * getDeviceReduceBuffer()
double doubleCG3UpdateNorm(double a, double b, ColorSpinorField &x, ColorSpinorField &y, ColorSpinorField &z)
ReduceType genericReduce(SpinorX &X, SpinorY &Y, SpinorZ &Z, SpinorW &W, SpinorV &V, Reducer r)
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
#define device_malloc(size)
cudaError_t qudaEventRecord(cudaEvent_t &event, cudaStream_t stream=0)
Wrapper around cudaEventRecord or cuEventRecord.
static QudaSumFloat * d_reduce
virtual void initTuneParam(TuneParam ¶m) const
#define mapped_malloc(size)
doubleN mixed_reduce(const double2 &a, const double2 &b, ColorSpinorField &x, ColorSpinorField &y, ColorSpinorField &z, ColorSpinorField &w, ColorSpinorField &v)
QudaTune getTuning()
Query whether autotuning is enabled or not. Default is enabled but can be overridden by setting QUDA_...
double doubleCG3InitNorm(double a, ColorSpinorField &x, ColorSpinorField &y, ColorSpinorField &z)
QudaPrecision Precision() const
doubleN nativeReduce(const double2 &a, const double2 &b, ColorSpinorField &x, ColorSpinorField &y, ColorSpinorField &z, ColorSpinorField &w, ColorSpinorField &v, int length)
__device__ unsigned int count[QUDA_MAX_MULTI_REDUCE]
QudaFieldOrder FieldOrder() const
static QudaSumFloat * hd_reduce
void defaultTuneParam(TuneParam ¶m) const
double3 tripleCGReduction(ColorSpinorField &x, ColorSpinorField &y, ColorSpinorField &z)
virtual void defaultTuneParam(TuneParam ¶m) const
double quadrupleCG3UpdateNorm(double a, double b, ColorSpinorField &x, ColorSpinorField &y, ColorSpinorField &z, ColorSpinorField &w, ColorSpinorField &v)