quda-ref/v1.0.0/reduce__quda_8cu_source.html

 #include <atomic>
 #include <blas_quda.h>
 #include <tune_quda.h>
 #include <float_vector.h>
 #include <color_spinor_field_order.h>

 #include <launch_kernel.cuh>
 #include <jitify_helper.cuh>
 #include <kernels/reduce_core.cuh>

 // These are used for reduction kernels
 static QudaSumFloat *d_reduce=0;
 static QudaSumFloat *h_reduce=0;
 static QudaSumFloat *hd_reduce=0;
 static cudaEvent_t reduceEnd;
 static bool fast_reduce_enabled = false;

 namespace quda {

   namespace blas {

 #include <generic_reduce.cuh>

     cudaStream_t* getStream();

     void* getDeviceReduceBuffer() { return d_reduce; }
     void* getMappedHostReduceBuffer() { return hd_reduce; }
     void* getHostReduceBuffer() { return h_reduce; }
     cudaEvent_t* getReduceEvent() { return &reduceEnd; }
     bool getFastReduce() { return fast_reduce_enabled; }

     void initFastReduce(int32_t words)
     {
       // initialize the reduction values in 32-bit increments to INT_MIN
       for (int32_t i = 0; i < words; i++) {
         reinterpret_cast<int32_t *>(h_reduce)[i] = std::numeric_limits<int32_t>::min();
       }

       // ensure that the host memory write is complete before we launch the kernel
       atomic_thread_fence(std::memory_order_release);
     }

     void completeFastReduce(int32_t words)
     {
       volatile int32_t *check = reinterpret_cast<int32_t *>(h_reduce);
       int count = 0;
       int complete = 0;
       while (complete < words) {
         // ensure visiblity to any changes in memory
         atomic_thread_fence(std::memory_order_acquire);

         complete = 0;
         for (int32_t i = 0; i < words; i++) {
           // spin-wait until all values have been updated
           if (check[i] != std::numeric_limits<int32_t>::min()) complete++;
         }
         if (count++ % 10000 == 0) { // check error every 10000 iterations
           // if there is an error in the kernel then we need to exit the spin-wait
           if (cudaSuccess != cudaPeekAtLastError()) break;
         }
       }
     }

     void initReduce()
     {
       /* we have these different reductions to cater for:

    - regular reductions (reduce_quda.cu) where are reducing to a
            single vector type (max length 4 presently), with possibly
            parity dimension, and a grid-stride loop with max number of
            blocks = 2 x SM count

    - multi-reductions where we are reducing to a matrix of size
      of size MAX_MULTI_BLAS_N^2 of vectors (max length 4), with
      possible parity dimension, and a grid-stride loop with
      maximum number of blocks = 2 x SM count
       */

       const int reduce_size = 4 * sizeof(QudaSumFloat);
       const int max_reduce_blocks = 2*deviceProp.multiProcessorCount;

       const int max_reduce = 2 * max_reduce_blocks * reduce_size;
       const int max_multi_reduce = 2 * MAX_MULTI_BLAS_N * MAX_MULTI_BLAS_N * max_reduce_blocks * reduce_size;

       // reduction buffer size
       size_t bytes = max_reduce > max_multi_reduce ? max_reduce : max_multi_reduce;

       if (!d_reduce) d_reduce = (QudaSumFloat *) device_malloc(bytes);

       // these arrays are actually oversized currently (only needs to be QudaSumFloat3)

       // if the device supports host-mapped memory then use a host-mapped array for the reduction
       if (!h_reduce) {
   // only use zero copy reductions when using 64-bit
 #if (defined(_MSC_VER) && defined(_WIN64)) || defined(__LP64__)
   if(deviceProp.canMapHostMemory) {
     h_reduce = (QudaSumFloat *) mapped_malloc(bytes);
     cudaHostGetDevicePointer(&hd_reduce, h_reduce, 0); // set the matching device pointer
   } else
 #endif
     {
       h_reduce = (QudaSumFloat *) pinned_malloc(bytes);
       hd_reduce = d_reduce;
     }
   memset(h_reduce, 0, bytes); // added to ensure that valgrind doesn't report h_reduce is unitialised
       }

       cudaEventCreateWithFlags(&reduceEnd, cudaEventDisableTiming);

       // enable fast reductions with CPU spin waiting as opposed to using CUDA events
       char *fast_reduce_env = getenv("QUDA_ENABLE_FAST_REDUCE");
       if (fast_reduce_env && strcmp(fast_reduce_env,"1") == 0) {
         warningQuda("Experimental fast reductions enabled");
         fast_reduce_enabled = true;
       }

       checkCudaError();
     }

     void endReduce(void)
     {
       if (d_reduce) {
   device_free(d_reduce);
   d_reduce = 0;
       }
       if (h_reduce) {
   host_free(h_reduce);
   h_reduce = 0;
       }
       hd_reduce = 0;

       cudaEventDestroy(reduceEnd);
     }

     template <typename doubleN, typename ReduceType, typename FloatN, int M, typename Arg>
     doubleN reduceLaunch(Arg &arg, const TuneParam &tp, const cudaStream_t &stream, Tunable &tunable)
     {
       if (tp.grid.x > (unsigned int)deviceProp.maxGridSize[0])
         errorQuda("Grid size %d greater than maximum %d\n", tp.grid.x, deviceProp.maxGridSize[0]);

       const int32_t words = tp.grid.y * sizeof(ReduceType) / sizeof(int32_t);
       if (getFastReduce() && !commAsyncReduction()) initFastReduce(words);

 #ifdef JITIFY
       using namespace jitify::reflection;
       tunable.jitifyError() = program->kernel("quda::blas::reduceKernel")
                                   .instantiate((int)tp.block.x, Type<ReduceType>(), Type<FloatN>(), M, Type<Arg>())
                                   .configure(tp.grid, tp.block, tp.shared_bytes, stream)
                                   .launch(arg);
 #else
       LAUNCH_KERNEL(reduceKernel, tp, stream, arg, ReduceType, FloatN, M);
 #endif

       if (!commAsyncReduction()) {
 #if (defined(_MSC_VER) && defined(_WIN64)) || defined(__LP64__)
         if (deviceProp.canMapHostMemory) {
           if (getFastReduce()) {
             completeFastReduce(words);
           } else {
             qudaEventRecord(reduceEnd, stream);
             while (cudaSuccess != qudaEventQuery(reduceEnd)) { ; }
           }
         } else
 #endif
         {
           qudaMemcpy(h_reduce, hd_reduce, sizeof(ReduceType), cudaMemcpyDeviceToHost);
         }
       }
       doubleN cpu_sum = set(((ReduceType *)h_reduce)[0]);
       if (tp.grid.y == 2) sum(cpu_sum, ((ReduceType *)h_reduce)[1]); // add other parity if needed
       return cpu_sum;
     }

     template <typename doubleN, typename ReduceType, typename FloatN, int M, typename SpinorX, typename SpinorY,
         typename SpinorZ, typename SpinorW, typename SpinorV, typename Reducer>
     class ReduceCuda : public Tunable
     {

   private:
       const int nParity; // for composite fields this includes the number of composites
       mutable ReductionArg<ReduceType, SpinorX, SpinorY, SpinorZ, SpinorW, SpinorV, Reducer> arg;
       doubleN &result;

       const ColorSpinorField &x, &y, &z, &w, &v;

       // host pointers used for backing up fields when tuning
       // these can't be curried into the Spinors because of Tesla argument length restriction
       char *X_h, *Y_h, *Z_h, *W_h, *V_h;
       char *Xnorm_h, *Ynorm_h, *Znorm_h, *Wnorm_h, *Vnorm_h;

       unsigned int sharedBytesPerThread() const { return 0; }
       unsigned int sharedBytesPerBlock(const TuneParam &param) const { return 0; }

       virtual bool advanceSharedBytes(TuneParam &param) const
       {
         TuneParam next(param);
         advanceBlockDim(next); // to get next blockDim
         int nthreads = next.block.x * next.block.y * next.block.z;
         param.shared_bytes = sharedBytesPerThread() * nthreads > sharedBytesPerBlock(param) ?
             sharedBytesPerThread() * nthreads :
             sharedBytesPerBlock(param);
         return false;
       }

   public:
       ReduceCuda(doubleN &result, SpinorX &X, SpinorY &Y, SpinorZ &Z, SpinorW &W, SpinorV &V, Reducer &r,
           ColorSpinorField &x, ColorSpinorField &y, ColorSpinorField &z, ColorSpinorField &w, ColorSpinorField &v,
           int length) :
           nParity((x.IsComposite() ? x.CompositeDim() : 1) * (x.SiteSubset())),
           arg(X, Y, Z, W, V, r, length / nParity),
           x(x),
           y(y),
           z(z),
           w(w),
           v(v),
           result(result),
           X_h(0),
           Y_h(0),
           Z_h(0),
           W_h(0),
           V_h(0),
           Xnorm_h(0),
           Ynorm_h(0),
           Znorm_h(0),
           Wnorm_h(0),
           Vnorm_h(0)
       {
         strcpy(aux, x.AuxString());
         if (x.Precision() != z.Precision()) {
           strcat(aux, ",");
           strcat(aux, z.AuxString());
         }
         if (getFastReduce()) strcat(aux, ",fast_reduce");

 #ifdef JITIFY
         ::quda::create_jitify_program("kernels/reduce_core.cuh");
 #endif
       }
       virtual ~ReduceCuda() {}

       inline TuneKey tuneKey() const { return TuneKey(x.VolString(), typeid(arg.r).name(), aux); }

       void apply(const cudaStream_t &stream)
       {
         TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
         result = reduceLaunch<doubleN, ReduceType, FloatN, M>(arg, tp, stream, *this);
       }

       void preTune()
       {
         arg.X.backup(&X_h, &Xnorm_h, x.Bytes(), x.NormBytes());
         arg.Y.backup(&Y_h, &Ynorm_h, y.Bytes(), y.NormBytes());
         arg.Z.backup(&Z_h, &Znorm_h, z.Bytes(), z.NormBytes());
         arg.W.backup(&W_h, &Wnorm_h, w.Bytes(), w.NormBytes());
         arg.V.backup(&V_h, &Vnorm_h, v.Bytes(), v.NormBytes());
       }

       void postTune()
       {
         arg.X.restore(&X_h, &Xnorm_h, x.Bytes(), x.NormBytes());
         arg.Y.restore(&Y_h, &Ynorm_h, y.Bytes(), y.NormBytes());
         arg.Z.restore(&Z_h, &Znorm_h, z.Bytes(), z.NormBytes());
         arg.W.restore(&W_h, &Wnorm_h, w.Bytes(), w.NormBytes());
         arg.V.restore(&V_h, &Vnorm_h, v.Bytes(), v.NormBytes());
       }

       void initTuneParam(TuneParam &param) const
       {
         Tunable::initTuneParam(param);
         param.grid.y = nParity;
       }

       void defaultTuneParam(TuneParam &param) const
       {
         Tunable::defaultTuneParam(param);
         param.grid.y = nParity;
       }

       long long flops() const { return arg.r.flops() * vec_length<FloatN>::value * arg.length * nParity * M; }

       long long bytes() const
       {
         // the factor two here assumes we are reading and writing to the high precision vector
         // this will evaluate correctly for non-mixed kernels since the +2/-2 will cancel out
         return (arg.r.streams() - 2) * x.Bytes() + 2 * z.Bytes();
       }

       int tuningIter() const { return 3; }
     };

     template <typename doubleN, typename ReduceType, typename RegType, typename StoreType, typename zType, int M,
         template <typename ReducerType, typename Float, typename FloatN> class Reducer, int writeX, int writeY,
         int writeZ, int writeW, int writeV>
     doubleN nativeReduce(const double2 &a, const double2 &b, ColorSpinorField &x, ColorSpinorField &y,
         ColorSpinorField &z, ColorSpinorField &w, ColorSpinorField &v, int length)
     {

       checkLength(x, y);
       checkLength(x, z);
       checkLength(x, w);
       checkLength(x, v);

       Spinor<RegType, StoreType, M, writeX> X(x);
       Spinor<RegType, StoreType, M, writeY> Y(y);
       Spinor<RegType, zType, M, writeZ> Z(z);
       Spinor<RegType, StoreType, M, writeW> W(w);
       Spinor<RegType, StoreType, M, writeV> V(v);

       doubleN value;
       typedef typename scalar<RegType>::type Float;
       typedef typename vector<Float, 2>::type Float2;
       typedef vector<Float, 2> vec2;

       Reducer<ReduceType, Float2, RegType> r((Float2)vec2(a), (Float2)vec2(b));
       ReduceCuda<doubleN, ReduceType, RegType, M, decltype(X), decltype(Y), decltype(Z), decltype(W), decltype(V),
           Reducer<ReduceType, Float2, RegType>>
           reduce(value, X, Y, Z, W, V, r, x, y, z, w, v, length);
       reduce.apply(*(blas::getStream()));

       blas::bytes += reduce.bytes();
       blas::flops += reduce.flops();

       checkCudaError();
       return value;
     }

     /*
       Wilson
       double double2 M = 1/12
       single float4  M = 1/6
       half   short4  M = 6/6

       Staggered
       double double2 M = 1/3
       single float2  M = 1/3
       half   short2  M = 3/3
     */

     template <typename doubleN, typename ReduceType, template <typename ReducerType, typename Float, typename FloatN> class Reducer,
         int writeX, int writeY, int writeZ, int writeW, int writeV, bool siteUnroll>
     doubleN uni_reduce(const double2 &a, const double2 &b, ColorSpinorField &x, ColorSpinorField &y,
         ColorSpinorField &z, ColorSpinorField &w, ColorSpinorField &v)
     {

       checkPrecision(x, y, z, w, v);

       doubleN value;
       if (checkLocation(x, y, z, w, v) == QUDA_CUDA_FIELD_LOCATION) {

         if (!x.isNative()
             && !(x.Nspin() == 4 && x.FieldOrder() == QUDA_FLOAT2_FIELD_ORDER && x.Precision() == QUDA_SINGLE_PRECISION
                 || x.Nspin() == 4 && x.FieldOrder() == QUDA_FLOAT2_FIELD_ORDER && x.Precision() == QUDA_HALF_PRECISION)) {
           warningQuda("Device reductions on non-native fields is not supported\n");
           doubleN value;
           ::quda::zero(value);
           return value;
         }

         // cannot do site unrolling for arbitrary color (needs JIT)
         if (siteUnroll && x.Ncolor() != 3) errorQuda("Not supported");

         int reduce_length = siteUnroll ? x.RealLength() : x.Length();

         if (x.Precision() == QUDA_DOUBLE_PRECISION) {

 #if QUDA_PRECISION & 8
           if (x.Nspin() == 4 || x.Nspin() == 2) { // wilson
 #if defined(GPU_WILSON_DIRAC) || defined(GPU_DOMAIN_WALL_DIRAC) || defined(GPU_MULTIGRID) || defined(GPU_COVDEV)
             const int M = siteUnroll ? 12 : 1; // determines how much work per thread to do
             if (x.Nspin() == 2 && siteUnroll) errorQuda("siteUnroll not supported for nSpin==2");
             value = nativeReduce<doubleN, ReduceType, double2, double2, double2, M, Reducer, writeX, writeY, writeZ,
                 writeW, writeV>(a, b, x, y, z, w, v, reduce_length / (2 * M));
 #else
             errorQuda("blas has not been built for Nspin=%d fields", x.Nspin());
 #endif
           } else if (x.Nspin() == 1) { // staggered
 #ifdef GPU_STAGGERED_DIRAC
             const int M = siteUnroll ? 3 : 1; // determines how much work per thread to do
             value = nativeReduce<doubleN, ReduceType, double2, double2, double2, M, Reducer, writeX, writeY, writeZ,
                 writeW, writeV>(a, b, x, y, z, w, v, reduce_length / (2 * M));
 #else
             errorQuda("blas has not been built for Nspin=%d fields", x.Nspin());
 #endif
           } else {
             errorQuda("ERROR: nSpin=%d is not supported\n", x.Nspin());
           }
 #else
           errorQuda("QUDA_PRECISION=%d does not enable precision %d", QUDA_PRECISION, x.Precision());
 #endif

         } else if (x.Precision() == QUDA_SINGLE_PRECISION) {

 #if QUDA_PRECISION & 4
           if (x.Nspin() == 4 && x.FieldOrder() == QUDA_FLOAT4_FIELD_ORDER) { // wilson
 #if defined(GPU_WILSON_DIRAC) || defined(GPU_DOMAIN_WALL_DIRAC) || defined(GPU_COVDEV)
             const int M = siteUnroll ? 6 : 1; // determines how much work per thread to do
             value = nativeReduce<doubleN, ReduceType, float4, float4, float4, M, Reducer, writeX, writeY, writeZ,
                 writeW, writeV>(a, b, x, y, z, w, v, reduce_length / (4 * M));
 #else
             errorQuda("blas has not been built for Nspin=%d fields", x.Nspin());
 #endif
           } else if (x.Nspin() == 1 || x.Nspin() == 2 || (x.Nspin() == 4 && x.FieldOrder() == QUDA_FLOAT2_FIELD_ORDER)) {
 #if defined(GPU_STAGGERED_DIRAC) || defined(GPU_MULTIGRID)
             const int M = siteUnroll ? 3 : 1; // determines how much work per thread to do
             if (x.Nspin() == 2 && siteUnroll) errorQuda("siteUnroll not supported for nSpin==2");
             value = nativeReduce<doubleN, ReduceType, float2, float2, float2, M, Reducer, writeX, writeY, writeZ,
                 writeW, writeV>(a, b, x, y, z, w, v, reduce_length / (2 * M));
 #else
             errorQuda("blas has not been built for Nspin=%d fields", x.Nspin());
 #endif
           } else {
             errorQuda("ERROR: nSpin=%d is not supported\n", x.Nspin());
           }
 #else
           errorQuda("QUDA_PRECISION=%d does not enable precision %d", QUDA_PRECISION, x.Precision());
 #endif

         } else if (x.Precision() == QUDA_HALF_PRECISION) { // half precision

 #if QUDA_PRECISION & 2
           if (x.Nspin() == 4 && x.FieldOrder() == QUDA_FLOAT4_FIELD_ORDER) { // wilson
 #if defined(GPU_WILSON_DIRAC) || defined(GPU_DOMAIN_WALL_DIRAC) || defined(GPU_COVDEV)
             const int M = 6; // determines how much work per thread to do
             value = nativeReduce<doubleN, ReduceType, float4, short4, short4, M, Reducer, writeX, writeY, writeZ,
                 writeW, writeV>(a, b, x, y, z, w, v, y.Volume());
 #else
             errorQuda("blas has not been built for Nspin=%d fields", x.Nspin());
 #endif
           } else if (x.Nspin() == 4 && x.FieldOrder() == QUDA_FLOAT2_FIELD_ORDER) { // wilson
 #if defined(GPU_MULTIGRID)
             const int M = 12; // determines how much work per thread to do
             value
                 = nativeReduce<doubleN, ReduceType, float2, short2, short2, M, Reducer, writeX, writeY, writeZ, writeW, writeV>(
                     a, b, x, y, z, w, v, y.Volume());
 #else
             errorQuda("blas has not been built for Nspin=%d fields", x.Nspin());
 #endif
           } else if (x.Nspin() == 1) { // staggered
 #ifdef GPU_STAGGERED_DIRAC
             const int M = 3; // determines how much work per thread to do
             value = nativeReduce<doubleN, ReduceType, float2, short2, short2, M, Reducer, writeX, writeY, writeZ,
                 writeW, writeV>(a, b, x, y, z, w, v, y.Volume());
 #else
             errorQuda("blas has not been built for Nspin=%d fields", x.Nspin());
 #endif
           } else {
             errorQuda("nSpin=%d is not supported\n", x.Nspin());
           }
 #else
           errorQuda("QUDA_PRECISION=%d does not enable precision %d", QUDA_PRECISION, x.Precision());
 #endif

         } else if (x.Precision() == QUDA_QUARTER_PRECISION) { // quarter precision

 #if QUDA_PRECISION & 1
           if (x.Nspin() == 4) { // wilson
 #if defined(GPU_WILSON_DIRAC) || defined(GPU_DOMAIN_WALL_DIRAC) || defined(GPU_COVDEV)
             const int M = 6; // determines how much work per thread to do
             value
                 = nativeReduce<doubleN, ReduceType, float4, char4, char4, M, Reducer, writeX, writeY, writeZ, writeW, writeV>(
                     a, b, x, y, z, w, v, y.Volume());
 #else
             errorQuda("blas has not been built for Nspin=%d fields", x.Nspin());
 #endif
           } else if (x.Nspin() == 1) { // staggered
 #ifdef GPU_STAGGERED_DIRAC
             const int M = 3; // determines how much work per thread to do
             value
                 = nativeReduce<doubleN, ReduceType, float2, char2, char2, M, Reducer, writeX, writeY, writeZ, writeW, writeV>(
                     a, b, x, y, z, w, v, y.Volume());
 #else
             errorQuda("blas has not been built for Nspin=%d fields", x.Nspin());
 #endif
           } else {
             errorQuda("nSpin=%d is not supported\n", x.Nspin());
           }
 #else
           errorQuda("QUDA_PRECISION=%d does not enable precision %d", QUDA_PRECISION, x.Precision());
 #endif

         } else {
           errorQuda("precision=%d is not supported\n", x.Precision());
         }
       } else { // fields are on the CPU
         // we don't have quad precision support on the GPU so use doubleN instead of ReduceType
         if (x.Precision() == QUDA_DOUBLE_PRECISION) {
           Reducer<doubleN, double2, double2> r(a, b);
           value = genericReduce<doubleN, doubleN, double, double, writeX, writeY, writeZ, writeW, writeV,
               Reducer<doubleN, double2, double2>>(x, y, z, w, v, r);
         } else if (x.Precision() == QUDA_SINGLE_PRECISION) {
           Reducer<doubleN, float2, float2> r(make_float2(a.x, a.y), make_float2(b.x, b.y));
           value = genericReduce<doubleN, doubleN, float, float, writeX, writeY, writeZ, writeW, writeV,
               Reducer<doubleN, float2, float2>>(x, y, z, w, v, r);
         } else {
           errorQuda("Precision %d not implemented", x.Precision());
         }
       }

       const int Nreduce = sizeof(doubleN) / sizeof(double);
       reduceDoubleArray((double *)&value, Nreduce);

       return value;
     }

     template <typename doubleN, typename ReduceType, template <typename ReducerType, typename Float, typename FloatN> class Reducer,
         int writeX, int writeY, int writeZ, int writeW, int writeV, bool siteUnroll>
     doubleN mixed_reduce(const double2 &a, const double2 &b, ColorSpinorField &x, ColorSpinorField &y,
         ColorSpinorField &z, ColorSpinorField &w, ColorSpinorField &v)
     {

       checkPrecision(x, y, w, v);

       doubleN value;
       if (checkLocation(x, y, z, w, v) == QUDA_CUDA_FIELD_LOCATION) {

         if (!x.isNative()
             && !(x.Nspin() == 4 && x.FieldOrder() == QUDA_FLOAT2_FIELD_ORDER && x.Precision() == QUDA_SINGLE_PRECISION)) {
           warningQuda("Device reductions on non-native fields is not supported\n");
           doubleN value;
           ::quda::zero(value);
           return value;
         }

         // cannot do site unrolling for arbitrary color (needs JIT)
         if (x.Ncolor() != 3) errorQuda("Not supported");

         if (z.Precision() == QUDA_DOUBLE_PRECISION) {

 #if QUDA_PRECISION & 8
           if (x.Precision() == QUDA_SINGLE_PRECISION) {

 #if QUDA_PRECISION & 4
             if (x.Nspin() == 4) { // wilson
 #if defined(GPU_WILSON_DIRAC) || defined(GPU_DOMAIN_WALL_DIRAC)
               const int M = 12; // determines how much work per thread to do
               value = nativeReduce<doubleN, ReduceType, double2, float4, double2, M, Reducer, writeX, writeY, writeZ,
                   writeW, writeV>(a, b, x, y, z, w, v, x.Volume());
 #else
               errorQuda("blas has not been built for Nspin=%d fields", x.Nspin());
 #endif
             } else if (x.Nspin() == 1) { // staggered
 #ifdef GPU_STAGGERED_DIRAC
               const int M = siteUnroll ? 3 : 1; // determines how much work per thread to do
               const int reduce_length = siteUnroll ? x.RealLength() : x.Length();
               value = nativeReduce<doubleN, ReduceType, double2, float2, double2, M, Reducer, writeX, writeY, writeZ,
                   writeW, writeV>(a, b, x, y, z, w, v, reduce_length / (2 * M));
 #else
               errorQuda("blas has not been built for Nspin=%d fields", x.Nspin());
 #endif
             } else {
               errorQuda("ERROR: nSpin=%d is not supported\n", x.Nspin());
             }
 #else
             errorQuda("QUDA_PRECISION=%d does not enable precision %d", QUDA_PRECISION, x.Precision());
 #endif

           } else if (x.Precision() == QUDA_HALF_PRECISION) {

 #if QUDA_PRECISION & 2
             if (x.Nspin() == 4) { // wilson
 #if defined(GPU_WILSON_DIRAC) || defined(GPU_DOMAIN_WALL_DIRAC)
               const int M = 12; // determines how much work per thread to do
               value = nativeReduce<doubleN, ReduceType, double2, short4, double2, M, Reducer, writeX, writeY, writeZ,
                   writeW, writeV>(a, b, x, y, z, w, v, x.Volume());
 #else
               errorQuda("blas has not been built for Nspin=%d fields", x.Nspin());
 #endif
             } else if (x.Nspin() == 1) { // staggered
 #ifdef GPU_STAGGERED_DIRAC
               const int M = 3; // determines how much work per thread to do
               value = nativeReduce<doubleN, ReduceType, double2, short2, double2, M, Reducer, writeX, writeY, writeZ,
                   writeW, writeV>(a, b, x, y, z, w, v, x.Volume());
 #else
               errorQuda("blas has not been built for Nspin=%d fields", x.Nspin());
 #endif
             } else {
               errorQuda("ERROR: nSpin=%d is not supported\n", x.Nspin());
             }
 #else
             errorQuda("QUDA_PRECISION=%d does not enable precision %d", QUDA_PRECISION, x.Precision());
 #endif

           } else if (x.Precision() == QUDA_QUARTER_PRECISION) {

 #if QUDA_PRECISION & 1
             if (x.Nspin() == 4) { // wilson
 #if defined(GPU_WILSON_DIRAC) || defined(GPU_DOMAIN_WALL_DIRAC)
               const int M = 12; // determines how much work per thread to do
               value = nativeReduce<doubleN, ReduceType, double2, char4, double2, M, Reducer, writeX, writeY, writeZ,
                   writeW, writeV>(a, b, x, y, z, w, v, x.Volume());
 #else
               errorQuda("blas has not been built for Nspin=%d fields", x.Nspin());
 #endif
             } else if (x.Nspin() == 1) { // staggered
 #ifdef GPU_STAGGERED_DIRAC
               const int M = 3; // determines how much work per thread to do
               value = nativeReduce<doubleN, ReduceType, double2, char2, double2, M, Reducer, writeX, writeY, writeZ,
                   writeW, writeV>(a, b, x, y, z, w, v, x.Volume());
 #else
               errorQuda("blas has not been built for Nspin=%d fields", x.Nspin());
 #endif
             } else {
               errorQuda("ERROR: nSpin=%d is not supported\n", x.Nspin());
             }
 #else
             errorQuda("QUDA_PRECISION=%d does not enable precision %d", QUDA_PRECISION, x.Precision());
 #endif

           } else {
             errorQuda("Not implemented for this precision combination %d %d", x.Precision(), z.Precision());
           }
 #else
           errorQuda("QUDA_PRECISION=%d does not enable precision %d", QUDA_PRECISION, z.Precision());
 #endif

         } else if (z.Precision() == QUDA_SINGLE_PRECISION) {

 #if QUDA_PRECISION & 4
           if (x.Precision() == QUDA_HALF_PRECISION) {

 #if QUDA_PRECISION & 2
             if (x.Nspin() == 4) { // wilson
 #if defined(GPU_WILSON_DIRAC) || defined(GPU_DOMAIN_WALL_DIRAC)
               const int M = 6;
               value = nativeReduce<doubleN, ReduceType, float4, short4, float4, M, Reducer, writeX, writeY, writeZ,
                   writeW, writeV>(a, b, x, y, z, w, v, x.Volume());
 #else
               errorQuda("blas has not been built for Nspin=%d fields", x.Nspin());
 #endif
             } else if (x.Nspin() == 1) { // staggered
 #ifdef GPU_STAGGERED_DIRAC
               const int M = 3;
               value = nativeReduce<doubleN, ReduceType, float2, short2, float2, M, Reducer, writeX, writeY, writeZ,
                   writeW, writeV>(a, b, x, y, z, w, v, x.Volume());
 #else
               errorQuda("blas has not been built for Nspin=%d fields", x.Nspin());
 #endif
             } else {
               errorQuda("ERROR: nSpin=%d is not supported\n", x.Nspin());
             }
             blas::bytes
                 += Reducer<ReduceType, double2, double2>::streams() * (unsigned long long)x.Volume() * sizeof(float);
 #else
             errorQuda("QUDA_PRECISION=%d does not enable precision %d", QUDA_PRECISION, x.Precision());
 #endif

           } else if (x.Precision() == QUDA_QUARTER_PRECISION) {
 #if QUDA_PRECISION & 1
             if (x.Nspin() == 4) { // wilson
 #if defined(GPU_WILSON_DIRAC) || defined(GPU_DOMAIN_WALL_DIRAC)
               const int M = 6;
               value = nativeReduce<doubleN, ReduceType, float4, char4, float4, M, Reducer, writeX, writeY, writeZ,
                   writeW, writeV>(a, b, x, y, z, w, v, x.Volume());
 #else
               errorQuda("blas has not been built for Nspin=%d fields", x.Nspin());
 #endif
             } else if (x.Nspin() == 1) { // staggered
 #ifdef GPU_STAGGERED_DIRAC
               const int M = 3;
               value = nativeReduce<doubleN, ReduceType, float2, char2, float2, M, Reducer, writeX, writeY, writeZ,
                   writeW, writeV>(a, b, x, y, z, w, v, x.Volume());
 #else
               errorQuda("blas has not been built for Nspin=%d fields", x.Nspin());
 #endif
             } else {
               errorQuda("ERROR: nSpin=%d is not supported\n", x.Nspin());
             }
             blas::bytes
                 += Reducer<ReduceType, double2, double2>::streams() * (unsigned long long)x.Volume() * sizeof(float);
 #else
             errorQuda("QUDA_PRECISION=%d does not enable precision %d", QUDA_PRECISION, x.Precision());
 #endif
           } else {
             errorQuda("Not implemented for this precision combination %d %d", x.Precision(), z.Precision());
           }
 #else
           errorQuda("QUDA_PRECISION=%d does not enable precision %d", QUDA_PRECISION, x.Precision());
 #endif

         } else {
           errorQuda("Not implemented for this precision combination %d %d", x.Precision(), z.Precision());
         }

       } else {
         // we don't have quad precision support on the GPU so use doubleN instead of ReduceType
         if (x.Precision() == QUDA_SINGLE_PRECISION && z.Precision() == QUDA_DOUBLE_PRECISION) {
           Reducer<doubleN, double2, double2> r(a, b);
           value = genericReduce<doubleN, doubleN, float, double, writeX, writeY, writeZ, writeW, writeV,
               Reducer<doubleN, double2, double2>>(x, y, z, w, v, r);
         } else {
           errorQuda("Precision %d not implemented", x.Precision());
         }
       }

       const int Nreduce = sizeof(doubleN) / sizeof(double);
       reduceDoubleArray((double *)&value, Nreduce);

       return value;
     }

     double norm1(const ColorSpinorField &x)
     {
       ColorSpinorField &y = const_cast<ColorSpinorField &>(x); // FIXME
       return uni_reduce<double, QudaSumFloat, Norm1, 0, 0, 0, 0, 0, false>(
           make_double2(0.0, 0.0), make_double2(0.0, 0.0), y, y, y, y, y);
     }

     double norm2(const ColorSpinorField &x)
     {
       ColorSpinorField &y = const_cast<ColorSpinorField &>(x);
       return uni_reduce<double, QudaSumFloat, Norm2, 0, 0, 0, 0, 0, false>(
           make_double2(0.0, 0.0), make_double2(0.0, 0.0), y, y, y, y, y);
     }

     double reDotProduct(ColorSpinorField &x, ColorSpinorField &y)
     {
       return uni_reduce<double, QudaSumFloat, Dot, 0, 0, 0, 0, 0, false>(
           make_double2(0.0, 0.0), make_double2(0.0, 0.0), x, y, x, x, x);
     }

     double axpbyzNorm(double a, ColorSpinorField &x, double b, ColorSpinorField &y, ColorSpinorField &z)
     {
       return uni_reduce<double, QudaSumFloat, axpbyzNorm2, 0, 0, 1, 0, 0, false>(
           make_double2(a, 0.0), make_double2(b, 0.0), x, y, z, x, x);
     }

     double axpyReDot(double a, ColorSpinorField &x, ColorSpinorField &y)
     {
       return uni_reduce<double, QudaSumFloat, AxpyReDot, 0, 1, 0, 0, 0, false>(
           make_double2(a, 0.0), make_double2(0.0, 0.0), x, y, x, x, x);
     }

     double caxpyNorm(const Complex &a, ColorSpinorField &x, ColorSpinorField &y)
     {
       return uni_reduce<double, QudaSumFloat, caxpyNorm2, 0, 1, 0, 0, 0, false>(
           make_double2(REAL(a), IMAG(a)), make_double2(0.0, 0.0), x, y, x, x, x);
     }

     double caxpyXmazNormX(const Complex &a, ColorSpinorField &x, ColorSpinorField &y, ColorSpinorField &z)
     {
       return uni_reduce<double, QudaSumFloat, caxpyxmaznormx, 1, 1, 0, 0, 0, false>(
           make_double2(REAL(a), IMAG(a)), make_double2(0.0, 0.0), x, y, z, x, x);
     }

     double cabxpyzAxNorm(double a, const Complex &b, ColorSpinorField &x, ColorSpinorField &y, ColorSpinorField &z)
     {
       return uni_reduce<double, QudaSumFloat, cabxpyzaxnorm, 1, 0, 1, 0, 0, false>(
           make_double2(a, 0.0), make_double2(REAL(b), IMAG(b)), x, y, z, x, x);
     }

     Complex cDotProduct(ColorSpinorField &x, ColorSpinorField &y)
     {
       double2 cdot = uni_reduce<double2, QudaSumFloat2, Cdot, 0, 0, 0, 0, 0, false>(
           make_double2(0.0, 0.0), make_double2(0.0, 0.0), x, y, x, x, x);
       return Complex(cdot.x, cdot.y);
     }

     Complex caxpyDotzy(const Complex &a, ColorSpinorField &x, ColorSpinorField &y, ColorSpinorField &z)
     {
       double2 cdot = uni_reduce<double2, QudaSumFloat2, caxpydotzy, 0, 1, 0, 0, 0, false>(
           make_double2(REAL(a), IMAG(a)), make_double2(0.0, 0.0), x, y, z, x, x);
       return Complex(cdot.x, cdot.y);
     }

     double3 cDotProductNormA(ColorSpinorField &x, ColorSpinorField &y) {
       return uni_reduce<double3, QudaSumFloat3, CdotNormA, 0, 0, 0, 0, 0, false>(
           make_double2(0.0, 0.0), make_double2(0.0, 0.0), x, y, x, x, x);
     }

     double3 caxpbypzYmbwcDotProductUYNormY(const Complex &a, ColorSpinorField &x,
              const Complex &b, ColorSpinorField &y,
              ColorSpinorField &z, ColorSpinorField &w,
              ColorSpinorField &u) {
       if (x.Precision() != z.Precision()) {
         return mixed_reduce<double3, QudaSumFloat3, caxpbypzYmbwcDotProductUYNormY_, 0, 1, 1, 0, 0, false>(
             make_double2(REAL(a), IMAG(a)), make_double2(REAL(b), IMAG(b)), x, y, z, w, u);
       } else {
         return uni_reduce<double3, QudaSumFloat3, caxpbypzYmbwcDotProductUYNormY_, 0, 1, 1, 0, 0, false>(
             make_double2(REAL(a), IMAG(a)), make_double2(REAL(b), IMAG(b)), x, y, z, w, u);
       }
     }

     Complex axpyCGNorm(double a, ColorSpinorField &x, ColorSpinorField &y) {
       // swizzle since mixed is on z
       double2 cg_norm ;
       if (x.Precision() != y.Precision()) {
         cg_norm = mixed_reduce<double2, QudaSumFloat2, axpyCGNorm2, 0, 0, 1, 0, 0, false>(
             make_double2(a, 0.0), make_double2(0.0, 0.0), x, x, y, x, x);
       } else {
         cg_norm = uni_reduce<double2, QudaSumFloat2, axpyCGNorm2, 0, 0, 1, 0, 0, false>(
             make_double2(a, 0.0), make_double2(0.0, 0.0), x, x, y, x, x);
       }
       return Complex(cg_norm.x, cg_norm.y);
     }

     double3 HeavyQuarkResidualNorm(ColorSpinorField &x, ColorSpinorField &r) {
       // in case of x.Ncolor()!=3 (MG mainly) reduce_core do not support this function.
       if (x.Ncolor()!=3) return make_double3(0.0, 0.0, 0.0);
       double3 rtn = uni_reduce<double3, QudaSumFloat3, HeavyQuarkResidualNorm_, 0, 0, 0, 0, 0, true>(
           make_double2(0.0, 0.0), make_double2(0.0, 0.0), x, r, r, r, r);
       rtn.z /= (x.Volume()*comm_size());
       return rtn;
     }

     double3 xpyHeavyQuarkResidualNorm(ColorSpinorField &x, ColorSpinorField &y,
               ColorSpinorField &r) {
       // in case of x.Ncolor()!=3 (MG mainly) reduce_core do not support this function.
       if (x.Ncolor()!=3) return make_double3(0.0, 0.0, 0.0);
       double3 rtn = uni_reduce<double3, QudaSumFloat3, xpyHeavyQuarkResidualNorm_, 0, 0, 0, 0, 0, true>(
           make_double2(0.0, 0.0), make_double2(0.0, 0.0), x, y, r, r, r);
       rtn.z /= (x.Volume()*comm_size());
       return rtn;
     }

     double3 tripleCGReduction(ColorSpinorField &x, ColorSpinorField &y, ColorSpinorField &z) {
       return uni_reduce<double3, QudaSumFloat3, tripleCGReduction_, 0, 0, 0, 0, 0, false>(
           make_double2(0.0, 0.0), make_double2(0.0, 0.0), x, y, z, x, x);
     }

     double4 quadrupleCGReduction(ColorSpinorField &x, ColorSpinorField &y, ColorSpinorField &z) {
       return uni_reduce<double4, QudaSumFloat4, quadrupleCGReduction_, 0, 0, 0, 0, 0, false>(
           make_double2(0.0, 0.0), make_double2(0.0, 0.0), x, y, z, x, x);
     }

     double quadrupleCG3InitNorm(double a, ColorSpinorField &x, ColorSpinorField &y, ColorSpinorField &z, ColorSpinorField &w, ColorSpinorField &v) {
       return uni_reduce<double, QudaSumFloat, quadrupleCG3InitNorm_, 1, 1, 1, 1, 0, false>(
           make_double2(a, 0.0), make_double2(0.0, 0.0), x, y, z, w, v);
     }

     double quadrupleCG3UpdateNorm(double a, double b, ColorSpinorField &x, ColorSpinorField &y, ColorSpinorField &z, ColorSpinorField &w, ColorSpinorField &v) {
       return uni_reduce<double, QudaSumFloat, quadrupleCG3UpdateNorm_, 1, 1, 1, 1, 0, false>(
           make_double2(a, 0.0), make_double2(b, 1. - b), x, y, z, w, v);
     }

     double doubleCG3InitNorm(double a, ColorSpinorField &x, ColorSpinorField &y, ColorSpinorField &z) {
       return uni_reduce<double, QudaSumFloat, doubleCG3InitNorm_, 1, 1, 0, 0, 0, false>(
           make_double2(a, 0.0), make_double2(0.0, 0.0), x, y, z, z, z);
     }

     double doubleCG3UpdateNorm(double a, double b, ColorSpinorField &x, ColorSpinorField &y, ColorSpinorField &z) {
       return uni_reduce<double, QudaSumFloat, doubleCG3UpdateNorm_, 1, 1, 0, 0, 0, false>(
           make_double2(a, 0.0), make_double2(b, 1.0 - b), x, y, z, z, z);
     }

    } // namespace blas

 } // namespace quda
qudaMemcpy
#define qudaMemcpy(dst, src, count, kind)
Definition: quda_cuda_api.h:33

quda::Tunable::jitifyError
CUresult jitifyError() const
Definition: tune_quda.h:375

quda::ColorSpinorField::Nspin
int Nspin() const
Definition: color_spinor_field.h:406

quda::TuneParam
Definition: tune_quda.h:17

Z
int Z[4]
Definition: test_util.cpp:26

float_vector.h

pinned_malloc
#define pinned_malloc(size)
Definition: malloc_quda.h:67

quda::blas::getHostReduceBuffer
void * getHostReduceBuffer()
Definition: reduce_quda.cu:28

quda::blas::cDotProductNormA
double3 cDotProductNormA(ColorSpinorField &a, ColorSpinorField &b)
Definition: reduce_quda.cu:778

commAsyncReduction
bool commAsyncReduction()
Definition: comm_common.cpp:825

quda::blas::caxpyNorm
double caxpyNorm(const Complex &a, ColorSpinorField &x, ColorSpinorField &y)
Definition: reduce_quda.cu:746

quda::ColorSpinorField
Definition: color_spinor_field.h:311

quda::LatticeField::AuxString
const char * AuxString() const
Definition: lattice_field.h:627

quda::qudaEventQuery
cudaError_t qudaEventQuery(cudaEvent_t &event)
Wrapper around cudaEventQuery or cuEventQuery.
Definition: quda_cuda_api.cpp:209

deviceProp
cudaDeviceProp deviceProp
Definition: interface_quda.cpp:156

quda::blas::quadrupleCG3InitNorm
double quadrupleCG3InitNorm(double a, ColorSpinorField &x, ColorSpinorField &y, ColorSpinorField &z, ColorSpinorField &w, ColorSpinorField &v)
Definition: reduce_quda.cu:838

quda::blas::ReduceCuda::flops
long long flops() const
Definition: reduce_quda.cu:282

getVerbosity
QudaVerbosity getVerbosity()
Definition: util_quda.cpp:21

checkPrecision
#define checkPrecision(...)
Definition: lattice_field.h:695

quda::blas::ReduceCuda::arg
ReductionArg< ReduceType, SpinorX, SpinorY, SpinorZ, SpinorW, SpinorV, Reducer > arg
Definition: reduce_quda.cu:184

errorQuda
#define errorQuda(...)
Definition: util_quda.h:121

quda::blas::norm2
double norm2(const ColorSpinorField &a)
Definition: reduce_quda.cu:721

QUDA_CUDA_FIELD_LOCATION
Definition: enum_quda.h:326

quda::blas::ReduceCuda::tuningIter
int tuningIter() const
Definition: reduce_quda.cu:291

jitify_helper.cuh
Helper file when using jitify run-time compilation. This file should be included in source code...

host_free
#define host_free(ptr)
Definition: malloc_quda.h:71

QUDA_QUARTER_PRECISION
Definition: enum_quda.h:59

QUDA_HALF_PRECISION
Definition: enum_quda.h:60

quda::blas::cDotProduct
Complex cDotProduct(ColorSpinorField &, ColorSpinorField &)
Definition: reduce_quda.cu:764

quda::blas::ReduceCuda::apply
void apply(const cudaStream_t &stream)
Definition: reduce_quda.cu:246

quda::blas::ReductionArg::Y
SpinorY Y
Definition: reduce_core.cuh:21

streams
cudaStream_t * streams
Definition: interface_quda.cpp:157

quda::blas::ReduceCuda::result
doubleN & result
Definition: reduce_quda.cu:185

quda::stream
cudaStream_t * stream
Definition: cuda_color_spinor_field.cpp:897

quda::blas::reduceLaunch
doubleN reduceLaunch(Arg &arg, const TuneParam &tp, const cudaStream_t &stream, Tunable &tunable)
Definition: reduce_quda.cu:139

quda::blas::ReduceCuda::ReduceCuda
ReduceCuda(doubleN &result, SpinorX &X, SpinorY &Y, SpinorZ &Z, SpinorW &W, SpinorV &V, Reducer &r, ColorSpinorField &x, ColorSpinorField &y, ColorSpinorField &z, ColorSpinorField &w, ColorSpinorField &v, int length)
Definition: reduce_quda.cu:209

quda::blas::xpyHeavyQuarkResidualNorm
double3 xpyHeavyQuarkResidualNorm(ColorSpinorField &x, ColorSpinorField &y, ColorSpinorField &r)
Definition: reduce_quda.cu:818

reduceDoubleArray
void reduceDoubleArray(double *, const int len)
Definition: comm_common.cpp:808

quda::blas::ReduceCuda::~ReduceCuda
virtual ~ReduceCuda()
Definition: reduce_quda.cu:242

quda::blas::reDotProduct
double reDotProduct(ColorSpinorField &x, ColorSpinorField &y)
Definition: reduce_quda.cu:728

quda::blas::axpyCGNorm
Complex axpyCGNorm(double a, ColorSpinorField &x, ColorSpinorField &y)
Definition: reduce_quda.cu:796

quda::LatticeField::VolString
const char * VolString() const
Definition: lattice_field.h:624

quda::TuneParam::shared_bytes
int shared_bytes
Definition: tune_quda.h:22

quda::blas::ReduceCuda::sharedBytesPerBlock
unsigned int sharedBytesPerBlock(const TuneParam &param) const
Definition: reduce_quda.cu:195

quda::blas::getMappedHostReduceBuffer
void * getMappedHostReduceBuffer()
Definition: reduce_quda.cu:27

quda::reduce
__device__ void reduce(ReduceArg< T > arg, const T &in, const int idx=0)
Definition: cub_helper.cuh:137

quda::ColorSpinorField::Ncolor
int Ncolor() const
Definition: color_spinor_field.h:405

length
int length[]
Definition: gauge_force_test.cpp:34

quda::ColorSpinorField::Volume
int Volume() const
Definition: color_spinor_field.h:415

quda::TuneParam::grid
dim3 grid
Definition: tune_quda.h:21

quda::sum
__host__ __device__ void sum(double &a, double &b)
Definition: blas_helper.cuh:62

quda::blas::ReductionArg::r
Reducer r
Definition: reduce_core.cuh:25

quda
Definition: blas_cublas.h:5

quda::blas::ReduceCuda::initTuneParam
void initTuneParam(TuneParam &param) const
Definition: reduce_quda.cu:270

Y_h
cpuGaugeField * Y_h
Definition: multigrid_benchmark_test.cu:45

quda::blas::uni_reduce
doubleN uni_reduce(const double2 &a, const double2 &b, ColorSpinorField &x, ColorSpinorField &y, ColorSpinorField &z, ColorSpinorField &w, ColorSpinorField &v)
Definition: reduce_quda.cu:349

fast_reduce_enabled
static bool fast_reduce_enabled
Definition: reduce_quda.cu:16

quda::scalar
Definition: register_traits.h:113

quda::ColorSpinorField::NormBytes
size_t NormBytes() const
Definition: color_spinor_field.h:419

quda::blas::ReduceCuda::Z_h
char * Z_h
Definition: reduce_quda.cu:191

quda::blas::completeFastReduce
void completeFastReduce(int32_t words)
Definition: reduce_quda.cu:43

quda::ColorSpinorField::RealLength
size_t RealLength() const
Definition: color_spinor_field.h:412

param
QudaGaugeParam param
Definition: pack_test.cpp:17

quda::blas::ReduceCuda::z
const ColorSpinorField & z
Definition: reduce_quda.cu:187

quda::blas::getStream
cudaStream_t * getStream()
Definition: blas_quda.cu:494

quda::blas::getFastReduce
bool getFastReduce()
Definition: reduce_quda.cu:30

quda::blas::initFastReduce
void initFastReduce(int words)

reduceEnd
static cudaEvent_t reduceEnd
Definition: reduce_quda.cu:15

quda::blas::ReductionArg::W
SpinorW W
Definition: reduce_core.cuh:23

comm_size
int comm_size(void)
Definition: comm_mpi.cpp:88

quda::Tunable
Definition: tune_quda.h:59

quda::blas::ReduceCuda
Definition: reduce_quda.cu:179

quda::blas::ReduceCuda::Znorm_h
char * Znorm_h
Definition: reduce_quda.cu:192

launch_kernel.cuh

quda::blas::ReductionArg::V
SpinorV V
Definition: reduce_core.cuh:24

quda::TuneParam::block
dim3 block
Definition: tune_quda.h:20

quda::blas::ReduceCuda::tuneKey
TuneKey tuneKey() const
Definition: reduce_quda.cu:244

quda::blas::initReduce
void initReduce()
Definition: reduce_quda.cu:64

quda::tuneLaunch
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
Definition: tune.cpp:643

quda::blas::quadrupleCGReduction
double4 quadrupleCGReduction(ColorSpinorField &x, ColorSpinorField &y, ColorSpinorField &z)
Definition: reduce_quda.cu:833

warningQuda
#define warningQuda(...)
Definition: util_quda.h:133

checkLocation
#define checkLocation(...)
Definition: lattice_field.h:664

quda::blas::reduceKernel
__global__ void reduceKernel(Arg arg)
Definition: reduce_core.cuh:44

QUDA_FLOAT2_FIELD_ORDER
Definition: enum_quda.h:348

quda::blas::cabxpyzAxNorm
double cabxpyzAxNorm(double a, const Complex &b, ColorSpinorField &x, ColorSpinorField &y, ColorSpinorField &z)
Definition: reduce_quda.cu:758

quda::vec_length
Definition: register_traits.h:76

quda::blas::HeavyQuarkResidualNorm
double3 HeavyQuarkResidualNorm(ColorSpinorField &x, ColorSpinorField &r)
Definition: reduce_quda.cu:809

REAL
#define REAL(a)
Definition: blas_helper.cuh:14

X
int X[4]
Definition: covdev_test.cpp:70

quda::Complex
std::complex< double > Complex
Definition: quda_internal.h:46

quda::blas::getReduceEvent
cudaEvent_t * getReduceEvent()
Definition: reduce_quda.cu:29

quda::blas::caxpyXmazNormX
double caxpyXmazNormX(const Complex &a, ColorSpinorField &x, ColorSpinorField &y, ColorSpinorField &z)
Definition: reduce_quda.cu:752

quda::blas::ReductionArg
Definition: reduce_core.cuh:19

quda::blas::axpyReDot
double axpyReDot(double a, ColorSpinorField &x, ColorSpinorField &y)
Definition: reduce_quda.cu:740

quda::blas::ReductionArg::Z
SpinorZ Z
Definition: reduce_core.cuh:22

quda::blas::caxpyDotzy
Complex caxpyDotzy(const Complex &a, ColorSpinorField &x, ColorSpinorField &y, ColorSpinorField &z)
Definition: reduce_quda.cu:771

quda::blas::ReduceCuda::nParity
const int nParity
Definition: reduce_quda.cu:183

quda::blas::zero
void zero(ColorSpinorField &a)
Definition: blas_quda.cu:472

V
int V
Definition: test_util.cpp:27

quda::checkLength
void checkLength(const ColorSpinorField &a, const ColorSpinorField &b)
Definition: blas_helper.cuh:26

tune_quda.h

quda::Arg
Definition: spinor_noise.cu:22

quda::ColorSpinorField::isNative
bool isNative() const
Definition: color_spinor_field.cpp:568

memset
void * memset(void *s, int c, size_t n)

LAUNCH_KERNEL
#define LAUNCH_KERNEL(kernel, tp, stream, arg,...)
Definition: launch_kernel.cuh:1

quda::blas::ReduceCuda::preTune
void preTune()
Definition: reduce_quda.cu:252

quda::blas::axpbyzNorm
double axpbyzNorm(double a, ColorSpinorField &x, double b, ColorSpinorField &y, ColorSpinorField &z)
Definition: reduce_quda.cu:734

blas_quda.h

QudaSumFloat
#define QudaSumFloat
Definition: blas_helper.cuh:52

QUDA_DOUBLE_PRECISION
Definition: enum_quda.h:62

quda::blas::caxpbypzYmbwcDotProductUYNormY
double3 caxpbypzYmbwcDotProductUYNormY(const Complex &a, ColorSpinorField &x, const Complex &b, ColorSpinorField &y, ColorSpinorField &z, ColorSpinorField &w, ColorSpinorField &u)
Definition: reduce_quda.cu:783

quda::blas::norm1
double norm1(const ColorSpinorField &b)
Definition: reduce_quda.cu:714

QUDA_SINGLE_PRECISION
Definition: enum_quda.h:61

h_reduce
static QudaSumFloat * h_reduce
Definition: reduce_quda.cu:13

quda::blas::ReduceCuda::advanceSharedBytes
virtual bool advanceSharedBytes(TuneParam &param) const
Definition: reduce_quda.cu:197

X_h
cpuGaugeField * X_h
Definition: multigrid_benchmark_test.cu:45

color_spinor_field_order.h

quda::blas::ReductionArg::length
const int length
Definition: reduce_core.cuh:26

quda::ColorSpinorField::Bytes
size_t Bytes() const
Definition: color_spinor_field.h:418

generic_reduce.cuh

quda::blas::ReduceCuda::sharedBytesPerThread
unsigned int sharedBytesPerThread() const
Definition: reduce_quda.cu:194

quda::blas::getDeviceReduceBuffer
void * getDeviceReduceBuffer()
Definition: reduce_quda.cu:26

quda::blas::ReductionArg::X
SpinorX X
Definition: reduce_core.cuh:20

quda::blas::ReduceCuda::postTune
void postTune()
Definition: reduce_quda.cu:261

quda::blas::doubleCG3UpdateNorm
double doubleCG3UpdateNorm(double a, double b, ColorSpinorField &x, ColorSpinorField &y, ColorSpinorField &z)
Definition: reduce_quda.cu:853

genericReduce
ReduceType genericReduce(SpinorX &X, SpinorY &Y, SpinorZ &Z, SpinorW &W, SpinorV &V, Reducer r)
Definition: generic_reduce.cuh:6

reduce_core.cuh

quda::blas::flops
unsigned long long flops
Definition: blas_quda.cu:22

quda::arg
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
Definition: complex_quda.h:1076

device_malloc
#define device_malloc(size)
Definition: malloc_quda.h:64

quda::ColorSpinorField::Length
size_t Length() const
Definition: color_spinor_field.h:413

quda::qudaEventRecord
cudaError_t qudaEventRecord(cudaEvent_t &event, cudaStream_t stream=0)
Wrapper around cudaEventRecord or cuEventRecord.
Definition: quda_cuda_api.cpp:230

quda::blas::ReduceCuda::bytes
long long bytes() const
Definition: reduce_quda.cu:284

d_reduce
static QudaSumFloat * d_reduce
Definition: reduce_quda.cu:12

quda::Tunable::initTuneParam
virtual void initTuneParam(TuneParam &param) const
Definition: tune_quda.h:304

Spinor
Definition: texture.h:288

checkCudaError
#define checkCudaError()
Definition: util_quda.h:161

mapped_malloc
#define mapped_malloc(size)
Definition: malloc_quda.h:68

quda::blas::mixed_reduce
doubleN mixed_reduce(const double2 &a, const double2 &b, ColorSpinorField &x, ColorSpinorField &y, ColorSpinorField &z, ColorSpinorField &w, ColorSpinorField &v)
Definition: reduce_quda.cu:520

QUDA_FLOAT4_FIELD_ORDER
Definition: enum_quda.h:349

getTuning
QudaTune getTuning()
Query whether autotuning is enabled or not. Default is enabled but can be overridden by setting QUDA_...
Definition: util_quda.cpp:52

quda::blas::doubleCG3InitNorm
double doubleCG3InitNorm(double a, ColorSpinorField &x, ColorSpinorField &y, ColorSpinorField &z)
Definition: reduce_quda.cu:848

quda::LatticeField::Precision
QudaPrecision Precision() const
Definition: lattice_field.h:546

quda::blas::nativeReduce
doubleN nativeReduce(const double2 &a, const double2 &b, ColorSpinorField &x, ColorSpinorField &y, ColorSpinorField &z, ColorSpinorField &w, ColorSpinorField &v, int length)
Definition: reduce_quda.cu:297

quda::count
__device__ unsigned int count[QUDA_MAX_MULTI_REDUCE]
Definition: cub_helper.cuh:90

quda::ColorSpinorField::FieldOrder
QudaFieldOrder FieldOrder() const
Definition: color_spinor_field.h:483

hd_reduce
static QudaSumFloat * hd_reduce
Definition: reduce_quda.cu:14

quda::blas::ReduceCuda::defaultTuneParam
void defaultTuneParam(TuneParam &param) const
Definition: reduce_quda.cu:276

quda::TuneKey
Definition: tune_key.h:8

quda::blas::endReduce
void endReduce()
Definition: reduce_quda.cu:120

quda::blas::bytes
unsigned long long bytes
Definition: blas_quda.cu:23

quda::blas::tripleCGReduction
double3 tripleCGReduction(ColorSpinorField &x, ColorSpinorField &y, ColorSpinorField &z)
Definition: reduce_quda.cu:828

MAX_MULTI_BLAS_N
#define MAX_MULTI_BLAS_N
Definition: multi_reduce_quda.cu:14

quda::Tunable::defaultTuneParam
virtual void defaultTuneParam(TuneParam &param) const
Definition: tune_quda.h:329

device_free
#define device_free(ptr)
Definition: malloc_quda.h:69

quda::blas::quadrupleCG3UpdateNorm
double quadrupleCG3UpdateNorm(double a, double b, ColorSpinorField &x, ColorSpinorField &y, ColorSpinorField &z, ColorSpinorField &w, ColorSpinorField &v)
Definition: reduce_quda.cu:843

IMAG
#define IMAG(a)
Definition: blas_helper.cuh:15