quda-ref/v1.0.0/multi__reduce__quda_8cu_source.html

 #include <blas_quda.h>
 #include <tune_quda.h>
 #include <float_vector.h>
 #include <color_spinor_field_order.h>
 #include <uint_to_char.h>

 #include <launch_kernel.cuh>
 #include <jitify_helper.cuh>
 #include <kernels/multi_reduce_core.cuh>

 // work around for Fermi
 #if (__COMPUTE_CAPABILITY__ < 300)
 #undef MAX_MULTI_BLAS_N
 #define MAX_MULTI_BLAS_N 2
 #endif

 namespace quda {

   namespace blas {

     cudaStream_t* getStream();
     cudaEvent_t* getReduceEvent();
     bool getFastReduce();
     void initFastReduce(int words);
     void completeFastReduce(int32_t words);

     template <int writeX, int writeY, int writeZ, int writeW>
     struct write {
       static constexpr int X = writeX;
       static constexpr int Y = writeY;
       static constexpr int Z = writeZ;
       static constexpr int W = writeW;
     };

     template <typename doubleN, typename ReduceType, typename FloatN, int M, int NXZ, typename Arg>
     void multiReduceLaunch(doubleN result[], Arg &arg, const TuneParam &tp, const cudaStream_t &stream, Tunable &tunable)
     {

       if (tp.grid.x > (unsigned int)deviceProp.maxGridSize[0])
         errorQuda("Grid size %d greater than maximum %d\n", tp.grid.x, deviceProp.maxGridSize[0]);

       const int32_t words = tp.grid.z * NXZ * arg.NYW * sizeof(ReduceType) / sizeof(int32_t);
       if (getFastReduce() && !commAsyncReduction()) initFastReduce(words);

 #ifdef WARP_MULTI_REDUCE
 #error "Untested - should be reverified"
       // multiReduceKernel<ReduceType,FloatN,M,NXZ><<<tp.grid,tp.block,tp.shared_bytes>>>(arg);
 #else
 #ifdef JITIFY
       using namespace jitify::reflection;
       tunable.jitifyError() = program->kernel("quda::blas::multiReduceKernel")
                                   .instantiate((int)tp.block.x, Type<ReduceType>(), Type<FloatN>(), M, NXZ, Type<Arg>())
                                   .configure(tp.grid, tp.block, tp.shared_bytes, stream)
                                   .launch(arg);
 #else
 #if CUDA_VERSION < 9000
       cudaMemcpyToSymbolAsync(arg_buffer, reinterpret_cast<char *>(&arg), sizeof(arg), 0, cudaMemcpyHostToDevice,
                               *getStream());
 #endif
       LAUNCH_KERNEL_LOCAL_PARITY(multiReduceKernel, tp, stream, arg, ReduceType, FloatN, M, NXZ);
 #endif
 #endif

       if (!commAsyncReduction()) {
 #if (defined(_MSC_VER) && defined(_WIN64) || defined(__LP64__))
         if (deviceProp.canMapHostMemory) {
           if (getFastReduce()) {
             completeFastReduce(words);
           } else {
             qudaEventRecord(*getReduceEvent(), stream);
             while (cudaSuccess != qudaEventQuery(*getReduceEvent())) {}
           }
         } else
 #endif
         {
           qudaMemcpy(getHostReduceBuffer(), getMappedHostReduceBuffer(), tp.grid.z * sizeof(ReduceType) * NXZ * arg.NYW,
               cudaMemcpyDeviceToHost);
         }
       }

       // need to transpose for same order with vector thread reduction
       for (int i = 0; i < NXZ; i++) {
         for (int j = 0; j < arg.NYW; j++) {
           result[i * arg.NYW + j] = set(((ReduceType *)getHostReduceBuffer())[j * NXZ + i]);
           if (tp.grid.z == 2)
             sum(result[i * arg.NYW + j], ((ReduceType *)getHostReduceBuffer())[NXZ * arg.NYW + j * NXZ + i]);
         }
       }
     }

     namespace detail
     {
       template <unsigned... digits> struct to_chars {
         static const char value[];
       };

       template <unsigned... digits> const char to_chars<digits...>::value[] = {('0' + digits)..., 0};

       template <unsigned rem, unsigned... digits> struct explode : explode<rem / 10, rem % 10, digits...> {
       };

       template <unsigned... digits> struct explode<0, digits...> : to_chars<digits...> {
       };
     } // namespace detail

     template <unsigned num> struct num_to_string : detail::explode<num / 10, num % 10> {
     };

     template <int NXZ, typename doubleN, typename ReduceType, typename FloatN, int M, typename SpinorX,
         typename SpinorY, typename SpinorZ, typename SpinorW, typename Reducer>
     class MultiReduceCuda : public Tunable
     {

   private:
       const int NYW;
       int nParity;
       MultiReduceArg<NXZ, ReduceType, SpinorX, SpinorY, SpinorZ, SpinorW, Reducer> arg;
       doubleN *result;

       std::vector<ColorSpinorField *> &x, &y, &z, &w;

       // don't curry into the Spinors to minimize parameter size
       char *Y_h[MAX_MULTI_BLAS_N], *W_h[MAX_MULTI_BLAS_N], *Ynorm_h[MAX_MULTI_BLAS_N], *Wnorm_h[MAX_MULTI_BLAS_N];

       unsigned int sharedBytesPerThread() const { return 0; }
       unsigned int sharedBytesPerBlock(const TuneParam &param) const { return 0; }

       virtual bool advanceSharedBytes(TuneParam &param) const
       {
         TuneParam next(param);
         advanceBlockDim(next); // to get next blockDim
         int nthreads = next.block.x * next.block.y * next.block.z;
         param.shared_bytes = sharedBytesPerThread() * nthreads > sharedBytesPerBlock(param) ?
             sharedBytesPerThread() * nthreads :
             sharedBytesPerBlock(param);
         return false;
       }

       // we only launch thread blocks up to size 512 since the autoner
       // tuner favours smaller blocks and this helps with compile time
       unsigned int maxBlockSize(const TuneParam &param) const { return deviceProp.maxThreadsPerBlock / 2; }

   public:
       MultiReduceCuda(doubleN result[], SpinorX X[], SpinorY Y[], SpinorZ Z[], SpinorW W[], Reducer &r,
           std::vector<ColorSpinorField *> &x, std::vector<ColorSpinorField *> &y, std::vector<ColorSpinorField *> &z,
           std::vector<ColorSpinorField *> &w, int NYW, int length) :
           NYW(NYW),
           nParity(x[0]->SiteSubset()),
           arg(X, Y, Z, W, r, NYW, length / nParity),
           x(x),
           y(y),
           z(z),
           w(w),
           result(result),
           Y_h(),
           W_h(),
           Ynorm_h(),
           Wnorm_h()
       {
         strcpy(aux, "policy_kernel,");
         strcat(aux, x[0]->AuxString());
         if (getFastReduce()) strcat(aux, ",fast_reduce");

         // since block dot product and block norm use the same functors, we need to distinguish them
         bool is_norm = false;
         if (NXZ == NYW) {
           is_norm = true;
           for (int i = 0; i < NXZ; i++) {
             if (x[i]->V() != y[i]->V() || x[i]->V() != z[i]->V() || x[i]->V() != w[i]->V()) {
               is_norm = false;
               break;
             }
           }
         }
         if (is_norm) strcat(aux, ",norm");

 #ifdef JITIFY
         ::quda::create_jitify_program("kernels/multi_reduce_core.cuh");
 #endif
       }

       inline TuneKey tuneKey() const
       {
         char name[TuneKey::name_n];
         strcpy(name, num_to_string<NXZ>::value);
         strcat(name, std::to_string(NYW).c_str());
         strcat(name, typeid(arg.r).name());
         return TuneKey(x[0]->VolString(), name, aux);
       }

       void apply(const cudaStream_t &stream)
       {
         TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
         multiReduceLaunch<doubleN, ReduceType, FloatN, M, NXZ>(result, arg, tp, stream, *this);
       }

       // Should these be NYW?
 #ifdef WARP_MULTI_REDUCE

       bool advanceBlockDim(TuneParam &param) const
       {
         if (param.block.y < NYW) {
           param.block.y++;
           param.grid.y = (NYW + param.block.y - 1) / param.block.y;
           return true;
         } else {
           param.block.y = 1;
           param.grid.y = NYW;
           return false;
         }
       }
 #endif

       bool advanceGridDim(TuneParam &param) const
       {
         bool rtn = Tunable::advanceGridDim(param);
         if (NYW > deviceProp.maxGridSize[1]) errorQuda("N=%d is greater than the maximum support grid size", NYW);
         return rtn;
       }

       void initTuneParam(TuneParam &param) const
       {
         Tunable::initTuneParam(param);
         param.block.y = 1;
         param.grid.y = NYW;
         param.grid.z = nParity;
       }

       void defaultTuneParam(TuneParam &param) const
       {
         Tunable::defaultTuneParam(param);
         param.block.y = 1;
         param.grid.y = NYW;
         param.grid.z = nParity;
       }

       void preTune()
       {
         for (int i = 0; i < NYW; ++i) {
           arg.Y[i].backup(&Y_h[i], &Ynorm_h[i], y[i]->Bytes(), y[i]->NormBytes());
           arg.W[i].backup(&W_h[i], &Wnorm_h[i], w[i]->Bytes(), w[i]->NormBytes());
         }
       }

       void postTune()
       {
         for (int i = 0; i < NYW; ++i) {
           arg.Y[i].restore(&Y_h[i], &Ynorm_h[i], y[i]->Bytes(), y[i]->NormBytes());
           arg.W[i].restore(&W_h[i], &Wnorm_h[i], w[i]->Bytes(), w[i]->NormBytes());
         }
       }

       long long flops() const
       {
         return NYW * NXZ * arg.r.flops() * vec_length<FloatN>::value * (long long)arg.length * nParity * M;
       }

       long long bytes() const
       {
         // this will be wrong when mixed precision is added
         return NYW * NXZ * arg.r.streams() * x[0]->Bytes();
       }

       int tuningIter() const { return 3; }
     };

     template <typename doubleN, typename ReduceType, typename RegType, typename StoreType, typename yType, int M, int NXZ,
         template <int MXZ, typename ReducerType, typename Float, typename FloatN> class Reducer, typename write, typename T>
     void multiReduce(doubleN result[], const coeff_array<T> &a, const coeff_array<T> &b, const coeff_array<T> &c,
         std::vector<ColorSpinorField *> &x, std::vector<ColorSpinorField *> &y, std::vector<ColorSpinorField *> &z,
         std::vector<ColorSpinorField *> &w, int length)
     {

       const int NYW = y.size();

       memset(result, 0, NXZ * NYW * sizeof(doubleN));

       const int N_MAX = NXZ > NYW ? NXZ : NYW;
       const int N_MIN = NXZ < NYW ? NXZ : NYW;

       static_assert(MAX_MULTI_BLAS_N * MAX_MULTI_BLAS_N <= QUDA_MAX_MULTI_REDUCE,
           "MAX_MULTI_BLAS_N^2 exceeds maximum number of reductions");
       static_assert(MAX_MULTI_BLAS_N <= 16, "MAX_MULTI_BLAS_N exceeds maximum size 16");
       if (N_MAX > MAX_MULTI_BLAS_N)
         errorQuda("Spinor vector length exceeds max size (%d > %d)", N_MAX, MAX_MULTI_BLAS_N);

       if (NXZ * NYW * sizeof(Complex) > MAX_MATRIX_SIZE)
         errorQuda("A matrix exceeds max size (%lu > %d)", NXZ * NYW * sizeof(Complex), MAX_MATRIX_SIZE);

       for (int i = 0; i < N_MIN; i++) {
         checkSpinor(*x[i], *y[i]);
         checkSpinor(*x[i], *z[i]);
         checkSpinor(*x[i], *w[i]);
         if (!x[i]->isNative()) {
           warningQuda("Reductions on non-native fields are not supported\n");
           return;
         }
       }

       typedef typename scalar<RegType>::type Float;
       typedef typename vector<Float, 2>::type Float2;
       typedef vector<Float, 2> vec2;

 #ifdef JITIFY
       // need to get constants pointer from jitify instance
       if (a.use_const || b.use_const || c.use_const)
         errorQuda("Constant memory buffer support not enabled with jitify yet");
 #endif

       // FIXME - if NXZ=1 no need to copy entire array
       // FIXME - do we really need strided access here?
       if (a.data && a.use_const) {
         Float2 A[MAX_MATRIX_SIZE / sizeof(Float2)];
         // since the kernel doesn't know the width of them matrix at compile
         // time we stride it and copy the padded matrix to GPU
         for (int i = 0; i < NXZ; i++)
           for (int j = 0; j < NYW; j++) A[MAX_MULTI_BLAS_N * i + j] = make_Float2<Float2>(Complex(a.data[NYW * i + j]));

         cudaMemcpyToSymbolAsync(Amatrix_d, A, MAX_MATRIX_SIZE, 0, cudaMemcpyHostToDevice, *getStream());
         Amatrix_h = reinterpret_cast<signed char *>(const_cast<T *>(a.data));
       }

       if (b.data && b.use_const) {
         Float2 B[MAX_MATRIX_SIZE / sizeof(Float2)];
         // since the kernel doesn't know the width of them matrix at compile
         // time we stride it and copy the padded matrix to GPU
         for (int i = 0; i < NXZ; i++)
           for (int j = 0; j < NYW; j++) B[MAX_MULTI_BLAS_N * i + j] = make_Float2<Float2>(Complex(b.data[NYW * i + j]));

         cudaMemcpyToSymbolAsync(Bmatrix_d, B, MAX_MATRIX_SIZE, 0, cudaMemcpyHostToDevice, *getStream());
         Bmatrix_h = reinterpret_cast<signed char *>(const_cast<T *>(b.data));
       }

       if (c.data && c.use_const) {
         Float2 C[MAX_MATRIX_SIZE / sizeof(Float2)];
         // since the kernel doesn't know the width of them matrix at compile
         // time we stride it and copy the padded matrix to GPU
         for (int i = 0; i < NXZ; i++)
           for (int j = 0; j < NYW; j++) C[MAX_MULTI_BLAS_N * i + j] = make_Float2<Float2>(Complex(c.data[NYW * i + j]));

         cudaMemcpyToSymbolAsync(Cmatrix_d, C, MAX_MATRIX_SIZE, 0, cudaMemcpyHostToDevice, *getStream());
         Cmatrix_h = reinterpret_cast<signed char *>(const_cast<T *>(c.data));
       }

       SpinorTexture<RegType, StoreType, M> X[NXZ];
       Spinor<RegType, yType, M, write::Y> Y[MAX_MULTI_BLAS_N];
       SpinorTexture<RegType, StoreType, M> Z[NXZ];
       Spinor<RegType, StoreType, M, write::W> W[MAX_MULTI_BLAS_N];

       for (int i = 0; i < NXZ; i++) {
         X[i].set(*dynamic_cast<cudaColorSpinorField *>(x[i]));
         Z[i].set(*dynamic_cast<cudaColorSpinorField *>(z[i]));
       }
       for (int i = 0; i < NYW; i++) {
         Y[i].set(*dynamic_cast<cudaColorSpinorField *>(y[i]));
         W[i].set(*dynamic_cast<cudaColorSpinorField *>(w[i]));
       }

       Reducer<NXZ, ReduceType, Float2, RegType> r(a, b, c, NYW);

       MultiReduceCuda<NXZ, doubleN, ReduceType, RegType, M, SpinorTexture<RegType, StoreType, M>,
                       Spinor<RegType, yType, M, write::Y>, SpinorTexture<RegType, StoreType, M>,
                       Spinor<RegType, StoreType, M, write::W>, decltype(r)>
         reduce(result, X, Y, Z, W, r, x, y, z, w, NYW, length);
       reduce.apply(*blas::getStream());

       blas::bytes += reduce.bytes();
       blas::flops += reduce.flops();

       checkCudaError();
     }

     template <int NXZ, typename doubleN, typename ReduceType,
         template <int MXZ, typename ReducerType, typename Float, typename FloatN> class Reducer, typename write,
         bool siteUnroll, typename T>
     void multiReduce(doubleN result[], const coeff_array<T> &a, const coeff_array<T> &b, const coeff_array<T> &c,
         CompositeColorSpinorField &x, CompositeColorSpinorField &y, CompositeColorSpinorField &z,
         CompositeColorSpinorField &w)
     {
       const int NYW = y.size();

       int reduce_length = siteUnroll ? x[0]->RealLength() : x[0]->Length();

       QudaPrecision precision = checkPrecision(*x[0], *y[0], *z[0], *w[0]);

       if (precision == QUDA_DOUBLE_PRECISION) {

 #if QUDA_PRECISION & 8
         if (x[0]->Nspin() == 4 || x[0]->Nspin() == 2) { // wilson
 #if defined(GPU_WILSON_DIRAC) || defined(GPU_DOMAIN_WALL_DIRAC) || defined(GPU_MULTIGRID)
           const int M = siteUnroll ? 12 : 1; // determines how much work per thread to do
           if (x[0]->Nspin() == 2 && siteUnroll) errorQuda("siteUnroll not supported for nSpin==2");
           multiReduce<doubleN, ReduceType, double2, double2, double2, M, NXZ, Reducer, write>(
               result, a, b, c, x, y, z, w, reduce_length / (2 * M));
 #else
           errorQuda("blas has not been built for Nspin=%d fields", x[0]->Nspin());
 #endif
         } else if (x[0]->Nspin() == 1) {
 #ifdef GPU_STAGGERED_DIRAC
           const int M = siteUnroll ? 3 : 1; // determines how much work per thread to do
           multiReduce<doubleN, ReduceType, double2, double2, double2, M, NXZ, Reducer, write>(
               result, a, b, c, x, y, z, w, reduce_length / (2 * M));
 #else
           errorQuda("blas has not been built for Nspin=%d field", x[0]->Nspin());
 #endif
         } else {
           errorQuda("nSpin=%d is not supported\n", x[0]->Nspin());
         }
 #else
         errorQuda("QUDA_PRECISION=%d does not enable precision %d", QUDA_PRECISION, precision);
 #endif

       } else if (precision == QUDA_SINGLE_PRECISION) {

 #if QUDA_PRECISION & 4
         if (x[0]->Nspin() == 4) { // wilson
 #if defined(GPU_WILSON_DIRAC) || defined(GPU_DOMAIN_WALL_DIRAC)
           const int M = siteUnroll ? 6 : 1; // determines how much work per thread to do
           multiReduce<doubleN, ReduceType, float4, float4, float4, M, NXZ, Reducer, write>(
               result, a, b, c, x, y, z, w, reduce_length / (4 * M));
 #else
           errorQuda("blas has not been built for Nspin=%d fields", x[0]->Nspin());
 #endif
         } else if (x[0]->Nspin() == 1 || x[0]->Nspin() == 2) { // staggered
 #if defined(GPU_STAGGERED_DIRAC) || defined(GPU_MULTIGRID)
           const int M = siteUnroll ? 3 : 1;
           if (x[0]->Nspin() == 2 && siteUnroll) errorQuda("siteUnroll not supported for nSpin==2");
           multiReduce<doubleN, ReduceType, float2, float2, float2, M, NXZ, Reducer, write>(
               result, a, b, c, x, y, z, w, reduce_length / (2 * M));
 #else
           errorQuda("blas has not been built for Nspin=%d fields", x[0]->Nspin());
 #endif
         } else {
           errorQuda("nSpin=%d is not supported\n", x[0]->Nspin());
         }
 #else
         errorQuda("QUDA_PRECISION=%d does not enable precision %d", QUDA_PRECISION, precision);
 #endif

       } else if (precision == QUDA_HALF_PRECISION) { // half precision

 #if QUDA_PRECISION & 2
         if (x[0]->Nspin() == 4) { // wilson
 #if defined(GPU_WILSON_DIRAC) || defined(GPU_DOMAIN_WALL_DIRAC)
           const int M = 6;
           multiReduce<doubleN, ReduceType, float4, short4, short4, M, NXZ, Reducer, write>(
               result, a, b, c, x, y, z, w, x[0]->Volume());
 #else
           errorQuda("blas has not been built for Nspin=%d fields", x[0]->Nspin());
 #endif
         } else if (x[0]->Nspin() == 1) { // staggered
 #ifdef GPU_STAGGERED_DIRAC
           const int M = 3;
           multiReduce<doubleN, ReduceType, float2, short2, short2, M, NXZ, Reducer, write>(
               result, a, b, c, x, y, z, w, x[0]->Volume());
 #else
           errorQuda("blas has not been built for Nspin=%d fields", x[0]->Nspin());
 #endif
         } else {
           errorQuda("nSpin=%d is not supported\n", x[0]->Nspin());
         }
 #else
         errorQuda("QUDA_PRECISION=%d does not enable precision %d", QUDA_PRECISION, precision);
 #endif

       } else if (precision == QUDA_QUARTER_PRECISION) { // quarter precision

 #if QUDA_PRECISION & 1
         if (x[0]->Nspin() == 4) { // wilson
 #if defined(GPU_WILSON_DIRAC) || defined(GPU_DOMAIN_WALL_DIRAC)
           const int M = 6;
           multiReduce<doubleN, ReduceType, float4, char4, char4, M, NXZ, Reducer, write>(
               result, a, b, c, x, y, z, w, x[0]->Volume());
 #else
           errorQuda("blas has not been built for Nspin=%d fields", x[0]->Nspin());
 #endif
         } else if (x[0]->Nspin() == 1) { // staggered
 #ifdef GPU_STAGGERED_DIRAC
           const int M = 3;
           multiReduce<doubleN, ReduceType, float2, char2, char2, M, NXZ, Reducer, write>(
               result, a, b, c, x, y, z, w, x[0]->Volume());
 #else
           errorQuda("blas has not been built for Nspin=%d fields", x[0]->Nspin());
 #endif
         } else {
           errorQuda("nSpin=%d is not supported\n", x[0]->Nspin());
         }
 #else
         errorQuda("QUDA_PRECISION=%d does not enable precision %d", QUDA_PRECISION, precision);
 #endif
       } else {
         errorQuda("Precision %d not supported\n", precision);
       }
     }

     template <int NXZ, typename doubleN, typename ReduceType,
         template <int MXZ, typename ReducerType, typename Float, typename FloatN> class Reducer, typename write,
         bool siteUnroll, typename T>
     void mixedMultiReduce(doubleN result[], const coeff_array<T> &a, const coeff_array<T> &b, const coeff_array<T> &c,
         CompositeColorSpinorField &x, CompositeColorSpinorField &y, CompositeColorSpinorField &z,
         CompositeColorSpinorField &w)
     {
       const int NYW = y.size();

       checkPrecision(*x[0], *z[0]);
       checkPrecision(*y[0], *w[0]);

       assert(siteUnroll == true);
       int reduce_length = siteUnroll ? x[0]->RealLength() : x[0]->Length();

       if (y[0]->Precision() == QUDA_DOUBLE_PRECISION && x[0]->Precision() == QUDA_SINGLE_PRECISION) {

         if (x[0]->Nspin() == 4) { // wilson
 #if defined(GPU_WILSON_DIRAC) || defined(GPU_DOMAIN_WALL_DIRAC)
           const int M = 12; // determines how much work per thread to do
           multiReduce<doubleN, ReduceType, double2, float4, double2, M, NXZ, Reducer, write>(
               result, a, b, c, x, y, z, w, reduce_length / (2 * M));
 #else
           errorQuda("blas has not been built for Nspin=%d fields", x[0]->Nspin());
 #endif
         } else if (x[0]->Nspin() == 1) {
 #ifdef GPU_STAGGERED_DIRAC
           const int M = 3; // determines how much work per thread to do
           multiReduce<doubleN, ReduceType, double2, float2, double2, M, NXZ, Reducer, write>(
               result, a, b, c, x, y, z, w, reduce_length / (2 * M));
 #else
           errorQuda("blas has not been built for Nspin=%d field", x[0]->Nspin());
 #endif
         } else {
           errorQuda("nSpin=%d is not supported\n", x[0]->Nspin());
         }

       } else if (y[0]->Precision() == QUDA_DOUBLE_PRECISION && x[0]->Precision() == QUDA_HALF_PRECISION) {

         if (x[0]->Nspin() == 4) { // wilson
 #if defined(GPU_WILSON_DIRAC) || defined(GPU_DOMAIN_WALL_DIRAC)
           const int M = 6; // determines how much work per thread to do
           multiReduce<doubleN, ReduceType, double2, short4, double2, M, NXZ, Reducer, write>(
               result, a, b, c, x, y, z, w, reduce_length / (4 * M));
 #else
           errorQuda("blas has not been built for Nspin=%d fields", x[0]->Nspin());
 #endif
         } else if (x[0]->Nspin() == 1 || x[0]->Nspin() == 2) { // staggered
 #if defined(GPU_STAGGERED_DIRAC)
           const int M = 3;
           multiReduce<doubleN, ReduceType, double2, short2, double2, M, NXZ, Reducer, write>(
               result, a, b, c, x, y, z, w, reduce_length / (2 * M));
 #else
           errorQuda("blas has not been built for Nspin=%d fields", x[0]->Nspin());
 #endif
         } else {
           errorQuda("nSpin=%d is not supported\n", x[0]->Nspin());
         }

       } else if (y[0]->Precision() == QUDA_SINGLE_PRECISION && x[0]->Precision() == QUDA_HALF_PRECISION) {

         if (x[0]->Nspin() == 4) { // wilson
 #if defined(GPU_WILSON_DIRAC) || defined(GPU_DOMAIN_WALL_DIRAC)
           const int M = 6;
           multiReduce<doubleN, ReduceType, float4, short4, float4, M, NXZ, Reducer, write>(
               result, a, b, c, x, y, z, w, x[0]->Volume());
 #else
           errorQuda("blas has not been built for Nspin=%d fields", x[0]->Nspin());
 #endif
         } else if (x[0]->Nspin() == 1) { // staggered
 #ifdef GPU_STAGGERED_DIRAC
           const int M = 3;
           multiReduce<doubleN, ReduceType, float2, short2, float2, M, NXZ, Reducer, write>(
               result, a, b, c, x, y, z, w, x[0]->Volume());
 #else
           errorQuda("blas has not been built for Nspin=%d fields", x[0]->Nspin());
 #endif
         } else {
           errorQuda("nSpin=%d is not supported\n", x[0]->Nspin());
         }

       } else {
         errorQuda("Precision combination x=%d y=%d not supported\n", x[0]->Precision(), y[0]->Precision());
       }
     }

     template <int NXZ, typename doubleN, typename ReduceType,
         template <int MXZ, typename ReducerType, typename Float, typename FloatN> class ReducerDiagonal, typename writeDiagonal,
         template <int MXZ, typename ReducerType, typename Float, typename FloatN> class ReducerOffDiagonal,
         typename writeOffDiagonal, bool siteUnroll, typename T>
     void multiReduce(doubleN result[], const coeff_array<T> &a, const coeff_array<T> &b, const coeff_array<T> &c,
         CompositeColorSpinorField &x, CompositeColorSpinorField &y, CompositeColorSpinorField &z,
         CompositeColorSpinorField &w, int i, int j)
     {

       if (x[0]->Precision() == y[0]->Precision()) {
         if (i == j) { // we are on the diagonal so invoke the diagonal reducer
           multiReduce<NXZ, doubleN, ReduceType, ReducerDiagonal, writeDiagonal, siteUnroll, T>(
               result, a, b, c, x, y, z, w);
         } else { // we are on the diagonal so invoke the off-diagonal reducer
           multiReduce<NXZ, doubleN, ReduceType, ReducerOffDiagonal, writeOffDiagonal, siteUnroll, T>(
               result, a, b, c, x, y, z, w);
         }
       } else {
         if (i == j) { // we are on the diagonal so invoke the diagonal reducer
           mixedMultiReduce<NXZ, doubleN, ReduceType, ReducerDiagonal, writeDiagonal, true, T>(
               result, a, b, c, x, y, z, w);
         } else { // we are on the diagonal so invoke the off-diagonal reducer
           mixedMultiReduce<NXZ, doubleN, ReduceType, ReducerOffDiagonal, writeOffDiagonal, true, T>(
               result, a, b, c, x, y, z, w);
         }
       }
     }

     void reDotProduct(double* result, std::vector<ColorSpinorField*>& x, std::vector<ColorSpinorField*>& y){
 #ifndef SSTEP
     errorQuda("S-step code not built\n");
 #else
     switch(x.size()){
       case 1:
         multiReduce<1, double, QudaSumFloat, Dot, 0, 0, 0, 0, false>(
             result, make_double2(0.0, 0.0), make_double2(0.0, 0.0), x, y, x, y);
         break;
       case 2:
         multiReduce<2, double, QudaSumFloat, Dot, 0, 0, 0, 0, false>(
             result, make_double2(0.0, 0.0), make_double2(0.0, 0.0), x, y, x, y);
         break;
       case 3:
         multiReduce<3, double, QudaSumFloat, Dot, 0, 0, 0, 0, false>(
             result, make_double2(0.0, 0.0), make_double2(0.0, 0.0), x, y, x, y);
         break;
       case 4:
         multiReduce<4, double, QudaSumFloat, Dot, 0, 0, 0, 0, false>(
             result, make_double2(0.0, 0.0), make_double2(0.0, 0.0), x, y, x, y);
         break;
       case 5:
         multiReduce<5, double, QudaSumFloat, Dot, 0, 0, 0, 0, false>(
             result, make_double2(0.0, 0.0), make_double2(0.0, 0.0), x, y, x, y);
         break;
       case 6:
         multiReduce<6, double, QudaSumFloat, Dot, 0, 0, 0, 0, false>(
             result, make_double2(0.0, 0.0), make_double2(0.0, 0.0), x, y, x, y);
         break;
       case 7:
         multiReduce<7, double, QudaSumFloat, Dot, 0, 0, 0, 0, false>(
             result, make_double2(0.0, 0.0), make_double2(0.0, 0.0), x, y, x, y);
         break;
       case 8:
         multiReduce<8, double, QudaSumFloat, Dot, 0, 0, 0, 0, false>(
             result, make_double2(0.0, 0.0), make_double2(0.0, 0.0), x, y, x, y);
         break;
       /*case 9:
         multiReduce<9,double,QudaSumFloat,Dot,0,0,0,0,false>
         (result, make_double2(0.0, 0.0), make_double2(0.0, 0.0), x, y, x, y);
         break;
       case 10:
         multiReduce<10,double,QudaSumFloat,Dot,0,0,0,0,false>
         (result, make_double2(0.0, 0.0), make_double2(0.0, 0.0), x, y, x, y);
         break;
       case 11:
         multiReduce<11,double,QudaSumFloat,Dot,0,0,0,0,false>
         (result, make_double2(0.0, 0.0), make_double2(0.0, 0.0), x, y, x, y);
         break;
       case 12:
         multiReduce<12,double,QudaSumFloat,Dot,0,0,0,0,false>
         (result, make_double2(0.0, 0.0), make_double2(0.0, 0.0), x, y, x, y);
         break;
       case 13:
         multiReduce<13,double,QudaSumFloat,Dot,0,0,0,0,false>
         (result, make_double2(0.0, 0.0), make_double2(0.0, 0.0), x, y, x, y);
         break;
       case 14:
         multiReduce<14,double,QudaSumFloat,Dot,0,0,0,0,false>
         (result, make_double2(0.0, 0.0), make_double2(0.0, 0.0), x, y, x, y);
         break;
       case 15:
         multiReduce<15,double,QudaSumFloat,Dot,0,0,0,0,false>
         (result, make_double2(0.0, 0.0), make_double2(0.0, 0.0), x, y, x, y);
         break;
       case 16:
         multiReduce<16,double,QudaSumFloat,Dot,0,0,0,0,false>
         (result, make_double2(0.0, 0.0), make_double2(0.0, 0.0), x, y, x, y);
         break;*/
       default:
         errorQuda("Unsupported vector size");
         break;
     }
 #endif // SSTEP
     // do a single multi-node reduction only once we have computed all local dot products
     const int Nreduce = x.size()*y.size();
     reduceDoubleArray((double*)result, Nreduce);
   }


     // This function does the outer product of dot products... in column major.
     // There's a function below called 'cDotProduct' that flips it to row major.
     template <template <int MXZ, typename ReducerType, typename Float, typename FloatN> class ReducerDiagonal, typename writeDiagonal,
         template <int MXZ, typename ReducerType, typename Float, typename FloatN> class ReducerOffDiagonal, typename writeOffDiagonal>
     void multiReduce_recurse(Complex* result, std::vector<ColorSpinorField*>& x, std::vector<ColorSpinorField*>& y,
            std::vector<ColorSpinorField*>&z, std::vector<ColorSpinorField*>&w, int i_idx, int j_idx, bool hermitian, unsigned int tile_size) {

       if (y.size() > tile_size) // if greater than max single-kernel size, split and recurse
       {
         // Do the recurse first.
         Complex* result0 = &result[0];
         Complex* result1 = &result[x.size()*(y.size()/2)];
         std::vector<ColorSpinorField*> y0(y.begin(), y.begin() + y.size()/2);
         std::vector<ColorSpinorField*> y1(y.begin() + y.size()/2, y.end());
         std::vector<ColorSpinorField*> w0(w.begin(), w.begin() + w.size()/2);
         std::vector<ColorSpinorField*> w1(w.begin() + w.size()/2, w.end());
         multiReduce_recurse<ReducerDiagonal,writeDiagonal,ReducerOffDiagonal,writeOffDiagonal>(result0, x, y0, z, w0, i_idx, 2*j_idx+0, hermitian, tile_size);
         multiReduce_recurse<ReducerDiagonal,writeDiagonal,ReducerOffDiagonal,writeOffDiagonal>(result1, x, y1, z, w1, i_idx, 2*j_idx+1, hermitian, tile_size);
       }
       else
       {
         double2* cdot = new double2[x.size()*y.size()];

   // if at bottom of recursion, return if on lower left
   if (x.size() <= tile_size && hermitian) {
     if (j_idx < i_idx) { return; }
   }

         coeff_array<Complex> a, b, c;

         if (x.size() <= tile_size) {
         switch(x.size()){ // COMMENT HERE FOR COMPILE TIME
         case 1:
           multiReduce<1, double2, QudaSumFloat2, ReducerDiagonal, writeDiagonal, ReducerOffDiagonal, writeOffDiagonal, false>(
               cdot, a, b, c, x, y, z, w, i_idx, j_idx);
           break;
 #if MAX_MULTI_BLAS_N >= 2
         case 2:
           multiReduce<2, double2, QudaSumFloat2, ReducerDiagonal, writeDiagonal, ReducerOffDiagonal, writeOffDiagonal, false>(
               cdot, a, b, c, x, y, z, w, i_idx, j_idx);
           break;
 #if MAX_MULTI_BLAS_N >= 3
         case 3:
           multiReduce<3, double2, QudaSumFloat2, ReducerDiagonal, writeDiagonal, ReducerOffDiagonal, writeOffDiagonal, false>(
               cdot, a, b, c, x, y, z, w, i_idx, j_idx);
           break;
 #if MAX_MULTI_BLAS_N >= 4
         case 4:
           multiReduce<4, double2, QudaSumFloat2, ReducerDiagonal, writeDiagonal, ReducerOffDiagonal, writeOffDiagonal, false>(
               cdot, a, b, c, x, y, z, w, i_idx, j_idx);
           break;
 #if MAX_MULTI_BLAS_N >= 5
         case 5:
           multiReduce<5, double2, QudaSumFloat2, ReducerDiagonal, writeDiagonal, ReducerOffDiagonal, writeOffDiagonal, false>(
               cdot, a, b, c, x, y, z, w, i_idx, j_idx);
           break;
 #if MAX_MULTI_BLAS_N >= 6
         case 6:
           multiReduce<6, double2, QudaSumFloat2, ReducerDiagonal, writeDiagonal, ReducerOffDiagonal, writeOffDiagonal, false>(
               cdot, a, b, c, x, y, z, w, i_idx, j_idx);
           break;
 #if MAX_MULTI_BLAS_N >= 7
         case 7:
           multiReduce<7, double2, QudaSumFloat2, ReducerDiagonal, writeDiagonal, ReducerOffDiagonal, writeOffDiagonal, false>(
               cdot, a, b, c, x, y, z, w, i_idx, j_idx);
           break;
 #if MAX_MULTI_BLAS_N >= 8
         case 8:
           multiReduce<8, double2, QudaSumFloat2, ReducerDiagonal, writeDiagonal, ReducerOffDiagonal, writeOffDiagonal, false>(
               cdot, a, b, c, x, y, z, w, i_idx, j_idx);
           break;
 #if MAX_MULTI_BLAS_N >= 9
   case 9:
           multiReduce<9, double2, QudaSumFloat2, ReducerDiagonal, writeDiagonal, ReducerOffDiagonal, writeOffDiagonal, false>(
               cdot, a, b, c, x, y, z, w, i_idx, j_idx);
           break;
 #if MAX_MULTI_BLAS_N >= 10
         case 10:
           multiReduce<10, double2, QudaSumFloat2, ReducerDiagonal, writeDiagonal, ReducerOffDiagonal, writeOffDiagonal, false>(
               cdot, a, b, c, x, y, z, w, i_idx, j_idx);
           break;
 #if MAX_MULTI_BLAS_N >= 11
         case 11:
           multiReduce<11, double2, QudaSumFloat2, ReducerDiagonal, writeDiagonal, ReducerOffDiagonal, writeOffDiagonal, false>(
               cdot, a, b, c, x, y, z, w, i_idx, j_idx);
           break;
 #if MAX_MULTI_BLAS_N >= 12
         case 12:
           multiReduce<12, double2, QudaSumFloat2, ReducerDiagonal, writeDiagonal, ReducerOffDiagonal, writeOffDiagonal, false>(
               cdot, a, b, c, x, y, z, w, i_idx, j_idx);
           break;
 #if MAX_MULTI_BLAS_N >= 13
         case 13:
           multiReduce<13, double2, QudaSumFloat2, ReducerDiagonal, writeDiagonal, ReducerOffDiagonal, writeOffDiagonal, false>(
               cdot, a, b, c, x, y, z, w, i_idx, j_idx);
           break;
 #if MAX_MULTI_BLAS_N >= 14
         case 14:
           multiReduce<14, double2, QudaSumFloat2, ReducerDiagonal, writeDiagonal, ReducerOffDiagonal, writeOffDiagonal, false>(
               cdot, a, b, c, x, y, z, w, i_idx, j_idx);
           break;
 #if MAX_MULTI_BLAS_N >= 15
         case 15:
           multiReduce<15, double2, QudaSumFloat2, ReducerDiagonal, writeDiagonal, ReducerOffDiagonal, writeOffDiagonal, false>(
               cdot, a, b, c, x, y, z, w, i_idx, j_idx);
           break;
 #if MAX_MULTI_BLAS_N >= 16
         case 16:
           multiReduce<16, double2, QudaSumFloat2, ReducerDiagonal, writeDiagonal, ReducerOffDiagonal, writeOffDiagonal, false>(
               cdot, a, b, c, x, y, z, w, i_idx, j_idx);
           break;
 #endif //16
 #endif //15
 #endif //14
 #endif //13
 #endif //12
 #endif //11
 #endif //10
 #endif // 9
 #endif // 8
 #endif // 7
 #endif // 6
 #endif // 5
 #endif // 4
 #endif // 3
 #endif // 2
   }
   } else {
           // split the problem and recurse. Splitting in x requires
           // memory reshuffling (unless y = 1).
           // Use a few temporary variables.

           Complex* tmpmajor = new Complex[x.size()*y.size()];
           Complex* result0 = &tmpmajor[0];
           Complex* result1 = &tmpmajor[(x.size()/2)*y.size()];
           std::vector<ColorSpinorField*> x0(x.begin(), x.begin() + x.size()/2);
           std::vector<ColorSpinorField*> x1(x.begin() + x.size()/2, x.end());
           std::vector<ColorSpinorField*> z0(z.begin(), z.begin() + z.size()/2);
           std::vector<ColorSpinorField*> z1(z.begin() + z.size()/2, z.end());

           multiReduce_recurse<ReducerDiagonal,writeDiagonal,ReducerOffDiagonal,writeOffDiagonal>(result0, x0, y, z0, w, 2*i_idx+0, j_idx, hermitian, tile_size);
           multiReduce_recurse<ReducerDiagonal,writeDiagonal,ReducerOffDiagonal,writeOffDiagonal>(result1, x1, y, z1, w, 2*i_idx+1, j_idx, hermitian, tile_size);

           const unsigned int xlen0 = x.size()/2;
           const unsigned int xlen1 = x.size() - xlen0;
           const unsigned int ylen = y.size();

           // Copy back into result.
           int count = 0, count0 = 0, count1 = 0;
           for (unsigned int i = 0; i < ylen; i++)
           {
             for (unsigned int j = 0; j < xlen0; j++)
               result[count++] = result0[count0++];
             for (unsigned int j = 0; j < xlen1; j++)
               result[count++] = result1[count1++];
           }

           delete[] tmpmajor;
         }

   // we are at the leaf of the binary tree (e.g., we ran the kernel): perform the row-to-column-major transpose here.
         if (x.size() <= tile_size)
         {
           const unsigned int xlen = x.size();
           const unsigned int ylen = y.size();
           for (unsigned int j = 0; j < xlen; j++)
             for (unsigned int i = 0; i < ylen; i++)
               result[i*xlen+j] = Complex(cdot[j*ylen + i].x, cdot[j*ylen+i].y);
         }
         delete[] cdot;
       }
     }


     template <template <int MXZ, typename ReducerType, typename Float, typename FloatN> class ReducerDiagonal,
         typename writeDiagonal,
         template <int MXZ, typename ReducerType, typename Float, typename FloatN> class ReducerOffDiagonal,
         typename writeOffDiagonal>
     class TileSizeTune : public Tunable {
       typedef std::vector<ColorSpinorField*> vec;
       Complex *result;
       vec &x, &y, &z, &w;
       bool hermitian;
       bool Anorm;

       unsigned int sharedBytesPerThread() const { return 0; }
       unsigned int sharedBytesPerBlock(const TuneParam &param) const { return 0; }

       unsigned int max_tile_size;

     public:
       TileSizeTune(Complex *result, vec &x, vec &y, vec &z, vec &w, bool hermitian, bool Anorm = false)
   : result(result), x(x), y(y), z(z), w(w), hermitian(hermitian), Anorm(Anorm), max_tile_size(1)
       {
         strcpy(aux, "policy,");
         strcat(aux, x[0]->AuxString());
         strcat(aux, ",");
         strcat(aux, y[0]->AuxString());
         if (hermitian) strcat(aux, ",hermitian");
         if (Anorm) strcat(aux, ",Anorm");
   strcat(aux,",n=");
   char size[8];
   u64toa(size, x.size());
   strcat(aux,size);
   strcat(aux,",m=");
   u64toa(size, y.size());
   strcat(aux,size);
         u64toa(size, MAX_MULTI_BLAS_N);
         strcat(aux, ",multi-blas-n=");
         strcat(aux, size);

         // before we do policy tuning we must ensure the kernel
         // constituents have been tuned since we can't do nested tuning
         // FIXME this will break if the kernels are destructive - which they aren't here
         if (getTuning() && getTuneCache().find(tuneKey()) == getTuneCache().end()) {
           disableProfileCount(); // purely for profiling reasons, don't want to profile tunings.

           if (x.size() == 1 || y.size() == 1) { // 1-d reduction

             max_tile_size = std::min(MAX_MULTI_BLAS_N, (int)std::max(x.size(), y.size()));

             // Make sure constituents are tuned.
       for ( unsigned int tile_size=1; tile_size <= max_tile_size; tile_size++) {
         multiReduce_recurse<ReducerDiagonal,writeDiagonal,ReducerOffDiagonal,writeOffDiagonal>
     (result, x, y, z, w, 0, 0, hermitian, tile_size);
       }

           } else { // 2-d reduction

             // max_tile_size should be set to the largest power of 2 less than
             // MAX_MULTI_BLAS_N, since we have a requirement that the
             // tile size is a power of 2.
             unsigned int max_count = 0;
       unsigned int tile_size_tmp = MAX_MULTI_BLAS_N;
       while (tile_size_tmp != 1) { tile_size_tmp = tile_size_tmp >> 1; max_count++; }
       tile_size_tmp = 1;
       for (unsigned int i = 0; i < max_count; i++) { tile_size_tmp = tile_size_tmp << 1; }
       max_tile_size = tile_size_tmp;

       // Make sure constituents are tuned.
       for ( unsigned int tile_size=1; tile_size <= max_tile_size && tile_size <= x.size() &&
         (tile_size <= y.size() || y.size()==1) ; tile_size*=2) {
         multiReduce_recurse<ReducerDiagonal,writeDiagonal,ReducerOffDiagonal,writeOffDiagonal>
     (result, x, y, z, w, 0, 0, hermitian, tile_size);
       }

             // also test case using a single kernel if both dimensions
             // are less than MAX_MULTI_BLAS_N
             if (x.size() <= MAX_MULTI_BLAS_N && y.size() <= MAX_MULTI_BLAS_N) {
         multiReduce_recurse<ReducerDiagonal,writeDiagonal,ReducerOffDiagonal,writeOffDiagonal>
     (result, x, y, z, w, 0, 0, hermitian, MAX_MULTI_BLAS_N);
             }
           }

           enableProfileCount();
           setPolicyTuning(true);
         }
       }

       virtual ~TileSizeTune() { setPolicyTuning(false); }

       void apply(const cudaStream_t &stream) {
         TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());

         // tp.aux.x is where the tile size is stored. "tp" is the tuning struct.
         // it contains blocksize, grid size, etc. Since we're only tuning
         // a policy, we don't care about those sizes. That's why we only
         // tune "aux.x", which is the tile size.
         multiReduce_recurse<ReducerDiagonal,writeDiagonal,ReducerOffDiagonal,writeOffDiagonal>
           (result, x, y, z, w, 0, 0, hermitian, tp.aux.x);
       }

       // aux.x is the tile size
       bool advanceAux(TuneParam &param) const
       {
   if ( x.size()==1 || y.size()==1 ) { // 1-d reduction

     param.aux.x++;
     if ( (unsigned int)param.aux.x <= max_tile_size ) {
       return true;
     } else {
       param.aux.x = 1;
       return false;
     }

   } else { // 2-d reduction

     if ( (unsigned int)(2*param.aux.x) <= max_tile_size &&
                (unsigned int)(2*param.aux.x) <= x.size() &&
          (unsigned int)(2*param.aux.x) <= y.size() ) {
             param.aux.x *= 2; // only tune powers of two
       return true;
     } else if (x.size() <= MAX_MULTI_BLAS_N && y.size() <= MAX_MULTI_BLAS_N && param.aux.x < MAX_MULTI_BLAS_N) {
             // we've run out of power of two tiles to try, but before
             // we finish, try a single kernel if it fits
             param.aux.x = MAX_MULTI_BLAS_N;
             return true;
           } else {
       param.aux.x = 1; // reset to the beginning (which we'd need for multi-dimensional tuning)
       return false;
     }

   }
       }

       bool advanceTuneParam(TuneParam &param) const { return advanceAux(param); }

       void initTuneParam(TuneParam &param) const  {
         Tunable::initTuneParam(param);
         param.aux.x = 1; param.aux.y = 0; param.aux.z = 0; param.aux.w = 0;
       }

       void defaultTuneParam(TuneParam &param) const  {
         Tunable::defaultTuneParam(param); // default is max tile size
         // max_tile_size is MAX_MULTI_BLAS_N rounded down to the nearest power of 2.
         param.aux.x = max_tile_size; param.aux.y = 0; param.aux.z = 0; param.aux.w = 0;
       }

       TuneKey tuneKey() const {
         return TuneKey(x[0]->VolString(), typeid(*this).name(), aux);
       }

       long long flops() const { return 0; } // FIXME
       long long bytes() const { return 0; } // FIXME

       void preTune() { } // FIXME - use write to determine what needs to be saved
       void postTune() { } // FIXME - use write to determine what needs to be saved
     };

     void cDotProduct(Complex* result, std::vector<ColorSpinorField*>& x, std::vector<ColorSpinorField*>& y){
       if (x.size() == 0 || y.size() == 0) errorQuda("vector.size() == 0");
       Complex* result_tmp = new Complex[x.size()*y.size()];
       for (unsigned int i = 0; i < x.size()*y.size(); i++) result_tmp[i] = 0.0;

       // cDotProduct_recurse returns a column-major matrix.
       // To be consistent with the multi-blas functions, we should
       // switch this to row-major.
       TileSizeTune<Cdot,write<0,0,0,0>,Cdot,write<0,0,0,0> > tile(result_tmp, x, y, x, y, false);
       tile.apply(0);

       // do a single multi-node reduction only once we have computed all local dot products
       const int Nreduce = 2*x.size()*y.size();
       reduceDoubleArray((double*)result_tmp, Nreduce);

       // Switch from col-major to row-major
       const unsigned int xlen = x.size();
       const unsigned int ylen = y.size();
       for (unsigned int j = 0; j < xlen; j++)
         for (unsigned int i = 0; i < ylen; i++)
           result[j*ylen+i] = result_tmp[i*xlen + j];

       delete[] result_tmp;
     }

     void hDotProduct(Complex* result, std::vector<ColorSpinorField*>& x, std::vector<ColorSpinorField*>& y){
       if (x.size() == 0 || y.size() == 0) errorQuda("vector.size() == 0");
       if (x.size() != y.size()) errorQuda("Cannot call Hermitian block dot product on non-square inputs");

       Complex* result_tmp = new Complex[x.size()*y.size()];
       for (unsigned int i = 0; i < x.size()*y.size(); i++) result_tmp[i] = 0.0;

       TileSizeTune<Cdot,write<0,0,0,0>,Cdot,write<0,0,0,0> > tile(result_tmp, x, y, x, y, true, false); // last false is b/c L2 norm
       tile.apply(0);

       // do a single multi-node reduction only once we have computed all local dot products
       const int Nreduce = 2*x.size()*y.size();
       reduceDoubleArray((double*)result_tmp, Nreduce); // FIXME - could optimize this for Hermiticity as well

       // Switch from col-major to row-major
       const unsigned int xlen = x.size();
       const unsigned int ylen = y.size();
       for (unsigned int j = 0; j < xlen; j++)
         for (unsigned int i = j; i < ylen; i++) {
           result[j*ylen+i] = result_tmp[i*xlen + j];
           result[i*ylen+j] = conj(result_tmp[i*xlen + j]);
   }

       delete[] result_tmp;
     }

     // for (p, Ap) norms in CG which are Hermitian.
     void hDotProduct_Anorm(Complex* result, std::vector<ColorSpinorField*>& x, std::vector<ColorSpinorField*>& y){
       if (x.size() == 0 || y.size() == 0) errorQuda("vector.size() == 0");
       if (x.size() != y.size()) errorQuda("Cannot call Hermitian block A-norm dot product on non-square inputs");

       Complex* result_tmp = new Complex[x.size()*y.size()];
       for (unsigned int i = 0; i < x.size()*y.size(); i++) result_tmp[i] = 0.0;

       TileSizeTune<Cdot,write<0,0,0,0>,Cdot,write<0,0,0,0> > tile(result_tmp, x, y, x, y, true, true); // last true is b/c A norm
       tile.apply(0);

       // do a single multi-node reduction only once we have computed all local dot products
       const int Nreduce = 2*x.size()*y.size();
       reduceDoubleArray((double*)result_tmp, Nreduce); // FIXME - could optimize this for Hermiticity as well

       // Switch from col-major to row-major
       const unsigned int xlen = x.size();
       const unsigned int ylen = y.size();
       for (unsigned int j = 0; j < xlen; j++)
         for (unsigned int i = j; i < ylen; i++) {
           result[j*ylen+i] = result_tmp[i*xlen + j];
           result[i*ylen+j] = conj(result_tmp[i*xlen + j]);
   }

       delete[] result_tmp;
     }

     // takes the outer product of inner products between and y and copies y into z
     void cDotProductCopy(Complex* result, std::vector<ColorSpinorField*>& x, std::vector<ColorSpinorField*>& y,
        std::vector<ColorSpinorField*>&z){

 #if 0
       if (x.size() == 0 || y.size() == 0) errorQuda("vector.size() == 0");
       if (y.size() != z.size()) errorQuda("Cannot copy input y of size %lu into z of size %lu\n", y.size(), z.size());

       Complex* result_tmp = new Complex[x.size()*y.size()];
       for (unsigned int i = 0; i < x.size()*y.size(); i++) result_tmp[i] = 0.0;

       // When recursing, only the diagonal tiles will do the copy, the rest just do the outer product
       TileSizeTune<CdotCopy,write<0,0,0,1>,Cdot,write<0,0,0,0> > tile(result_tmp, x, y, x, y, true);
       tile.apply(0);

       // do a single multi-node reduction only once we have computed all local dot products
       const int Nreduce = 2*x.size()*y.size();
       reduceDoubleArray((double*)result_tmp, Nreduce);

       // Switch from col-major to row-major.
       const unsigned int xlen = x.size();
       const unsigned int ylen = y.size();
       for (unsigned int j = 0; j < xlen; j++)
         for (unsigned int i = 0; i < ylen; i++)
           result[j*ylen+i] = result_tmp[i*xlen + j];

       delete[] result_tmp;
 #else
       errorQuda("cDotProductCopy not enabled");
 #endif
     }

    } // namespace blas

 } // namespace quda
quda::blas::detail::explode
Definition: multi_blas_quda.cu:35

quda::blas::MultiReduceCuda::MultiReduceCuda
MultiReduceCuda(doubleN result[], SpinorX X[], SpinorY Y[], SpinorZ Z[], SpinorW W[], Reducer &r, std::vector< ColorSpinorField *> &x, std::vector< ColorSpinorField *> &y, std::vector< ColorSpinorField *> &z, std::vector< ColorSpinorField *> &w, int NYW, int length)
Definition: multi_reduce_quda.cu:144

qudaMemcpy
#define qudaMemcpy(dst, src, count, kind)
Definition: quda_cuda_api.h:33

quda::Tunable::jitifyError
CUresult jitifyError() const
Definition: tune_quda.h:375

quda::blas::MultiReduceCuda::postTune
void postTune()
Definition: multi_reduce_quda.cu:251

quda::TuneParam
Definition: tune_quda.h:17

quda::blas::TileSizeTune::tuneKey
TuneKey tuneKey() const
Definition: multi_reduce_quda.cu:1020

quda::blas::TileSizeTune::Anorm
bool Anorm
Definition: multi_reduce_quda.cu:885

quda::blas::multiReduceLaunch
void multiReduceLaunch(doubleN result[], Arg &arg, const TuneParam &tp, const cudaStream_t &stream, Tunable &tunable)
Definition: multi_reduce_quda.cu:36

float_vector.h

QudaPrecision
enum QudaPrecision_s QudaPrecision

quda::blas::getHostReduceBuffer
void * getHostReduceBuffer()
Definition: reduce_quda.cu:28

quda::blas::TileSizeTune::bytes
long long bytes() const
Definition: multi_reduce_quda.cu:1025

commAsyncReduction
bool commAsyncReduction()
Definition: comm_common.cpp:825

quda::blas::Cmatrix_d
static __constant__ signed char Cmatrix_d[MAX_MATRIX_SIZE]
Definition: multi_blas_core.cuh:19

quda::qudaEventQuery
cudaError_t qudaEventQuery(cudaEvent_t &event)
Wrapper around cudaEventQuery or cuEventQuery.
Definition: quda_cuda_api.cpp:209

deviceProp
cudaDeviceProp deviceProp
Definition: interface_quda.cpp:156

LAUNCH_KERNEL_LOCAL_PARITY
#define LAUNCH_KERNEL_LOCAL_PARITY(kernel, tp, stream, arg,...)
Definition: launch_kernel.cuh:135

quda::disableProfileCount
void disableProfileCount()
Disable the profile kernel counting.
Definition: tune.cpp:125

getVerbosity
QudaVerbosity getVerbosity()
Definition: util_quda.cpp:21

quda::blas::TileSizeTune::sharedBytesPerBlock
unsigned int sharedBytesPerBlock(const TuneParam &param) const
Definition: multi_reduce_quda.cu:888

quda::blas::TileSizeTune::hermitian
bool hermitian
Definition: multi_reduce_quda.cu:884

quda::blas::TileSizeTune::preTune
void preTune()
Definition: multi_reduce_quda.cu:1027

quda::blas::end
void end(void)
Definition: blas_quda.cu:489

checkPrecision
#define checkPrecision(...)
Definition: lattice_field.h:695

errorQuda
#define errorQuda(...)
Definition: util_quda.h:121

jitify_helper.cuh
Helper file when using jitify run-time compilation. This file should be included in source code...

QUDA_QUARTER_PRECISION
Definition: enum_quda.h:59

quda::blas::Amatrix_d
static __constant__ signed char Amatrix_d[MAX_MATRIX_SIZE]
Definition: multi_blas_core.cuh:17

QUDA_HALF_PRECISION
Definition: enum_quda.h:60

quda::blas::TileSizeTune::max_tile_size
unsigned int max_tile_size
Definition: multi_reduce_quda.cu:890

quda::blas::cDotProduct
Complex cDotProduct(ColorSpinorField &, ColorSpinorField &)
Definition: reduce_quda.cu:764

quda::blas::cDotProductCopy
void cDotProductCopy(Complex *result, std::vector< ColorSpinorField *> &a, std::vector< ColorSpinorField *> &b, std::vector< ColorSpinorField *> &c)
Computes the matrix of inner products between the vector set a and the vector set b...
Definition: multi_reduce_quda.cu:1110

QUDA_MAX_MULTI_REDUCE
#define QUDA_MAX_MULTI_REDUCE
Maximum number of simultaneous reductions that can take place. This number may be increased if needed...
Definition: quda_constants.h:63

quda::stream
cudaStream_t * stream
Definition: cuda_color_spinor_field.cpp:897

reduceDoubleArray
void reduceDoubleArray(double *, const int len)
Definition: comm_common.cpp:808

quda::blas::MultiReduceCuda::apply
void apply(const cudaStream_t &stream)
Definition: multi_reduce_quda.cu:191

Spinor::set
void set(const cudaColorSpinorField &x)
Definition: texture.h:321

quda::blas::coeff_array
Definition: multi_blas_core.cuh:110

quda::blas::reDotProduct
double reDotProduct(ColorSpinorField &x, ColorSpinorField &y)
Definition: reduce_quda.cu:728

quda::blas::MultiReduceCuda::tuneKey
TuneKey tuneKey() const
Definition: multi_reduce_quda.cu:182

quda::blas::mixedMultiReduce
void mixedMultiReduce(doubleN result[], const coeff_array< T > &a, const coeff_array< T > &b, const coeff_array< T > &c, CompositeColorSpinorField &x, CompositeColorSpinorField &y, CompositeColorSpinorField &z, CompositeColorSpinorField &w)
Definition: multi_reduce_quda.cu:511

quda::TuneParam::shared_bytes
int shared_bytes
Definition: tune_quda.h:22

quda::blas::multiReduce
void multiReduce(doubleN result[], const coeff_array< T > &a, const coeff_array< T > &b, const coeff_array< T > &c, std::vector< ColorSpinorField *> &x, std::vector< ColorSpinorField *> &y, std::vector< ColorSpinorField *> &z, std::vector< ColorSpinorField *> &w, int length)
Definition: multi_reduce_quda.cu:275

quda::blas::getMappedHostReduceBuffer
void * getMappedHostReduceBuffer()
Definition: reduce_quda.cu:27

quda::blas::MultiReduceCuda::z
std::vector< ColorSpinorField * > & z
Definition: multi_reduce_quda.cu:120

quda::blas::multiReduce_recurse
void multiReduce_recurse(Complex *result, std::vector< ColorSpinorField *> &x, std::vector< ColorSpinorField *> &y, std::vector< ColorSpinorField *> &z, std::vector< ColorSpinorField *> &w, int i_idx, int j_idx, bool hermitian, unsigned int tile_size)
Definition: multi_reduce_quda.cu:706

quda::reduce
__device__ void reduce(ReduceArg< T > arg, const T &in, const int idx=0)
Definition: cub_helper.cuh:137

length
int length[]
Definition: gauge_force_test.cpp:34

quda::TuneParam::grid
dim3 grid
Definition: tune_quda.h:21

quda::TuneParam::aux
int4 aux
Definition: tune_quda.h:23

quda::sum
__host__ __device__ void sum(double &a, double &b)
Definition: blas_helper.cuh:62

quda::Tunable::advanceGridDim
virtual bool advanceGridDim(TuneParam &param) const
Definition: tune_quda.h:77

quda::blas::TileSizeTune::initTuneParam
void initTuneParam(TuneParam &param) const
Definition: multi_reduce_quda.cu:1009

quda::blas::write::Y
static constexpr int Y
Definition: multi_blas_quda.cu:22

Nspin
int Nspin
Definition: blas_test.cu:45

quda
Definition: blas_cublas.h:5

Y_h
cpuGaugeField * Y_h
Definition: multigrid_benchmark_test.cu:45

quda::blas::TileSizeTune::defaultTuneParam
void defaultTuneParam(TuneParam &param) const
Definition: multi_reduce_quda.cu:1014

quda::scalar
Definition: register_traits.h:113

quda::enableProfileCount
void enableProfileCount()
Enable the profile kernel counting.
Definition: tune.cpp:126

quda::blas::completeFastReduce
void completeFastReduce(int32_t words)
Definition: reduce_quda.cu:43

quda::blas::MultiReduceArg::W
SpinorW W[MAX_MULTI_BLAS_N]
Definition: multi_reduce_core.cuh:54

param
QudaGaugeParam param
Definition: pack_test.cpp:17

quda::blas::getStream
cudaStream_t * getStream()
Definition: blas_quda.cu:494

quda::blas::getFastReduce
bool getFastReduce()
Definition: reduce_quda.cu:30

quda::blas::initFastReduce
void initFastReduce(int words)

quda::Tunable
Definition: tune_quda.h:59

quda::CompositeColorSpinorField
std::vector< ColorSpinorField * > CompositeColorSpinorField
Definition: color_spinor_field.h:17

launch_kernel.cuh

quda::blas::num_to_string
Definition: multi_blas_quda.cu:42

quda::blas::MultiReduceCuda::initTuneParam
void initTuneParam(TuneParam &param) const
Definition: multi_reduce_quda.cu:227

quda::TuneParam::block
dim3 block
Definition: tune_quda.h:20

quda::blas::Cdot
Definition: multi_reduce_core.cuh:231

quda::blas::MultiReduceArg::r
Reducer r
Definition: multi_reduce_core.cuh:55

quda::blas::write
Definition: multi_blas_quda.cu:20

quda::blas::write::Z
static constexpr int Z
Definition: multi_blas_quda.cu:23

quda::blas::TileSizeTune::flops
long long flops() const
Definition: multi_reduce_quda.cu:1024

quda::blas::MultiReduceArg::Y
SpinorY Y[MAX_MULTI_BLAS_N]
Definition: multi_reduce_core.cuh:52

quda::blas::coeff_array::use_const
const bool use_const
Definition: multi_blas_core.cuh:112

quda::blas::TileSizeTune::sharedBytesPerThread
unsigned int sharedBytesPerThread() const
Definition: multi_reduce_quda.cu:887

quda::blas::TileSizeTune::advanceTuneParam
bool advanceTuneParam(TuneParam &param) const
Definition: multi_reduce_quda.cu:1007

quda::blas::MultiReduceCuda::result
doubleN * result
Definition: multi_reduce_quda.cu:118

quda::blas::TileSizeTune::vec
std::vector< ColorSpinorField * > vec
Definition: multi_reduce_quda.cu:881

quda::size
constexpr int size
Definition: dslash_domain_wall_4d.cuh:8

quda::tuneLaunch
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
Definition: tune.cpp:643

warningQuda
#define warningQuda(...)
Definition: util_quda.h:133

quda::blas::Bmatrix_h
static signed char * Bmatrix_h
Definition: multi_blas_core.cuh:22

quda::blas::write::W
static constexpr int W
Definition: multi_blas_quda.cu:24

quda::blas::Bmatrix_d
static __constant__ signed char Bmatrix_d[MAX_MATRIX_SIZE]
Definition: multi_blas_core.cuh:18

quda::vec_length
Definition: register_traits.h:76

quda::blas::hDotProduct_Anorm
void hDotProduct_Anorm(Complex *result, std::vector< ColorSpinorField *> &a, std::vector< ColorSpinorField *> &b)
Computes the matrix of inner products between the vector set a and the vector set b...
Definition: multi_reduce_quda.cu:1083

quda::Complex
std::complex< double > Complex
Definition: quda_internal.h:46

quda::blas::getReduceEvent
cudaEvent_t * getReduceEvent()
Definition: reduce_quda.cu:29

quda::blas::MultiReduceCuda::defaultTuneParam
void defaultTuneParam(TuneParam &param) const
Definition: multi_reduce_quda.cu:235

quda::blas::MultiReduceArg
Parameter struct for generic multi-blas kernel.
Definition: multi_reduce_core.cuh:48

MAX_MATRIX_SIZE
#define MAX_MATRIX_SIZE
Definition: block_orthogonalize.cuh:15

quda::blas::MultiReduceCuda::arg
MultiReduceArg< NXZ, ReduceType, SpinorX, SpinorY, SpinorZ, SpinorW, Reducer > arg
Definition: multi_reduce_quda.cu:117

quda::blas::TileSizeTune::postTune
void postTune()
Definition: multi_reduce_quda.cu:1028

quda::blas::write::X
static constexpr int X
Definition: multi_blas_quda.cu:21

quda::setPolicyTuning
void setPolicyTuning(bool)
Enable / disable whether are tuning a policy.
Definition: tune.cpp:499

V
int V
Definition: test_util.cpp:27

quda::blas::MultiReduceCuda::flops
long long flops() const
Definition: multi_reduce_quda.cu:259

quda::blas::MultiReduceCuda::maxBlockSize
unsigned int maxBlockSize(const TuneParam &param) const
Definition: multi_reduce_quda.cu:141

quda::blas::MultiReduceCuda
Definition: multi_reduce_quda.cu:111

tune_quda.h

quda::Arg
Definition: spinor_noise.cu:22

memset
void * memset(void *s, int c, size_t n)

quda::blas::TileSizeTune::apply
void apply(const cudaStream_t &stream)
Definition: multi_reduce_quda.cu:963

blas_quda.h

SpinorTexture::set
void set(const cudaColorSpinorField &x, int nFace=1)
Definition: texture.h:196

SpinorTexture
Definition: texture.h:132

QUDA_DOUBLE_PRECISION
Definition: enum_quda.h:62

quda::blas::TileSizeTune::TileSizeTune
TileSizeTune(Complex *result, vec &x, vec &y, vec &z, vec &w, bool hermitian, bool Anorm=false)
Definition: multi_reduce_quda.cu:893

quda::blas::TileSizeTune::result
Complex * result
Definition: multi_reduce_quda.cu:882

multi_reduce_core.cuh

quda::blas::TileSizeTune::z
vec & z
Definition: multi_reduce_quda.cu:883

quda::blas::hDotProduct
void hDotProduct(Complex *result, std::vector< ColorSpinorField *> &a, std::vector< ColorSpinorField *> &b)
Computes the matrix of inner products between the vector set a and the vector set b...
Definition: multi_reduce_quda.cu:1056

quda::blas::TileSizeTune::~TileSizeTune
virtual ~TileSizeTune()
Definition: multi_reduce_quda.cu:961

quda::blas::MultiReduceCuda::bytes
long long bytes() const
Definition: multi_reduce_quda.cu:264

QUDA_SINGLE_PRECISION
Definition: enum_quda.h:61

color_spinor_field_order.h

quda::blas::arg_buffer
static __constant__ signed char arg_buffer[MAX_MATRIX_SIZE]
Definition: multi_blas_core.cuh:29

quda::blas::TileSizeTune
Definition: multi_reduce_quda.cu:880

quda::blas::MultiReduceCuda::tuningIter
int tuningIter() const
Definition: multi_reduce_quda.cu:270

quda::blas::coeff_array::data
const T * data
Definition: multi_blas_core.cuh:111

quda::blas::TileSizeTune::advanceAux
bool advanceAux(TuneParam &param) const
Definition: multi_reduce_quda.cu:975

quda::checkSpinor
void checkSpinor(const ColorSpinorField &a, const ColorSpinorField &b)
Definition: blas_helper.cuh:20

quda::blas::flops
unsigned long long flops
Definition: blas_quda.cu:22

quda::arg
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
Definition: complex_quda.h:1076

quda::qudaEventRecord
cudaError_t qudaEventRecord(cudaEvent_t &event, cudaStream_t stream=0)
Wrapper around cudaEventRecord or cuEventRecord.
Definition: quda_cuda_api.cpp:230

quda::blas::MultiReduceCuda::advanceSharedBytes
virtual bool advanceSharedBytes(TuneParam &param) const
Definition: multi_reduce_quda.cu:128

quda::blas::multiReduceKernel
__global__ void multiReduceKernel(Arg arg_)
Definition: multi_reduce_core.cuh:79

quda::Tunable::initTuneParam
virtual void initTuneParam(TuneParam &param) const
Definition: tune_quda.h:304

Spinor
Definition: texture.h:288

checkCudaError
#define checkCudaError()
Definition: util_quda.h:161

uint_to_char.h

quda::blas::MultiReduceCuda::nParity
int nParity
Definition: multi_reduce_quda.cu:116

quda::getTuneCache
const std::map< TuneKey, TuneParam > & getTuneCache()
Returns a reference to the tunecache map.
Definition: tune.cpp:128

quda::conj
__host__ __device__ ValueType conj(ValueType x)
Definition: complex_quda.h:130

quda::blas::MultiReduceArg::length
const int length
Definition: multi_reduce_core.cuh:56

getTuning
QudaTune getTuning()
Query whether autotuning is enabled or not. Default is enabled but can be overridden by setting QUDA_...
Definition: util_quda.cpp:52

quda::u64toa
void u64toa(char *buffer, uint64_t value)
Definition: uint_to_char.h:127

quda::count
__device__ unsigned int count[QUDA_MAX_MULTI_REDUCE]
Definition: cub_helper.cuh:90

quda::blas::Amatrix_h
static signed char * Amatrix_h
Definition: multi_blas_core.cuh:21

quda::blas::MultiReduceCuda::NYW
const int NYW
Definition: multi_reduce_quda.cu:115

quda::blas::MultiReduceCuda::advanceGridDim
bool advanceGridDim(TuneParam &param) const
Definition: multi_reduce_quda.cu:220

quda::blas::MultiReduceCuda::sharedBytesPerThread
unsigned int sharedBytesPerThread() const
Definition: multi_reduce_quda.cu:125

quda::blas::Cmatrix_h
static signed char * Cmatrix_h
Definition: multi_blas_core.cuh:23

quda::TuneKey
Definition: tune_key.h:8

quda::TuneKey::name_n
static const int name_n
Definition: tune_key.h:11

quda::blas::bytes
unsigned long long bytes
Definition: blas_quda.cu:23

quda::blas::MultiReduceCuda::sharedBytesPerBlock
unsigned int sharedBytesPerBlock(const TuneParam &param) const
Definition: multi_reduce_quda.cu:126

MAX_MULTI_BLAS_N
#define MAX_MULTI_BLAS_N
Definition: multi_reduce_quda.cu:14

quda::Tunable::defaultTuneParam
virtual void defaultTuneParam(TuneParam &param) const
Definition: tune_quda.h:329

quda::blas::MultiReduceCuda::preTune
void preTune()
Definition: multi_reduce_quda.cu:243