v0.9.0/doc/blas__quda_8cu_source.html

 #include <stdlib.h>
 #include <stdio.h>
 #include <cstring> // needed for memset


 #include <tune_quda.h>
 #include <typeinfo>

 #include <quda_internal.h>
 #include <float_vector.h>
 #include <blas_quda.h>
 #include <color_spinor_field.h>
 #include <color_spinor_field_order.h>

 #define checkSpinor(a, b)           \
   {                 \
     if (a.Precision() != b.Precision())         \
       errorQuda("precisions do not match: %d %d", a.Precision(), b.Precision()); \
     if (a.Length() != b.Length())         \
       errorQuda("lengths do not match: %lu %lu", a.Length(), b.Length()); \
     if (a.Stride() != b.Stride())         \
       errorQuda("strides do not match: %d %d", a.Stride(), b.Stride()); \
   }

 #define checkLength(a, b)           \
   {                 \
     if (a.Length() != b.Length())         \
       errorQuda("lengths do not match: %lu %lu", a.Length(), b.Length()); \
     if (a.Stride() != b.Stride())         \
       errorQuda("strides do not match: %d %d", a.Stride(), b.Stride()); \
   }

 namespace quda {

   namespace blas {

 #define BLAS_SPINOR // do not include ghost functions in Spinor class to reduce parameter space overhead
 #include <texture.h>

     unsigned long long flops;
     unsigned long long bytes;

     void zero(ColorSpinorField &a) {
       if (typeid(a) == typeid(cudaColorSpinorField)) {
   static_cast<cudaColorSpinorField&>(a).zero();
       } else {
   static_cast<cpuColorSpinorField&>(a).zero();
       }
     }

     static cudaStream_t *blasStream;

     static struct {
       const char *vol_str;
       const char *aux_str;
       char aux_tmp[TuneKey::aux_n];
     } blasStrings;

     void initReduce();
     void endReduce();

     void init()
     {
       blasStream = &streams[Nstream-1];
       initReduce();
     }

     void end(void)
     {
       endReduce();
     }

     cudaStream_t* getStream() { return blasStream; }

 #include <blas_core.cuh>

 #include <blas_core.h>
 #include <blas_mixed_core.h>

     template <typename Float2, typename FloatN>
     struct BlasFunctor {

       virtual __device__ __host__ void init() { ; }

       virtual __device__ __host__ void operator()(FloatN &x, FloatN &y, FloatN &z, FloatN &w) = 0;
     };

     template <typename Float2, typename FloatN>
     struct axpby_ : public BlasFunctor<Float2,FloatN> {
       const Float2 a;
       const Float2 b;
       axpby_(const Float2 &a, const Float2 &b, const Float2 &c) : a(a), b(b) { ; }
       __device__ __host__ void operator()(FloatN &x, FloatN &y, FloatN &z, FloatN &w)
       { y = a.x*x + b.x*y; }
       static int streams() { return 3; }
       static int flops() { return 3; }
     };

     void axpby(const double &a, ColorSpinorField &x, const double &b, ColorSpinorField &y) {
       if (x.Precision() != y.Precision()) {
   // call hacked mixed precision kernel
   mixed::blasCuda<axpby_,0,1,0,0>(make_double2(a,0.0), make_double2(b,0.0), make_double2(0.0,0.0),
                x, y, x, x);
       } else {
   blasCuda<axpby_,0,1,0,0>(make_double2(a, 0.0), make_double2(b, 0.0), make_double2(0.0, 0.0),
          x, y, x, x);
       }
     }

     template <typename Float2, typename FloatN>
     struct xpy_ : public BlasFunctor<Float2,FloatN> {
       xpy_(const Float2 &a, const Float2 &b, const Float2 &c) { ; }
       __device__ __host__ void operator()(FloatN &x, FloatN &y, FloatN &z, FloatN &w) { y += x ; }
       static int streams() { return 3; }
       static int flops() { return 1; }
     };

     void xpy(ColorSpinorField &x, ColorSpinorField &y) {
       if (x.Precision() != y.Precision()) {
         mixed::blasCuda<xpy_,0,1,0,0>(make_double2(1.0, 0.0), make_double2(1.0, 0.0),
                                       make_double2(0.0, 0.0), x, y, x, x);
       } else {
         blasCuda<xpy_,0,1,0,0>(make_double2(1.0, 0.0), make_double2(1.0, 0.0),
                                make_double2(0.0, 0.0), x, y, x, x);
       }
     }

     template <typename Float2, typename FloatN>
     struct axpy_ : public BlasFunctor<Float2,FloatN> {
       const Float2 a;
       axpy_(const Float2 &a, const Float2 &b, const Float2 &c) : a(a) { ; }
       __device__ __host__ void operator()(FloatN &x, FloatN &y, FloatN &z, FloatN &w) { y = a.x*x + y; }
       static int streams() { return 3; }
       static int flops() { return 2; }
     };

     void axpy(const double &a, ColorSpinorField &x, ColorSpinorField &y) {
       if (x.Precision() != y.Precision()) {
   // call hacked mixed precision kernel
   mixed::blasCuda<axpy_,0,1,0,0>(make_double2(a,0.0), make_double2(1.0,0.0), make_double2(0.0,0.0),
                x, y, x, x);
       } else {
   blasCuda<axpy_,0,1,0,0>(make_double2(a, 0.0), make_double2(1.0, 0.0), make_double2(0.0, 0.0),
              x, y, x, x);
       }
     }

     template <typename Float2, typename FloatN>
     struct xpayz_ : public BlasFunctor<Float2,FloatN> {
       const Float2 a;
       xpayz_(const Float2 &a, const Float2 &b, const Float2 &c) : a(a) { ; }
       __device__ __host__ void operator()(FloatN &x, FloatN &y, FloatN &z, FloatN &w) { z = x + a.x*y; }
       static int streams() { return 3; }
       static int flops() { return 2; }
     };

     void xpay(ColorSpinorField &x, const double &a, ColorSpinorField &y) {
       blasCuda<xpayz_,0,0,1,0>(make_double2(a,0.0), make_double2(0.0, 0.0), make_double2(0.0, 0.0), x, y, y, x);
     }

     void xpayz(ColorSpinorField &x, const double &a, ColorSpinorField &y, ColorSpinorField &z) {
       blasCuda<xpayz_,0,0,1,0>(make_double2(a,0.0), make_double2(0.0, 0.0), make_double2(0.0, 0.0), x, y, z, x);
     }

     template <typename Float2, typename FloatN>
     struct mxpy_ : public BlasFunctor<Float2,FloatN> {
       mxpy_(const Float2 &a, const Float2 &b, const Float2 &c) { ; }
       __device__ __host__ void operator()(FloatN &x, FloatN &y, FloatN &z, FloatN &w) { y -= x; }
       static int streams() { return 3; }
       static int flops() { return 1; }
     };

     void mxpy(ColorSpinorField &x, ColorSpinorField &y) {
       blasCuda<mxpy_,0,1,0,0>(make_double2(1.0, 0.0), make_double2(1.0, 0.0),
            make_double2(0.0, 0.0), x, y, x, x);
     }

     template <typename Float2, typename FloatN>
     struct ax_ : public BlasFunctor<Float2,FloatN> {
       const Float2 a;
       ax_(const Float2 &a, const Float2 &b, const Float2 &c) : a(a) { ; }
       __device__ __host__ void operator()(FloatN &x, FloatN &y, FloatN &z, FloatN &w) { x *= a.x; }
       static int streams() { return 2; }
       static int flops() { return 1; }
     };

     void ax(const double &a, ColorSpinorField &x) {
       blasCuda<ax_,1,0,0,0>(make_double2(a, 0.0), make_double2(0.0, 0.0),
          make_double2(0.0, 0.0), x, x, x, x);
     }


     __device__ __host__ void _caxpy(const float2 &a, const float4 &x, float4 &y) {
       y.x += a.x*x.x; y.x -= a.y*x.y;
       y.y += a.y*x.x; y.y += a.x*x.y;
       y.z += a.x*x.z; y.z -= a.y*x.w;
       y.w += a.y*x.z; y.w += a.x*x.w;
     }

     __device__ __host__ void _caxpy(const float2 &a, const float2 &x, float2 &y) {
       y.x += a.x*x.x; y.x -= a.y*x.y;
       y.y += a.y*x.x; y.y += a.x*x.y;
     }

     __device__ __host__ void _caxpy(const double2 &a, const double2 &x, double2 &y) {
       y.x += a.x*x.x; y.x -= a.y*x.y;
       y.y += a.y*x.x; y.y += a.x*x.y;
     }

     template <typename Float2, typename FloatN>
     struct caxpy_ : public BlasFunctor<Float2,FloatN> {
       const Float2 a;
       caxpy_(const Float2 &a, const Float2 &b, const Float2 &c) : a(a) { ; }
       __device__ __host__ void operator()(FloatN &x, FloatN &y, FloatN &z, FloatN &w)
       { _caxpy(a, x, y); }
       static int streams() { return 3; }
       static int flops() { return 4; }
     };

     void caxpy(const Complex &a, ColorSpinorField &x, ColorSpinorField &y) {
       if (x.Precision() != y.Precision()) {
         mixed::blasCuda<caxpy_,0,1,0,0>(make_double2(real(a),imag(a)), make_double2(0.0, 0.0),
                make_double2(0.0, 0.0), x, y, x, x);
       } else {
         blasCuda<caxpy_,0,1,0,0>(make_double2(real(a),imag(a)), make_double2(0.0, 0.0),
                make_double2(0.0, 0.0), x, y, x, x);
       }
     }


     __device__ __host__ void _caxpby(const float2 &a, const float4 &x, const float2 &b, float4 &y)
     { float4 yy;
       yy.x = a.x*x.x; yy.x -= a.y*x.y; yy.x += b.x*y.x; yy.x -= b.y*y.y;
       yy.y = a.y*x.x; yy.y += a.x*x.y; yy.y += b.y*y.x; yy.y += b.x*y.y;
       yy.z = a.x*x.z; yy.z -= a.y*x.w; yy.z += b.x*y.z; yy.z -= b.y*y.w;
       yy.w = a.y*x.z; yy.w += a.x*x.w; yy.w += b.y*y.z; yy.w += b.x*y.w;
       y = yy; }

     __device__ __host__ void _caxpby(const float2 &a, const float2 &x, const float2 &b, float2 &y)
     { float2 yy;
       yy.x = a.x*x.x; yy.x -= a.y*x.y; yy.x += b.x*y.x; yy.x -= b.y*y.y;
       yy.y = a.y*x.x; yy.y += a.x*x.y; yy.y += b.y*y.x; yy.y += b.x*y.y;
       y = yy; }

     __device__ __host__ void _caxpby(const double2 &a, const double2 &x, const double2 &b, double2 &y)
     { double2 yy;
       yy.x = a.x*x.x; yy.x -= a.y*x.y; yy.x += b.x*y.x; yy.x -= b.y*y.y;
       yy.y = a.y*x.x; yy.y += a.x*x.y; yy.y += b.y*y.x; yy.y += b.x*y.y;
       y = yy; }

     template <typename Float2, typename FloatN>
     struct caxpby_ : public BlasFunctor<Float2,FloatN> {
       const Float2 a;
       const Float2 b;
       caxpby_(const Float2 &a, const Float2 &b, const Float2 &c) : a(a), b(b) { ; }
       __device__ __host__ void operator()(FloatN &x, FloatN &y, FloatN &z, FloatN &w)
       { _caxpby(a, x, b, y); }
       static int streams() { return 3; }
       static int flops() { return 7; }
     };

     void caxpby(const Complex &a, ColorSpinorField &x, const Complex &b, ColorSpinorField &y) {
       blasCuda<caxpby_,0,1,0,0>(make_double2(REAL(a),IMAG(a)), make_double2(REAL(b), IMAG(b)),
              make_double2(0.0, 0.0), x, y, x, x);
     }

     __device__ __host__ void _cxpaypbz(const float4 &x, const float2 &a, const float4 &y, const float2 &b, float4 &z) {
       float4 zz;
       zz.x = x.x + a.x*y.x; zz.x -= a.y*y.y; zz.x += b.x*z.x; zz.x -= b.y*z.y;
       zz.y = x.y + a.y*y.x; zz.y += a.x*y.y; zz.y += b.y*z.x; zz.y += b.x*z.y;
       zz.z = x.z + a.x*y.z; zz.z -= a.y*y.w; zz.z += b.x*z.z; zz.z -= b.y*z.w;
       zz.w = x.w + a.y*y.z; zz.w += a.x*y.w; zz.w += b.y*z.z; zz.w += b.x*z.w;
       z = zz;
     }

     __device__ __host__ void _cxpaypbz(const float2 &x, const float2 &a, const float2 &y, const float2 &b, float2 &z) {
       float2 zz;
       zz.x = x.x + a.x*y.x; zz.x -= a.y*y.y; zz.x += b.x*z.x; zz.x -= b.y*z.y;
       zz.y = x.y + a.y*y.x; zz.y += a.x*y.y; zz.y += b.y*z.x; zz.y += b.x*z.y;
       z = zz;
     }

     __device__ __host__ void _cxpaypbz(const double2 &x, const double2 &a, const double2 &y, const double2 &b, double2 &z) {
       double2 zz;
       zz.x = x.x + a.x*y.x; zz.x -= a.y*y.y; zz.x += b.x*z.x; zz.x -= b.y*z.y;
       zz.y = x.y + a.y*y.x; zz.y += a.x*y.y; zz.y += b.y*z.x; zz.y += b.x*z.y;
       z = zz;
     }

     template <typename Float2, typename FloatN>
     struct cxpaypbz_ : public BlasFunctor<Float2,FloatN> {
       const Float2 a;
       const Float2 b;
       cxpaypbz_(const Float2 &a, const Float2 &b, const Float2 &c) : a(a), b(b) { ; }
       __device__ __host__ void operator()(FloatN &x, FloatN &y, FloatN &z, FloatN &w)
       { _cxpaypbz(x, a, y, b, z); }
       static int streams() { return 4; }
       static int flops() { return 8; }
     };

     void cxpaypbz(ColorSpinorField &x, const Complex &a, ColorSpinorField &y,
       const Complex &b, ColorSpinorField &z) {
       blasCuda<cxpaypbz_,0,0,1,0>(make_double2(REAL(a),IMAG(a)), make_double2(REAL(b), IMAG(b)),
          make_double2(0.0, 0.0), x, y, z, z);
     }

     template <typename Float2, typename FloatN>
     struct axpyBzpcx_ : public BlasFunctor<Float2,FloatN> {
       const Float2 a;
       const Float2 b;
       const Float2 c;
       axpyBzpcx_(const Float2 &a, const Float2 &b, const Float2 &c) : a(a), b(b), c(c) { ; }
       __device__ __host__ void operator()(FloatN &x, FloatN &y, FloatN &z, FloatN &w)
       { y += a.x*x; x = b.x*z + c.x*x; }
       static int streams() { return 5; }
       static int flops() { return 5; }
     };

     void axpyBzpcx(const double &a, ColorSpinorField& x, ColorSpinorField& y, const double &b,
        ColorSpinorField& z, const double &c) {
       if (x.Precision() != y.Precision()) {
   // call hacked mixed precision kernel
   mixed::blasCuda<axpyBzpcx_,1,1,0,0>(make_double2(a,0.0), make_double2(b,0.0),
               make_double2(c,0.0), x, y, z, x);
       } else {
   // swap arguments around
   blasCuda<axpyBzpcx_,1,1,0,0>(make_double2(a,0.0), make_double2(b,0.0),
              make_double2(c,0.0), x, y, z, x);
       }
     }


     template <typename Float2, typename FloatN>
     struct axpyZpbx_ : public BlasFunctor<Float2,FloatN> {
       const Float2 a;
       const Float2 b;
       axpyZpbx_(const Float2 &a, const Float2 &b, const Float2 &c) : a(a), b(b) { ; }
       __device__ __host__ void operator()(FloatN &x, FloatN &y, FloatN &z, FloatN &w)
       { y += a.x*x; x = z + b.x*x; }
       static int streams() { return 5; }
       static int flops() { return 4; }
     };

     void axpyZpbx(const double &a, ColorSpinorField& x, ColorSpinorField& y,
       ColorSpinorField& z, const double &b) {
       if (x.Precision() != y.Precision()) {
   // call hacked mixed precision kernel
   mixed::blasCuda<axpyZpbx_,1,1,0,0>(make_double2(a,0.0), make_double2(b,0.0), make_double2(0.0,0.0),
              x, y, z, x);
       } else {
   // swap arguments around
   blasCuda<axpyZpbx_,1,1,0,0>(make_double2(a,0.0), make_double2(b,0.0), make_double2(0.0,0.0),
             x, y, z, x);
       }
     }

     template <typename Float2, typename FloatN>
     struct caxpyBzpx_ : public BlasFunctor<Float2,FloatN> {
       const Float2 a;
       const Float2 b;
       caxpyBzpx_(const Float2 &a, const Float2 &b, const Float2 &c) : a(a), b(b) { ; }
       __device__ __host__ void operator()(FloatN &x, FloatN &y, FloatN &z, FloatN &w)
       { _caxpy(a, x, y); _caxpy(b, z, x); }

       static int streams() { return 5; }
       static int flops() { return 8; }
     };

     void caxpyBzpx(const Complex &a, ColorSpinorField &x,
           ColorSpinorField &y, const Complex &b, ColorSpinorField &z) {
           if (x.Precision() != y.Precision()) {
             mixed::blasCuda<caxpyBzpx_,1,1,0,0>(make_double2(REAL(a),IMAG(a)), make_double2(REAL(b), IMAG(b)),
              make_double2(0.0,0.0), x, y, z, x);
           } else {
             blasCuda<caxpyBzpx_,1,1,0,0>(make_double2(REAL(a),IMAG(a)), make_double2(REAL(b), IMAG(b)),
              make_double2(0.0,0.0), x, y, z, x);
           }
     }

     template <typename Float2, typename FloatN>
     struct caxpyBxpz_ : public BlasFunctor<Float2,FloatN> {
       const Float2 a;
       const Float2 b;
       caxpyBxpz_(const Float2 &a, const Float2 &b, const Float2 &c) : a(a), b(b) { ; }
       __device__ __host__ void operator()(FloatN &x, FloatN &y, FloatN &z, FloatN &w)
       { _caxpy(a, x, y); _caxpy(b, x, z); }

       static int streams() { return 5; }
       static int flops() { return 8; }
     };

     void caxpyBxpz(const Complex &a, ColorSpinorField &x,
           ColorSpinorField &y, const Complex &b, ColorSpinorField &z) {
           if (x.Precision() != y.Precision()) {
             mixed::blasCuda<caxpyBxpz_,0,1,1,0>(make_double2(REAL(a),IMAG(a)), make_double2(REAL(b), IMAG(b)),
                make_double2(0.0,0.0), x, y, z, x);
             } else {
               blasCuda<caxpyBxpz_,0,1,1,0>(make_double2(REAL(a),IMAG(a)), make_double2(REAL(b), IMAG(b)),
                make_double2(0.0,0.0), x, y, z, x);
             }
     }

     template <typename Float2, typename FloatN>
     struct caxpbypzYmbw_ : public BlasFunctor<Float2,FloatN> {
       const Float2 a;
       const Float2 b;
       caxpbypzYmbw_(const Float2 &a, const Float2 &b, const Float2 &c) : a(a), b(b) { ; }
       __device__ __host__ void operator()(FloatN &x, FloatN &y, FloatN &z, FloatN &w)
       { _caxpy(a, x, z); _caxpy(b, y, z); _caxpy(-b, w, y); }

       static int streams() { return 6; }
       static int flops() { return 12; }
     };

     void caxpbypzYmbw(const Complex &a, ColorSpinorField &x, const Complex &b,
           ColorSpinorField &y, ColorSpinorField &z, ColorSpinorField &w) {
       blasCuda<caxpbypzYmbw_,0,1,1,0>(make_double2(REAL(a),IMAG(a)), make_double2(REAL(b), IMAG(b)),
              make_double2(0.0,0.0), x, y, z, w);
     }

     template <typename Float2, typename FloatN>
     struct cabxpyAx_ : public BlasFunctor<Float2,FloatN> {
       const Float2 a;
       const Float2 b;
       cabxpyAx_(const Float2 &a, const Float2 &b, const Float2 &c) : a(a), b(b) { ; }
       __device__ __host__ void operator()(FloatN &x, FloatN &y, FloatN &z, FloatN &w)
       { x *= a.x; _caxpy(b, x, y); }
       static int streams() { return 4; }
       static int flops() { return 5; }
     };

     void cabxpyAx(const double &a, const Complex &b,
       ColorSpinorField &x, ColorSpinorField &y) {
       // swap arguments around
       blasCuda<cabxpyAx_,1,1,0,0>(make_double2(a,0.0), make_double2(REAL(b),IMAG(b)),
           make_double2(0.0,0.0), x, y, x, x);
     }

     template <typename Float2, typename FloatN>
     struct caxpbypz_ : public BlasFunctor<Float2,FloatN> {
       const Float2 a;
       const Float2 b;
       caxpbypz_(const Float2 &a, const Float2 &b, const Float2 &c) : a(a), b(b) { ; }
       __device__ __host__ void operator()(FloatN &x, FloatN &y, FloatN &z, FloatN &w)
       { _caxpy(a, x, z); _caxpy(b, y, z); }
       static int streams() { return 4; }
       static int flops() { return 8; }
     };

     void caxpbypz(const Complex &a, ColorSpinorField &x, const Complex &b,
       ColorSpinorField &y, ColorSpinorField &z) {
       blasCuda<caxpbypz_,0,0,1,0>(make_double2(REAL(a),IMAG(a)), make_double2(REAL(b),IMAG(b)),
           make_double2(0.0,0.0), x, y, z, z);
     }

     template <typename Float2, typename FloatN>
     struct caxpbypczpw_ : public BlasFunctor<Float2,FloatN> {
       const Float2 a;
       const Float2 b;
       const Float2 c;
       caxpbypczpw_(const Float2 &a, const Float2 &b, const Float2 &c) : a(a), b(b), c(c) { ; }
       __device__ __host__ void operator()(FloatN &x, FloatN &y, FloatN &z, FloatN &w)
       { _caxpy(a, x, w); _caxpy(b, y, w); _caxpy(c, z, w); }

       static int streams() { return 4; }
       static int flops() { return 12; }
     };

     void caxpbypczpw(const Complex &a, ColorSpinorField &x, const Complex &b,
          ColorSpinorField &y, const Complex &c, ColorSpinorField &z,
          ColorSpinorField &w) {
       blasCuda<caxpbypczpw_,0,0,0,1>(make_double2(REAL(a),IMAG(a)), make_double2(REAL(b),IMAG(b)),
              make_double2(REAL(c),IMAG(c)), x, y, z, w);
     }

     template <typename Float2, typename FloatN>
     struct caxpyxmaz_ : public BlasFunctor<Float2,FloatN> {
       Float2 a;
       caxpyxmaz_(const Float2 &a, const Float2 &b, const Float2 &c) : a(a) { ; }
       __device__ __host__ void operator()(FloatN &x, FloatN &y, FloatN &z, FloatN &w)
       { _caxpy(a, x, y); _caxpy(-a, z, x); }
       static int streams() { return 5; }
       static int flops() { return 8; }
     };

     void caxpyXmaz(const Complex &a, ColorSpinorField &x,
        ColorSpinorField &y, ColorSpinorField &z) {
       blasCuda<caxpyxmaz_,1,1,0,0>(make_double2(REAL(a), IMAG(a)), make_double2(0.0, 0.0),
            make_double2(0.0, 0.0), x, y, z, x);
     }

     template <typename Float2, typename FloatN>
     struct caxpyxmazMR_ : public BlasFunctor<Float2,FloatN> {
       Float2 a;
       double3 *Ar3;
       caxpyxmazMR_(const Float2 &a, const Float2 &b, const Float2 &c)
   : a(a), Ar3(static_cast<double3*>(blas::getDeviceReduceBuffer())) { ; }

       inline __device__ __host__ void init() {
 #ifdef __CUDA_ARCH__
   typedef decltype(a.x) real;
   double3 result = __ldg(Ar3);
   a.y = a.x * (real)(result.y) * ((real)1.0 / (real)result.z);
   a.x = a.x * (real)(result.x) * ((real)1.0 / (real)result.z);
 #endif
       }

       __device__ __host__ void operator()(FloatN &x, FloatN &y, FloatN &z, FloatN &w)
       { _caxpy(a, x, y); _caxpy(-a, z, x); }

       static int streams() { return 5; }
       static int flops() { return 8; }
     };

     void caxpyXmazMR(const Complex &a, ColorSpinorField &x,
          ColorSpinorField &y, ColorSpinorField &z) {
       if (!commAsyncReduction())
   errorQuda("This kernel requires asynchronous reductions to be set");
       if (x.Location() == QUDA_CPU_FIELD_LOCATION)
   errorQuda("This kernel cannot be run on CPU fields");

       blasCuda<caxpyxmazMR_,1,1,0,0>(make_double2(REAL(a), IMAG(a)), make_double2(0.0, 0.0),
              make_double2(0.0, 0.0), x, y, z, x);
     }

     template <typename Float2, typename FloatN>
     struct tripleCGUpdate_ : public BlasFunctor<Float2,FloatN> {
       Float2 a, b;
       tripleCGUpdate_(const Float2 &a, const Float2 &b, const Float2 &c) : a(a), b(b) { ; }
       __device__ __host__ void operator()(FloatN &x, FloatN &y, FloatN &z, FloatN &w)
       { y += a.x*w; z -= a.x*x; w = z + b.x*w; }
       static int streams() { return 7; }
       static int flops() { return 6; }
     };

     void tripleCGUpdate(const double &a, const double &b, ColorSpinorField &x,
       ColorSpinorField &y, ColorSpinorField &z, ColorSpinorField &w) {
       if (x.Precision() != y.Precision()) {
       // call hacked mixed precision kernel
   mixed::blasCuda<tripleCGUpdate_,0,1,1,1>(make_double2(a,0.0), make_double2(b,0.0),
              make_double2(0.0,0.0), x, y, z, w);
       } else {
   blasCuda<tripleCGUpdate_,0,1,1,1>(make_double2(a, 0.0), make_double2(b, 0.0),
             make_double2(0.0, 0.0), x, y, z, w);
       }
     }

   } // namespace blas

 } // namespace quda
quda::blas::caxpbypzYmbw_::caxpbypzYmbw_
caxpbypzYmbw_(const Float2 &a, const Float2 &b, const Float2 &c)
Definition: blas_quda.cu:456

quda::blas::caxpbypczpw_
Definition: blas_quda.cu:515

quda::blas::tripleCGUpdate_::tripleCGUpdate_
tripleCGUpdate_(const Float2 &a, const Float2 &b, const Float2 &c)
Definition: blas_quda.cu:603

quda::blas::xpy_::flops
static int flops()
total number of input and output streams
Definition: blas_quda.cu:125

quda::blas::axpyZpbx_::streams
static int streams()
Definition: blas_quda.cu:380

quda::blas::aux_str
const char * aux_str
Definition: blas_quda.cu:57

quda::blas::axpyBzpcx_::axpyBzpcx_
axpyBzpcx_(const Float2 &a, const Float2 &b, const Float2 &c)
Definition: blas_quda.cu:349

quda::blas::caxpbypzYmbw_::b
const Float2 b
Definition: blas_quda.cu:455

quda::blas::caxpyxmazMR_::operator()
__device__ __host__ void operator()(FloatN &x, FloatN &y, FloatN &z, FloatN &w)
where the reduction is usually computed and any auxiliary operations
Definition: blas_quda.cu:576

quda::blas::axpby_::operator()
__device__ __host__ void operator()(FloatN &x, FloatN &y, FloatN &z, FloatN &w)
where the reduction is usually computed and any auxiliary operations
Definition: blas_quda.cu:100

quda::blas::xpay
void xpay(ColorSpinorField &x, const double &a, ColorSpinorField &y)
Definition: blas_quda.cu:173

quda::blas::_caxpby
__device__ __host__ void _caxpby(const float2 &a, const float4 &x, const float2 &b, float4 &y)
Definition: blas_quda.cu:261

quda::blas::caxpyXmazMR
void caxpyXmazMR(const Complex &a, ColorSpinorField &x, ColorSpinorField &y, ColorSpinorField &z)
Definition: blas_quda.cu:583

quda::blas::axpyZpbx_::operator()
__device__ __host__ void operator()(FloatN &x, FloatN &y, FloatN &z, FloatN &w)
where the reduction is usually computed and any auxiliary operations
Definition: blas_quda.cu:378

float_vector.h

quda::blas::axpyZpbx_::a
const Float2 a
Definition: blas_quda.cu:375

quda::blas::axpby_::a
const Float2 a
Definition: blas_quda.cu:97

commAsyncReduction
bool commAsyncReduction()
Definition: comm_common.cpp:684

quda::blas::axpyBzpcx_::operator()
__device__ __host__ void operator()(FloatN &x, FloatN &y, FloatN &z, FloatN &w)
where the reduction is usually computed and any auxiliary operations
Definition: blas_quda.cu:350

quda::blas::caxpbypczpw_::flops
static int flops()
total number of input and output streams
Definition: blas_quda.cu:524

quda::ColorSpinorField
Definition: color_spinor_field.h:271

quda::blas::aux_tmp
char aux_tmp[TuneKey::aux_n]
Definition: blas_quda.cu:58

quda::blas::ax_
Definition: blas_quda.cu:201

quda::blas::axpyBzpcx_::c
const Float2 c
Definition: blas_quda.cu:348

quda::blas::caxpyBxpz_::a
const Float2 a
Definition: blas_quda.cu:428

quda::blas::caxpbypz_::streams
static int streams()
Definition: blas_quda.cu:501

quda::blas::caxpbypczpw_::b
const Float2 b
Definition: blas_quda.cu:517

quda::blas::caxpyBzpx_::a
const Float2 a
Definition: blas_quda.cu:402

quda::blas::axpy_::a
const Float2 a
Definition: blas_quda.cu:143

quda::blas::end
void end(void)
Definition: blas_quda.cu:70

quda::blas::caxpby_::b
const Float2 b
Definition: blas_quda.cu:284

errorQuda
#define errorQuda(...)
Definition: util_quda.h:90

color_spinor_field.h

quda::blas::init
void init()
Definition: blas_quda.cu:64

quda::blas::caxpy_
Definition: blas_quda.cu:237

quda::blas::xpy_::streams
static int streams()
Definition: blas_quda.cu:124

quda::blas::caxpyxmaz_::caxpyxmaz_
caxpyxmaz_(const Float2 &a, const Float2 &b, const Float2 &c)
Definition: blas_quda.cu:542

quda::blas::caxpyBzpx_::b
const Float2 b
Definition: blas_quda.cu:403

quda::Complex
std::complex< double > Complex
Definition: eig_variables.h:13

streams
cudaStream_t * streams
Definition: interface_quda.cpp:153

quda::blas::caxpbypz_
Definition: blas_quda.cu:495

quda::blas::caxpyBzpx_::flops
static int flops()
total number of input and output streams
Definition: blas_quda.cu:409

quda::blas::xpayz
void xpayz(ColorSpinorField &x, const double &a, ColorSpinorField &y, ColorSpinorField &z)
Definition: blas_quda.cu:177

quda::blas::tripleCGUpdate_::a
Float2 a
Definition: blas_quda.cu:602

quda::blas::cxpaypbz_::a
const Float2 a
Definition: blas_quda.cu:326

quda::blas::caxpyBzpx_::operator()
__device__ __host__ void operator()(FloatN &x, FloatN &y, FloatN &z, FloatN &w)
where the reduction is usually computed and any auxiliary operations
Definition: blas_quda.cu:405

quda::Nstream
const int Nstream
Definition: quda_internal.h:330

quda::blas::cxpaypbz_
Definition: blas_quda.cu:325

quda::blas::axpyBzpcx_::b
const Float2 b
Definition: blas_quda.cu:347

quda::blas::caxpyBzpx_::caxpyBzpx_
caxpyBzpx_(const Float2 &a, const Float2 &b, const Float2 &c)
Definition: blas_quda.cu:404

z
int int z
Definition: CMakeCUDACompilerId.cpp1.ii:2637

quda::blas::mxpy_::mxpy_
mxpy_(const Float2 &a, const Float2 &b, const Float2 &c)
Definition: blas_quda.cu:186

quda::blas::caxpbypz_::b
const Float2 b
Definition: blas_quda.cu:497

blas_core.cuh

quda::blas::ax
void ax(const double &a, ColorSpinorField &x)
Definition: blas_quda.cu:209

quda::cudaColorSpinorField
Definition: color_spinor_field.h:504

quda::blas::axpby_::streams
static int streams()
Definition: blas_quda.cu:102

quda::blas::axpyBzpcx_::a
const Float2 a
Definition: blas_quda.cu:346

quda::blas::caxpby_::flops
static int flops()
total number of input and output streams
Definition: blas_quda.cu:289

quda::blas::caxpyBxpz_::caxpyBxpz_
caxpyBxpz_(const Float2 &a, const Float2 &b, const Float2 &c)
Definition: blas_quda.cu:430

quda::blas::caxpbypzYmbw_::operator()
__device__ __host__ void operator()(FloatN &x, FloatN &y, FloatN &z, FloatN &w)
where the reduction is usually computed and any auxiliary operations
Definition: blas_quda.cu:457

quda::blas::caxpy_::operator()
__device__ __host__ void operator()(FloatN &x, FloatN &y, FloatN &z, FloatN &w)
where the reduction is usually computed and any auxiliary operations
Definition: blas_quda.cu:240

quda::blas::caxpbypczpw_::caxpbypczpw_
caxpbypczpw_(const Float2 &a, const Float2 &b, const Float2 &c)
Definition: blas_quda.cu:519

quda::blas::BlasFunctor::operator()
virtual __device__ __host__ void operator()(FloatN &x, FloatN &y, FloatN &z, FloatN &w)=0
where the reduction is usually computed and any auxiliary operations

quda::blas::caxpyxmazMR_::Ar3
double3 * Ar3
Definition: blas_quda.cu:563

quda::blas::xpayz_::flops
static int flops()
total number of input and output streams
Definition: blas_quda.cu:170

quda::blas::xpy_
Definition: blas_quda.cu:121

quda::blas::caxpyBzpx
void caxpyBzpx(const Complex &, ColorSpinorField &, ColorSpinorField &, const Complex &, ColorSpinorField &)
Definition: blas_quda.cu:412

texture.h

quda
Definition: blas_cublas.h:6

quda::blas::xpy_::operator()
__device__ __host__ void operator()(FloatN &x, FloatN &y, FloatN &z, FloatN &w)
where the reduction is usually computed and any auxiliary operations
Definition: blas_quda.cu:123

quda::blas::caxpbypzYmbw_::streams
static int streams()
Definition: blas_quda.cu:460

quda::blas::axpy_::flops
static int flops()
total number of input and output streams
Definition: blas_quda.cu:147

quda::blas::caxpyBxpz
void caxpyBxpz(const Complex &, ColorSpinorField &, ColorSpinorField &, const Complex &, ColorSpinorField &)
Definition: blas_quda.cu:438

quda::blas::axpyBzpcx_::streams
static int streams()
Definition: blas_quda.cu:352

quda::blas::xpayz_::xpayz_
xpayz_(const Float2 &a, const Float2 &b, const Float2 &c)
Definition: blas_quda.cu:167

quda::blas::_caxpy
__device__ __host__ void _caxpy(const float2 &a, const float4 &x, float4 &y)
Definition: blas_quda.cu:219

quda::blas::caxpby_::streams
static int streams()
Definition: blas_quda.cu:288

quda::blas::caxpyxmaz_
Definition: blas_quda.cu:540

quda::blas::axpyZpbx_
Definition: blas_quda.cu:374

quda::blas::cabxpyAx_::b
const Float2 b
Definition: blas_quda.cu:476

b
#define b
Definition: dw_dslash4_core.h:83

x
p x
Definition: CMakeCUDACompilerId.cpp1.ii:3011

quda::blas::getStream
cudaStream_t * getStream()
Definition: blas_quda.cu:75

quda::blas::caxpyxmaz_::a
Float2 a
Definition: blas_quda.cu:541

quda::blas::caxpy_::caxpy_
caxpy_(const Float2 &a, const Float2 &b, const Float2 &c)
Definition: blas_quda.cu:239

quda::blas::cabxpyAx
void cabxpyAx(const double &a, const Complex &b, ColorSpinorField &x, ColorSpinorField &y)
Definition: blas_quda.cu:484

quda::blas::cabxpyAx_
Definition: blas_quda.cu:474

quda::blas::tripleCGUpdate_::operator()
__device__ __host__ void operator()(FloatN &x, FloatN &y, FloatN &z, FloatN &w)
where the reduction is usually computed and any auxiliary operations
Definition: blas_quda.cu:604

blas_core.h

quda::blas::axpyZpbx_::b
const Float2 b
Definition: blas_quda.cu:376

IMAG
#define IMAG(a)
Definition: blas_quda.h:14

quda::blas::BlasFunctor::init
virtual __device__ __host__ void init()
pre-computation routine before the main loop
Definition: blas_quda.cu:86

quda::blas::blasStream
static cudaStream_t * blasStream
Definition: blas_quda.cu:53

quda::blas::caxpbypczpw_::streams
static int streams()
Definition: blas_quda.cu:523

quda::blas::axpby_
Definition: blas_quda.cu:96

blas_mixed_core.h

quda::blas::axpyZpbx
void axpyZpbx(const double &a, ColorSpinorField &x, ColorSpinorField &y, ColorSpinorField &z, const double &b)
Definition: blas_quda.cu:384

quda::blas::caxpyBxpz_::streams
static int streams()
Definition: blas_quda.cu:434

quda::blas::cabxpyAx_::flops
static int flops()
total number of input and output streams
Definition: blas_quda.cu:481

quda::blas::caxpbypz_::caxpbypz_
caxpbypz_(const Float2 &a, const Float2 &b, const Float2 &c)
Definition: blas_quda.cu:498

quda::blas::blasStrings
static struct quda::blas::@4 blasStrings

quda::blas::caxpyBzpx_::streams
static int streams()
Definition: blas_quda.cu:408

quda::blas::mxpy_::streams
static int streams()
Definition: blas_quda.cu:188

w
int int int w
Definition: CMakeCUDACompilerId.cpp1.ii:2637

quda::blas::caxpbypzYmbw
void caxpbypzYmbw(const Complex &, ColorSpinorField &, const Complex &, ColorSpinorField &, ColorSpinorField &, ColorSpinorField &)
Definition: blas_quda.cu:464

quda::blas::caxpyxmazMR_
Definition: blas_quda.cu:561

quda::blas::axpyBzpcx_::flops
static int flops()
total number of input and output streams
Definition: blas_quda.cu:353

quda::blas::mxpy_::operator()
__device__ __host__ void operator()(FloatN &x, FloatN &y, FloatN &z, FloatN &w)
where the reduction is usually computed and any auxiliary operations
Definition: blas_quda.cu:187

quda::blas::caxpbypz_::flops
static int flops()
total number of input and output streams
Definition: blas_quda.cu:502

quda::blas::initReduce
void initReduce()
Definition: reduce_quda.cu:78

quda::blas::cxpaypbz_::streams
static int streams()
Definition: blas_quda.cu:331

quda::blas::tripleCGUpdate
void tripleCGUpdate(const double &alpha, const double &beta, ColorSpinorField &q, ColorSpinorField &r, ColorSpinorField &x, ColorSpinorField &p)
Definition: blas_quda.cu:610

quda::blas::cxpaypbz_::cxpaypbz_
cxpaypbz_(const Float2 &a, const Float2 &b, const Float2 &c)
Definition: blas_quda.cu:328

quda::blas::caxpy_::a
const Float2 a
Definition: blas_quda.cu:238

quda::blas::caxpyBxpz_::flops
static int flops()
total number of input and output streams
Definition: blas_quda.cu:435

quda::blas::caxpyBxpz_::operator()
__device__ __host__ void operator()(FloatN &x, FloatN &y, FloatN &z, FloatN &w)
where the reduction is usually computed and any auxiliary operations
Definition: blas_quda.cu:431

quda::blas::axpyZpbx_::axpyZpbx_
axpyZpbx_(const Float2 &a, const Float2 &b, const Float2 &c)
Definition: blas_quda.cu:377

quda::blas::cabxpyAx_::streams
static int streams()
Definition: blas_quda.cu:480

quda::blas::axpyZpbx_::flops
static int flops()
total number of input and output streams
Definition: blas_quda.cu:381

quda::blas::caxpby_::a
const Float2 a
Definition: blas_quda.cu:283

quda::blas::caxpbypzYmbw_::a
const Float2 a
Definition: blas_quda.cu:454

quda::blas::caxpbypzYmbw_::flops
static int flops()
total number of input and output streams
Definition: blas_quda.cu:461

quda::blas::caxpy
void caxpy(const Complex &a, ColorSpinorField &x, ColorSpinorField &y)
Definition: blas_quda.cu:246

quda::blas::cabxpyAx_::operator()
__device__ __host__ void operator()(FloatN &x, FloatN &y, FloatN &z, FloatN &w)
where the reduction is usually computed and any auxiliary operations
Definition: blas_quda.cu:478

quda::blas::caxpyBxpz_
Definition: blas_quda.cu:427

quda::blas::zero
void zero(ColorSpinorField &a)
Definition: blas_quda.cu:45

quda::blas::vol_str
const char * vol_str
Definition: blas_quda.cu:56

quda::blas::ax_::ax_
ax_(const Float2 &a, const Float2 &b, const Float2 &c)
Definition: blas_quda.cu:203

tune_quda.h

quda::blas::caxpbypczpw
void caxpbypczpw(const Complex &, ColorSpinorField &, const Complex &, ColorSpinorField &, const Complex &, ColorSpinorField &, ColorSpinorField &)
Definition: blas_quda.cu:527

quda::blas::axpy
void axpy(const double &a, ColorSpinorField &x, ColorSpinorField &y)
Definition: blas_quda.cu:150

quda::blas::xpayz_::operator()
__device__ __host__ void operator()(FloatN &x, FloatN &y, FloatN &z, FloatN &w)
where the reduction is usually computed and any auxiliary operations
Definition: blas_quda.cu:168

quda::blas::caxpy_::flops
static int flops()
total number of input and output streams
Definition: blas_quda.cu:243

REAL
#define REAL(a)
Definition: blas_quda.h:13

blas_quda.h

quda::blas::cabxpyAx_::cabxpyAx_
cabxpyAx_(const Float2 &a, const Float2 &b, const Float2 &c)
Definition: blas_quda.cu:477

quda::blas::caxpbypczpw_::a
const Float2 a
Definition: blas_quda.cu:516

quda::blas::axpby_::flops
static int flops()
total number of input and output streams
Definition: blas_quda.cu:103

quda::blas::caxpyxmazMR_::init
__device__ __host__ void init()
pre-computation routine before the main loop
Definition: blas_quda.cu:567

quda::blas::xpayz_
Definition: blas_quda.cu:165

quda::blas::axpby
void axpby(const double &a, ColorSpinorField &x, const double &b, ColorSpinorField &y)
Definition: blas_quda.cu:106

quda::blas::axpby_::axpby_
axpby_(const Float2 &a, const Float2 &b, const Float2 &c)
Definition: blas_quda.cu:99

quda::blas::tripleCGUpdate_::streams
static int streams()
Definition: blas_quda.cu:606

quda::blas::caxpyxmazMR_::a
Float2 a
Definition: blas_quda.cu:562

quda::blas::caxpbypz
void caxpbypz(const Complex &, ColorSpinorField &, const Complex &, ColorSpinorField &, ColorSpinorField &)
Definition: blas_quda.cu:505

quda::blas::mxpy_
Definition: blas_quda.cu:185

quda::blas::caxpyxmazMR_::flops
static int flops()
total number of input and output streams
Definition: blas_quda.cu:580

quda::blas::caxpyBxpz_::b
const Float2 b
Definition: blas_quda.cu:429

quda::blas::xpy_::xpy_
xpy_(const Float2 &a, const Float2 &b, const Float2 &c)
Definition: blas_quda.cu:122

quda::blas::caxpby_::operator()
__device__ __host__ void operator()(FloatN &x, FloatN &y, FloatN &z, FloatN &w)
where the reduction is usually computed and any auxiliary operations
Definition: blas_quda.cu:286

quda::blas::xpayz_::a
const Float2 a
Definition: blas_quda.cu:166

quda::blas::caxpyxmaz_::streams
static int streams()
Definition: blas_quda.cu:545

quda::blas::caxpy_::streams
static int streams()
Definition: blas_quda.cu:242

quda::blas::axpyBzpcx
void axpyBzpcx(const double &a, ColorSpinorField &x, ColorSpinorField &y, const double &b, ColorSpinorField &z, const double &c)
Definition: blas_quda.cu:356

quda::blas::caxpbypz_::operator()
__device__ __host__ void operator()(FloatN &x, FloatN &y, FloatN &z, FloatN &w)
where the reduction is usually computed and any auxiliary operations
Definition: blas_quda.cu:499

color_spinor_field_order.h

quda::blas::caxpyXmaz
void caxpyXmaz(const Complex &a, ColorSpinorField &x, ColorSpinorField &y, ColorSpinorField &z)
Definition: blas_quda.cu:549

quda::blas::cxpaypbz_::b
const Float2 b
Definition: blas_quda.cu:327

quda::TuneKey::aux_n
static const int aux_n
Definition: tune_key.h:12

quda::blas::getDeviceReduceBuffer
void * getDeviceReduceBuffer()
Definition: reduce_quda.cu:73

quda::blas::_cxpaypbz
__device__ __host__ void _cxpaypbz(const float4 &x, const float2 &a, const float4 &y, const float2 &b, float4 &z)
Definition: blas_quda.cu:301

quda::blas::mxpy_::flops
static int flops()
total number of input and output streams
Definition: blas_quda.cu:189

quda::blas::axpy_::axpy_
axpy_(const Float2 &a, const Float2 &b, const Float2 &c)
Definition: blas_quda.cu:144

quda::blas::axpy_
Definition: blas_quda.cu:142

quda::blas::flops
unsigned long long flops
Definition: blas_quda.cu:42

quda::blas::xpy
void xpy(ColorSpinorField &x, ColorSpinorField &y)
Definition: blas_quda.cu:128

quda::blas::tripleCGUpdate_
Definition: blas_quda.cu:601

quda::blas::tripleCGUpdate_::b
Float2 b
Definition: blas_quda.cu:602

quda::blas::ax_::operator()
__device__ __host__ void operator()(FloatN &x, FloatN &y, FloatN &z, FloatN &w)
where the reduction is usually computed and any auxiliary operations
Definition: blas_quda.cu:204

quda::blas::caxpyxmaz_::flops
static int flops()
total number of input and output streams
Definition: blas_quda.cu:546

quda::blas::caxpby
void caxpby(const Complex &a, ColorSpinorField &x, const Complex &b, ColorSpinorField &y)
Definition: blas_quda.cu:292

quda::blas::caxpyxmazMR_::caxpyxmazMR_
caxpyxmazMR_(const Float2 &a, const Float2 &b, const Float2 &c)
Definition: blas_quda.cu:564

c
const void * c
Definition: CMakeCUDACompilerId.cpp1.ii:2234

quda::blas::mxpy
void mxpy(ColorSpinorField &x, ColorSpinorField &y)
Definition: blas_quda.cu:192

quda::blas::BlasFunctor
Definition: blas_quda.cu:83

quda::blas::axpyBzpcx_
Definition: blas_quda.cu:345

quda::blas::caxpbypz_::a
const Float2 a
Definition: blas_quda.cu:496

quda::blas::cxpaypbz
void cxpaypbz(ColorSpinorField &, const Complex &b, ColorSpinorField &y, const Complex &c, ColorSpinorField &z)
Definition: blas_quda.cu:335

quda::blas::caxpbypczpw_::c
const Float2 c
Definition: blas_quda.cu:518

quda::blas::ax_::streams
static int streams()
Definition: blas_quda.cu:205

quda::blas::axpy_::streams
static int streams()
Definition: blas_quda.cu:146

quda::blas::cxpaypbz_::flops
static int flops()
total number of input and output streams
Definition: blas_quda.cu:332

quda::blas::axpy_::operator()
__device__ __host__ void operator()(FloatN &x, FloatN &y, FloatN &z, FloatN &w)
where the reduction is usually computed and any auxiliary operations
Definition: blas_quda.cu:145

quda::blas::axpby_::b
const Float2 b
Definition: blas_quda.cu:98

quda::blas::xpayz_::streams
static int streams()
Definition: blas_quda.cu:169

quda::blas::caxpbypzYmbw_
Definition: blas_quda.cu:453

quda::blas::caxpbypczpw_::operator()
__device__ __host__ void operator()(FloatN &x, FloatN &y, FloatN &z, FloatN &w)
where the reduction is usually computed and any auxiliary operations
Definition: blas_quda.cu:520

quda::blas::caxpyxmazMR_::streams
static int streams()
Definition: blas_quda.cu:579

quda::blas::cxpaypbz_::operator()
__device__ __host__ void operator()(FloatN &x, FloatN &y, FloatN &z, FloatN &w)
where the reduction is usually computed and any auxiliary operations
Definition: blas_quda.cu:329

quda::blas::cabxpyAx_::a
const Float2 a
Definition: blas_quda.cu:475

QUDA_CPU_FIELD_LOCATION
Definition: enum_quda.h:296

quda::blas::ax_::a
const Float2 a
Definition: blas_quda.cu:202

a
#define a
Definition: dw_dslash4_core.h:82

quda::blas::ax_::flops
static int flops()
total number of input and output streams
Definition: blas_quda.cu:206

quda::blas::endReduce
void endReduce()
Definition: reduce_quda.cu:134

quda::blas::bytes
unsigned long long bytes
Definition: blas_quda.cu:43

quda::cpuColorSpinorField
Definition: color_spinor_field.h:789

quda::blas::caxpby_::caxpby_
caxpby_(const Float2 &a, const Float2 &b, const Float2 &c)
Definition: blas_quda.cu:285

quda::blas::caxpyBzpx_
Definition: blas_quda.cu:401

y
int y
Definition: CMakeCUDACompilerId.cpp1.ii:2637

quda::blas::tripleCGUpdate_::flops
static int flops()
total number of input and output streams
Definition: blas_quda.cu:607

quda_internal.h

quda::blas::caxpby_
Definition: blas_quda.cu:282

quda::blas::caxpyxmaz_::operator()
__device__ __host__ void operator()(FloatN &x, FloatN &y, FloatN &z, FloatN &w)
where the reduction is usually computed and any auxiliary operations
Definition: blas_quda.cu:543