v0.9.0/doc/multi__blas__core_8cuh_source.html

 template <int NXZ, typename SpinorX, typename SpinorY, typename SpinorZ,
     typename SpinorW, typename Functor>
 struct MultiBlasArg {
   const int NYW;
   SpinorX X[NXZ];
   SpinorY Y[MAX_MULTI_BLAS_N];
   SpinorZ Z[NXZ];
   SpinorW W[MAX_MULTI_BLAS_N];
   Functor f;
   const int length;

   MultiBlasArg(SpinorX X[NXZ], SpinorY Y[], SpinorZ Z[NXZ], SpinorW W[], Functor f, int NYW, int length)
     :  NYW(NYW), f(f), length(length) {

     for(int i=0; i<NXZ; ++i){
       this->X[i] = X[i];
       this->Z[i] = Z[i];
     }
     for(int i=0; i<NYW; ++i){
       this->Y[i] = Y[i];
       this->W[i] = W[i];
     }
   }
 };


 // storage for matrix coefficients
 #define MAX_MATRIX_SIZE 4096
 static __constant__ signed char Amatrix_d[MAX_MATRIX_SIZE];
 static __constant__ signed char Bmatrix_d[MAX_MATRIX_SIZE];
 static __constant__ signed char Cmatrix_d[MAX_MATRIX_SIZE];

 static signed char *Amatrix_h;
 static signed char *Bmatrix_h;
 static signed char *Cmatrix_h;

 template<int k, int NXZ, typename FloatN, int M, typename Arg>
 __device__ inline void compute(Arg &arg, int idx, int parity) {

   while (idx < arg.length) {

     FloatN x[M], y[M], z[M], w[M];
     arg.Y[k].load(y, idx, parity);
     arg.W[k].load(w, idx, parity);

 #pragma unroll
     for (int l=0; l < NXZ; l++) {
       arg.X[l].load(x, idx, parity);
       arg.Z[l].load(z, idx, parity);

 #pragma unroll
       for (int j=0; j < M; j++) arg.f(x[j], y[j], z[j], w[j], k, l);
     }
     arg.Y[k].save(y, idx, parity);
     arg.W[k].save(w, idx, parity);

     idx += gridDim.x*blockDim.x;
   }
 }

 template <typename FloatN, int M, int NXZ, typename SpinorX, typename SpinorY,
 typename SpinorZ, typename SpinorW, typename Functor>
 __global__ void multiblasKernel(MultiBlasArg<NXZ,SpinorX,SpinorY,SpinorZ,SpinorW,Functor> arg) {

   // use i to loop over elements in kernel
   unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
   unsigned int k = blockIdx.y * blockDim.y + threadIdx.y;
   unsigned int parity = blockIdx.z;

   arg.f.init();
   if (k >= arg.NYW) return;

   switch(k) {
   case  0: compute< 0,NXZ,FloatN,M>(arg,i,parity); break;
 #if MAX_MULTI_BLAS_N >= 2
   case  1: compute< 1,NXZ,FloatN,M>(arg,i,parity); break;
 #if MAX_MULTI_BLAS_N >= 3
   case  2: compute< 2,NXZ,FloatN,M>(arg,i,parity); break;
 #if MAX_MULTI_BLAS_N >= 4
   case  3: compute< 3,NXZ,FloatN,M>(arg,i,parity); break;
 #if MAX_MULTI_BLAS_N >= 5
   case  4: compute< 4,NXZ,FloatN,M>(arg,i,parity); break;
 #if MAX_MULTI_BLAS_N >= 6
   case  5: compute< 5,NXZ,FloatN,M>(arg,i,parity); break;
 #if MAX_MULTI_BLAS_N >= 7
   case  6: compute< 6,NXZ,FloatN,M>(arg,i,parity); break;
 #if MAX_MULTI_BLAS_N >= 8
   case  7: compute< 7,NXZ,FloatN,M>(arg,i,parity); break;
 #if MAX_MULTI_BLAS_N >= 9
   case  8: compute< 8,NXZ,FloatN,M>(arg,i,parity); break;
 #if MAX_MULTI_BLAS_N >= 10
   case  9: compute< 9,NXZ,FloatN,M>(arg,i,parity); break;
 #if MAX_MULTI_BLAS_N >= 11
   case 10: compute<10,NXZ,FloatN,M>(arg,i,parity); break;
 #if MAX_MULTI_BLAS_N >= 12
   case 11: compute<11,NXZ,FloatN,M>(arg,i,parity); break;
 #if MAX_MULTI_BLAS_N >= 13
   case 12: compute<12,NXZ,FloatN,M>(arg,i,parity); break;
 #if MAX_MULTI_BLAS_N >= 14
   case 13: compute<13,NXZ,FloatN,M>(arg,i,parity); break;
 #if MAX_MULTI_BLAS_N >= 15
   case 14: compute<14,NXZ,FloatN,M>(arg,i,parity); break;
 #if MAX_MULTI_BLAS_N >= 16
   case 15: compute<15,NXZ,FloatN,M>(arg,i,parity); break;
 #endif //16
 #endif //15
 #endif //14
 #endif //13
 #endif //12
 #endif //11
 #endif //10
 #endif //9
 #endif //8
 #endif //7
 #endif //6
 #endif //5
 #endif //4
 #endif //3
 #endif //2
   }

 }

 namespace detail
 {
   template<unsigned... digits>
   struct to_chars { static const char value[]; };

   template<unsigned... digits>
   const char to_chars<digits...>::value[] = {('0' + digits)..., 0};

   template<unsigned rem, unsigned... digits>
   struct explode : explode<rem / 10, rem % 10, digits...> {};

   template<unsigned... digits>
   struct explode<0, digits...> : to_chars<digits...> {};
 }

 template<unsigned num>
 struct num_to_string : detail::explode<num / 10, num % 10> {};


 template <int NXZ, typename FloatN, int M, typename SpinorX, typename SpinorY,
   typename SpinorZ, typename SpinorW, typename Functor>
 class MultiBlasCuda : public TunableVectorY {

 private:
   const int NYW;
   mutable MultiBlasArg<NXZ,SpinorX,SpinorY,SpinorZ,SpinorW,Functor> arg;
   const int nParity;

   // host pointers used for backing up fields when tuning
   // don't curry into the Spinors to minimize parameter size
   char *Y_h[MAX_MULTI_BLAS_N], *W_h[MAX_MULTI_BLAS_N], *Ynorm_h[MAX_MULTI_BLAS_N], *Wnorm_h[MAX_MULTI_BLAS_N];
   std::vector<ColorSpinorField*> &y, &w;

   bool tuneSharedBytes() const { return false; }

 public:
   MultiBlasCuda(SpinorX X[], SpinorY Y[], SpinorZ Z[], SpinorW W[], Functor &f,
     int NYW, int length, int nParity,
     std::vector<ColorSpinorField*> &y, std::vector<ColorSpinorField*> &w)
     : TunableVectorY(NYW), NYW(NYW), arg(X, Y, Z, W, f, NYW, length/nParity),
       nParity(nParity), Y_h(), W_h(), Ynorm_h(), Wnorm_h(), y(y), w(w) { }

   virtual ~MultiBlasCuda() { }

   inline TuneKey tuneKey() const {
     char name[TuneKey::name_n];
     strcpy(name, num_to_string<NXZ>::value);
     strcat(name, std::to_string(NYW).c_str());
     strcat(name, typeid(arg.f).name());
     return TuneKey(blasStrings.vol_str, name, blasStrings.aux_tmp);
   }

   inline void apply(const cudaStream_t &stream) {
     TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
     multiblasKernel<FloatN,M,NXZ> <<<tp.grid, tp.block, tp.shared_bytes, stream>>>(arg);
   }

   void preTune() {
     for(int i=0; i<NYW; ++i){
       arg.Y[i].backup(&Y_h[i], &Ynorm_h[i], y[i]->Bytes(), y[i]->NormBytes());
       arg.W[i].backup(&W_h[i], &Wnorm_h[i], w[i]->Bytes(), w[i]->NormBytes());
     }
   }

   void postTune() {
     for(int i=0; i<NYW; ++i){
       arg.Y[i].restore(&Y_h[i], &Ynorm_h[i], y[i]->Bytes(), y[i]->NormBytes());
       arg.W[i].restore(&W_h[i], &Wnorm_h[i], w[i]->Bytes(), w[i]->NormBytes());
     }
   }

   void initTuneParam(TuneParam &param) const {
     TunableVectorY::initTuneParam(param);
     param.grid.z = nParity;
   }

   void defaultTuneParam(TuneParam &param) const {
     TunableVectorY::defaultTuneParam(param);
     param.grid.z = nParity;
   }

   long long flops() const { return arg.f.flops()*vec_length<FloatN>::value*(long)arg.length*nParity*M; }

   long long bytes() const
   {
     // bytes for low-precision vector
     size_t base_bytes = arg.X[0].Precision()*vec_length<FloatN>::value*M;
     if (arg.X[0].Precision() == QUDA_HALF_PRECISION) base_bytes += sizeof(float);

     // bytes for high precision vector
     size_t extra_bytes = arg.Y[0].Precision()*vec_length<FloatN>::value*M;
     if (arg.Y[0].Precision() == QUDA_HALF_PRECISION) extra_bytes += sizeof(float);

     // the factor two here assumes we are reading and writing to the high precision vector
     return ((arg.f.streams()-2)*base_bytes + 2*extra_bytes)*arg.length*nParity;
   }

   int tuningIter() const { return 3; }
 };

 template <typename T>
 struct coeff_array {
   const T *data;
   const bool use_const;
   coeff_array() : data(nullptr), use_const(false) { }
   coeff_array(const T *data, bool use_const) : data(data), use_const(use_const) { }
 };

 template <int NXZ, typename RegType, typename StoreType, typename yType, int M,
     template <int,typename,typename> class Functor,
     typename write, typename T>
 void multiblasCuda(const coeff_array<T> &a, const coeff_array<T> &b, const coeff_array<T> &c,
        std::vector<ColorSpinorField*> &x, std::vector<ColorSpinorField*> &y,
        std::vector<ColorSpinorField*> &z, std::vector<ColorSpinorField*> &w,
        int length) {

   const int NYW = y.size();

   const int N = NXZ > NYW ? NXZ : NYW;
   if (N > MAX_MULTI_BLAS_N) errorQuda("Spinor vector length exceeds max size (%d > %d)", N, MAX_MULTI_BLAS_N);

   if (NXZ*NYW*sizeof(Complex) > MAX_MATRIX_SIZE)
     errorQuda("A matrix exceeds max size (%lu > %d)", NXZ*NYW*sizeof(Complex), MAX_MATRIX_SIZE);

   typedef typename scalar<RegType>::type Float;
   typedef typename vector<Float,2>::type Float2;
   typedef vector<Float,2> vec2;

   // FIXME - if NXZ=1 no need to copy entire array
   // FIXME - do we really need strided access here?
   if (a.data && a.use_const) {
     Float2 A[MAX_MATRIX_SIZE/sizeof(Float2)];
     // since the kernel doesn't know the width of them matrix at compile
     // time we stride it and copy the padded matrix to GPU
     for (int i=0; i<NXZ; i++) for (int j=0; j<NYW; j++)
       A[MAX_MULTI_BLAS_N * i + j] = make_Float2<Float2>(Complex(a.data[NYW * i + j]));

     cudaMemcpyToSymbolAsync(Amatrix_d, A, MAX_MATRIX_SIZE, 0, cudaMemcpyHostToDevice, *getStream());
     Amatrix_h = reinterpret_cast<signed char*>(const_cast<T*>(a.data));
   }

   if (b.data && b.use_const) {
     Float2 B[MAX_MATRIX_SIZE/sizeof(Float2)];
     // since the kernel doesn't know the width of them matrix at compile
     // time we stride it and copy the padded matrix to GPU
     for (int i=0; i<NXZ; i++) for (int j=0; j<NYW; j++)
       B[MAX_MULTI_BLAS_N * i + j] = make_Float2<Float2>(Complex(b.data[NYW * i + j]));

     cudaMemcpyToSymbolAsync(Bmatrix_d, B, MAX_MATRIX_SIZE, 0, cudaMemcpyHostToDevice, *getStream());
     Bmatrix_h = reinterpret_cast<signed char*>(const_cast<T*>(b.data));
   }

   if (c.data && c.use_const) {
     Float2 C[MAX_MATRIX_SIZE/sizeof(Float2)];
     // since the kernel doesn't know the width of them matrix at compile
     // time we stride it and copy the padded matrix to GPU
     for (int i=0; i<NXZ; i++) for (int j=0; j<NYW; j++)
       C[MAX_MULTI_BLAS_N * i + j] = make_Float2<Float2>(Complex(c.data[NYW * i + j]));

     cudaMemcpyToSymbolAsync(Cmatrix_d, C, MAX_MATRIX_SIZE, 0, cudaMemcpyHostToDevice, *getStream());
     Cmatrix_h = reinterpret_cast<signed char*>(const_cast<T*>(c.data));
   }

   // for (int i=0; i<N; i++) {
   //   checkLength(*x[i],*y[i]); checkLength(*x[i],*z[i]); checkLength(*x[i],*w[i]);
   // }

   blasStrings.vol_str = x[0]->VolString();
   strcpy(blasStrings.aux_tmp, x[0]->AuxString());
   if (typeid(StoreType) != typeid(yType)) {
     strcat(blasStrings.aux_tmp, ",");
     strcat(blasStrings.aux_tmp, y[0]->AuxString());
   }

   multi::SpinorTexture<RegType,StoreType,M,0> X[NXZ];
   multi::Spinor<RegType,    yType,M,write::Y,1> Y[MAX_MULTI_BLAS_N];
   multi::SpinorTexture<RegType,StoreType,M,2> Z[NXZ];
   multi::Spinor<RegType,StoreType,M,write::W,3> W[MAX_MULTI_BLAS_N];

   //MWFIXME
   for (int i=0; i<NXZ; i++) { X[i].set(*dynamic_cast<cudaColorSpinorField *>(x[i])); Z[i].set(*dynamic_cast<cudaColorSpinorField *>(z[i]));}
   for (int i=0; i<NYW; i++) { Y[i].set(*dynamic_cast<cudaColorSpinorField *>(y[i])); W[i].set(*dynamic_cast<cudaColorSpinorField *>(w[i]));}

   // if block caxpy is an 'outer product of caxpy' where 'x'

   Functor<NXZ,Float2, RegType> f(a, b, c, NYW);

   MultiBlasCuda<NXZ,RegType,M,
     multi::SpinorTexture<RegType,StoreType,M,0>,
     multi::Spinor<RegType,    yType,M,write::Y,1>,
     multi::SpinorTexture<RegType,StoreType,M,2>,
     multi::Spinor<RegType,StoreType,M,write::W,3>,
     decltype(f) >
     blas(X, Y, Z, W, f, NYW, length, x[0]->SiteSubset(), y, w);
   blas.apply(*getStream());

   blas::bytes += blas.bytes();
   blas::flops += blas.flops();

   checkCudaError();
 }


 template <typename Float2, typename write,
   typename SpinorX, typename SpinorY, typename SpinorZ, typename SpinorW,
   typename Functor>
 void genericMultiBlas(SpinorX &X, SpinorY &Y, SpinorZ &Z, SpinorW &W, Functor f) {

   for (int parity=0; parity<X.Nparity(); parity++) {
     for (int x=0; x<X.VolumeCB(); x++) {
       for (int s=0; s<X.Nspin(); s++) {
   for (int c=0; c<X.Ncolor(); c++) {
     Float2 X2 = make_Float2<Float2>( X(parity, x, s, c) );
     Float2 Y2 = make_Float2<Float2>( Y(parity, x, s, c) );
     Float2 Z2 = make_Float2<Float2>( Z(parity, x, s, c) );
     Float2 W2 = make_Float2<Float2>( W(parity, x, s, c) );
     f(X2, Y2, Z2, W2, 1 , 1);
     // if (writeX) X(parity, x, s, c) = make_Complex(X2);
     if (write::X) errorQuda("writeX not supported in multiblas.");
     if (write::Y) Y(parity, x, s, c) = make_Complex(Y2);
     if (write::Z) errorQuda("writeZ not supported in multiblas.");
     if (write::W) W(parity, x, s, c) = make_Complex(W2);
   }
       }
     }
   }
 }

 template <typename Float, typename yFloat, int nSpin, int nColor, QudaFieldOrder order,
   typename write, typename Functor>
   void genericMultiBlas(ColorSpinorField &x, ColorSpinorField &y, ColorSpinorField &z,
        ColorSpinorField &w, Functor f) {
   colorspinor::FieldOrderCB<Float,nSpin,nColor,1,order> X(x), Z(z), W(w);
   colorspinor::FieldOrderCB<yFloat,nSpin,nColor,1,order> Y(y);
   typedef typename vector<yFloat,2>::type Float2;
   genericMultiBlas<Float2,write>(X, Y, Z, W, f);
 }

 template <typename Float, typename yFloat, int nSpin, QudaFieldOrder order,
     typename write, typename Functor>
   void genericMultiBlas(ColorSpinorField &x, ColorSpinorField &y, ColorSpinorField &z, ColorSpinorField &w, Functor f) {
   if (x.Ncolor() == 2) {
     genericMultiBlas<Float,yFloat,nSpin,2,order,write,Functor>(x, y, z, w, f);
   } else if (x.Ncolor() == 3) {
     genericMultiBlas<Float,yFloat,nSpin,3,order,write,Functor>(x, y, z, w, f);
   } else if (x.Ncolor() == 4) {
     genericMultiBlas<Float,yFloat,nSpin,4,order,write,Functor>(x, y, z, w, f);
   } else if (x.Ncolor() == 8) {
     genericMultiBlas<Float,yFloat,nSpin,8,order,write,Functor>(x, y, z, w, f);
   } else if (x.Ncolor() == 12) {
     genericMultiBlas<Float,yFloat,nSpin,12,order,write,Functor>(x, y, z, w, f);
   } else if (x.Ncolor() == 16) {
     genericMultiBlas<Float,yFloat,nSpin,16,order,write,Functor>(x, y, z, w, f);
   } else if (x.Ncolor() == 20) {
     genericMultiBlas<Float,yFloat,nSpin,20,order,write,Functor>(x, y, z, w, f);
   } else if (x.Ncolor() == 24) {
     genericMultiBlas<Float,yFloat,nSpin,24,order,write,Functor>(x, y, z, w, f);
   } else if (x.Ncolor() == 32) {
     genericMultiBlas<Float,yFloat,nSpin,32,order,write,Functor>(x, y, z, w, f);
   } else {
     errorQuda("nColor = %d not implemeneted",x.Ncolor());
   }
 }

 template <typename Float, typename yFloat, QudaFieldOrder order, typename write, typename Functor>
   void genericMultiBlas(ColorSpinorField &x, ColorSpinorField &y, ColorSpinorField &z, ColorSpinorField &w, Functor f) {
   if (x.Nspin() == 4) {
     genericMultiBlas<Float,yFloat,4,order,write,Functor>(x, y, z, w, f);
   } else if (x.Nspin() == 2) {
     genericMultiBlas<Float,yFloat,2,order,write,Functor>(x, y, z, w, f);
 #ifdef GPU_STAGGERED_DIRAC
   } else if (x.Nspin() == 1) {
     genericMultiBlas<Float,yFloat,1,order,write,Functor>(x, y, z, w, f);
 #endif
   } else {
     errorQuda("nSpin = %d not implemeneted",x.Nspin());
   }
 }

 template <typename Float, typename yFloat, typename write, typename Functor>
   void genericMultiBlas(ColorSpinorField &x, ColorSpinorField &y, ColorSpinorField &z, ColorSpinorField &w, Functor f) {
   if (x.FieldOrder() == QUDA_SPACE_SPIN_COLOR_FIELD_ORDER) {
     genericMultiBlas<Float,yFloat,QUDA_SPACE_SPIN_COLOR_FIELD_ORDER,write,Functor>
       (x, y, z, w, f);
   } else {
     errorQuda("Not implemeneted");
   }
 }
vector
Definition: reduce_core.cuh:299

blockDim
dim3 dim3 blockDim
Definition: CMakeCUDACompilerId.cpp1.ii:2471

stream
cudaStream_t stream
Definition: CMakeCUDACompilerId.cpp1.ii:2284

Bmatrix_d
static __constant__ signed char Bmatrix_d[MAX_MATRIX_SIZE]
Definition: multi_blas_core.cuh:40

num_to_string
Definition: multi_blas_core.cuh:155

MultiBlasCuda::arg
MultiBlasArg< NXZ, SpinorX, SpinorY, SpinorZ, SpinorW, Functor > arg
Definition: multi_blas_core.cuh:164

getVerbosity
QudaVerbosity getVerbosity()
Definition: util_quda.cpp:20

MultiBlasCuda::Wnorm_h
char * Wnorm_h[MAX_MULTI_BLAS_N]
Definition: multi_blas_core.cuh:169

errorQuda
#define errorQuda(...)
Definition: util_quda.h:90

MultiBlasCuda::preTune
void preTune()
Definition: multi_blas_core.cuh:196

Amatrix_d
static __constant__ signed char Amatrix_d[MAX_MATRIX_SIZE]
Definition: multi_blas_core.cuh:39

QUDA_SPACE_SPIN_COLOR_FIELD_ORDER
Definition: enum_quda.h:321

QUDA_HALF_PRECISION
Definition: enum_quda.h:59

QudaFieldOrder
enum QudaFieldOrder_s QudaFieldOrder

quda::Complex
std::complex< double > Complex
Definition: eig_variables.h:13

MultiBlasCuda::flops
long long flops() const
Definition: multi_blas_core.cuh:220

detail::to_chars
Definition: multi_blas_core.cuh:142

strcpy
char * strcpy(char *__dst, const char *__src)

z
int int z
Definition: CMakeCUDACompilerId.cpp1.ii:2637

strcat
char * strcat(char *__s1, const char *__s2)

MultiBlasArg::Y
SpinorY Y[MAX_MULTI_BLAS_N]
Definition: multi_blas_core.cuh:16

coeff_array::coeff_array
coeff_array(const T *data, bool use_const)
Definition: multi_blas_core.cuh:244

MultiBlasCuda
Definition: multi_blas_core.cuh:160

MultiBlasCuda::~MultiBlasCuda
virtual ~MultiBlasCuda()
Definition: multi_blas_core.cuh:181

coeff_array
Definition: multi_blas_core.cuh:240

gridDim
dim3 gridDim
Definition: CMakeCUDACompilerId.cpp1.ii:2471

param
QudaGaugeParam param
Definition: pack_test.cpp:17

b
#define b
Definition: dw_dslash4_core.h:83

x
p x
Definition: CMakeCUDACompilerId.cpp1.ii:3011

quda::make_Complex
complex< double > make_Complex(const double2 &a)
Definition: float_vector.h:278

quda::blas::getStream
cudaStream_t * getStream()
Definition: blas_quda.cu:75

compute
__device__ void compute(Arg &arg, int idx, int parity)
Definition: multi_blas_core.cuh:48

Cmatrix_d
static __constant__ signed char Cmatrix_d[MAX_MATRIX_SIZE]
Definition: multi_blas_core.cuh:41

MultiBlasCuda::apply
void apply(const cudaStream_t &stream)
Definition: multi_blas_core.cuh:191

MultiBlasCuda::initTuneParam
void initTuneParam(TuneParam &param) const
Definition: multi_blas_core.cuh:210

MultiBlasCuda::MultiBlasCuda
MultiBlasCuda(SpinorX X[], SpinorY Y[], SpinorZ Z[], SpinorW W[], Functor &f, int NYW, int length, int nParity, std::vector< ColorSpinorField *> &y, std::vector< ColorSpinorField *> &w)
Definition: multi_blas_core.cuh:175

detail
Definition: multi_blas_core.cuh:139

nColor
const int nColor
Definition: covdev_test.cpp:77

quda::blas::blasStrings
static struct quda::blas::@4 blasStrings

w
int int int w
Definition: CMakeCUDACompilerId.cpp1.ii:2637

quda::tuneLaunch
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
Definition: tune.cpp:603

fused_exterior_ndeg_tm_dslash_cuda_gen.i
int i
start here
Definition: fused_exterior_ndeg_tm_dslash_cuda_gen.py:816

multiblasCuda
void multiblasCuda(const coeff_array< T > &a, const coeff_array< T > &b, const coeff_array< T > &c, std::vector< ColorSpinorField *> &x, std::vector< ColorSpinorField *> &y, std::vector< ColorSpinorField *> &z, std::vector< ColorSpinorField *> &w, int length)
Definition: multi_blas_core.cuh:250

f
int int int enum cudaChannelFormatKind f
Definition: CMakeCUDACompilerId.cpp1.ii:2637

Z
int Z[4]
Definition: test_util.cpp:27

multiblasKernel
__global__ void multiblasKernel(MultiBlasArg< NXZ, SpinorX, SpinorY, SpinorZ, SpinorW, Functor > arg)
Generic multi-blas kernel with four loads and up to four stores.
Definition: multi_blas_core.cuh:78

Cmatrix_h
static signed char * Cmatrix_h
Definition: multi_blas_core.cuh:45

MultiBlasCuda::w
std::vector< ColorSpinorField * > & w
Definition: multi_blas_core.cuh:170

detail::explode
Definition: multi_blas_core.cuh:148

MultiBlasArg::f
Functor f
Definition: multi_blas_core.cuh:19

X
int X
Definition: asym_wilson_clover_dslash_dagger_fermi_core.h:394

coeff_array::coeff_array
coeff_array()
Definition: multi_blas_core.cuh:243

MultiBlasCuda::bytes
long long bytes() const
Definition: multi_blas_core.cuh:222

MultiBlasArg::Z
SpinorZ Z[NXZ]
Definition: multi_blas_core.cuh:17

Amatrix_h
static signed char * Amatrix_h
Definition: multi_blas_core.cuh:43

MultiBlasCuda::NYW
const int NYW
Definition: multi_blas_core.cuh:163

MultiBlasArg::NYW
const int NYW
Definition: multi_blas_core.cuh:14

idx
int idx
Definition: staggered_fused_exterior_dslash_core.h:355

MultiBlasCuda::postTune
void postTune()
Definition: multi_blas_core.cuh:203

MAX_MULTI_BLAS_N
#define MAX_MULTI_BLAS_N
Definition: quda_internal.h:49

s
size_t s
Definition: CMakeCUDACompilerId.cpp1.ii:2229

genericMultiBlas
void genericMultiBlas(SpinorX &X, SpinorY &Y, SpinorZ &Z, SpinorW &W, Functor f)
Definition: multi_blas_core.cuh:351

quda::blas::flops
unsigned long long flops
Definition: blas_quda.cu:42

MultiBlasCuda::y
std::vector< ColorSpinorField * > & y
Definition: multi_blas_core.cuh:170

quda::arg
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
Definition: complex_quda.h:880

length
void size_t length
Definition: CMakeCUDACompilerId.cpp1.ii:2433

MultiBlasArg
Parameter struct for generic multi-blas kernel.
Definition: multi_blas_core.cuh:13

MultiBlasArg::W
SpinorW W[MAX_MULTI_BLAS_N]
Definition: multi_blas_core.cuh:18

MultiBlasCuda::Y_h
char * Y_h[MAX_MULTI_BLAS_N]
Definition: multi_blas_core.cuh:169

detail::to_chars::value
static const char value[]
Definition: multi_blas_core.cuh:142

c
const void * c
Definition: CMakeCUDACompilerId.cpp1.ii:2234

MultiBlasCuda::W_h
char * W_h[MAX_MULTI_BLAS_N]
Definition: multi_blas_core.cuh:169

MAX_MATRIX_SIZE
#define MAX_MATRIX_SIZE
Definition: multi_blas_core.cuh:38

coeff_array::data
const T * data
Definition: multi_blas_core.cuh:241

checkCudaError
#define checkCudaError()
Definition: util_quda.h:129

MultiBlasCuda::tuningIter
int tuningIter() const
Definition: multi_blas_core.cuh:236

MultiBlasCuda::tuneSharedBytes
bool tuneSharedBytes() const
Definition: multi_blas_core.cuh:172

MultiBlasCuda::tuneKey
TuneKey tuneKey() const
Definition: multi_blas_core.cuh:183

MultiBlasCuda::defaultTuneParam
void defaultTuneParam(TuneParam &param) const
Definition: multi_blas_core.cuh:215

getTuning
QudaTune getTuning()
Query whether autotuning is enabled or not. Default is enabled but can be overridden by setting QUDA_...
Definition: util_quda.cpp:51

value
int value
Definition: CMakeCUDACompilerId.cpp1.ii:2296

MultiBlasCuda::Ynorm_h
char * Ynorm_h[MAX_MULTI_BLAS_N]
Definition: multi_blas_core.cuh:169

parity
QudaParity parity
Definition: covdev_test.cpp:53

coeff_array::use_const
const bool use_const
Definition: multi_blas_core.cuh:242

MultiBlasArg::MultiBlasArg
MultiBlasArg(SpinorX X[NXZ], SpinorY Y[], SpinorZ Z[NXZ], SpinorW W[], Functor f, int NYW, int length)
Definition: multi_blas_core.cuh:22

MultiBlasArg::length
const int length
Definition: multi_blas_core.cuh:20

a
#define a
Definition: dw_dslash4_core.h:82

quda::blas::bytes
unsigned long long bytes
Definition: blas_quda.cu:43

MultiBlasCuda::nParity
const int nParity
Definition: multi_blas_core.cuh:165

MultiBlasArg::X
SpinorX X[NXZ]
Definition: multi_blas_core.cuh:15

y
int y
Definition: CMakeCUDACompilerId.cpp1.ii:2637

Bmatrix_h
static signed char * Bmatrix_h
Definition: multi_blas_core.cuh:44