QUDA
1.0.0
|
Public Member Functions | |
BlasCuda (SpinorX &X, SpinorY &Y, SpinorZ &Z, SpinorW &W, SpinorV &V, Functor &f, ColorSpinorField &x, ColorSpinorField &y, ColorSpinorField &z, ColorSpinorField &w, ColorSpinorField &v, int length) | |
virtual | ~BlasCuda () |
TuneKey | tuneKey () const |
void | apply (const cudaStream_t &stream) |
void | preTune () |
void | postTune () |
void | initTuneParam (TuneParam ¶m) const |
void | defaultTuneParam (TuneParam ¶m) const |
long long | flops () const |
long long | bytes () const |
int | tuningIter () const |
![]() | |
Tunable () | |
virtual | ~Tunable () |
virtual std::string | paramString (const TuneParam ¶m) const |
virtual std::string | perfString (float time) const |
virtual bool | advanceTuneParam (TuneParam ¶m) const |
void | checkLaunchParam (TuneParam ¶m) |
CUresult | jitifyError () const |
CUresult & | jitifyError () |
Private Member Functions | |
unsigned int | sharedBytesPerThread () const |
unsigned int | sharedBytesPerBlock (const TuneParam ¶m) const |
virtual bool | advanceSharedBytes (TuneParam ¶m) const |
Private Attributes | |
const int | nParity |
BlasArg< SpinorX, SpinorY, SpinorZ, SpinorW, SpinorV, Functor > | arg |
const ColorSpinorField & | x |
const ColorSpinorField & | y |
const ColorSpinorField & | z |
const ColorSpinorField & | w |
const ColorSpinorField & | v |
char * | X_h |
char * | Y_h |
char * | Z_h |
char * | W_h |
char * | V_h |
char * | Xnorm_h |
char * | Ynorm_h |
char * | Znorm_h |
char * | Wnorm_h |
char * | Vnorm_h |
Additional Inherited Members | |
![]() | |
virtual unsigned int | minThreads () const |
virtual bool | tuneGridDim () const |
virtual bool | tuneAuxDim () const |
virtual bool | tuneSharedBytes () const |
virtual bool | advanceGridDim (TuneParam ¶m) const |
virtual unsigned int | maxBlockSize (const TuneParam ¶m) const |
virtual unsigned int | maxGridSize () const |
virtual unsigned int | minGridSize () const |
virtual int | gridStep () const |
gridStep sets the step size when iterating the grid size in advanceGridDim. More... | |
virtual int | blockStep () const |
virtual int | blockMin () const |
virtual void | resetBlockDim (TuneParam ¶m) const |
virtual bool | advanceBlockDim (TuneParam ¶m) const |
unsigned int | maxBlocksPerSM () const |
For some reason this can't be queried from the device properties, so here we set set this. Based on Table 14 of the CUDA Programming Guide 10.0 (Technical Specifications per Compute Capability) More... | |
template<typename F > | |
void | setMaxDynamicSharedBytesPerBlock (F *func) const |
Enable the maximum dynamic shared bytes for the kernel "func" (values given by maxDynamicSharedBytesPerBlock()). More... | |
unsigned int | maxDynamicSharedBytesPerBlock () const |
This can't be correctly queried in CUDA for all architectures so here we set set this. Based on Table 14 of the CUDA Programming Guide 10.0 (Technical Specifications per Compute Capability). More... | |
virtual unsigned int | maxSharedBytesPerBlock () const |
The maximum shared memory that a CUDA thread block can use in the autotuner. This isn't necessarily the same as maxDynamicSharedMemoryPerBlock since that may need explicit opt in to enable (by calling setMaxDynamicSharedBytes for the kernel in question). If the CUDA kernel in question does this opt in then this function can be overloaded to return maxDynamicSharedBytesPerBlock. More... | |
virtual bool | advanceAux (TuneParam ¶m) const |
int | writeAuxString (const char *format,...) |
![]() | |
char | aux [TuneKey::aux_n] |
CUresult | jitify_error |
Definition at line 29 of file blas_quda.cu.
|
inline |
Definition at line 58 of file blas_quda.cu.
References quda::Tunable::aux, quda::LatticeField::AuxString(), and quda::LatticeField::Precision().
|
inlinevirtual |
Definition at line 89 of file blas_quda.cu.
|
inlineprivatevirtual |
The goal here is to throttle the number of thread blocks per SM by over-allocating shared memory (in order to improve L2 utilization, etc.). We thus request the smallest amount of dynamic shared memory that guarantees throttling to a given number of blocks, in order to allow some extra leeway.
Reimplemented from quda::Tunable.
Definition at line 46 of file blas_quda.cu.
References quda::Tunable::advanceBlockDim(), quda::TuneParam::block, quda::TuneParam::shared_bytes, quda::blas::BlasCuda< FloatN, M, SpinorX, SpinorY, SpinorZ, SpinorW, SpinorV, Functor >::sharedBytesPerBlock(), and quda::blas::BlasCuda< FloatN, M, SpinorX, SpinorY, SpinorZ, SpinorW, SpinorV, Functor >::sharedBytesPerThread().
|
inlinevirtual |
Implements quda::Tunable.
Definition at line 93 of file blas_quda.cu.
References quda::blas::BlasCuda< FloatN, M, SpinorX, SpinorY, SpinorZ, SpinorW, SpinorV, Functor >::arg, quda::TuneParam::block, getTuning(), getVerbosity(), quda::TuneParam::grid, quda::Tunable::jitify_error, quda::TuneParam::shared_bytes, and quda::tuneLaunch().
Referenced by quda::blas::nativeBlas().
|
inlinevirtual |
Reimplemented from quda::Tunable.
Definition at line 138 of file blas_quda.cu.
References quda::ColorSpinorField::Bytes(), and quda::blas::BlasArg< SpinorX, SpinorY, SpinorZ, SpinorW, SpinorV, Functor >::f.
Referenced by quda::blas::nativeBlas().
|
inlinevirtual |
sets default values for when tuning is disabled
Reimplemented from quda::Tunable.
Definition at line 131 of file blas_quda.cu.
References quda::TuneParam::grid, quda::Tunable::initTuneParam(), and quda::blas::BlasCuda< FloatN, M, SpinorX, SpinorY, SpinorZ, SpinorW, SpinorV, Functor >::nParity.
|
inlinevirtual |
Implements quda::Tunable.
Definition at line 137 of file blas_quda.cu.
References quda::blas::BlasArg< SpinorX, SpinorY, SpinorZ, SpinorW, SpinorV, Functor >::f, and quda::blas::BlasArg< SpinorX, SpinorY, SpinorZ, SpinorW, SpinorV, Functor >::length.
Referenced by quda::blas::nativeBlas().
|
inlinevirtual |
Reimplemented from quda::Tunable.
Definition at line 125 of file blas_quda.cu.
References quda::TuneParam::grid, quda::Tunable::initTuneParam(), and quda::blas::BlasCuda< FloatN, M, SpinorX, SpinorY, SpinorZ, SpinorW, SpinorV, Functor >::nParity.
|
inlinevirtual |
Reimplemented from quda::Tunable.
Definition at line 116 of file blas_quda.cu.
References quda::ColorSpinorField::Bytes(), quda::ColorSpinorField::NormBytes(), quda::blas::BlasArg< SpinorX, SpinorY, SpinorZ, SpinorW, SpinorV, Functor >::V, quda::blas::BlasArg< SpinorX, SpinorY, SpinorZ, SpinorW, SpinorV, Functor >::W, quda::blas::BlasArg< SpinorX, SpinorY, SpinorZ, SpinorW, SpinorV, Functor >::X, quda::blas::BlasArg< SpinorX, SpinorY, SpinorZ, SpinorW, SpinorV, Functor >::Y, and quda::blas::BlasArg< SpinorX, SpinorY, SpinorZ, SpinorW, SpinorV, Functor >::Z.
|
inlinevirtual |
Reimplemented from quda::Tunable.
Definition at line 107 of file blas_quda.cu.
References quda::ColorSpinorField::Bytes(), quda::ColorSpinorField::NormBytes(), quda::blas::BlasArg< SpinorX, SpinorY, SpinorZ, SpinorW, SpinorV, Functor >::V, quda::blas::BlasArg< SpinorX, SpinorY, SpinorZ, SpinorW, SpinorV, Functor >::W, quda::blas::BlasArg< SpinorX, SpinorY, SpinorZ, SpinorW, SpinorV, Functor >::X, quda::blas::BlasArg< SpinorX, SpinorY, SpinorZ, SpinorW, SpinorV, Functor >::Y, and quda::blas::BlasArg< SpinorX, SpinorY, SpinorZ, SpinorW, SpinorV, Functor >::Z.
|
inlineprivatevirtual |
Implements quda::Tunable.
Definition at line 44 of file blas_quda.cu.
Referenced by quda::blas::BlasCuda< FloatN, M, SpinorX, SpinorY, SpinorZ, SpinorW, SpinorV, Functor >::advanceSharedBytes().
|
inlineprivatevirtual |
Implements quda::Tunable.
Definition at line 43 of file blas_quda.cu.
Referenced by quda::blas::BlasCuda< FloatN, M, SpinorX, SpinorY, SpinorZ, SpinorW, SpinorV, Functor >::advanceSharedBytes().
|
inlinevirtual |
Implements quda::Tunable.
Definition at line 91 of file blas_quda.cu.
References quda::Tunable::aux, quda::blas::BlasArg< SpinorX, SpinorY, SpinorZ, SpinorW, SpinorV, Functor >::f, and quda::LatticeField::VolString().
|
inlinevirtual |
Reimplemented from quda::Tunable.
Definition at line 144 of file blas_quda.cu.
|
mutableprivate |
Definition at line 34 of file blas_quda.cu.
Referenced by quda::blas::BlasCuda< FloatN, M, SpinorX, SpinorY, SpinorZ, SpinorW, SpinorV, Functor >::apply().
|
private |
|
private |
Definition at line 36 of file blas_quda.cu.
Referenced by quda::blas::mixed_blas(), and quda::blas::uni_blas().
|
private |
Definition at line 40 of file blas_quda.cu.
|
private |
Definition at line 41 of file blas_quda.cu.
|
private |
Definition at line 36 of file blas_quda.cu.
Referenced by quda::blas::caxpbypczw(), quda::blas::caxpbypzYmbw(), quda::blas::mixed_blas(), quda::blas::tripleCGUpdate(), and quda::blas::uni_blas().
|
private |
Definition at line 40 of file blas_quda.cu.
|
private |
Definition at line 41 of file blas_quda.cu.
|
private |
Definition at line 36 of file blas_quda.cu.
Referenced by quda::blas::ax(), quda::blas::axpbyz(), quda::blas::axpyBzpcx(), quda::blas::axpyZpbx(), quda::blas::cabxpyAx(), quda::blas::caxpby(), quda::blas::caxpbypczw(), quda::blas::caxpbypzYmbw(), quda::blas::caxpy(), quda::blas::caxpyBxpz(), quda::blas::caxpyBzpx(), quda::blas::caxpyXmaz(), quda::blas::caxpyXmazMR(), quda::blas::cxpaypbz(), quda::blas::doubleCG3Init(), quda::blas::doubleCG3Update(), quda::blas::mixed_blas(), quda::blas::tripleCGUpdate(), and quda::blas::uni_blas().
|
private |
Definition at line 40 of file blas_quda.cu.
|
private |
Definition at line 41 of file blas_quda.cu.
|
private |
Definition at line 36 of file blas_quda.cu.
Referenced by quda::blas::axpbyz(), quda::blas::axpyBzpcx(), quda::blas::axpyZpbx(), quda::blas::cabxpyAx(), quda::blas::caxpby(), quda::blas::caxpbypczw(), quda::blas::caxpbypzYmbw(), quda::blas::caxpy(), quda::blas::caxpyBxpz(), quda::blas::caxpyBzpx(), quda::blas::caxpyXmaz(), quda::blas::caxpyXmazMR(), quda::blas::cxpaypbz(), quda::blas::doubleCG3Init(), quda::blas::doubleCG3Update(), quda::blas::mixed_blas(), quda::blas::tripleCGUpdate(), and quda::blas::uni_blas().
|
private |
Definition at line 40 of file blas_quda.cu.
|
private |
Definition at line 41 of file blas_quda.cu.
|
private |
Definition at line 36 of file blas_quda.cu.
Referenced by quda::blas::axpbyz(), quda::blas::axpyBzpcx(), quda::blas::axpyZpbx(), quda::blas::caxpbypczw(), quda::blas::caxpbypzYmbw(), quda::blas::caxpyBxpz(), quda::blas::caxpyBzpx(), quda::blas::caxpyXmaz(), quda::blas::caxpyXmazMR(), quda::blas::cxpaypbz(), quda::blas::doubleCG3Init(), quda::blas::doubleCG3Update(), quda::blas::mixed_blas(), quda::blas::tripleCGUpdate(), and quda::blas::uni_blas().
|
private |
Definition at line 40 of file blas_quda.cu.
|
private |
Definition at line 41 of file blas_quda.cu.