QUDA v0.4.0
A library for QCD on GPUs
|
#include <tune_quda.h>
Public Member Functions | |
Tunable () | |
virtual | ~Tunable () |
virtual TuneKey | tuneKey () const =0 |
virtual void | apply (const cudaStream_t &stream)=0 |
virtual void | preTune () |
virtual void | postTune () |
virtual int | tuningIter () const |
virtual std::string | paramString (const TuneParam ¶m) const |
virtual std::string | perfString (float time) const |
virtual void | initTuneParam (TuneParam ¶m) const |
virtual void | defaultTuneParam (TuneParam ¶m) const |
virtual bool | advanceTuneParam (TuneParam ¶m) const |
Protected Member Functions | |
virtual long long | flops () const |
virtual long long | bytes () const |
virtual int | sharedBytesPerThread () const =0 |
virtual int | sharedBytesPerBlock () const =0 |
virtual bool | advanceGridDim (TuneParam ¶m) const |
virtual bool | advanceBlockDim (TuneParam ¶m) const |
virtual bool | advanceSharedBytes (TuneParam ¶m) const |
Definition at line 66 of file tune_quda.h.
Tunable::Tunable | ( | ) | [inline] |
Definition at line 133 of file tune_quda.h.
virtual Tunable::~Tunable | ( | ) | [inline, virtual] |
Definition at line 134 of file tune_quda.h.
virtual bool Tunable::advanceBlockDim | ( | TuneParam & | param | ) | const [inline, protected, virtual] |
Reimplemented in DslashCuda.
Definition at line 91 of file tune_quda.h.
virtual bool Tunable::advanceGridDim | ( | TuneParam & | param | ) | const [inline, protected, virtual] |
Reimplemented in DslashCuda, and CloverCuda< sFloat, cFloat >.
Definition at line 78 of file tune_quda.h.
virtual bool Tunable::advanceSharedBytes | ( | TuneParam & | param | ) | const [inline, protected, virtual] |
The goal here is to throttle the number of thread blocks per SM by over-allocating shared memory (in order to improve L2 utilization, etc.). Note that:
Definition at line 113 of file tune_quda.h.
virtual bool Tunable::advanceTuneParam | ( | TuneParam & | param | ) | const [inline, virtual] |
Definition at line 176 of file tune_quda.h.
virtual void Tunable::apply | ( | const cudaStream_t & | stream | ) | [pure virtual] |
Implemented in BlasCuda< FloatN, M, writeX, writeY, writeZ, writeW, InputX, InputY, InputZ, InputW, OutputX, OutputY, OutputZ, OutputW, Functor >, CopyCuda< FloatN, N, Output, Input >, WilsonDslashCuda< sFloat, gFloat >, CloverDslashCuda< sFloat, gFloat, cFloat >, TwistedDslashCuda< sFloat, gFloat >, DomainWallDslashCuda< sFloat, gFloat >, StaggeredDslashCuda< sFloat, fatGFloat, longGFloat >, CloverCuda< sFloat, cFloat >, TwistGamma5Cuda< sFloat >, and ReduceCuda< doubleN, ReduceType, ReduceSimpleType, FloatN, M, writeX, writeY, writeZ, InputX, InputY, InputZ, InputW, InputV, Reducer, OutputX, OutputY, OutputZ >.
virtual long long Tunable::bytes | ( | ) | const [inline, protected, virtual] |
Reimplemented in BlasCuda< FloatN, M, writeX, writeY, writeZ, writeW, InputX, InputY, InputZ, InputW, OutputX, OutputY, OutputZ, OutputW, Functor >, CopyCuda< FloatN, N, Output, Input >, and ReduceCuda< doubleN, ReduceType, ReduceSimpleType, FloatN, M, writeX, writeY, writeZ, InputX, InputY, InputZ, InputW, InputV, Reducer, OutputX, OutputY, OutputZ >.
Definition at line 70 of file tune_quda.h.
virtual void Tunable::defaultTuneParam | ( | TuneParam & | param | ) | const [inline, virtual] |
sets default values for when tuning is disabled
Reimplemented in DslashCuda.
Definition at line 170 of file tune_quda.h.
virtual long long Tunable::flops | ( | ) | const [inline, protected, virtual] |
Reimplemented in BlasCuda< FloatN, M, writeX, writeY, writeZ, writeW, InputX, InputY, InputZ, InputW, OutputX, OutputY, OutputZ, OutputW, Functor >, CopyCuda< FloatN, N, Output, Input >, and ReduceCuda< doubleN, ReduceType, ReduceSimpleType, FloatN, M, writeX, writeY, writeZ, InputX, InputY, InputZ, InputW, InputV, Reducer, OutputX, OutputY, OutputZ >.
Definition at line 69 of file tune_quda.h.
virtual void Tunable::initTuneParam | ( | TuneParam & | param | ) | const [inline, virtual] |
Reimplemented in DslashCuda.
Definition at line 160 of file tune_quda.h.
virtual std::string Tunable::paramString | ( | const TuneParam & | param | ) | const [inline, virtual] |
Reimplemented in DslashCuda, CloverCuda< sFloat, cFloat >, and TwistGamma5Cuda< sFloat >.
Definition at line 141 of file tune_quda.h.
virtual std::string Tunable::perfString | ( | float | time | ) | const [inline, virtual] |
Definition at line 150 of file tune_quda.h.
virtual void Tunable::postTune | ( | ) | [inline, virtual] |
Reimplemented in BlasCuda< FloatN, M, writeX, writeY, writeZ, writeW, InputX, InputY, InputZ, InputW, OutputX, OutputY, OutputZ, OutputW, Functor >, CopyCuda< FloatN, N, Output, Input >, WilsonDslashCuda< sFloat, gFloat >, CloverDslashCuda< sFloat, gFloat, cFloat >, TwistedDslashCuda< sFloat, gFloat >, DomainWallDslashCuda< sFloat, gFloat >, StaggeredDslashCuda< sFloat, fatGFloat, longGFloat >, CloverCuda< sFloat, cFloat >, TwistGamma5Cuda< sFloat >, and ReduceCuda< doubleN, ReduceType, ReduceSimpleType, FloatN, M, writeX, writeY, writeZ, InputX, InputY, InputZ, InputW, InputV, Reducer, OutputX, OutputY, OutputZ >.
Definition at line 138 of file tune_quda.h.
virtual void Tunable::preTune | ( | ) | [inline, virtual] |
Reimplemented in BlasCuda< FloatN, M, writeX, writeY, writeZ, writeW, InputX, InputY, InputZ, InputW, OutputX, OutputY, OutputZ, OutputW, Functor >, CopyCuda< FloatN, N, Output, Input >, WilsonDslashCuda< sFloat, gFloat >, CloverDslashCuda< sFloat, gFloat, cFloat >, TwistedDslashCuda< sFloat, gFloat >, DomainWallDslashCuda< sFloat, gFloat >, StaggeredDslashCuda< sFloat, fatGFloat, longGFloat >, CloverCuda< sFloat, cFloat >, TwistGamma5Cuda< sFloat >, and ReduceCuda< doubleN, ReduceType, ReduceSimpleType, FloatN, M, writeX, writeY, writeZ, InputX, InputY, InputZ, InputW, InputV, Reducer, OutputX, OutputY, OutputZ >.
Definition at line 137 of file tune_quda.h.
virtual int Tunable::sharedBytesPerBlock | ( | ) | const [protected, pure virtual] |
Implemented in DslashCuda, and CloverCuda< sFloat, cFloat >.
virtual int Tunable::sharedBytesPerThread | ( | ) | const [protected, pure virtual] |
virtual TuneKey Tunable::tuneKey | ( | ) | const [pure virtual] |
Implemented in BlasCuda< FloatN, M, writeX, writeY, writeZ, writeW, InputX, InputY, InputZ, InputW, OutputX, OutputY, OutputZ, OutputW, Functor >, CopyCuda< FloatN, N, Output, Input >, DslashCuda, WilsonDslashCuda< sFloat, gFloat >, CloverDslashCuda< sFloat, gFloat, cFloat >, TwistedDslashCuda< sFloat, gFloat >, DomainWallDslashCuda< sFloat, gFloat >, StaggeredDslashCuda< sFloat, fatGFloat, longGFloat >, CloverCuda< sFloat, cFloat >, TwistGamma5Cuda< sFloat >, and ReduceCuda< doubleN, ReduceType, ReduceSimpleType, FloatN, M, writeX, writeY, writeZ, InputX, InputY, InputZ, InputW, InputV, Reducer, OutputX, OutputY, OutputZ >.
virtual int Tunable::tuningIter | ( | ) | const [inline, virtual] |
Definition at line 139 of file tune_quda.h.