#include <tune_quda.h>

Inheritance diagram for quda::Tunable:

Public Member Functions
	Tunable ()

virtual	~Tunable ()

virtual TuneKey	tuneKey () const =0

virtual void	apply (const qudaStream_t &stream)=0

virtual void	preTune ()

virtual void	postTune ()

virtual int	tuningIter () const

virtual std::string	paramString (const TuneParam &param) const

virtual std::string	perfString (float time) const

virtual void	initTuneParam (TuneParam &param) const

virtual void	defaultTuneParam (TuneParam &param) const

virtual bool	advanceTuneParam (TuneParam &param) const

void	checkLaunchParam (TuneParam &param)

CUresult	jitifyError () const

CUresult &	jitifyError ()

Protected Member Functions
virtual long long	flops () const =0

virtual long long	bytes () const

virtual unsigned int	sharedBytesPerThread () const =0

virtual unsigned int	sharedBytesPerBlock (const TuneParam &param) const =0

virtual unsigned int	minThreads () const

virtual bool	tuneGridDim () const

virtual bool	tuneAuxDim () const

virtual bool	tuneSharedBytes () const

virtual bool	advanceGridDim (TuneParam &param) const

virtual unsigned int	maxBlockSize (const TuneParam &param) const

virtual unsigned int	maxGridSize () const

virtual unsigned int	minGridSize () const

virtual int	gridStep () const
	gridStep sets the step size when iterating the grid size in advanceGridDim. More...

virtual int	blockStep () const

virtual int	blockMin () const

virtual void	resetBlockDim (TuneParam &param) const

virtual bool	advanceBlockDim (TuneParam &param) const

unsigned int	maxBlocksPerSM () const
	Returns the maximum number of simultaneously resident blocks per SM. We can directly query this of CUDA 11, but previously this needed to be hand coded. More...

unsigned int	maxDynamicSharedBytesPerBlock () const
	Returns the maximum dynamic shared memory per block. More...

virtual unsigned int	maxSharedBytesPerBlock () const
	The maximum shared memory that a CUDA thread block can use in the autotuner. This isn't necessarily the same as maxDynamicSharedMemoryPerBlock since that may need explicit opt in to enable (by calling setMaxDynamicSharedBytes for the kernel in question). If the CUDA kernel in question does this opt in then this function can be overloaded to return maxDynamicSharedBytesPerBlock. More...

virtual bool	advanceSharedBytes (TuneParam &param) const

virtual bool	advanceAux (TuneParam &param) const

int	writeAuxString (const char *format,...)

bool	tuned ()
	Whether the present instance has already been tuned or not. More...

Protected Attributes
char	aux [TuneKey::aux_n]

CUresult	jitify_error

Detailed Description

Definition at line 95 of file tune_quda.h.

Constructor & Destructor Documentation

◆ Tunable()

quda::Tunable::Tunable ( )

inline

Definition at line 305 of file tune_quda.h.

◆ ~Tunable()

virtual quda::Tunable::~Tunable ( )

inlinevirtual

Definition at line 306 of file tune_quda.h.

Member Function Documentation

◆ advanceAux()

virtual bool quda::Tunable::advanceAux ( TuneParam & param ) const

inlineprotectedvirtual

Reimplemented in quda::Dslash< D, Arg >.

Definition at line 267 of file tune_quda.h.

◆ advanceBlockDim()

virtual bool quda::Tunable::advanceBlockDim ( TuneParam & param ) const

inlineprotectedvirtual

Reimplemented in quda::TunableVectorYZ, quda::TunableVectorY, and quda::TunableLocalParityReduction.

Definition at line 159 of file tune_quda.h.

◆ advanceGridDim()

virtual bool quda::Tunable::advanceGridDim ( TuneParam & param ) const

inlineprotectedvirtual

Definition at line 113 of file tune_quda.h.

◆ advanceSharedBytes()

virtual bool quda::Tunable::advanceSharedBytes ( TuneParam & param ) const

inlineprotectedvirtual

The goal here is to throttle the number of thread blocks per SM by over-allocating shared memory (in order to improve L2 utilization, etc.). We thus request the smallest amount of dynamic shared memory that guarantees throttling to a given number of blocks, in order to allow some extra leeway.

Definition at line 242 of file tune_quda.h.

◆ advanceTuneParam()

virtual bool quda::Tunable::advanceTuneParam ( TuneParam & param ) const

inlinevirtual

Reimplemented in quda::QudaMem, and quda::Dslash< D, Arg >.

Definition at line 363 of file tune_quda.h.

◆ apply()

virtual void quda::Tunable::apply ( const qudaStream_t & stream )

pure virtual

Implemented in quda::QudaMem, and quda::TransformReduce< reduce_t, T, I, transformer, reducer >.

◆ blockMin()

virtual int quda::Tunable::blockMin ( ) const

inlineprotectedvirtual

Reimplemented in quda::Dslash< D, Arg >.

Definition at line 141 of file tune_quda.h.

◆ blockStep()

virtual int quda::Tunable::blockStep ( ) const

inlineprotectedvirtual

Reimplemented in quda::Dslash< D, Arg >.

Definition at line 140 of file tune_quda.h.

◆ bytes()

virtual long long quda::Tunable::bytes ( ) const

inlineprotectedvirtual

Reimplemented in quda::QudaMem, quda::TransformReduce< reduce_t, T, I, transformer, reducer >, and quda::Dslash< D, Arg >.

Definition at line 99 of file tune_quda.h.

◆ checkLaunchParam()

void quda::Tunable::checkLaunchParam ( TuneParam & param )

inline

Check the launch parameters of the kernel to ensure that they are valid for the current device.

Definition at line 372 of file tune_quda.h.

◆ defaultTuneParam()

virtual void quda::Tunable::defaultTuneParam ( TuneParam & param ) const

inlinevirtual

sets default values for when tuning is disabled

Reimplemented in quda::TunableVectorYZ, quda::TunableVectorY, quda::TunableLocalParityReduction, and quda::Dslash< D, Arg >.

Definition at line 357 of file tune_quda.h.

◆ flops()

virtual long long quda::Tunable::flops ( ) const

protectedpure virtual

Implemented in quda::QudaMem, quda::TransformReduce< reduce_t, T, I, transformer, reducer >, and quda::Dslash< D, Arg >.

◆ gridStep()

virtual int quda::Tunable::gridStep ( ) const

inlineprotectedvirtual

gridStep sets the step size when iterating the grid size in advanceGridDim.

Returns: Grid step size

Reimplemented in quda::TunableLocalParityReduction, and quda::Dslash< D, Arg >.

Definition at line 138 of file tune_quda.h.

◆ initTuneParam()

virtual void quda::Tunable::initTuneParam ( TuneParam & param ) const

inlinevirtual

Reimplemented in quda::TunableVectorYZ, quda::TunableVectorY, quda::TunableLocalParityReduction, and quda::Dslash< D, Arg >.

Definition at line 332 of file tune_quda.h.

◆ jitifyError() [1/2]

CUresult& quda::Tunable::jitifyError ( )

inline

Definition at line 404 of file tune_quda.h.

◆ jitifyError() [2/2]

CUresult quda::Tunable::jitifyError ( ) const

inline

Definition at line 403 of file tune_quda.h.

◆ maxBlockSize()

virtual unsigned int quda::Tunable::maxBlockSize ( const TuneParam & param ) const

inlineprotectedvirtual

Reimplemented in quda::TunableLocalParityReduction.

Definition at line 129 of file tune_quda.h.

◆ maxBlocksPerSM()

unsigned int quda::Tunable::maxBlocksPerSM ( ) const

inlineprotected

Returns the maximum number of simultaneously resident blocks per SM. We can directly query this of CUDA 11, but previously this needed to be hand coded.

Returns: The maximum number of simultaneously resident blocks per SM

Definition at line 186 of file tune_quda.h.

◆ maxDynamicSharedBytesPerBlock()

unsigned int quda::Tunable::maxDynamicSharedBytesPerBlock ( ) const

inlineprotected

Returns the maximum dynamic shared memory per block.

Returns: The maximum dynamic shared memory to CUDA thread block

Definition at line 220 of file tune_quda.h.

◆ maxGridSize()

virtual unsigned int quda::Tunable::maxGridSize ( ) const

inlineprotectedvirtual

Definition at line 130 of file tune_quda.h.

◆ maxSharedBytesPerBlock()

virtual unsigned int quda::Tunable::maxSharedBytesPerBlock ( ) const

inlineprotectedvirtual

The maximum shared memory that a CUDA thread block can use in the autotuner. This isn't necessarily the same as maxDynamicSharedMemoryPerBlock since that may need explicit opt in to enable (by calling setMaxDynamicSharedBytes for the kernel in question). If the CUDA kernel in question does this opt in then this function can be overloaded to return maxDynamicSharedBytesPerBlock.

Returns: The maximum shared bytes limit per block the autotung will utilize.

Reimplemented in quda::Dslash< D, Arg >.

Definition at line 233 of file tune_quda.h.

◆ minGridSize()

virtual unsigned int quda::Tunable::minGridSize ( ) const

inlineprotectedvirtual

Reimplemented in quda::TunableLocalParityReduction, and quda::Dslash< D, Arg >.

Definition at line 131 of file tune_quda.h.

◆ minThreads()

virtual unsigned int quda::Tunable::minThreads ( ) const

inlineprotectedvirtual

Reimplemented in quda::Dslash< D, Arg >.

Definition at line 108 of file tune_quda.h.

◆ paramString()

virtual std::string quda::Tunable::paramString ( const TuneParam & param ) const

inlinevirtual

Definition at line 314 of file tune_quda.h.

◆ perfString()

virtual std::string quda::Tunable::perfString ( float time ) const

inlinevirtual

Definition at line 321 of file tune_quda.h.

◆ postTune()

virtual void quda::Tunable::postTune ( )

inlinevirtual

Reimplemented in quda::Dslash< D, Arg >.

Definition at line 310 of file tune_quda.h.

◆ preTune()

virtual void quda::Tunable::preTune ( )

inlinevirtual

Reimplemented in quda::Dslash< D, Arg >.

Definition at line 309 of file tune_quda.h.

◆ resetBlockDim()

virtual void quda::Tunable::resetBlockDim ( TuneParam & param ) const

inlineprotectedvirtual

Definition at line 143 of file tune_quda.h.

◆ sharedBytesPerBlock()

virtual unsigned int quda::Tunable::sharedBytesPerBlock ( const TuneParam & param ) const

protectedpure virtual

Implemented in quda::TunableVectorY, and quda::TunableLocalParityReduction.

◆ sharedBytesPerThread()

virtual unsigned int quda::Tunable::sharedBytesPerThread ( ) const

protectedpure virtual

Implemented in quda::TunableVectorY, and quda::TunableLocalParityReduction.

◆ tuneAuxDim()

virtual bool quda::Tunable::tuneAuxDim ( ) const

inlineprotectedvirtual

Definition at line 110 of file tune_quda.h.

◆ tuned()

bool quda::Tunable::tuned ( )

inlineprotected

Whether the present instance has already been tuned or not.

Returns: True if tuned, false if not

Definition at line 289 of file tune_quda.h.

◆ tuneGridDim()

virtual bool quda::Tunable::tuneGridDim ( ) const

inlineprotectedvirtual

Reimplemented in quda::TunableLocalParityReduction, and quda::Dslash< D, Arg >.

Definition at line 109 of file tune_quda.h.

◆ tuneKey()

virtual TuneKey quda::Tunable::tuneKey ( ) const

pure virtual

Implemented in quda::QudaMem, quda::TransformReduce< reduce_t, T, I, transformer, reducer >, and quda::Dslash< D, Arg >.

◆ tuneSharedBytes()

virtual bool quda::Tunable::tuneSharedBytes ( ) const

inlineprotectedvirtual

Definition at line 111 of file tune_quda.h.

◆ tuningIter()

virtual int quda::Tunable::tuningIter ( ) const

inlinevirtual

Reimplemented in quda::Dslash< D, Arg >.

Definition at line 311 of file tune_quda.h.

◆ writeAuxString()

int quda::Tunable::writeAuxString	(	const char *	format,
			...
	)

inlineprotected

Definition at line 271 of file tune_quda.h.

Member Data Documentation

◆ aux

char quda::Tunable::aux[TuneKey::aux_n]

protected

Definition at line 269 of file tune_quda.h.

◆ jitify_error

CUresult quda::Tunable::jitify_error

protected

This is the return result from kernels launched using jitify

Definition at line 283 of file tune_quda.h.

The documentation for this class was generated from the following file:

quda/include/tune_quda.h

Public Member Functions

Protected Member Functions

Protected Attributes

Detailed Description

Constructor & Destructor Documentation

◆ Tunable()

◆ ~Tunable()

Member Function Documentation

◆ advanceAux()

◆ advanceBlockDim()

◆ advanceGridDim()

◆ advanceSharedBytes()

◆ advanceTuneParam()

◆ apply()

◆ blockMin()

◆ blockStep()

◆ bytes()

◆ checkLaunchParam()

◆ defaultTuneParam()

◆ flops()

◆ gridStep()

◆ initTuneParam()

◆ jitifyError() [1/2]

◆ jitifyError() [2/2]

◆ maxBlockSize()

◆ maxBlocksPerSM()

◆ maxDynamicSharedBytesPerBlock()

◆ maxGridSize()

◆ maxSharedBytesPerBlock()

◆ minGridSize()

◆ minThreads()

◆ paramString()

◆ perfString()

◆ postTune()

◆ preTune()

◆ resetBlockDim()

◆ sharedBytesPerBlock()

◆ sharedBytesPerThread()

◆ tuneAuxDim()

◆ tuned()

◆ tuneGridDim()

◆ tuneKey()

◆ tuneSharedBytes()

◆ tuningIter()

◆ writeAuxString()

Member Data Documentation

◆ aux

◆ jitify_error