Inheritance diagram for quda::blas::MultiReduceCuda< NXZ, doubleN, ReduceType, FloatN, M, SpinorX, SpinorY, SpinorZ, SpinorW, Reducer >:

[legend]

Collaboration diagram for quda::blas::MultiReduceCuda< NXZ, doubleN, ReduceType, FloatN, M, SpinorX, SpinorY, SpinorZ, SpinorW, Reducer >:

[legend]

Public Member Functions
	MultiReduceCuda (doubleN result[], SpinorX X[], SpinorY Y[], SpinorZ Z[], SpinorW W[], Reducer &r, std::vector< ColorSpinorField > &x, std::vector< ColorSpinorField > &y, std::vector< ColorSpinorField > &z, std::vector< ColorSpinorField > &w, int NYW, int length)

TuneKey	tuneKey () const

void	apply (const cudaStream_t &stream)

bool	advanceGridDim (TuneParam &param) const

void	initTuneParam (TuneParam &param) const

void	defaultTuneParam (TuneParam &param) const

void	preTune ()

void	postTune ()

long long	flops () const

long long	bytes () const

int	tuningIter () const

Public Member Functions inherited from quda::Tunable
	Tunable ()

virtual	~Tunable ()

virtual std::string	paramString (const TuneParam &param) const

virtual std::string	perfString (float time) const

virtual bool	advanceTuneParam (TuneParam &param) const

void	checkLaunchParam (TuneParam &param)

CUresult	jitifyError () const

CUresult &	jitifyError ()

Private Member Functions
unsigned int	sharedBytesPerThread () const

unsigned int	sharedBytesPerBlock (const TuneParam &param) const

virtual bool	advanceSharedBytes (TuneParam &param) const

unsigned int	maxBlockSize (const TuneParam &param) const

Private Attributes
const int	NYW

int	nParity

MultiReduceArg< NXZ, ReduceType, SpinorX, SpinorY, SpinorZ, SpinorW, Reducer >	arg

doubleN *	result

std::vector< ColorSpinorField * > &	x

std::vector< ColorSpinorField * > &	y

std::vector< ColorSpinorField * > &	z

std::vector< ColorSpinorField * > &	w

char *	Y_h [MAX_MULTI_BLAS_N]

char *	W_h [MAX_MULTI_BLAS_N]

char *	Ynorm_h [MAX_MULTI_BLAS_N]

char *	Wnorm_h [MAX_MULTI_BLAS_N]

Additional Inherited Members
Protected Member Functions inherited from quda::Tunable
virtual unsigned int	minThreads () const

virtual bool	tuneGridDim () const

virtual bool	tuneAuxDim () const

virtual bool	tuneSharedBytes () const

virtual unsigned int	maxGridSize () const

virtual unsigned int	minGridSize () const

virtual int	gridStep () const
	gridStep sets the step size when iterating the grid size in advanceGridDim. More...

virtual int	blockStep () const

virtual int	blockMin () const

virtual void	resetBlockDim (TuneParam &param) const

virtual bool	advanceBlockDim (TuneParam &param) const

unsigned int	maxBlocksPerSM () const
	For some reason this can't be queried from the device properties, so here we set set this. Based on Table 14 of the CUDA Programming Guide 10.0 (Technical Specifications per Compute Capability) More...

template<typename F >
void	setMaxDynamicSharedBytesPerBlock (F *func) const
	Enable the maximum dynamic shared bytes for the kernel "func" (values given by maxDynamicSharedBytesPerBlock()). More...

unsigned int	maxDynamicSharedBytesPerBlock () const
	This can't be correctly queried in CUDA for all architectures so here we set set this. Based on Table 14 of the CUDA Programming Guide 10.0 (Technical Specifications per Compute Capability). More...

virtual unsigned int	maxSharedBytesPerBlock () const
	The maximum shared memory that a CUDA thread block can use in the autotuner. This isn't necessarily the same as maxDynamicSharedMemoryPerBlock since that may need explicit opt in to enable (by calling setMaxDynamicSharedBytes for the kernel in question). If the CUDA kernel in question does this opt in then this function can be overloaded to return maxDynamicSharedBytesPerBlock. More...

virtual bool	advanceAux (TuneParam &param) const

int	writeAuxString (const char *format,...)

Protected Attributes inherited from quda::Tunable
char	aux [TuneKey::aux_n]

CUresult	jitify_error

Detailed Description

template<int NXZ, typename doubleN, typename ReduceType, typename FloatN, int M, typename SpinorX, typename SpinorY, typename SpinorZ, typename SpinorW, typename Reducer>
class quda::blas::MultiReduceCuda< NXZ, doubleN, ReduceType, FloatN, M, SpinorX, SpinorY, SpinorZ, SpinorW, Reducer >

Definition at line 111 of file multi_reduce_quda.cu.

Constructor & Destructor Documentation

◆ MultiReduceCuda()

template<int NXZ, typename doubleN, typename ReduceType, typename FloatN, int M, typename SpinorX, typename SpinorY, typename SpinorZ, typename SpinorW, typename Reducer>

quda::blas::MultiReduceCuda< NXZ, doubleN, ReduceType, FloatN, M, SpinorX, SpinorY, SpinorZ, SpinorW, Reducer >::MultiReduceCuda	(	doubleN	result[],
		SpinorX	X[],
		SpinorY	Y[],
		SpinorZ	Z[],
		SpinorW	W[],
		Reducer &	r,
		std::vector< ColorSpinorField *> &	x,
		std::vector< ColorSpinorField *> &	y,
		std::vector< ColorSpinorField *> &	z,
		std::vector< ColorSpinorField *> &	w,
		int	NYW,
		int	length
	)

inline

Definition at line 144 of file multi_reduce_quda.cu.

References quda::blas::getFastReduce(), and V.

Here is the call graph for this function:

Member Function Documentation

◆ advanceGridDim()

template<int NXZ, typename doubleN, typename ReduceType, typename FloatN, int M, typename SpinorX, typename SpinorY, typename SpinorZ, typename SpinorW, typename Reducer>

bool quda::blas::MultiReduceCuda< NXZ, doubleN, ReduceType, FloatN, M, SpinorX, SpinorY, SpinorZ, SpinorW, Reducer >::advanceGridDim ( TuneParam & param ) const

inlinevirtual

Reimplemented from quda::Tunable.

Definition at line 220 of file multi_reduce_quda.cu.

References quda::Tunable::advanceGridDim(), deviceProp, and errorQuda.

Here is the call graph for this function:

◆ advanceSharedBytes()

template<int NXZ, typename doubleN, typename ReduceType, typename FloatN, int M, typename SpinorX, typename SpinorY, typename SpinorZ, typename SpinorW, typename Reducer>

virtual bool quda::blas::MultiReduceCuda< NXZ, doubleN, ReduceType, FloatN, M, SpinorX, SpinorY, SpinorZ, SpinorW, Reducer >::advanceSharedBytes ( TuneParam & param ) const

inlineprivatevirtual

The goal here is to throttle the number of thread blocks per SM by over-allocating shared memory (in order to improve L2 utilization, etc.). We thus request the smallest amount of dynamic shared memory that guarantees throttling to a given number of blocks, in order to allow some extra leeway.

Reimplemented from quda::Tunable.

Definition at line 128 of file multi_reduce_quda.cu.

References quda::TuneParam::block, and quda::TuneParam::shared_bytes.

◆ apply()

template<int NXZ, typename doubleN, typename ReduceType, typename FloatN, int M, typename SpinorX, typename SpinorY, typename SpinorZ, typename SpinorW, typename Reducer>

void quda::blas::MultiReduceCuda< NXZ, doubleN, ReduceType, FloatN, M, SpinorX, SpinorY, SpinorZ, SpinorW, Reducer >::apply ( const cudaStream_t & stream )

inlinevirtual

Implements quda::Tunable.

Definition at line 191 of file multi_reduce_quda.cu.

References quda::arg(), quda::TuneParam::block, getTuning(), getVerbosity(), quda::TuneParam::grid, param, quda::stream, and quda::tuneLaunch().

Referenced by quda::blas::multiReduce().

Here is the call graph for this function:

Here is the caller graph for this function:

◆ bytes()

template<int NXZ, typename doubleN, typename ReduceType, typename FloatN, int M, typename SpinorX, typename SpinorY, typename SpinorZ, typename SpinorW, typename Reducer>

long long quda::blas::MultiReduceCuda< NXZ, doubleN, ReduceType, FloatN, M, SpinorX, SpinorY, SpinorZ, SpinorW, Reducer >::bytes ( ) const

inlinevirtual

Reimplemented from quda::Tunable.

Definition at line 264 of file multi_reduce_quda.cu.

References quda::blas::MultiReduceArg< NXZ, ReduceType, SpinorX, SpinorY, SpinorZ, SpinorW, Reducer >::r.

Referenced by quda::blas::multiReduce().

Here is the caller graph for this function:

◆ defaultTuneParam()

template<int NXZ, typename doubleN, typename ReduceType, typename FloatN, int M, typename SpinorX, typename SpinorY, typename SpinorZ, typename SpinorW, typename Reducer>

void quda::blas::MultiReduceCuda< NXZ, doubleN, ReduceType, FloatN, M, SpinorX, SpinorY, SpinorZ, SpinorW, Reducer >::defaultTuneParam ( TuneParam & param ) const

inlinevirtual

sets default values for when tuning is disabled

Reimplemented from quda::Tunable.

Definition at line 235 of file multi_reduce_quda.cu.

References quda::TuneParam::block, quda::Tunable::defaultTuneParam(), and quda::TuneParam::grid.

Here is the call graph for this function:

◆ flops()

template<int NXZ, typename doubleN, typename ReduceType, typename FloatN, int M, typename SpinorX, typename SpinorY, typename SpinorZ, typename SpinorW, typename Reducer>

long long quda::blas::MultiReduceCuda< NXZ, doubleN, ReduceType, FloatN, M, SpinorX, SpinorY, SpinorZ, SpinorW, Reducer >::flops ( ) const

inlinevirtual

Implements quda::Tunable.

Definition at line 259 of file multi_reduce_quda.cu.

References quda::blas::MultiReduceArg< NXZ, ReduceType, SpinorX, SpinorY, SpinorZ, SpinorW, Reducer >::length, and quda::blas::MultiReduceArg< NXZ, ReduceType, SpinorX, SpinorY, SpinorZ, SpinorW, Reducer >::r.

Referenced by quda::blas::multiReduce().

Here is the caller graph for this function:

◆ initTuneParam()

template<int NXZ, typename doubleN, typename ReduceType, typename FloatN, int M, typename SpinorX, typename SpinorY, typename SpinorZ, typename SpinorW, typename Reducer>

void quda::blas::MultiReduceCuda< NXZ, doubleN, ReduceType, FloatN, M, SpinorX, SpinorY, SpinorZ, SpinorW, Reducer >::initTuneParam ( TuneParam & param ) const

inlinevirtual

Reimplemented from quda::Tunable.

Definition at line 227 of file multi_reduce_quda.cu.

References quda::TuneParam::block, quda::TuneParam::grid, and quda::Tunable::initTuneParam().

Here is the call graph for this function:

◆ maxBlockSize()

template<int NXZ, typename doubleN, typename ReduceType, typename FloatN, int M, typename SpinorX, typename SpinorY, typename SpinorZ, typename SpinorW, typename Reducer>

unsigned int quda::blas::MultiReduceCuda< NXZ, doubleN, ReduceType, FloatN, M, SpinorX, SpinorY, SpinorZ, SpinorW, Reducer >::maxBlockSize ( const TuneParam & param ) const

inlineprivatevirtual

Reimplemented from quda::Tunable.

Definition at line 141 of file multi_reduce_quda.cu.

References deviceProp.

◆ postTune()

template<int NXZ, typename doubleN, typename ReduceType, typename FloatN, int M, typename SpinorX, typename SpinorY, typename SpinorZ, typename SpinorW, typename Reducer>

void quda::blas::MultiReduceCuda< NXZ, doubleN, ReduceType, FloatN, M, SpinorX, SpinorY, SpinorZ, SpinorW, Reducer >::postTune ( )

inlinevirtual

Reimplemented from quda::Tunable.

Definition at line 251 of file multi_reduce_quda.cu.

References quda::blas::MultiReduceArg< NXZ, ReduceType, SpinorX, SpinorY, SpinorZ, SpinorW, Reducer >::W, and quda::blas::MultiReduceArg< NXZ, ReduceType, SpinorX, SpinorY, SpinorZ, SpinorW, Reducer >::Y.

◆ preTune()

template<int NXZ, typename doubleN, typename ReduceType, typename FloatN, int M, typename SpinorX, typename SpinorY, typename SpinorZ, typename SpinorW, typename Reducer>

void quda::blas::MultiReduceCuda< NXZ, doubleN, ReduceType, FloatN, M, SpinorX, SpinorY, SpinorZ, SpinorW, Reducer >::preTune ( )

inlinevirtual

Reimplemented from quda::Tunable.

Definition at line 243 of file multi_reduce_quda.cu.

References quda::blas::MultiReduceArg< NXZ, ReduceType, SpinorX, SpinorY, SpinorZ, SpinorW, Reducer >::W, and quda::blas::MultiReduceArg< NXZ, ReduceType, SpinorX, SpinorY, SpinorZ, SpinorW, Reducer >::Y.

◆ sharedBytesPerBlock()

template<int NXZ, typename doubleN, typename ReduceType, typename FloatN, int M, typename SpinorX, typename SpinorY, typename SpinorZ, typename SpinorW, typename Reducer>

unsigned int quda::blas::MultiReduceCuda< NXZ, doubleN, ReduceType, FloatN, M, SpinorX, SpinorY, SpinorZ, SpinorW, Reducer >::sharedBytesPerBlock ( const TuneParam & param ) const

inlineprivatevirtual

Implements quda::Tunable.

Definition at line 126 of file multi_reduce_quda.cu.

◆ sharedBytesPerThread()

template<int NXZ, typename doubleN, typename ReduceType, typename FloatN, int M, typename SpinorX, typename SpinorY, typename SpinorZ, typename SpinorW, typename Reducer>

unsigned int quda::blas::MultiReduceCuda< NXZ, doubleN, ReduceType, FloatN, M, SpinorX, SpinorY, SpinorZ, SpinorW, Reducer >::sharedBytesPerThread ( ) const

inlineprivatevirtual

Implements quda::Tunable.

Definition at line 125 of file multi_reduce_quda.cu.

◆ tuneKey()

template<int NXZ, typename doubleN, typename ReduceType, typename FloatN, int M, typename SpinorX, typename SpinorY, typename SpinorZ, typename SpinorW, typename Reducer>

TuneKey quda::blas::MultiReduceCuda< NXZ, doubleN, ReduceType, FloatN, M, SpinorX, SpinorY, SpinorZ, SpinorW, Reducer >::tuneKey ( ) const

inlinevirtual

Implements quda::Tunable.

Definition at line 182 of file multi_reduce_quda.cu.

References quda::TuneKey::name_n, and quda::blas::MultiReduceArg< NXZ, ReduceType, SpinorX, SpinorY, SpinorZ, SpinorW, Reducer >::r.

◆ tuningIter()

template<int NXZ, typename doubleN, typename ReduceType, typename FloatN, int M, typename SpinorX, typename SpinorY, typename SpinorZ, typename SpinorW, typename Reducer>

int quda::blas::MultiReduceCuda< NXZ, doubleN, ReduceType, FloatN, M, SpinorX, SpinorY, SpinorZ, SpinorW, Reducer >::tuningIter ( ) const

inlinevirtual

Reimplemented from quda::Tunable.

Definition at line 270 of file multi_reduce_quda.cu.

Member Data Documentation

◆ arg

template<int NXZ, typename doubleN, typename ReduceType, typename FloatN, int M, typename SpinorX, typename SpinorY, typename SpinorZ, typename SpinorW, typename Reducer>

MultiReduceArg<NXZ, ReduceType, SpinorX, SpinorY, SpinorZ, SpinorW, Reducer> quda::blas::MultiReduceCuda< NXZ, doubleN, ReduceType, FloatN, M, SpinorX, SpinorY, SpinorZ, SpinorW, Reducer >::arg

private

Definition at line 117 of file multi_reduce_quda.cu.

◆ nParity

template<int NXZ, typename doubleN, typename ReduceType, typename FloatN, int M, typename SpinorX, typename SpinorY, typename SpinorZ, typename SpinorW, typename Reducer>

int quda::blas::MultiReduceCuda< NXZ, doubleN, ReduceType, FloatN, M, SpinorX, SpinorY, SpinorZ, SpinorW, Reducer >::nParity

private

Definition at line 116 of file multi_reduce_quda.cu.

◆ NYW

template<int NXZ, typename doubleN, typename ReduceType, typename FloatN, int M, typename SpinorX, typename SpinorY, typename SpinorZ, typename SpinorW, typename Reducer>

const int quda::blas::MultiReduceCuda< NXZ, doubleN, ReduceType, FloatN, M, SpinorX, SpinorY, SpinorZ, SpinorW, Reducer >::NYW

private

Definition at line 115 of file multi_reduce_quda.cu.

◆ result

template<int NXZ, typename doubleN, typename ReduceType, typename FloatN, int M, typename SpinorX, typename SpinorY, typename SpinorZ, typename SpinorW, typename Reducer>

doubleN* quda::blas::MultiReduceCuda< NXZ, doubleN, ReduceType, FloatN, M, SpinorX, SpinorY, SpinorZ, SpinorW, Reducer >::result

private

Definition at line 118 of file multi_reduce_quda.cu.

◆ w

template<int NXZ, typename doubleN, typename ReduceType, typename FloatN, int M, typename SpinorX, typename SpinorY, typename SpinorZ, typename SpinorW, typename Reducer>

std::vector<ColorSpinorField *> & quda::blas::MultiReduceCuda< NXZ, doubleN, ReduceType, FloatN, M, SpinorX, SpinorY, SpinorZ, SpinorW, Reducer >::w

private

Definition at line 120 of file multi_reduce_quda.cu.

◆ W_h

template<int NXZ, typename doubleN, typename ReduceType, typename FloatN, int M, typename SpinorX, typename SpinorY, typename SpinorZ, typename SpinorW, typename Reducer>

char * quda::blas::MultiReduceCuda< NXZ, doubleN, ReduceType, FloatN, M, SpinorX, SpinorY, SpinorZ, SpinorW, Reducer >::W_h[MAX_MULTI_BLAS_N]

private

Definition at line 123 of file multi_reduce_quda.cu.

◆ Wnorm_h

template<int NXZ, typename doubleN, typename ReduceType, typename FloatN, int M, typename SpinorX, typename SpinorY, typename SpinorZ, typename SpinorW, typename Reducer>

char * quda::blas::MultiReduceCuda< NXZ, doubleN, ReduceType, FloatN, M, SpinorX, SpinorY, SpinorZ, SpinorW, Reducer >::Wnorm_h[MAX_MULTI_BLAS_N]

private

Definition at line 123 of file multi_reduce_quda.cu.

◆ x

template<int NXZ, typename doubleN, typename ReduceType, typename FloatN, int M, typename SpinorX, typename SpinorY, typename SpinorZ, typename SpinorW, typename Reducer>

std::vector<ColorSpinorField *>& quda::blas::MultiReduceCuda< NXZ, doubleN, ReduceType, FloatN, M, SpinorX, SpinorY, SpinorZ, SpinorW, Reducer >::x

private

Definition at line 120 of file multi_reduce_quda.cu.

◆ y

template<int NXZ, typename doubleN, typename ReduceType, typename FloatN, int M, typename SpinorX, typename SpinorY, typename SpinorZ, typename SpinorW, typename Reducer>

std::vector<ColorSpinorField *> & quda::blas::MultiReduceCuda< NXZ, doubleN, ReduceType, FloatN, M, SpinorX, SpinorY, SpinorZ, SpinorW, Reducer >::y

private

Definition at line 120 of file multi_reduce_quda.cu.

◆ Y_h

template<int NXZ, typename doubleN, typename ReduceType, typename FloatN, int M, typename SpinorX, typename SpinorY, typename SpinorZ, typename SpinorW, typename Reducer>

char* quda::blas::MultiReduceCuda< NXZ, doubleN, ReduceType, FloatN, M, SpinorX, SpinorY, SpinorZ, SpinorW, Reducer >::Y_h[MAX_MULTI_BLAS_N]

private

Definition at line 123 of file multi_reduce_quda.cu.

◆ Ynorm_h

template<int NXZ, typename doubleN, typename ReduceType, typename FloatN, int M, typename SpinorX, typename SpinorY, typename SpinorZ, typename SpinorW, typename Reducer>

char * quda::blas::MultiReduceCuda< NXZ, doubleN, ReduceType, FloatN, M, SpinorX, SpinorY, SpinorZ, SpinorW, Reducer >::Ynorm_h[MAX_MULTI_BLAS_N]

private

Definition at line 123 of file multi_reduce_quda.cu.

◆ z

template<int NXZ, typename doubleN, typename ReduceType, typename FloatN, int M, typename SpinorX, typename SpinorY, typename SpinorZ, typename SpinorW, typename Reducer>

std::vector<ColorSpinorField *> & quda::blas::MultiReduceCuda< NXZ, doubleN, ReduceType, FloatN, M, SpinorX, SpinorY, SpinorZ, SpinorW, Reducer >::z

private

Definition at line 120 of file multi_reduce_quda.cu.

The documentation for this class was generated from the following file:

lib/multi_reduce_quda.cu

Public Member Functions

Private Member Functions

Private Attributes

Additional Inherited Members

Detailed Description

template<int NXZ, typename doubleN, typename ReduceType, typename FloatN, int M, typename SpinorX, typename SpinorY, typename SpinorZ, typename SpinorW, typename Reducer> class quda::blas::MultiReduceCuda< NXZ, doubleN, ReduceType, FloatN, M, SpinorX, SpinorY, SpinorZ, SpinorW, Reducer >

Constructor & Destructor Documentation

◆ MultiReduceCuda()

Member Function Documentation

◆ advanceGridDim()

◆ advanceSharedBytes()

◆ apply()

◆ bytes()

◆ defaultTuneParam()

◆ flops()

◆ initTuneParam()

◆ maxBlockSize()

◆ postTune()

◆ preTune()

◆ sharedBytesPerBlock()

◆ sharedBytesPerThread()

◆ tuneKey()

◆ tuningIter()

Member Data Documentation

◆ arg

◆ nParity

◆ NYW

◆ result

◆ w

◆ W_h

◆ Wnorm_h

◆ x

◆ y

◆ Y_h

◆ Ynorm_h

◆ z

template<int NXZ, typename doubleN, typename ReduceType, typename FloatN, int M, typename SpinorX, typename SpinorY, typename SpinorZ, typename SpinorW, typename Reducer>
class quda::blas::MultiReduceCuda< NXZ, doubleN, ReduceType, FloatN, M, SpinorX, SpinorY, SpinorZ, SpinorW, Reducer >