#include <dslash.h>

Inheritance diagram for quda::Dslash< Float >:

[legend]

Collaboration diagram for quda::Dslash< Float >:

[legend]

Public Member Functions
template<typename T , typename Arg >
void	launch (T *f, const TuneParam &tp, Arg &arg, const cudaStream_t &stream)

template<template< typename, int, int, int, bool, bool, KernelType, typename > class Launch, int nDim, int nColor, int nParity, bool dagger, bool xpay, typename Arg >
void	instantiate (TuneParam &tp, Arg &arg, const cudaStream_t &stream)
	This instantiate function is used to instantiate the the KernelType template required for the multi-GPU dslash kernels. More...

template<template< typename, int, int, int, bool, bool, KernelType, typename > class Launch, int nDim, int nColor, int nParity, bool xpay, typename Arg >
void	instantiate (TuneParam &tp, Arg &arg, const cudaStream_t &stream)
	This instantiate function is used to instantiate the the dagger template. More...

template<template< typename, int, int, int, bool, bool, KernelType, typename > class Launch, int nDim, int nColor, bool xpay, typename Arg >
void	instantiate (TuneParam &tp, Arg &arg, const cudaStream_t &stream)
	This instantiate function is used to instantiate the the nParity template. More...

template<template< typename, int, int, int, bool, bool, KernelType, typename > class Launch, int nDim, int nColor, typename Arg >
void	instantiate (TuneParam &tp, Arg &arg, const cudaStream_t &stream)
	This instantiate function is used to instantiate the the xpay template. More...

	Dslash (DslashArg< Float > &arg, const ColorSpinorField &out, const ColorSpinorField &in, const char *src)

int	Nface () const

int	Dagger () const

const char *	getAux (KernelType type) const

void	setAux (KernelType type, const char *aux_)

void	augmentAux (KernelType type, const char *extra)

virtual void	preTune ()
	Save the output field since the output field is both read from and written to in the exterior kernels. More...

virtual void	postTune ()
	Restore the output field if doing exterior kernel. More...

virtual long long	flops () const

virtual long long	bytes () const

Public Member Functions inherited from quda::TunableVectorYZ
	TunableVectorYZ (unsigned int vector_length_y, unsigned int vector_length_z)

bool	advanceBlockDim (TuneParam &param) const

void	initTuneParam (TuneParam &param) const

void	defaultTuneParam (TuneParam &param) const

void	resizeVector (int y, int z) const

void	resizeStep (int y, int z) const

Public Member Functions inherited from quda::TunableVectorY
	TunableVectorY (unsigned int vector_length_y)

void	resizeVector (int y) const

void	resizeStep (int y) const

Public Member Functions inherited from quda::Tunable
	Tunable ()

virtual	~Tunable ()

virtual TuneKey	tuneKey () const =0

virtual void	apply (const cudaStream_t &stream)=0

virtual std::string	paramString (const TuneParam &param) const

virtual std::string	perfString (float time) const

virtual bool	advanceTuneParam (TuneParam &param) const

void	checkLaunchParam (TuneParam &param)

CUresult	jitifyError () const

CUresult &	jitifyError ()

Public Attributes
DslashArg< Float > &	dslashParam

Protected Member Functions
void	fillAuxBase ()
	Set the base strings used by the different dslash kernel types for autotuning. More...

void	fillAux (KernelType kernel_type, const char *kernel_str)
	Specialize the auxiliary strings for each kernel type. More...

bool	tuneGridDim () const

unsigned int	minThreads () const

template<typename Arg >
void	setParam (Arg &arg)

virtual int	tuningIter () const

int	blockStep () const

int	blockMin () const

unsigned int	maxSharedBytesPerBlock () const
	The maximum shared memory that a CUDA thread block can use in the autotuner. This isn't necessarily the same as maxDynamicSharedMemoryPerBlock since that may need explicit opt in to enable (by calling setMaxDynamicSharedBytes for the kernel in question). If the CUDA kernel in question does this opt in then this function can be overloaded to return maxDynamicSharedBytesPerBlock. More...

Protected Member Functions inherited from quda::TunableVectorY
virtual unsigned int	sharedBytesPerThread () const

virtual unsigned int	sharedBytesPerBlock (const TuneParam &param) const

Protected Member Functions inherited from quda::Tunable
virtual bool	tuneAuxDim () const

virtual bool	tuneSharedBytes () const

virtual bool	advanceGridDim (TuneParam &param) const

virtual unsigned int	maxBlockSize (const TuneParam &param) const

virtual unsigned int	maxGridSize () const

virtual unsigned int	minGridSize () const

virtual int	gridStep () const
	gridStep sets the step size when iterating the grid size in advanceGridDim. More...

virtual void	resetBlockDim (TuneParam &param) const

unsigned int	maxBlocksPerSM () const
	For some reason this can't be queried from the device properties, so here we set set this. Based on Table 14 of the CUDA Programming Guide 10.0 (Technical Specifications per Compute Capability) More...

template<typename F >
void	setMaxDynamicSharedBytesPerBlock (F *func) const
	Enable the maximum dynamic shared bytes for the kernel "func" (values given by maxDynamicSharedBytesPerBlock()). More...

unsigned int	maxDynamicSharedBytesPerBlock () const
	This can't be correctly queried in CUDA for all architectures so here we set set this. Based on Table 14 of the CUDA Programming Guide 10.0 (Technical Specifications per Compute Capability). More...

virtual bool	advanceSharedBytes (TuneParam &param) const

virtual bool	advanceAux (TuneParam &param) const

int	writeAuxString (const char *format,...)

Protected Attributes
DslashArg< Float > &	arg

const ColorSpinorField &	out

const ColorSpinorField &	in

const int	nDimComms

char	aux_base [TuneKey::aux_n - 32]

char	aux [8][TuneKey::aux_n]

Protected Attributes inherited from quda::TunableVectorY
unsigned int	vector_length_y

unsigned int	step_y

bool	tune_block_x

Protected Attributes inherited from quda::Tunable
char	aux [TuneKey::aux_n]

CUresult	jitify_error

Detailed Description

template<typename Float>
class quda::Dslash< Float >

Definition at line 12 of file dslash.h.

Constructor & Destructor Documentation

◆ Dslash()

template<typename Float>

quda::Dslash< Float >::Dslash	(	DslashArg< Float > &	arg,
		const ColorSpinorField &	out,
		const ColorSpinorField &	in,
		const char *	src
	)

inline

Definition at line 237 of file dslash.h.

References checkLocation, quda::DslashArg< Float >::commDim, errorQuda, quda::EXTERIOR_KERNEL_ALL, quda::EXTERIOR_KERNEL_T, quda::EXTERIOR_KERNEL_X, quda::EXTERIOR_KERNEL_Y, quda::EXTERIOR_KERNEL_Z, quda::Dslash< Float >::fillAux(), quda::Dslash< Float >::fillAuxBase(), quda::INTERIOR_KERNEL, quda::KERNEL_POLICY, QUDA_CPU_FIELD_LOCATION, and quda::setPackComms().

Here is the call graph for this function:

Member Function Documentation

◆ augmentAux()

template<typename Float>

void quda::Dslash< Float >::augmentAux	(	KernelType	type,
		const char *	extra
	)

inline

Definition at line 281 of file dslash.h.

Referenced by quda::dslash::setMappedGhost().

Here is the caller graph for this function:

◆ blockMin()

template<typename Float>

int quda::Dslash< Float >::blockMin ( ) const

inlineprotectedvirtual

Reimplemented from quda::Tunable.

Definition at line 95 of file dslash.h.

◆ blockStep()

template<typename Float>

int quda::Dslash< Float >::blockStep ( ) const

inlineprotectedvirtual

Reimplemented from quda::Tunable.

Definition at line 94 of file dslash.h.

◆ bytes()

template<typename Float>

virtual long long quda::Dslash< Float >::bytes ( ) const

inlinevirtual

Reimplemented from quda::Tunable.

Reimplemented in quda::Laplace< Float, nDim, nColor, Arg >, quda::Staggered< Float, nDim, nColor, Arg >, quda::TwistedCloverPreconditioned< Float, nDim, nColor, Arg >, quda::WilsonCloverPreconditioned< Float, nDim, nColor, Arg >, quda::DomainWall5D< Float, nDim, nColor, Arg >, quda::TwistedClover< Float, nDim, nColor, Arg >, and quda::WilsonClover< Float, nDim, nColor, Arg >.

Definition at line 364 of file dslash.h.

References quda::DslashArg< Float >::commDim, quda::EXTERIOR_KERNEL_ALL, quda::EXTERIOR_KERNEL_T, quda::EXTERIOR_KERNEL_X, quda::EXTERIOR_KERNEL_Y, quda::EXTERIOR_KERNEL_Z, quda::ColorSpinorField::GhostFace(), quda::INTERIOR_KERNEL, quda::KERNEL_POLICY, quda::DslashArg< Float >::kernel_type, quda::ColorSpinorField::Ncolor(), quda::ColorSpinorField::Nspin(), quda::LatticeField::Precision(), quda::DslashArg< Float >::reconstruct, quda::ColorSpinorField::Volume(), and quda::DslashArg< Float >::xpay.

Referenced by quda::DomainWall5D< Float, nDim, nColor, Arg >::bytes(), quda::WilsonClover< Float, nDim, nColor, Arg >::bytes(), quda::TwistedClover< Float, nDim, nColor, Arg >::bytes(), quda::WilsonCloverPreconditioned< Float, nDim, nColor, Arg >::bytes(), quda::TwistedCloverPreconditioned< Float, nDim, nColor, Arg >::bytes(), and quda::dslash::DslashPolicyTune< Dslash >::bytes().

Here is the call graph for this function:

Here is the caller graph for this function:

◆ Dagger()

template<typename Float>

int quda::Dslash< Float >::Dagger ( ) const

inline

Definition at line 275 of file dslash.h.

References quda::DslashArg< Float >::dagger.

Here is the caller graph for this function:

◆ fillAux()

template<typename Float>

void quda::Dslash< Float >::fillAux	(	KernelType	kernel_type,
		const char *	kernel_str
	)

inlineprotected

Specialize the auxiliary strings for each kernel type.

Parameters

[in]	kernel_type	The kernel_type we are generating the string got
[in]	kernel_str	String corresponding to the kernel type

Definition at line 56 of file dslash.h.

References quda::TuneKey::aux_n, comm_dim_partitioned_string(), and quda::INTERIOR_KERNEL.

Referenced by quda::Dslash< Float >::Dslash().

Here is the call graph for this function:

Here is the caller graph for this function:

◆ fillAuxBase()

template<typename Float>

void quda::Dslash< Float >::fillAuxBase ( )

inlineprotected

Set the base strings used by the different dslash kernel types for autotuning.

Definition at line 36 of file dslash.h.

References quda::DslashArg< Float >::commDim, quda::DslashArg< Float >::dagger, and quda::DslashArg< Float >::xpay.

Referenced by quda::Dslash< Float >::Dslash().

Here is the caller graph for this function:

◆ flops()

template<typename Float>

virtual long long quda::Dslash< Float >::flops ( ) const

inlinevirtual

Implements quda::Tunable.

Definition at line 316 of file dslash.h.

References quda::DslashArg< Float >::commDim, quda::EXTERIOR_KERNEL_ALL, quda::EXTERIOR_KERNEL_T, quda::EXTERIOR_KERNEL_X, quda::EXTERIOR_KERNEL_Y, quda::EXTERIOR_KERNEL_Z, quda::ColorSpinorField::GhostFace(), quda::INTERIOR_KERNEL, quda::KERNEL_POLICY, quda::DslashArg< Float >::kernel_type, quda::ColorSpinorField::Ncolor(), quda::ColorSpinorField::Nspin(), quda::ColorSpinorField::Volume(), and quda::DslashArg< Float >::xpay.

Here is the call graph for this function:

Here is the caller graph for this function:

◆ getAux()

template<typename Float>

const char* quda::Dslash< Float >::getAux ( KernelType type ) const

inline

Definition at line 277 of file dslash.h.

Referenced by quda::dslash::setMappedGhost().

Here is the caller graph for this function:

◆ instantiate() [1/4]

template<typename Float>

template<template< typename, int, int, int, bool, bool, KernelType, typename > class Launch, int nDim, int nColor, int nParity, bool dagger, bool xpay, typename Arg >

void quda::Dslash< Float >::instantiate	(	TuneParam &	tp,
		Arg &	arg,
		const cudaStream_t &	stream
	)

inline

This instantiate function is used to instantiate the the KernelType template required for the multi-GPU dslash kernels.

Parameters

[in]	tp	The tuning parameters to use for this kernel
[in,out]	arg	The argument struct for the kernel
[in]	stream	The cudaStream_t where the kernel will run

Definition at line 119 of file dslash.h.

References errorQuda, quda::EXTERIOR_KERNEL_ALL, quda::EXTERIOR_KERNEL_T, quda::EXTERIOR_KERNEL_X, quda::EXTERIOR_KERNEL_Y, quda::EXTERIOR_KERNEL_Z, quda::INTERIOR_KERNEL, quda::LatticeField::Location(), nColor, QUDA_CPU_FIELD_LOCATION, and quda::blas::xpay().

Referenced by quda::DomainWall4D< Float, nDim, nColor, Arg >::apply().

Here is the call graph for this function:

Here is the caller graph for this function:

◆ instantiate() [2/4]

template<typename Float>

template<template< typename, int, int, int, bool, bool, KernelType, typename > class Launch, int nDim, int nColor, int nParity, bool xpay, typename Arg >

void quda::Dslash< Float >::instantiate	(	TuneParam &	tp,
		Arg &	arg,
		const cudaStream_t &	stream
	)

inline

This instantiate function is used to instantiate the the dagger template.

Parameters

[in]	tp	The tuning parameters to use for this kernel
[in,out]	arg	The argument struct for the kernel
[in]	stream	The cudaStream_t where the kernel will run

Definition at line 162 of file dslash.h.

References quda::Dslash< Float >::arg, quda::TuneParam::block, quda::TuneParam::grid, quda::Tunable::jitify_error, nColor, quda::TuneParam::shared_bytes, quda::stream, and quda::blas::xpay().

Here is the call graph for this function:

◆ instantiate() [3/4]

template<typename Float>

template<template< typename, int, int, int, bool, bool, KernelType, typename > class Launch, int nDim, int nColor, bool xpay, typename Arg >

void quda::Dslash< Float >::instantiate	(	TuneParam &	tp,
		Arg &	arg,
		const cudaStream_t &	stream
	)

inline

This instantiate function is used to instantiate the the nParity template.

Parameters

[in]	tp	The tuning parameters to use for this kernel
[in,out]	arg	The argument struct for the kernel
[in]	stream	The cudaStream_t where the kernel will run

Definition at line 189 of file dslash.h.

References quda::Dslash< Float >::arg, quda::TuneParam::block, errorQuda, quda::TuneParam::grid, quda::Tunable::jitify_error, nColor, quda::Arg< real, Ns, Nc, order >::nParity, quda::TuneParam::shared_bytes, and quda::stream.

◆ instantiate() [4/4]

template<typename Float>

template<template< typename, int, int, int, bool, bool, KernelType, typename > class Launch, int nDim, int nColor, typename Arg >

void quda::Dslash< Float >::instantiate	(	TuneParam &	tp,
		Arg &	arg,
		const cudaStream_t &	stream
	)

inline

This instantiate function is used to instantiate the the xpay template.

Parameters

[in]	tp	The tuning parameters to use for this kernel
[in,out]	arg	The argument struct for the kernel
[in]	stream	The cudaStream_t where the kernel will run

Definition at line 217 of file dslash.h.

References quda::Dslash< Float >::arg, quda::TuneParam::block, quda::TuneParam::grid, quda::Tunable::jitify_error, quda::Arg< real, Ns, Nc, order >::nParity, quda::TuneParam::shared_bytes, and quda::stream.

◆ launch()

template<typename Float>

template<typename T , typename Arg >

void quda::Dslash< Float >::launch	(	T *	f,
		const TuneParam &	tp,
		Arg &	arg,
		const cudaStream_t &	stream
	)

inline

Definition at line 101 of file dslash.h.

References quda::TuneParam::block, dagger, deviceProp, quda::TuneParam::grid, nColor, quda::qudaLaunchKernel(), quda::Tunable::setMaxDynamicSharedBytesPerBlock(), quda::TuneParam::shared_bytes, and quda::blas::xpay().

Here is the call graph for this function:

Here is the caller graph for this function:

◆ maxSharedBytesPerBlock()

template<typename Float>

unsigned int quda::Dslash< Float >::maxSharedBytesPerBlock ( ) const

inlineprotectedvirtual

The maximum shared memory that a CUDA thread block can use in the autotuner. This isn't necessarily the same as maxDynamicSharedMemoryPerBlock since that may need explicit opt in to enable (by calling setMaxDynamicSharedBytes for the kernel in question). If the CUDA kernel in question does this opt in then this function can be overloaded to return maxDynamicSharedBytesPerBlock.