Classes
struct	DslashAsync

struct	DslashBasic

struct	DslashCommsPattern

struct	DslashFactory

struct	DslashFusedExterior

struct	DslashFusedExteriorAsync

struct	DslashFusedGDR

struct	DslashFusedGDRRecv

struct	DslashFusedZeroCopy

struct	DslashFusedZeroCopyPack

struct	DslashFusedZeroCopyPackGDRRecv

struct	DslashGDR

struct	DslashGDRRecv

struct	DslashNC

struct	DslashPolicyImp

class	DslashPolicyTune

struct	DslashZeroCopy

struct	DslashZeroCopyPack

struct	DslashZeroCopyPackGDRRecv

Enumerations
enum	QudaDslashPolicy { QudaDslashPolicy::QUDA_DSLASH, QudaDslashPolicy::QUDA_FUSED_DSLASH, QudaDslashPolicy::QUDA_GDR_DSLASH, QudaDslashPolicy::QUDA_FUSED_GDR_DSLASH, QudaDslashPolicy::QUDA_GDR_RECV_DSLASH, QudaDslashPolicy::QUDA_FUSED_GDR_RECV_DSLASH, QudaDslashPolicy::QUDA_ZERO_COPY_PACK_DSLASH, QudaDslashPolicy::QUDA_FUSED_ZERO_COPY_PACK_DSLASH, QudaDslashPolicy::QUDA_ZERO_COPY_DSLASH, QudaDslashPolicy::QUDA_FUSED_ZERO_COPY_DSLASH, QudaDslashPolicy::QUDA_ZERO_COPY_PACK_GDR_RECV_DSLASH, QudaDslashPolicy::QUDA_FUSED_ZERO_COPY_PACK_GDR_RECV_DSLASH, QudaDslashPolicy::QUDA_DSLASH_ASYNC, QudaDslashPolicy::QUDA_FUSED_DSLASH_ASYNC, QudaDslashPolicy::QUDA_DSLASH_NC, QudaDslashPolicy::QUDA_DSLASH_POLICY_DISABLED }

enum	QudaP2PPolicy { QudaP2PPolicy::QUDA_P2P_DEFAULT, QudaP2PPolicy::QUDA_P2P_COPY_ENGINE, QudaP2PPolicy::QUDA_P2P_REMOTE_WRITE, QudaP2PPolicy::QUDA_P2P_POLICY_DISABLED }

Functions
template<typename Arg , typename Dslash >
void	setFusedParam (Arg &param, Dslash &dslash, const int *faceVolumeCB)

template<typename Dslash >
void	issueRecv (cudaColorSpinorField &input, const Dslash &dslash, cudaStream_t *stream, bool gdr)
	This helper function simply posts all receives in all directions. More...

template<typename Dslash >
void	issuePack (cudaColorSpinorField &in, const Dslash &dslash, int parity, MemoryLocation location, int packIndex)
	This helper function simply posts the packing kernel needed for halo exchange. More...

template<typename Dslash >
void	issueGather (cudaColorSpinorField &in, const Dslash &dslash)
	This helper function simply posts the device-host memory copies of all halos in all dimensions and directions. More...

template<typename T >
int	getStreamIndex (const T &dslashParam)
	Returns a stream index for posting the pack/scatters to. We desire a stream index that is not being used for peer-to-peer communication. This is used by the fused halo dslash kernels where we post all scatters to the same stream so we only have a single event to wait on before the exterior kernel is applied, and by the zero-copy dslash kernels where we want to post the packing kernel to an unused stream. More...

template<typename Dslash >
bool	commsComplete (cudaColorSpinorField &in, const Dslash &dslash, int dim, int dir, bool gdr_send, bool gdr_recv, bool zero_copy_recv, bool async, int scatterIndex=-1)
	Wrapper for querying if communication is finished in the dslash, and if it is take the appropriate action: More...

template<typename T >
void	completeDslash (const ColorSpinorField &in, const T &dslashParam)
	Ensure that the dslash is complete. By construction, the dslash will have completed (or is in flight) on this process, however, we must also ensure that no local work begins until any communication in flight from this process to another has completed. This prevents a race condition where we could start updating the local buffers on a subsequent computation before we have finished sending. More...

template<typename Dslash >
void	setMappedGhost (Dslash &dslash, ColorSpinorField &in, bool to_mapped)
	Set the ghosts to the mapped CPU ghost buffer, or unsets if already set. Note this must not be called until after the interior dslash has been called, since sets the peer-to-peer ghost pointers, and this need to be done without the mapped ghost enabled. More...

void	enable_policy (QudaDslashPolicy p)

void	disable_policy (QudaDslashPolicy p)

Variables
int	it = 0

cudaEvent_t	packEnd [2]

cudaEvent_t	gatherStart [Nstream]

cudaEvent_t	gatherEnd [Nstream]

cudaEvent_t	scatterStart [Nstream]

cudaEvent_t	scatterEnd [Nstream]

cudaEvent_t	dslashStart [2]

Worker *	aux_worker

bool	dslash_pack_compute

bool	dslash_interior_compute

bool	dslash_exterior_compute

bool	dslash_comms

bool	dslash_copy

static cudaColorSpinorField *	inSpinor

bool	dslash_policy_init

int	first_active_policy

int	first_active_p2p_policy

std::vector< QudaDslashPolicy >	policies

char	policy_string [TuneKey::aux_n]

std::vector< QudaP2PPolicy >	p2p_policies

Enumeration Type Documentation

◆ QudaDslashPolicy

enum quda::dslash::QudaDslashPolicy

strong

Enumerator
QUDA_DSLASH
QUDA_FUSED_DSLASH
QUDA_GDR_DSLASH
QUDA_FUSED_GDR_DSLASH
QUDA_GDR_RECV_DSLASH
QUDA_FUSED_GDR_RECV_DSLASH
QUDA_ZERO_COPY_PACK_DSLASH
QUDA_FUSED_ZERO_COPY_PACK_DSLASH
QUDA_ZERO_COPY_DSLASH
QUDA_FUSED_ZERO_COPY_DSLASH
QUDA_ZERO_COPY_PACK_GDR_RECV_DSLASH
QUDA_FUSED_ZERO_COPY_PACK_GDR_RECV_DSLASH
QUDA_DSLASH_ASYNC
QUDA_FUSED_DSLASH_ASYNC
QUDA_DSLASH_NC
QUDA_DSLASH_POLICY_DISABLED

Definition at line 1670 of file dslash_policy.cuh.

◆ QudaP2PPolicy

enum quda::dslash::QudaP2PPolicy

strong

Enumerator
QUDA_P2P_DEFAULT
QUDA_P2P_COPY_ENGINE
QUDA_P2P_REMOTE_WRITE
QUDA_P2P_POLICY_DISABLED

Definition at line 1695 of file dslash_policy.cuh.

Function Documentation

◆ commsComplete()

template<typename Dslash >

bool quda::dslash::commsComplete	(	cudaColorSpinorField &	in,
		const Dslash &	dslash,
		int	dim,
		int	dir,
		bool	gdr_send,
		bool	gdr_recv,
		bool	zero_copy_recv,
		bool	async,
		int	scatterIndex = `-1`
	)

inline

Wrapper for querying if communication is finished in the dslash, and if it is take the appropriate action:

if peer-to-peer then we now know that the peer-to-peer copy is now in flight and we are safe to post the cudaStreamWaitEvent in our GPU context
if gdr or zero-copy for the receive buffer then we have nothing else to do, it is now safe to post halo kernel
if staging with async, we release the scatter by setting the approriate commsEnd_h flag
if basic staging, we post the scatter (host to device memory copy)

Parameters

[in,out]	in	Field being commicated
[in]	dslash	The dslash object
[in]	dim	Dimension we are working on
[in]	dir	Direction we are working on
[in]	gdr_send	Whether GPU Direct RDMA is being used for sending
[in]	gdr_recv	Whether GPU Direct RDMA is being used for receiving
[in]	zero_copy_recv	Whether we are using zero-copy on the receive end (and hence do not need to do CPU->GPU copy)
[in]	async	Whether GPU Direct Async is being used
[in]	scatterIndex	The stream index used for posting the host-to-device memory copy in

Definition at line 253 of file dslash_policy.cuh.

References comm_peer2peer_enabled(), quda::cudaColorSpinorField::commsQuery(), quda::Dslash< Float >::Dagger(), errorQuda, quda::LatticeField::getIPCRemoteCopyEvent(), quda::Dslash< Float >::Nface(), quda::Nstream, PROFILE, quda::QUDA_PROFILE_COMMS_QUERY, quda::QUDA_PROFILE_SCATTER, quda::QUDA_PROFILE_STREAM_WAIT_EVENT, quda::qudaStreamWaitEvent(), quda::cudaColorSpinorField::scatter(), quda::stream, and streams.

Here is the call graph for this function:

Here is the caller graph for this function:

◆ completeDslash()

template<typename T >

void quda::dslash::completeDslash	(	const ColorSpinorField &	in,
		const T &	dslashParam
	)

inline

Ensure that the dslash is complete. By construction, the dslash will have completed (or is in flight) on this process, however, we must also ensure that no local work begins until any communication in flight from this process to another has completed. This prevents a race condition where we could start updating the local buffers on a subsequent computation before we have finished sending.

Definition at line 304 of file dslash_policy.cuh.

References comm_peer2peer_enabled(), quda::LatticeField::getIPCCopyEvent(), quda::Nstream, PROFILE, quda::QUDA_PROFILE_STREAM_WAIT_EVENT, quda::qudaStreamWaitEvent(), and streams.

Here is the call graph for this function:

Here is the caller graph for this function:

◆ disable_policy()

void quda::dslash::disable_policy ( QudaDslashPolicy p )

inline

Definition at line 1765 of file dslash_policy.cuh.

References QUDA_DSLASH_POLICY_DISABLED.

◆ enable_policy()

void quda::dslash::enable_policy ( QudaDslashPolicy p )

inline

Definition at line 1763 of file dslash_policy.cuh.

Referenced by quda::dslash::DslashPolicyTune< Dslash >::DslashPolicyTune().

Here is the caller graph for this function:

◆ getStreamIndex()

template<typename T >

int quda::dslash::getStreamIndex ( const T & dslashParam )

inline

Returns a stream index for posting the pack/scatters to. We desire a stream index that is not being used for peer-to-peer communication. This is used by the fused halo dslash kernels where we post all scatters to the same stream so we only have a single event to wait on before the exterior kernel is applied, and by the zero-copy dslash kernels where we want to post the packing kernel to an unused stream.

Returns: stream index

Definition at line 213 of file dslash_policy.cuh.

References comm_peer2peer_enabled(), and index().

Referenced by quda::dslash::DslashFusedExterior< Dslash >::operator()(), quda::dslash::DslashZeroCopyPack< Dslash >::operator()(), quda::dslash::DslashFusedZeroCopyPack< Dslash >::operator()(), quda::dslash::DslashZeroCopyPackGDRRecv< Dslash >::operator()(), quda::dslash::DslashFusedZeroCopyPackGDRRecv< Dslash >::operator()(), quda::dslash::DslashZeroCopy< Dslash >::operator()(), and quda::dslash::DslashFusedZeroCopy< Dslash >::operator()().

Here is the call graph for this function:

Here is the caller graph for this function:

◆ issueGather()

template<typename Dslash >

void quda::dslash::issueGather	(	cudaColorSpinorField &	in,
		const Dslash &	dslash
	)

inline

This helper function simply posts the device-host memory copies of all halos in all dimensions and directions.

Parameters

[out]	in	Field that whose halos we are communicating
[in]	dslash	The dslash object

Definition at line 180 of file dslash_policy.cuh.

References quda::LatticeField::bufferIndex, comm_peer2peer_enabled(), quda::Dslash< Float >::Dagger(), quda::Dslash< Float >::dslashParam, quda::cudaColorSpinorField::gather(), quda::getKernelPackT(), quda::Dslash< Float >::Nface(), PROFILE, quda::QUDA_PROFILE_EVENT_RECORD, quda::QUDA_PROFILE_GATHER, quda::QUDA_PROFILE_STREAM_WAIT_EVENT, quda::qudaEventRecord(), quda::qudaStreamWaitEvent(), and streams.

Referenced by quda::dslash::DslashBasic< Dslash >::operator()(), quda::dslash::DslashFusedExterior< Dslash >::operator()(), quda::dslash::DslashGDRRecv< Dslash >::operator()(), and quda::dslash::DslashFusedGDRRecv< Dslash >::operator()().

Here is the call graph for this function:

Here is the caller graph for this function:

◆ issuePack()

template<typename Dslash >

void quda::dslash::issuePack	(	cudaColorSpinorField &	in,
		const Dslash &	dslash,
		int	parity,
		MemoryLocation	location,
		int	packIndex
	)

inline

This helper function simply posts the packing kernel needed for halo exchange.

Parameters

[out]	in	Field that we are packing
[in]	dslash	The dslash object
[in]	parity	Field parity
[in]	location	Memory location where we are packing to if Host is requested, the non-p2p halos will be sent to host if Remote is requested, the p2p halos will be written directly
[in]	packIndex	Stream index where the packing kernel will run

Definition at line 139 of file dslash_policy.cuh.

References quda::arg(), quda::LatticeField::bufferIndex, comm_peer2peer_enabled(), quda::Dslash< Float >::Dagger(), quda::Device, quda::Dslash< Float >::dslashParam, errorQuda, quda::getKernelPackT(), quda::Host, quda::Dslash< Float >::Nface(), quda::pack(), quda::cudaColorSpinorField::pack(), parity, PROFILE, QUDA_MAX_DIM, quda::QUDA_PROFILE_EVENT_RECORD, quda::QUDA_PROFILE_PACK_KERNEL, quda::qudaEventRecord(), quda::Remote, and streams.

Here is the call graph for this function:

Here is the caller graph for this function:

◆ issueRecv()

template<typename Dslash >

void quda::dslash::issueRecv	(	cudaColorSpinorField &	input,
		const Dslash &	dslash,
		cudaStream_t *	stream,
		bool	gdr
	)

inline

This helper function simply posts all receives in all directions.

Parameters

[out]	input	Field that we are doing halo exchange
[in]	dslash	The dslash object
[in]	stream	Stream were the receive is being posted (effectively ignored)
[in]	gdr	Whether we are using GPU Direct RDMA or not

Definition at line 118 of file dslash_policy.cuh.

References quda::Dslash< Float >::Dagger(), quda::Dslash< Float >::dslashParam, quda::Dslash< Float >::Nface(), PROFILE, quda::QUDA_PROFILE_COMMS_START, quda::cudaColorSpinorField::recvStart(), and quda::stream.

Here is the call graph for this function:

Here is the caller graph for this function:

◆ setFusedParam()

template<typename Arg , typename Dslash >

void quda::dslash::setFusedParam	(	Arg &	param,
		Dslash &	dslash,
		const int *	faceVolumeCB
	)

inline

Definition at line 81 of file dslash_policy.cuh.

References quda::Dslash< Float >::dslashParam, quda::EXTERIOR_KERNEL_ALL, and quda::Dslash< Float >::Nface().

Referenced by quda::dslash::DslashFusedExterior< Dslash >::operator()(), quda::dslash::DslashFusedGDR< Dslash >::operator()(), quda::dslash::DslashFusedGDRRecv< Dslash >::operator()(), quda::dslash::DslashFusedZeroCopyPack< Dslash >::operator()(), quda::dslash::DslashFusedZeroCopyPackGDRRecv< Dslash >::operator()(), and quda::dslash::DslashFusedZeroCopy< Dslash >::operator()().

Here is the call graph for this function:

Here is the caller graph for this function:

◆ setMappedGhost()

template<typename Dslash >

void quda::dslash::setMappedGhost	(	Dslash &	dslash,
		ColorSpinorField &	in,
		bool	to_mapped
	)

inline

Set the ghosts to the mapped CPU ghost buffer, or unsets if already set. Note this must not be called until after the interior dslash has been called, since sets the peer-to-peer ghost pointers, and this need to be done without the mapped ghost enabled.

Parameters