This graph shows which files directly or indirectly include this file:

Enumerations
enum	IndexType { EVEN_X = 0, EVEN_Y = 1, EVEN_Z = 2, EVEN_T = 3 }

Functions
template<int nDim, QudaDWFPCType type, int dim, int nLayers, int face_num, typename Param >
static __device__ int	indexFromFaceIndex (int face_idx, const Param &param)
	Compute global checkerboard index from face index. The following indexing routines work for arbitrary (including odd) lattice dimensions. Specifically, we compute an index into the local volume from an index into the face. This is used by the Wilson-like face packing routines. This routine can handle 4-d or 5-d fields, and well as 4-d or 5-d preconditioning. More...

template<int dim, int nLayers, int face_num, typename Param >
static __device__ int	indexFromFaceIndexExtended (int face_idx, const Param &param)
	Compute global extended checkerboard index from face index. The following indexing routines work for arbitrary (including odd) lattice dimensions. Specifically, we compute an index into the local volume from an index into the face. This is used by the Wilson-like face packing routines. More...

template<int dim, int nLayers, int face_num, typename Param >
static __device__ int	indexFromFaceIndexStaggered (int face_idx_in, const Param &param)
	Compute global checkerboard index from face index. The following indexing routines work for arbitrary lattice dimensions (though perhaps not odd like thw Wilson variant?) Specifically, we compute an index into the local volume from an index into the face. This is used by the staggered-like face packing routines, and is different from the Wilson variant since here the halo depth is tranversed in a different order - here the halo depth is the faster running dimension. More...

template<int dim, int nLayers, int face_num, typename Param >
static __device__ int	indexFromFaceIndexExtendedStaggered (int face_idx, const Param &param)
	Compute global extended checkerboard index from face index. The following indexing routines work for arbitrary lattice dimensions (though perhaps not odd like thw Wilson variant?) Specifically, we compute an index into the local volume from an index into the face. This is used by the staggered-like face packing routines, and is different from the Wilson variant since here the halo depth is tranversed in a different order - here the halo depth is the faster running dimension. More...

template<KernelType dim, int nLayers, int Dir, typename Param >
static __device__ void	coordsFromFaceIndexStaggered (int x[], int idx, const Param &param)
	Compute the full-lattice coordinates from the input face index. This is used by the staggered halo update kernels. More...

template<int nDim, QudaDWFPCType type, int dim_, int nLayers, typename Int , typename Param >
static __device__ void	coordsFromFaceIndex (int &idx, int &cb_idx, Int *const x, int face_idx, const int &face_num, const Param &param)
	Compute the full-lattice coordinates from the input face index. This is used by the Wilson-like halo update kernels, and can deal with 4-d or 5-d field and 4-d or 5-d preconditioning. More...

template<int nDim, QudaDWFPCType pc_type, IndexType idxType, typename T , typename Param >
static __device__ __forceinline__ void	coordsFromIndex (int &idx, T *x, int &cb_idx, const Param &param)
	Compute coordinates from index into the checkerboard (used by the interior Dslash kernels). This is used by the Wilson-like interior update kernels, and can deal with 4-d or 5-d field and 4-d or 5-d preconditioning. More...

template<IndexType idxType, typename Int , typename Param >
static __device__ __forceinline__ void	coordsFromIndex3D (int &idx, Int *const x, int &cb_idx, const Param &param)
	Compute coordinates from index into the checkerboard (used by the interior Dslash kernels). This is the variant used by the shared memory wilson dslash. More...

template<int dim, typename T >
static __device__ bool	inBoundary (const int depth, const int coord[], const T X[])
	Compute whether the provided coordinate is within the halo region boundary of a given dimension. More...

template<typename T >
static __device__ bool	isActive (const int threadDim, int offsetDim, int offset, const int y[], const int partitioned[], const T X[])
	Compute whether this thread should be active for updating the a given offsetDim halo. This is used by the fused halo region update kernels: here every thread has a prescribed dimension it is tasked with updating, but for the edges and vertices, the thread responsible for the entire update is the "greatest" one. Hence some threads may be labelled as a given dimension, but they have to update other dimensions too. Conversely, a given thread may be labeled for a given dimension, but if that thread lies at en edge or vertex, and we have partitioned a higher dimension, then that thread will cede to the higher thread. More...

template<int nDim = 4, typename Param >
__device__ int	dimFromFaceIndex (int &face_idx, const int tid, const Param &param)
	Determines which face a given thread is computing. Also rescale face_idx so that is relative to a given dimension. If 5-d variant if called, then it is assumed that param.threads contains only the 3-d surface of threads but face_idx is a 4-d index (surface * fifth dimension). At present multi-src staggered uses the 4-d variant since the face_idx that is passed in is the 3-d surface not the 4-d one. More...

template<int nDim = 4, typename Param >
__device__ int	dimFromFaceIndex (int &face_idx, const Param &param)

template<int nDim, int nLayers, typename I , typename Param >
static __device__ void	faceIndexFromCoords (int &face_idx, I *const x, int face_dim, const Param &param)
	Compute the face index from the lattice coordinates. More...

template<typename T >
__device__ int	block_idx (const T &swizzle)
	Swizzler for reordering the (x) thread block indices - use on conjunction with swizzle-factor autotuning to find the optimum swizzle factor. Specfically, the thread block id is remapped by transposing its coordinates: if the original order can be parametrized by. More...

__device__ float	__fast_pow (float a, int b)

Enumeration Type Documentation

◆ IndexType

enum IndexType

Enumerator
EVEN_X
EVEN_Y
EVEN_Z
EVEN_T

Definition at line 530 of file dslash_index.cuh.

Function Documentation

◆ __fast_pow()

__device__ float __fast_pow	(	float	a,
		int	b
	)

inline

Definition at line 911 of file dslash_index.cuh.

References a, b, fabsf(), and deg_tm_dslash_cuda_gen::sign().

Here is the call graph for this function:

◆ block_idx()

template<typename T >

__device__ int block_idx ( const T & swizzle )

inline

Swizzler for reordering the (x) thread block indices - use on conjunction with swizzle-factor autotuning to find the optimum swizzle factor. Specfically, the thread block id is remapped by transposing its coordinates: if the original order can be parametrized by.

blockIdx.x = j * swizzle + i,

then the new order is

block_idx = i * (gridDim.x / swizzle) + j

We need to factor out any remainder and leave this in original ordering.

Parameters

[in] swizzle Swizzle factor to be applied

Returns: Swizzled block index

Definition at line 883 of file dslash_index.cuh.

References gridDim, and fused_exterior_ndeg_tm_dslash_cuda_gen::i.

◆ coordsFromFaceIndex()

template<int nDim, QudaDWFPCType type, int dim_, int nLayers, typename Int , typename Param >

static __device__ void coordsFromFaceIndex	(	int &	idx,
		int &	cb_idx,
		Int *const	x,
		int	face_idx,
		const int &	face_num,
		const Param &	param
	)

inlinestatic

Compute the full-lattice coordinates from the input face index. This is used by the Wilson-like halo update kernels, and can deal with 4-d or 5-d field and 4-d or 5-d preconditioning.

Parameters

	idx[out]	The full lattice coordinate
	cb_idx[out]	The checkboarded lattice coordinate
	x[out]	Coordinates we are computing
	.dc.face_idx[in]	Input checkerboarded face index
[in]	param	Parameter struct with required meta data

Definition at line 442 of file dslash_index.cuh.

References dim, face_idx, face_num, idx, INTERIOR_KERNEL, param, QUDA_4D_PC, QUDA_5D_PC, QudaGaugeParam_s::X, X, and x.

◆ coordsFromFaceIndexStaggered()

template<KernelType dim, int nLayers, int Dir, typename Param >

static __device__ void coordsFromFaceIndexStaggered	(	int	x[],
		int	idx,
		const Param &	param
	)

inlinestatic

Compute the full-lattice coordinates from the input face index. This is used by the staggered halo update kernels.

Parameters

	x[out]	Coordinates we are computing
	idx[in]	Input checkerboard face index
[in]	param	Parameter struct with required meta data

Definition at line 362 of file dslash_index.cuh.

References dim, EXTERIOR_KERNEL_T, EXTERIOR_KERNEL_X, EXTERIOR_KERNEL_Y, EXTERIOR_KERNEL_Z, idx, param, QudaGaugeParam_s::X, X, x, x0h, Xh, za, and zb.

◆ coordsFromIndex()

template<int nDim, QudaDWFPCType pc_type, IndexType idxType, typename T , typename Param >

static __device__ __forceinline__ void coordsFromIndex	(	int &	idx,
		T *	x,
		int &	cb_idx,
		const Param &	param
	)

static

Compute coordinates from index into the checkerboard (used by the interior Dslash kernels). This is used by the Wilson-like interior update kernels, and can deal with 4-d or 5-d field and 4-d or 5-d preconditioning.

Parameters

	idx[out]	The full lattice coordinate
	cb_idx[out]	The checkboarded lattice coordinate
	x[out]	Coordinates we are computing
	idx[in]	Input checkerboarded face index
[in]	param	Parameter struct with required meta data

(X[0] & 1)

(X[1] & 1)

(X[2] & 1)

Definition at line 550 of file dslash_index.cuh.

References deg_tm_dslash_cuda_gen::block(), EVEN_X, EVEN_Y, EVEN_Z, fused_exterior_ndeg_tm_dslash_cuda_gen::i, idx, param, QUDA_4D_PC, s, t, QudaGaugeParam_s::X, X, x, y, and z.

Referenced by quda::neighborIndex().

Here is the call graph for this function:

Here is the caller graph for this function:

◆ coordsFromIndex3D()

template<IndexType idxType, typename Int , typename Param >

static __device__ __forceinline__ void coordsFromIndex3D	(	int &	idx,
		Int *const	x,
		int &	cb_idx,
		const Param &	param
	)

static

Compute coordinates from index into the checkerboard (used by the interior Dslash kernels). This is the variant used by the shared memory wilson dslash.

Parameters

[out]	idx	Linear index
[out]	x	Compute coordinates
[out]	ch_idx	Linear checkboard index
[in]	param	Parameter struct with required meta data

Definition at line 697 of file dslash_index.cuh.

References blockDim, EVEN_X, idx, param, X, and x.

◆ dimFromFaceIndex() [1/2]

template<int nDim = 4, typename Param >

__device__ int dimFromFaceIndex	(	int &	face_idx,
		const int	tid,
		const Param &	param
	)

inline

Determines which face a given thread is computing. Also rescale face_idx so that is relative to a given dimension. If 5-d variant if called, then it is assumed that param.threads contains only the 3-d surface of threads but face_idx is a 4-d index (surface * fifth dimension). At present multi-src staggered uses the 4-d variant since the face_idx that is passed in is the 3-d surface not the 4-d one.

Parameters

[in]	face_idx	Face index
[in]	param	Input parameters

Returns: dimension this face_idx corresponds to

Definition at line 808 of file dslash_index.cuh.

References face_idx, param, and s.

◆ dimFromFaceIndex() [2/2]

template<int nDim = 4, typename Param >

__device__ int dimFromFaceIndex	(	int &	face_idx,
		const Param &	param
	)

inline

Definition at line 834 of file dslash_index.cuh.

References face_idx, and param.

◆ faceIndexFromCoords()

template<int nDim, int nLayers, typename I , typename Param >

static __device__ void faceIndexFromCoords	(	int &	face_idx,
		I *const	x,
		int	face_dim,
		const Param &	param
	)

inlinestatic

Compute the face index from the lattice coordinates.

Parameters

[in]	face_idx	Face index
[in]	x	Lattice coordinates
[in]	face_dim	Which dimension
[in]	param	Input parameters

Returns: dimension this face_idx corresponds to

Definition at line 846 of file dslash_index.cuh.

References face_idx, param, QudaGaugeParam_s::X, x, and y.

◆ inBoundary()

template<int dim, typename T >

static __device__ bool inBoundary	(	const int	depth,
		const int	coord[],
		const T	X[]
	)

inlinestatic

Compute whether the provided coordinate is within the halo region boundary of a given dimension.

Parameters

[in]	depth	Depth of halo
[in]	coord	Coordinates
[in]	X	Lattice dimensions

Returns: True if in boundary, else false

Definition at line 727 of file dslash_index.cuh.

References coord, depth, dim, and X.

◆ indexFromFaceIndex()

template<int nDim, QudaDWFPCType type, int dim, int nLayers, int face_num, typename Param >

static __device__ int indexFromFaceIndex	(	int	face_idx,
		const Param &	param
	)

inlinestatic

Compute global checkerboard index from face index. The following indexing routines work for arbitrary (including odd) lattice dimensions. Specifically, we compute an index into the local volume from an index into the face. This is used by the Wilson-like face packing routines. This routine can handle 4-d or 5-d fields, and well as 4-d or 5-d preconditioning.

Parameters

[in]	face_idx	Checkerboarded face index
[in]	param	Parameter struct with required meta data

Returns: Global checkerboard coordinate

Definition at line 14 of file dslash_index.cuh.

References dim, face_idx, face_num, idx, if(), param, QUDA_4D_PC, QUDA_5D_PC, s, t, QudaGaugeParam_s::X, y, and z.

Here is the call graph for this function:

◆ indexFromFaceIndexExtended()

template<int dim, int nLayers, int face_num, typename Param >

static __device__ int indexFromFaceIndexExtended	(	int	face_idx,
		const Param &	param
	)

inlinestatic

Compute global extended checkerboard index from face index. The following indexing routines work for arbitrary (including odd) lattice dimensions. Specifically, we compute an index into the local volume from an index into the face. This is used by the Wilson-like face packing routines.

Parameters

[in]	face_idx	Checkerboarded face index
[in]	param	Parameter struct with required meta data

Returns: Global extended checkerboard coordinate

Definition at line 110 of file dslash_index.cuh.

References dim, face_idx, face_num, idx, param, R, t, QudaGaugeParam_s::X, X, y, and z.

◆ indexFromFaceIndexExtendedStaggered()

template<int dim, int nLayers, int face_num, typename Param >

static __device__ int indexFromFaceIndexExtendedStaggered	(	int	face_idx,
		const Param &	param
	)

inlinestatic

Compute global extended checkerboard index from face index. The following indexing routines work for arbitrary lattice dimensions (though perhaps not odd like thw Wilson variant?) Specifically, we compute an index into the local volume from an index into the face. This is used by the staggered-like face packing routines, and is different from the Wilson variant since here the halo depth is tranversed in a different order - here the halo depth is the faster running dimension.

Parameters

[in]	face_idx_in	Checkerboarded face index
[in]	param	Parameter struct with required meta data

Returns: Global extended checkerboard coordinate

Definition at line 276 of file dslash_index.cuh.

References dim, face_idx, face_num, idx, param, R, t, V, QudaGaugeParam_s::X, X, y, and z.

◆ indexFromFaceIndexStaggered()

template<int dim, int nLayers, int face_num, typename Param >

static __device__ int indexFromFaceIndexStaggered	(	int	face_idx_in,
		const Param &	param
	)

inlinestatic

Compute global checkerboard index from face index. The following indexing routines work for arbitrary lattice dimensions (though perhaps not odd like thw Wilson variant?) Specifically, we compute an index into the local volume from an index into the face. This is used by the staggered-like face packing routines, and is different from the Wilson variant since here the halo depth is tranversed in a different order - here the halo depth is the faster running dimension.

Parameters

[in]	face_idx_in	Checkerboarded face index
[in]	param	Parameter struct with required meta data

Returns: Global checkerboard coordinate

Definition at line 207 of file dslash_index.cuh.

References dim, face_idx, face_num, idx, param, s, t, QudaGaugeParam_s::X, X, y, and z.

◆ isActive()

template<typename T >

static __device__ bool isActive	(	const int	threadDim,
		int	offsetDim,
		int	offset,
		const int	y[],
		const int	partitioned[],
		const T	X[]
	)

inlinestatic

Compute whether this thread should be active for updating the a given offsetDim halo. This is used by the fused halo region update kernels: here every thread has a prescribed dimension it is tasked with updating, but for the edges and vertices, the thread responsible for the entire update is the "greatest" one. Hence some threads may be labelled as a given dimension, but they have to update other dimensions too. Conversely, a given thread may be labeled for a given dimension, but if that thread lies at en edge or vertex, and we have partitioned a higher dimension, then that thread will cede to the higher thread.

Parameters

[in]	threadDim	Prescribed dimension of this thread
[in]	offsetDim	The dimension we are querying whether this thread should be responsible
[in]	offset	The size of the hop
[in]	y	Site coordinate
[in]	partitioned	Array of which dimensions have been partitioned
[in]	X	Lattice dimensions

Returns: True if this thread is active

Definition at line 753 of file dslash_index.cuh.

References offset, width, X, and y.

Referenced by for().

Here is the caller graph for this function:

Enumerations

Functions

Enumeration Type Documentation

◆ IndexType

Function Documentation

◆ __fast_pow()

◆ block_idx()

◆ coordsFromFaceIndex()

◆ coordsFromFaceIndexStaggered()

◆ coordsFromIndex()

◆ coordsFromIndex3D()

◆ dimFromFaceIndex() [1/2]

◆ dimFromFaceIndex() [2/2]

◆ faceIndexFromCoords()

◆ inBoundary()

◆ indexFromFaceIndex()

◆ indexFromFaceIndexExtended()

◆ indexFromFaceIndexExtendedStaggered()

◆ indexFromFaceIndexStaggered()

◆ isActive()