QUDA
1.0.0
|
Go to the source code of this file.
Namespaces | |
quda | |
Enumerations | |
enum | quda::KernelType { quda::INTERIOR_KERNEL = 5, quda::EXTERIOR_KERNEL_ALL = 6, quda::EXTERIOR_KERNEL_X = 0, quda::EXTERIOR_KERNEL_Y = 1, quda::EXTERIOR_KERNEL_Z = 2, quda::EXTERIOR_KERNEL_T = 3, quda::KERNEL_POLICY = 7 } |
Functions | |
template<typename I , typename J , typename K > | |
static __device__ __host__ int | quda::linkIndexShift (const I x[], const J dx[], const K X[4]) |
template<typename I , typename J , typename K > | |
static __device__ __host__ int | quda::linkIndexShift (I y[], const I x[], const J dx[], const K X[4]) |
template<typename I > | |
static __device__ __host__ int | quda::linkIndex (const int x[], const I X[4]) |
template<typename I > | |
static __device__ __host__ int | quda::linkIndex (int y[], const int x[], const I X[4]) |
template<typename I , int n> | |
static __device__ __host__ int | quda::linkIndexDn (const int x[], const I X[4], const int mu) |
template<typename I > | |
static __device__ __host__ int | quda::linkIndexM1 (const int x[], const I X[4], const int mu) |
template<typename I > | |
static __device__ __host__ int | quda::linkIndexM3 (const int x[], const I X[4], const int mu) |
template<typename I > | |
static __device__ __host__ int | quda::linkNormalIndexP1 (const int x[], const I X[4], const int mu) |
template<typename I > | |
static __device__ __host__ int | quda::linkIndexP1 (const int x[], const I X[4], const int mu) |
template<typename I > | |
static __device__ __host__ int | quda::linkIndexP3 (const int x[], const I X[4], const int mu) |
template<int nDim = 4, typename Arg > | |
static __device__ __host__ int | quda::getNeighborIndexCB (const int x[], int mu, int dir, const Arg &arg) |
Compute the checkerboard 1-d index for the nearest neighbor. More... | |
template<typename I , typename J > | |
static __device__ __host__ void | quda::getCoordsCB (int x[], int cb_index, const I X[], J X0h, int parity) |
template<typename I > | |
static __device__ __host__ void | quda::getCoords (int x[], int cb_index, const I X[], int parity) |
template<typename I , typename J > | |
static __device__ __host__ void | quda::getCoordsExtended (I x[], int cb_index, const J X[], int parity, const int R[]) |
template<typename I , typename J > | |
static __device__ __host__ void | quda::getCoords5CB (int x[5], int cb_index, const I X[5], J X0h, int parity, QudaPCType pc_type) |
template<typename I > | |
static __device__ __host__ void | quda::getCoords5 (int x[5], int cb_index, const I X[5], int parity, QudaPCType pc_type) |
template<typename I > | |
static __device__ __host__ int | quda::getIndexFull (int cb_index, const I X[4], int parity) |
template<int dir, int nDim = 4, typename I > | |
__device__ __host__ int | quda::ghostFaceIndex (const int x_[], const I X_[], int dim, int nFace) |
template<int dir, int nDim = 4, typename I > | |
__device__ __host__ int | quda::ghostFaceIndexStaggered (const int x_[], const I X_[], int dim, int nFace) |
template<int nDim, QudaPCType type, int dim_, int nLayers, typename Int , typename Arg > | |
__device__ __host__ void | quda::coordsFromFaceIndex (int &idx, int &cb_idx, Int *const x, int face_idx, const int &face_num, int parity, const Arg &arg) |
Compute the full-lattice coordinates from the input face index. This is used by the Wilson-like halo update kernels, and can deal with 4-d or 5-d field and 4-d or 5-d preconditioning. More... | |
template<int nDim, QudaPCType type, int dim_, int nLayers, typename Int , typename Arg > | |
__device__ __host__ void | quda::coordsFromFaceIndex (int &idx, int &cb_idx, Int *const x, int face_idx, const int &face_num, const Arg &arg) |
Overloaded variant of indexFromFaceIndex where we use the parity declared in arg. More... | |
template<int nDim, QudaPCType type, int dim, int nLayers, int face_num, typename Arg > | |
__device__ __host__ int | quda::indexFromFaceIndex (int face_idx, int parity, const Arg &arg) |
Compute the checkerboard lattice index from the input face index. This is used by the Wilson-like halo packing kernels, and can deal with 4-d or 5-d field and 4-d or 5-d preconditioning. More... | |
template<int nDim, QudaPCType type, int dim, int nLayers, int face_num, typename Arg > | |
__device__ __host__ int | quda::indexFromFaceIndex (int face_idx, const Arg &arg) |
Overloaded variant of indexFromFaceIndex where we use the parity declared in arg. More... | |
template<int nDim, QudaPCType type, int dim, int nLayers, int face_num, typename Arg > | |
static __device__ int | quda::indexFromFaceIndexStaggered (int face_idx_in, int parity, const Arg &arg) |
Compute global checkerboard index from face index. The following indexing routines work for arbitrary lattice dimensions (though perhaps not odd like thw Wilson variant?) Specifically, we compute an index into the local volume from an index into the face. This is used by the staggered-like face packing routines, and is different from the Wilson variant since here the halo depth is tranversed in a different order - here the halo depth is the faster running dimension. More... | |
template<int nDim = 4, typename Arg > | |
__host__ __device__ int | quda::dimFromFaceIndex (int &face_idx, int tid, const Arg &arg) |
Determines which face a given thread is computing. Also rescale face_idx so that is relative to a given dimension. If 5-d variant if called, then it is assumed that arg.threads contains only the 3-d surface of threads but face_idx is a 4-d index (surface * fifth dimension). At present multi-src staggered uses the 4-d variant since the face_idx that is passed in is the 3-d surface not the 4-d one. More... | |
template<int nDim = 4, typename Arg > | |
__host__ __device__ int | quda::dimFromFaceIndex (int &face_idx, const Arg &arg) |
template<typename T > | |
__device__ int | quda::block_idx (const T &swizzle) |
Swizzler for reordering the (x) thread block indices - use on conjunction with swizzle-factor autotuning to find the optimum swizzle factor. Specfically, the thread block id is remapped by transposing its coordinates: if the original order can be parametrized by. More... | |
template<typename Arg > | |
__device__ __host__ auto | quda::StaggeredPhase (const int coords[], int dim, int dir, const Arg &arg) -> typename Arg::real |
Compute the staggered phase factor at unit shift from the current lattice coordinates. The routine below optimizes out the shift where possible, hence is only visible where we need to consider the boundary condition. More... | |