17 template <KernelType type> __host__ __device__
inline bool doHalo(
int dim = -1)
35 template <KernelType type> __host__ __device__
inline bool doBulk()
55 template <KernelType type,
typename Arg> __host__ __device__
inline bool isComplete(
const Arg &
arg,
int coord[])
63 incomplete = incomplete || (arg.commDim[3] && (coord[3] == 0 || coord[3] == (arg.dc.X[3] - 1)));
65 incomplete = incomplete || (arg.commDim[2] && (coord[2] == 0 || coord[2] == (arg.dc.X[2] - 1)));
67 incomplete = incomplete || (arg.commDim[1] && (coord[1] == 0 || coord[1] == (arg.dc.X[1] - 1)));
69 incomplete = incomplete || (arg.commDim[0] && (coord[0] == 0 || coord[0] == (arg.dc.X[0] - 1)));
87 template <
int nDim, QudaPCType pc_type, KernelType kernel_type,
typename Arg,
int nface_ = 1>
95 const int Ls = (nDim == 5 && pc_type ==
QUDA_5D_PC ? (int)arg.dim[4] : 1);
100 getCoords5CB(coord, idx, arg.dim, arg.X0h, parity, pc_type);
106 const int face_size = nface_ * arg.dc.ghostFaceCB[kernel_type] *
Ls;
107 const int face_num = idx >= face_size;
108 idx -= face_num * face_size;
109 coordsFromFaceIndex<nDim, pc_type, kernel_type, nface_>(
X, x_cb, coord, idx, face_num,
parity,
arg);
114 if (idx < arg.threadDimMapUpper[0] * Ls) {
116 const int face_size = nface_ * arg.dc.ghostFaceCB[dim] *
Ls;
117 const int face_num = idx >= face_size;
118 idx -= face_num * face_size;
119 coordsFromFaceIndex<nDim, pc_type, 0, nface_>(
X, x_cb, coord, idx, face_num,
parity,
arg);
120 }
else if (idx < arg.threadDimMapUpper[1] * Ls) {
122 idx -= arg.threadDimMapLower[1] *
Ls;
123 const int face_size = nface_ * arg.dc.ghostFaceCB[dim] *
Ls;
124 const int face_num = idx >= face_size;
125 idx -= face_num * face_size;
126 coordsFromFaceIndex<nDim, pc_type, 1, nface_>(
X, x_cb, coord, idx, face_num,
parity,
arg);
127 }
else if (idx < arg.threadDimMapUpper[2] * Ls) {
129 idx -= arg.threadDimMapLower[2] *
Ls;
130 const int face_size = nface_ * arg.dc.ghostFaceCB[dim] *
Ls;
131 const int face_num = idx >= face_size;
132 idx -= face_num * face_size;
133 coordsFromFaceIndex<nDim, pc_type, 2, nface_>(
X, x_cb, coord, idx, face_num,
parity,
arg);
136 idx -= arg.threadDimMapLower[3] *
Ls;
137 const int face_size = nface_ * arg.dc.ghostFaceCB[dim] *
Ls;
138 const int face_num = idx >= face_size;
139 idx -= face_num * face_size;
140 coordsFromFaceIndex<nDim, pc_type, 3, nface_>(
X, x_cb, coord, idx, face_num,
parity,
arg);
155 template <
int dim,
typename Arg>
inline __host__ __device__
bool inBoundary(
const int coord[],
const Arg &
arg)
157 return ((coord[dim] >= arg.dim[dim] - arg.nFace) || (coord[dim] < arg.nFace));
187 template <KernelType kernel_type,
typename Arg>
188 inline __device__
bool isActive(
bool &active,
int threadDim,
int offsetDim,
const int coord[],
const Arg &
arg)
194 if (!arg.ghostDim[offsetDim])
return false;
197 if (threadDim < offsetDim)
return false;
204 if (!arg.ghostDim[3])
break;
205 if (arg.ghostDim[3] && inBoundary<3>(coord, arg))
return false;
209 if ((!arg.ghostDim[3]) && (!arg.ghostDim[2]))
break;
210 if (arg.ghostDim[3] && inBoundary<3>(coord, arg))
return false;
211 if (arg.ghostDim[2] && inBoundary<2>(coord, arg))
return false;
215 if ((!arg.ghostDim[3]) && (!arg.ghostDim[2]) && (!arg.ghostDim[1]))
break;
216 if (arg.ghostDim[3] && inBoundary<3>(coord, arg))
return false;
217 if (arg.ghostDim[2] && inBoundary<2>(coord, arg))
return false;
218 if (arg.ghostDim[1] && inBoundary<1>(coord, arg))
return false;
267 int spin_project,
const int *comm_override) :
269 nParity(in.SiteSubset()),
272 X0h(nParity == 2 ? in.
X(0) / 2 : in.
X(0)),
273 dim {(3 -
nParity) * in.
X(0), in.
X(1), in.
X(2), in.
X(3), in.
Ndim() == 5 ? in.
X(4) : 1},
279 threadDimMapLower {},
280 threadDimMapUpper {},
286 for (
int d = 0; d < 4; d++) {
296 dc =
in.getDslashConstant();
300 template <
typename Float> std::ostream &operator<<(std::ostream &out, const DslashArg<Float> &
arg)
302 out <<
"parity = " <<
arg.parity << std::endl;
303 out <<
"nParity = " <<
arg.nParity << std::endl;
304 out <<
"nFace = " <<
arg.nFace << std::endl;
305 out <<
"reconstruct = " <<
arg.reconstruct << std::endl;
306 out <<
"X0h = " <<
arg.X0h << std::endl;
308 for (
int i = 0; i < 5; i++)
out <<
arg.dim[i] << (i < 4 ?
", " :
" }");
310 out <<
"commDim = { ";
311 for (
int i = 0; i < 4; i++)
out <<
arg.commDim[i] << (i < 3 ?
", " :
" }");
313 out <<
"ghostDim = { ";
314 for (
int i = 0; i < 4; i++)
out <<
arg.ghostDim[i] << (i < 3 ?
", " :
" }");
316 out <<
"volumeCB = " <<
arg.volumeCB << std::endl;
317 out <<
"dagger = " <<
arg.dagger << std::endl;
318 out <<
"xpay = " <<
arg.xpay << std::endl;
319 out <<
"kernel_type = " <<
arg.kernel_type << std::endl;
320 out <<
"remote_write = " <<
arg.remote_write << std::endl;
321 out <<
"threads = " <<
arg.threads << std::endl;
322 out <<
"threadDimMapLower = { ";
323 for (
int i = 0; i < 4; i++)
out <<
arg.threadDimMapLower[i] << (i < 3 ?
", " :
" }");
325 out <<
"threadDimMapUpper = { ";
326 for (
int i = 0; i < 4; i++)
out <<
arg.threadDimMapUpper[i] << (i < 3 ?
", " :
" }");
328 out <<
"twist_a = " <<
arg.twist_a;
329 out <<
"twist_b = " <<
arg.twist_b;
330 out <<
"twist_c = " <<
arg.twist_c;
Constants used by dslash and packing kernels.
static __device__ __host__ void getCoords5CB(int x[5], int cb_index, const I X[5], J X0h, int parity, QudaPCType pc_type)
static __device__ __host__ void getCoordsCB(int x[], int cb_index, const I X[], J X0h, int parity)
__host__ __device__ bool doBulk()
Helper function to determine if we should do interior computation.
const QudaReconstructType reconstruct
mapper< Float >::type real
__device__ bool isActive(bool &active, int threadDim, int offsetDim, const int coord[], const Arg &arg)
Compute whether this thread should be active for updating the a given offsetDim halo. For non-fused halo update kernels this is a trivial kernel that just checks if the given dimension is partitioned and if so, return true.
Generic reconstruction helper with no reconstruction.
__host__ __device__ bool doHalo(int dim=-1)
Helper function to determine if we should do halo computation.
Provides precision abstractions and defines the register precision given the storage precision using ...
__host__ __device__ bool inBoundary(const int coord[], const Arg &arg)
Compute whether the provided coordinate is within the halo region boundary of a given dimension...
cpuColorSpinorField * out
enum QudaReconstructType_s QudaReconstructType
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
DslashArg(const ColorSpinorField &in, const GaugeField &U, int parity, bool dagger, bool xpay, int nFace, int spin_project, const int *comm_override)
int comm_dim_partitioned(int dim)
__host__ __device__ int getCoords(int coord[], const Arg &arg, int &idx, int parity, int &dim)
Compute the space-time coordinates we are at.
__host__ __device__ bool isComplete(const Arg &arg, int coord[])
Helper functon to determine if the application of the derivative in the dslash is complete...