16 for (
int i = 0; i < 4; i++)
commDim[i] = comm_dim[i];
20 template <
typename Float,
int nSpin,
int nColor,
bool spin_project>
21 std::ostream &operator<<(std::ostream &out, const PackArg<Float, nSpin, nColor, spin_project> &
arg)
23 out <<
"parity = " <<
arg.parity << std::endl;
24 out <<
"nParity = " <<
arg.nParity << std::endl;
25 out <<
"pc_type = " <<
arg.pc_type << std::endl;
26 out <<
"nFace = " <<
arg.nFace << std::endl;
27 out <<
"dagger = " <<
arg.dagger << std::endl;
28 out <<
"a = " <<
arg.a << std::endl;
29 out <<
"b = " <<
arg.b << std::endl;
30 out <<
"c = " <<
arg.c << std::endl;
31 out <<
"twist = " <<
arg.twist << std::endl;
32 out <<
"threads = " <<
arg.threads << std::endl;
33 out <<
"threadDimMapLower = { ";
34 for (
int i = 0; i < 4; i++)
out <<
arg.threadDimMapLower[i] << (i < 3 ?
", " :
" }");
36 out <<
"threadDimMapUpper = { ";
37 for (
int i = 0; i < 4; i++)
out <<
arg.threadDimMapUpper[i] << (i < 3 ?
", " :
" }");
39 out <<
"sites_per_block = " <<
arg.sites_per_block << std::endl;
66 if (location &
Host) {
77 for (
int d = 0; d < in.
Ndim(); d++) nDimComms +=
commDim[d];
78 return max * nDimComms;
86 if (location &
Host) {
97 for (
int d = 0; d < in.
Ndim(); d++) nDimComms +=
commDim[d];
98 return min * nDimComms;
109 if (location &
Host) {
114 for (
int d = 0; d < in.
Ndim(); d++) nDimComms +=
commDim[d];
115 return 2 * nDimComms;
127 strcpy(
aux,
"policy_kernel,");
130 for (
int i = 0; i < 4; i++) comm[i] = (
commDim[i] ?
'1' :
'0');
132 strcat(
aux,
",comm=");
136 if (dagger && in.
Nspin() == 4) { strcat(
aux,
",dagger"); }
139 case 1: strcat(
aux,
",nFace=1");
break;
140 case 3: strcat(
aux,
",nFace=3");
break;
141 default:
errorQuda(
"Number of faces not supported");
144 twist = ((b != 0.0) ? (c != 0.0 ? 2 : 1) : 0);
145 if (twist && a == 0.0)
errorQuda(
"Twisted packing requires non-zero scale factor a");
146 if (twist) strcat(
aux, twist == 2 ?
",twist-doublet" :
",twist-singlet");
149 if (location &
Host) strcat(
aux,
",shmem");
154 switch ((
int)location) {
156 case Host |
Remote: strcat(
aux,
",host-remote");
break;
157 case Device: strcat(
aux,
",device-device");
break;
159 default:
errorQuda(
"Unknown pack target location %d\n", location);
165 double a,
double b,
double c) :
173 nParity(in.SiteSubset()),
182 for (
int i = 0; i < 4; i++) {
191 template <
typename T,
typename Arg>
198 void *args[] = {&arg};
206 if (in.
Nspin() == 4) {
208 Arg arg(ghost, in, nFace, dagger, parity, threads, a, b, c);
209 arg.swizzle = tp.
aux.x;
210 arg.sites_per_block = (arg.threads + tp.
grid.x - 1) / tp.
grid.x;
211 arg.blocks_per_dir = tp.
grid.x / (2 * arg.active_dims);
217 case 0:
launch(packKernel<true, 0, QUDA_4D_PC, Arg>, tp, arg, stream);
break;
218 case 1:
launch(packKernel<true, 1, QUDA_4D_PC, Arg>, tp, arg, stream);
break;
219 case 2:
launch(packKernel<true, 2, QUDA_4D_PC, Arg>, tp, arg, stream);
break;
223 case 0:
launch(packKernel<false, 0, QUDA_4D_PC, Arg>, tp, arg, stream);
break;
224 default:
errorQuda(
"Twisted packing only for dagger");
228 if (arg.twist)
errorQuda(
"Twist packing not defined");
230 launch(packKernel<true, 0, QUDA_5D_PC, Arg>, tp, arg, stream);
232 launch(packKernel<false, 0, QUDA_5D_PC, Arg>, tp, arg, stream);
242 launch(location &
Host ? packShmemKernel<true, 0, QUDA_4D_PC, Arg> : packKernel<true, 0, QUDA_4D_PC, Arg>,
246 launch(location &
Host ? packShmemKernel<true, 1, QUDA_4D_PC, Arg> : packKernel<true, 0, QUDA_4D_PC, Arg>,
250 launch(location &
Host ? packShmemKernel<true, 2, QUDA_4D_PC, Arg> : packKernel<true, 2, QUDA_4D_PC, Arg>,
257 launch(location &
Host ? packShmemKernel<false, 0, QUDA_4D_PC, Arg> : packKernel<false, 0, QUDA_4D_PC, Arg>,
260 default:
errorQuda(
"Twisted packing only for dagger");
264 if (arg.twist)
errorQuda(
"Twist packing not defined");
266 launch(packKernel<true, 0, QUDA_5D_PC, Arg>, tp, arg, stream);
268 launch(packKernel<false, 0, QUDA_5D_PC, Arg>, tp, arg, stream);
272 }
else if (in.
Nspin() == 1) {
274 Arg arg(ghost, in, nFace, dagger, parity, threads, a, b, c);
275 arg.swizzle = tp.
aux.x;
276 arg.sites_per_block = (arg.threads + tp.
grid.x - 1) / tp.
grid.x;
277 arg.blocks_per_dir = tp.
grid.x / (2 * arg.active_dims);
280 launch(packStaggeredKernel<Arg>, tp, arg, stream);
282 launch(location &
Host ? packStaggeredShmemKernel<Arg> : packStaggeredKernel<Arg>, tp, arg, stream);
333 size_t precision =
sizeof(Float);
336 faceBytes += 2 *
sizeof(float);
341 template <
typename Float,
int nColor>
343 bool spin_project,
double a,
double b,
double c,
const cudaStream_t &
stream)
346 Pack<Float, nColor, true> pack(ghost, in, location, nFace, dagger, parity, a, b, c);
349 Pack<Float, nColor, false> pack(ghost, in, location, nFace, dagger, parity, a, b, c);
355 template <
typename Float>
357 bool spin_project,
double a,
double b,
double c,
const cudaStream_t &
stream)
360 PackGhost<Float, 3>(
ghost,
in,
location,
nFace,
dagger,
parity, spin_project,
a,
b,
c,
stream);
368 bool dagger,
int parity,
bool spin_project,
double a,
double b,
double c,
const cudaStream_t &
stream)
371 for (
int d = 0; d < 4; d++) {
376 if (!nDimPack)
return;
379 PackGhost<double>(
ghost,
in,
location,
nFace,
dagger,
parity, spin_project,
a,
b,
c,
stream);
381 PackGhost<float>(
ghost,
in,
location,
nFace,
dagger,
parity, spin_project,
a,
b,
c,
stream);
383 PackGhost<short>(
ghost,
in,
location,
nFace,
dagger,
parity, spin_project,
a,
b,
c,
stream);
385 PackGhost<char>(
ghost,
in,
location,
nFace,
dagger,
parity, spin_project,
a,
b,
c,
stream);
const char * AuxString() const
cudaDeviceProp deviceProp
QudaVerbosity getVerbosity()
unsigned int maxGridSize() const
const char * VolString() const
void defaultTuneParam(TuneParam ¶m) const
void apply(const cudaStream_t &stream)
void launch(T *f, const TuneParam &tp, Arg &arg, const cudaStream_t &stream)
int gridStep() const
gridStep sets the step size when iterating the grid size in advanceGridDim.
void setMaxDynamicSharedBytesPerBlock(F *func) const
Enable the maximum dynamic shared bytes for the kernel "func" (values given by maxDynamicSharedBytesP...
virtual unsigned int maxGridSize() const
const char * comm_dim_topology_string()
Return a string that defines the comm topology (for use as a tuneKey)
virtual int gridStep() const
gridStep sets the step size when iterating the grid size in advanceGridDim.
bool tuneSharedBytes() const
QudaPCType PCType() const
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
void PackGhost(void *ghost[2 *QUDA_MAX_DIM], const ColorSpinorField &field, MemoryLocation location, int nFace, bool dagger, int parity, bool spin_project, double a, double b, double c, const cudaStream_t &stream)
Dslash face packing routine.
int ghostFaceCB[QUDA_MAX_DIM+1]
void initTuneParam(TuneParam ¶m) const
const ColorSpinorField & in
const DslashConstant & getDslashConstant() const
Get the dslash_constant structure from this field.
unsigned int minThreads() const
virtual unsigned int minGridSize() const
static int commDim[QUDA_MAX_DIM]
cpuColorSpinorField * out
__device__ __host__ void pack(Arg &arg, int ghost_idx, int s, int parity)
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
void initTuneParam(TuneParam ¶m) const
#define QUDA_MAX_DIM
Maximum number of dimensions supported by QUDA. In practice, no routines make use of more than 5...
unsigned int maxDynamicSharedBytesPerBlock() const
This can't be correctly queried in CUDA for all architectures so here we set set this. Based on Table 14 of the CUDA Programming Guide 10.0 (Technical Specifications per Compute Capability).
Pack(void *ghost[], const ColorSpinorField &in, MemoryLocation location, int nFace, bool dagger, int parity, double a, double b, double c)
unsigned int minGridSize() const
QudaTune getTuning()
Query whether autotuning is enabled or not. Default is enabled but can be overridden by setting QUDA_...
QudaPrecision Precision() const
int comm_peer2peer_enabled_global()
cudaError_t qudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim, void **args, size_t sharedMem, cudaStream_t stream)
Wrapper around cudaLaunchKernel.
virtual unsigned int maxSharedBytesPerBlock() const
The maximum shared memory that a CUDA thread block can use in the autotuner. This isn't necessarily t...
void defaultTuneParam(TuneParam ¶m) const
void setPackComms(const int *dim_pack)
Helper function that sets which dimensions the packing kernel should be packing for.