QUDA  1.0.0
dslash_pack2.cu
Go to the documentation of this file.
1 #include <color_spinor_field.h>
2 
3 // STRIPED - spread the blocks throughout the workload to ensure we
4 // work on all directions/dimensions simultanesouly to maximize NVLink saturation
5 #define STRIPED
6 // if not STRIPED then this means we assign one thread block per direction / dimension
7 
8 #include <dslash_quda.h>
10 
11 namespace quda
12 {
13 
14  void setPackComms(const int *comm_dim)
15  {
16  for (int i = 0; i < 4; i++) commDim[i] = comm_dim[i];
17  for (int i = 4; i < QUDA_MAX_DIM; i++) commDim[i] = 0;
18  }
19 
20  template <typename Float, int nSpin, int nColor, bool spin_project>
21  std::ostream &operator<<(std::ostream &out, const PackArg<Float, nSpin, nColor, spin_project> &arg)
22  {
23  out << "parity = " << arg.parity << std::endl;
24  out << "nParity = " << arg.nParity << std::endl;
25  out << "pc_type = " << arg.pc_type << std::endl;
26  out << "nFace = " << arg.nFace << std::endl;
27  out << "dagger = " << arg.dagger << std::endl;
28  out << "a = " << arg.a << std::endl;
29  out << "b = " << arg.b << std::endl;
30  out << "c = " << arg.c << std::endl;
31  out << "twist = " << arg.twist << std::endl;
32  out << "threads = " << arg.threads << std::endl;
33  out << "threadDimMapLower = { ";
34  for (int i = 0; i < 4; i++) out << arg.threadDimMapLower[i] << (i < 3 ? ", " : " }");
35  out << std::endl;
36  out << "threadDimMapUpper = { ";
37  for (int i = 0; i < 4; i++) out << arg.threadDimMapUpper[i] << (i < 3 ? ", " : " }");
38  out << std::endl;
39  out << "sites_per_block = " << arg.sites_per_block << std::endl;
40  return out;
41  }
42 
43  // FIXME - add CPU variant
44 
45  template <typename Float, int nColor, bool spin_project> class Pack : TunableVectorYZ
46  {
47 
48 protected:
49  void **ghost;
52  const int nFace;
53  const bool dagger; // only has meaning for nSpin=4
54  const int parity;
55  const int nParity;
56  int threads;
57  const double a;
58  const double b;
59  const double c;
60  int twist; // only has meaning for nSpin=4
61 
62  bool tuneGridDim() const { return true; } // If striping, always tune grid dimension
63 
64  unsigned int maxGridSize() const
65  {
66  if (location & Host) {
67 #ifdef STRIPED
68  // if zero-copy policy then set a maximum number of blocks to be
69  // the 3 * number of dimensions we are communicating
70  int max = 3;
71 #else
72  // if zero-copy policy then assign exactly up to four thread blocks
73  // per direction per dimension (effectively no grid-size tuning)
74  int max = 2 * 4;
75 #endif
76  int nDimComms = 0;
77  for (int d = 0; d < in.Ndim(); d++) nDimComms += commDim[d];
78  return max * nDimComms;
79  } else {
81  }
82  } // use no more than a quarter of the GPU
83 
84  unsigned int minGridSize() const
85  {
86  if (location & Host) {
87 #ifdef STRIPED
88  // if zero-copy policy then set a minimum number of blocks to be
89  // the 1 * number of dimensions we are communicating
90  int min = 3;
91 #else
92  // if zero-copy policy then assign exactly one thread block
93  // per direction per dimension (effectively no grid-size tuning)
94  int min = 2;
95 #endif
96  int nDimComms = 0;
97  for (int d = 0; d < in.Ndim(); d++) nDimComms += commDim[d];
98  return min * nDimComms;
99  } else {
101  }
102  }
103 
104  int gridStep() const
105  {
106 #ifdef STRIPED
107  return TunableVectorYZ::gridStep();
108 #else
109  if (location & Host) {
110  // the shmem kernel must ensure the grid size autotuner
111  // increments in steps of 2 * number partitioned dimensions
112  // for equal division of blocks to each direction/dimension
113  int nDimComms = 0;
114  for (int d = 0; d < in.Ndim(); d++) nDimComms += commDim[d];
115  return 2 * nDimComms;
116  } else {
117  return TunableVectorYZ::gridStep();
118  }
119 #endif
120  }
121 
122  bool tuneAuxDim() const { return true; } // Do tune the aux dimensions.
123  unsigned int minThreads() const { return threads; }
124 
125  void fillAux()
126  {
127  strcpy(aux, "policy_kernel,");
128  strcat(aux, in.AuxString());
129  char comm[5];
130  for (int i = 0; i < 4; i++) comm[i] = (commDim[i] ? '1' : '0');
131  comm[4] = '\0';
132  strcat(aux, ",comm=");
133  strcat(aux, comm);
134  strcat(aux, comm_dim_topology_string());
135  if (in.PCType() == QUDA_5D_PC) { strcat(aux, ",5D_pc"); }
136  if (dagger && in.Nspin() == 4) { strcat(aux, ",dagger"); }
137  if (getKernelPackT()) { strcat(aux, ",kernelPackT"); }
138  switch (nFace) {
139  case 1: strcat(aux, ",nFace=1"); break;
140  case 3: strcat(aux, ",nFace=3"); break;
141  default: errorQuda("Number of faces not supported");
142  }
143 
144  twist = ((b != 0.0) ? (c != 0.0 ? 2 : 1) : 0);
145  if (twist && a == 0.0) errorQuda("Twisted packing requires non-zero scale factor a");
146  if (twist) strcat(aux, twist == 2 ? ",twist-doublet" : ",twist-singlet");
147 
148 #ifndef STRIPED
149  if (location & Host) strcat(aux, ",shmem");
150 #endif
151 
152  // label the locations we are packing to
153  // location label is nonp2p-p2p
154  switch ((int)location) {
155  case Device | Remote: strcat(aux, ",device-remote"); break;
156  case Host | Remote: strcat(aux, ",host-remote"); break;
157  case Device: strcat(aux, ",device-device"); break;
158  case Host: strcat(aux, comm_peer2peer_enabled_global() ? ",host-device" : ",host-host"); break;
159  default: errorQuda("Unknown pack target location %d\n", location);
160  }
161  }
162 
163 public:
164  Pack(void *ghost[], const ColorSpinorField &in, MemoryLocation location, int nFace, bool dagger, int parity,
165  double a, double b, double c) :
166  TunableVectorYZ((in.Ndim() == 5 ? in.X(4) : 1), in.SiteSubset()),
167  ghost(ghost),
168  in(in),
169  location(location),
170  nFace(nFace),
171  dagger(dagger),
172  parity(parity),
173  nParity(in.SiteSubset()),
174  threads(0),
175  a(a),
176  b(b),
177  c(c)
178  {
179  fillAux();
180 
181  // compute number of threads - really number of active work items we have to do
182  for (int i = 0; i < 4; i++) {
183  if (!commDim[i]) continue;
184  if (i == 3 && !getKernelPackT()) continue;
185  threads += 2 * nFace * in.getDslashConstant().ghostFaceCB[i]; // 2 for forwards and backwards faces
186  }
187  }
188 
189  virtual ~Pack() {}
190 
191  template <typename T, typename Arg>
192  inline void launch(T *f, const TuneParam &tp, Arg &arg, const cudaStream_t &stream)
193  {
194  if (deviceProp.major >= 7) { // enable max shared memory mode on GPUs that support it
196  }
197 
198  void *args[] = {&arg};
199  qudaLaunchKernel((const void *)f, tp.grid, tp.block, args, tp.shared_bytes, stream);
200  }
201 
202  void apply(const cudaStream_t &stream)
203  {
204  TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
205 
206  if (in.Nspin() == 4) {
208  Arg arg(ghost, in, nFace, dagger, parity, threads, a, b, c);
209  arg.swizzle = tp.aux.x;
210  arg.sites_per_block = (arg.threads + tp.grid.x - 1) / tp.grid.x;
211  arg.blocks_per_dir = tp.grid.x / (2 * arg.active_dims); // set number of blocks per direction
212 
213 #ifdef STRIPED
214  if (in.PCType() == QUDA_4D_PC) {
215  if (arg.dagger) {
216  switch (arg.twist) {
217  case 0: launch(packKernel<true, 0, QUDA_4D_PC, Arg>, tp, arg, stream); break;
218  case 1: launch(packKernel<true, 1, QUDA_4D_PC, Arg>, tp, arg, stream); break;
219  case 2: launch(packKernel<true, 2, QUDA_4D_PC, Arg>, tp, arg, stream); break;
220  }
221  } else {
222  switch (arg.twist) {
223  case 0: launch(packKernel<false, 0, QUDA_4D_PC, Arg>, tp, arg, stream); break;
224  default: errorQuda("Twisted packing only for dagger");
225  }
226  }
227  } else if (arg.pc_type == QUDA_5D_PC) {
228  if (arg.twist) errorQuda("Twist packing not defined");
229  if (arg.dagger) {
230  launch(packKernel<true, 0, QUDA_5D_PC, Arg>, tp, arg, stream);
231  } else {
232  launch(packKernel<false, 0, QUDA_5D_PC, Arg>, tp, arg, stream);
233  }
234  } else {
235  errorQuda("Unexpected preconditioning type %d", in.PCType());
236  }
237 #else
238  if (in.PCType() == QUDA_4D_PC) {
239  if (arg.dagger) {
240  switch (arg.twist) {
241  case 0:
242  launch(location & Host ? packShmemKernel<true, 0, QUDA_4D_PC, Arg> : packKernel<true, 0, QUDA_4D_PC, Arg>,
243  tp, arg, stream);
244  break;
245  case 1:
246  launch(location & Host ? packShmemKernel<true, 1, QUDA_4D_PC, Arg> : packKernel<true, 0, QUDA_4D_PC, Arg>,
247  tp, arg, stream);
248  break;
249  case 2:
250  launch(location & Host ? packShmemKernel<true, 2, QUDA_4D_PC, Arg> : packKernel<true, 2, QUDA_4D_PC, Arg>,
251  tp, arg, stream);
252  break;
253  }
254  } else {
255  switch (arg.twist) {
256  case 0:
257  launch(location & Host ? packShmemKernel<false, 0, QUDA_4D_PC, Arg> : packKernel<false, 0, QUDA_4D_PC, Arg>,
258  tp, arg, stream);
259  break;
260  default: errorQuda("Twisted packing only for dagger");
261  }
262  }
263  } else if (arg.pc_type == QUDA_5D_PC) {
264  if (arg.twist) errorQuda("Twist packing not defined");
265  if (arg.dagger) {
266  launch(packKernel<true, 0, QUDA_5D_PC, Arg>, tp, arg, stream);
267  } else {
268  launch(packKernel<false, 0, QUDA_5D_PC, Arg>, tp, arg, stream);
269  }
270  }
271 #endif
272  } else if (in.Nspin() == 1) {
274  Arg arg(ghost, in, nFace, dagger, parity, threads, a, b, c);
275  arg.swizzle = tp.aux.x;
276  arg.sites_per_block = (arg.threads + tp.grid.x - 1) / tp.grid.x;
277  arg.blocks_per_dir = tp.grid.x / (2 * arg.active_dims); // set number of blocks per direction
278 
279 #ifdef STRIPED
280  launch(packStaggeredKernel<Arg>, tp, arg, stream);
281 #else
282  launch(location & Host ? packStaggeredShmemKernel<Arg> : packStaggeredKernel<Arg>, tp, arg, stream);
283 #endif
284  } else {
285  errorQuda("Unsupported nSpin = %d\n", in.Nspin());
286  }
287  }
288 
289  bool tuneSharedBytes() const { return false; }
290 
291 #if 0
292  // not used at present, but if tuneSharedBytes is enabled then
293  // this allows tuning up the full dynamic shared memory if needed
294  unsigned int maxSharedBytesPerBlock() const { return maxDynamicSharedBytesPerBlock(); }
295 #endif
296 
298  {
300  // if doing a zero-copy policy then ensure that each thread block
301  // runs exclusively on a given SM - this is to ensure quality of
302  // service for the packing kernel when running concurrently.
303  if (location & Host) param.shared_bytes = maxDynamicSharedBytesPerBlock() / 2 + 1;
304 #ifndef STRIPED
305  if (location & Host) param.grid.x = minGridSize();
306 #endif
307  }
308 
310  {
312  // if doing a zero-copy policy then ensure that each thread block
313  // runs exclusively on a given SM - this is to ensure quality of
314  // service for the packing kernel when running concurrently.
315  if (location & Host) param.shared_bytes = maxDynamicSharedBytesPerBlock() / 2 + 1;
316 #ifndef STRIPED
317  if (location & Host) param.grid.x = minGridSize();
318 #endif
319  }
320 
321  TuneKey tuneKey() const { return TuneKey(in.VolString(), typeid(*this).name(), aux); }
322 
323  int tuningIter() const { return 3; }
324 
325  long long flops() const
326  {
327  // unless we are spin projecting (nSpin = 4), there are no flops to do
328  return in.Nspin() == 4 ? 2 * in.Nspin() / 2 * nColor * nParity * in.getDslashConstant().Ls * threads : 0;
329  }
330 
331  long long bytes() const
332  {
333  size_t precision = sizeof(Float);
334  size_t faceBytes = 2 * ((in.Nspin() == 4 ? in.Nspin() / 2 : in.Nspin()) + in.Nspin()) * nColor * precision;
335  if (precision == QUDA_HALF_PRECISION || precision == QUDA_QUARTER_PRECISION)
336  faceBytes += 2 * sizeof(float); // 2 is from input and output
337  return faceBytes * nParity * in.getDslashConstant().Ls * threads;
338  }
339  };
340 
341  template <typename Float, int nColor>
342  void PackGhost(void *ghost[], const ColorSpinorField &in, MemoryLocation location, int nFace, bool dagger, int parity,
343  bool spin_project, double a, double b, double c, const cudaStream_t &stream)
344  {
345  if (spin_project) {
346  Pack<Float, nColor, true> pack(ghost, in, location, nFace, dagger, parity, a, b, c);
347  pack.apply(stream);
348  } else {
349  Pack<Float, nColor, false> pack(ghost, in, location, nFace, dagger, parity, a, b, c);
350  pack.apply(stream);
351  }
352  }
353 
354  // template on the number of colors
355  template <typename Float>
356  void PackGhost(void *ghost[], const ColorSpinorField &in, MemoryLocation location, int nFace, bool dagger, int parity,
357  bool spin_project, double a, double b, double c, const cudaStream_t &stream)
358  {
359  if (in.Ncolor() == 3) {
360  PackGhost<Float, 3>(ghost, in, location, nFace, dagger, parity, spin_project, a, b, c, stream);
361  } else {
362  errorQuda("Unsupported number of colors %d\n", in.Ncolor());
363  }
364  }
365 
366  // Pack the ghost for the Dslash operator
368  bool dagger, int parity, bool spin_project, double a, double b, double c, const cudaStream_t &stream)
369  {
370  int nDimPack = 0;
371  for (int d = 0; d < 4; d++) {
372  if (!commDim[d]) continue;
373  if (d != 3 || getKernelPackT()) nDimPack++;
374  }
375 
376  if (!nDimPack) return; // if zero then we have nothing to pack
377 
378  if (in.Precision() == QUDA_DOUBLE_PRECISION) {
379  PackGhost<double>(ghost, in, location, nFace, dagger, parity, spin_project, a, b, c, stream);
380  } else if (in.Precision() == QUDA_SINGLE_PRECISION) {
381  PackGhost<float>(ghost, in, location, nFace, dagger, parity, spin_project, a, b, c, stream);
382  } else if (in.Precision() == QUDA_HALF_PRECISION) {
383  PackGhost<short>(ghost, in, location, nFace, dagger, parity, spin_project, a, b, c, stream);
384  } else if (in.Precision() == QUDA_QUARTER_PRECISION) {
385  PackGhost<char>(ghost, in, location, nFace, dagger, parity, spin_project, a, b, c, stream);
386  } else {
387  errorQuda("Unsupported precision %d\n", in.Precision());
388  }
389  }
390 
391 } // namespace quda
TuneKey tuneKey() const
const char * AuxString() const
cudaDeviceProp deviceProp
bool getKernelPackT()
Definition: dslash_quda.cu:26
QudaVerbosity getVerbosity()
Definition: util_quda.cpp:21
#define errorQuda(...)
Definition: util_quda.h:121
int comm_dim(int dim)
cudaStream_t * stream
unsigned int maxGridSize() const
Definition: dslash_pack2.cu:64
const char * VolString() const
const double a
Definition: dslash_pack2.cu:57
bool tuneGridDim() const
Definition: dslash_pack2.cu:62
const int parity
Definition: dslash_pack2.cu:54
void defaultTuneParam(TuneParam &param) const
void apply(const cudaStream_t &stream)
void launch(T *f, const TuneParam &tp, Arg &arg, const cudaStream_t &stream)
QudaGaugeParam param
Definition: pack_test.cpp:17
int gridStep() const
gridStep sets the step size when iterating the grid size in advanceGridDim.
bool tuneAuxDim() const
void setMaxDynamicSharedBytesPerBlock(F *func) const
Enable the maximum dynamic shared bytes for the kernel "func" (values given by maxDynamicSharedBytesP...
Definition: tune_quda.h:181
virtual unsigned int maxGridSize() const
Definition: tune_quda.h:95
const int nFace
Definition: dslash_pack2.cu:52
const int nColor
Definition: covdev_test.cpp:75
const char * comm_dim_topology_string()
Return a string that defines the comm topology (for use as a tuneKey)
virtual int gridStep() const
gridStep sets the step size when iterating the grid size in advanceGridDim.
Definition: tune_quda.h:103
bool tuneSharedBytes() const
void fillAux()
QudaPCType PCType() const
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
Definition: tune.cpp:643
int X[4]
Definition: covdev_test.cpp:70
void PackGhost(void *ghost[2 *QUDA_MAX_DIM], const ColorSpinorField &field, MemoryLocation location, int nFace, bool dagger, int parity, bool spin_project, double a, double b, double c, const cudaStream_t &stream)
Dslash face packing routine.
virtual ~Pack()
int ghostFaceCB[QUDA_MAX_DIM+1]
void initTuneParam(TuneParam &param) const
Definition: tune_quda.h:523
const ColorSpinorField & in
Definition: dslash_pack2.cu:50
const DslashConstant & getDslashConstant() const
Get the dslash_constant structure from this field.
const double b
Definition: dslash_pack2.cu:58
MemoryLocation location
Definition: dslash_pack2.cu:51
unsigned int minThreads() const
const int nParity
Definition: dslash_pack2.cu:55
const bool dagger
Definition: dslash_pack2.cu:53
const double c
Definition: dslash_pack2.cu:59
virtual unsigned int minGridSize() const
Definition: tune_quda.h:96
static int commDim[QUDA_MAX_DIM]
Definition: dslash_pack.cuh:9
cpuColorSpinorField * out
__device__ __host__ void pack(Arg &arg, int ghost_idx, int s, int parity)
Definition: dslash_pack.cuh:83
void ** ghost
Definition: dslash_pack2.cu:49
int tuningIter() const
long long bytes() const
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
void initTuneParam(TuneParam &param) const
#define QUDA_MAX_DIM
Maximum number of dimensions supported by QUDA. In practice, no routines make use of more than 5...
unsigned int maxDynamicSharedBytesPerBlock() const
This can&#39;t be correctly queried in CUDA for all architectures so here we set set this. Based on Table 14 of the CUDA Programming Guide 10.0 (Technical Specifications per Compute Capability).
Definition: tune_quda.h:198
Pack(void *ghost[], const ColorSpinorField &in, MemoryLocation location, int nFace, bool dagger, int parity, double a, double b, double c)
unsigned int minGridSize() const
Definition: dslash_pack2.cu:84
QudaTune getTuning()
Query whether autotuning is enabled or not. Default is enabled but can be overridden by setting QUDA_...
Definition: util_quda.cpp:52
QudaPrecision Precision() const
int comm_peer2peer_enabled_global()
cudaError_t qudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim, void **args, size_t sharedMem, cudaStream_t stream)
Wrapper around cudaLaunchKernel.
char aux[TuneKey::aux_n]
Definition: tune_quda.h:265
virtual unsigned int maxSharedBytesPerBlock() const
The maximum shared memory that a CUDA thread block can use in the autotuner. This isn&#39;t necessarily t...
Definition: tune_quda.h:229
void defaultTuneParam(TuneParam &param) const
Definition: tune_quda.h:531
void setPackComms(const int *dim_pack)
Helper function that sets which dimensions the packing kernel should be packing for.
Definition: dslash_pack2.cu:14
long long flops() const