QUDA  1.0.0
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros
dslash_policy.cuh
Go to the documentation of this file.
1 #include <tune_quda.h>
2 
3 namespace quda
4 {
5 
6  namespace dslash
7  {
8 
9  extern int it;
10 
11  extern cudaEvent_t packEnd[2]; // double buffered
12  extern cudaEvent_t gatherStart[Nstream];
13  extern cudaEvent_t gatherEnd[Nstream];
14  extern cudaEvent_t scatterStart[Nstream];
15  extern cudaEvent_t scatterEnd[Nstream];
16  extern cudaEvent_t dslashStart[2]; // double buffered
17 
18  // FIX this is a hack from hell
19  // Auxiliary work that can be done while waiting on comms to finish
20  extern Worker *aux_worker;
21 
22 #if CUDA_VERSION >= 8000
23  extern cuuint32_t *commsEnd_h;
24  extern CUdeviceptr commsEnd_d[Nstream];
25 #endif
26 
27  // these variables are used for benchmarking the dslash components in isolation
28  extern bool dslash_pack_compute;
29  extern bool dslash_interior_compute;
30  extern bool dslash_exterior_compute;
31  extern bool dslash_comms;
32  extern bool dslash_copy;
33 
35 
46 
47  inline DslashCommsPattern(const int commDim[], bool gdr_send = false) :
48  commsCompleted {},
49  dslashCompleted {},
50  completeSum(0)
51  {
52 
53  for (int i = 0; i < Nstream - 1; i++) gatherCompleted[i] = gdr_send ? 1 : 0;
54  gatherCompleted[Nstream - 1] = 1;
55  commsCompleted[Nstream - 1] = 1;
56  dslashCompleted[Nstream - 1] = 1;
57 
58  // We need to know which was the previous direction in which
59  // communication was issued, since we only query a given event /
60  // comms call after the previous the one has successfully
61  // completed.
62  for (int i = 3; i >= 0; i--) {
63  if (commDim[i]) {
64  int prev = Nstream - 1;
65  for (int j = 3; j > i; j--)
66  if (commDim[j]) prev = 2 * j;
67  previousDir[2 * i + 1] = prev;
68  previousDir[2 * i + 0] = 2 * i + 1; // always valid
69  }
70  }
71 
72  // this tells us how many events / comms occurances there are in
73  // total. Used for exiting the while loop
74  commDimTotal = 0;
75  for (int i = 3; i >= 0; i--) { commDimTotal += commDim[i]; }
76  commDimTotal *= gdr_send ? 2 : 4; // 2 from pipe length, 2 from direction
77  }
78  };
79 
80  template <typename Arg, typename Dslash>
81  inline void setFusedParam(Arg &param, Dslash &dslash, const int *faceVolumeCB)
82  {
83  int prev = -1;
84 
85  param.threads = 0;
86  for (int i = 0; i < 4; ++i) {
87  param.threadDimMapLower[i] = 0;
88  param.threadDimMapUpper[i] = 0;
89  if (!dslash.dslashParam.commDim[i]) continue;
90  param.threadDimMapLower[i] = (prev >= 0 ? param.threadDimMapUpper[prev] : 0);
91  param.threadDimMapUpper[i] = param.threadDimMapLower[i] + dslash.Nface() * faceVolumeCB[i];
92  param.threads = param.threadDimMapUpper[i];
93  prev = i;
94  }
95 
96  param.kernel_type = EXTERIOR_KERNEL_ALL;
97  }
98 
99 #undef DSLASH_PROFILE
100 #ifdef DSLASH_PROFILE
101 #define PROFILE(f, profile, idx) \
102  profile.TPSTART(idx); \
103  f; \
104  profile.TPSTOP(idx);
105 #else
106 #define PROFILE(f, profile, idx) f;
107 #endif
108 
109 
117  template <typename Dslash>
118  inline void issueRecv(cudaColorSpinorField &input, const Dslash &dslash, cudaStream_t *stream, bool gdr)
119  {
120  for(int i=3; i>=0; i--){
121  if (!dslash.dslashParam.commDim[i]) continue;
122  for(int dir=1; dir>=0; dir--) {
123  PROFILE(if (dslash_comms) input.recvStart(dslash.Nface()/2, 2*i+dir, dslash.Dagger(), stream, gdr), profile, QUDA_PROFILE_COMMS_START);
124  }
125  }
126  }
127 
138  template <typename Dslash>
139  inline void issuePack(cudaColorSpinorField &in, const Dslash &dslash, int parity, MemoryLocation location, int packIndex)
140  {
141 
142  auto &arg = dslash.dslashParam;
143  if ( (location & Device) & Host) errorQuda("MemoryLocation cannot be both Device and Host");
144 
145  bool pack = false;
146  for (int i=3; i>=0; i--)
147  if (arg.commDim[i] && (i != 3 || getKernelPackT())) {
148  pack = true;
149  break;
150  }
151 
152  MemoryLocation pack_dest[2*QUDA_MAX_DIM];
153  for (int dim=0; dim<4; dim++) {
154  for (int dir=0; dir<2; dir++) {
155  if ( (location & Remote) && comm_peer2peer_enabled(dir,dim) ) {
156  pack_dest[2*dim+dir] = Remote; // pack to p2p remote
157  } else if ( location & Host && !comm_peer2peer_enabled(dir,dim) ) {
158  pack_dest[2*dim+dir] = Host; // pack to cpu memory
159  } else {
160  pack_dest[2*dim+dir] = Device; // pack to local gpu memory
161  }
162  }
163  }
164  if (pack) {
165  PROFILE(if (dslash_pack_compute) in.pack(dslash.Nface() / 2, parity, dslash.Dagger(), packIndex, pack_dest,
166  location, arg.spin_project, arg.twist_a, arg.twist_b, arg.twist_c),
167  profile, QUDA_PROFILE_PACK_KERNEL);
168 
169  // Record the end of the packing
170  PROFILE(if (location != Host) qudaEventRecord(packEnd[in.bufferIndex], streams[packIndex]), profile, QUDA_PROFILE_EVENT_RECORD);
171  }
172  }
173 
180  template <typename Dslash> inline void issueGather(cudaColorSpinorField &in, const Dslash &dslash)
181  {
182 
183  for (int i = 3; i >=0; i--) {
184  if (!dslash.dslashParam.commDim[i]) continue;
185 
186  for (int dir=1; dir>=0; dir--) { // forwards gather
187  cudaEvent_t &event = (i!=3 || getKernelPackT()) ? packEnd[in.bufferIndex] : dslashStart[in.bufferIndex];
188 
189  PROFILE(qudaStreamWaitEvent(streams[2*i+dir], event, 0), profile, QUDA_PROFILE_STREAM_WAIT_EVENT);
190 
191  // Initialize host transfer from source spinor
192  PROFILE(if (dslash_copy) in.gather(dslash.Nface()/2, dslash.Dagger(), 2*i+dir), profile, QUDA_PROFILE_GATHER);
193 
194  // Record the end of the gathering if not peer-to-peer
195  if (!comm_peer2peer_enabled(dir,i)) {
196  PROFILE(qudaEventRecord(gatherEnd[2*i+dir], streams[2*i+dir]), profile, QUDA_PROFILE_EVENT_RECORD);
197  }
198  }
199  }
200  }
201 
212  template <typename T>
213  inline int getStreamIndex(const T &dslashParam) {
214  // set index to a stream index not being used for p2p
215  int index = -1;
216  for (int i = 3; i >=0; i--) {
217  if (!dslashParam.commDim[i]) continue;
218  if (!comm_peer2peer_enabled(0,i)) index = 2*i+0;
219  else if (!comm_peer2peer_enabled(1,i)) index = 2*i+1;
220  }
221  // make sure we pick a valid index, in case we are fully p2p connected
222  if (index == -1) index = 0;
223  return index;
224  }
225 
252  template <typename Dslash>
253  inline bool commsComplete(cudaColorSpinorField &in, const Dslash &dslash, int dim, int dir, bool gdr_send,
254  bool gdr_recv, bool zero_copy_recv, bool async, int scatterIndex = -1)
255  {
256 
257  cudaStream_t *stream = nullptr;
258 
259  PROFILE(int comms_test = dslash_comms ? in.commsQuery(dslash.Nface()/2, 2*dim+dir, dslash.Dagger(), stream, gdr_send, gdr_recv) : 1, profile, QUDA_PROFILE_COMMS_QUERY);
260  if (comms_test) {
261  // now we are receive centric
262  int dir2 = 1-dir;
263 
264  // if peer-2-peer in a given direction then we need to insert a wait on that copy event
265  if (comm_peer2peer_enabled(dir2,dim)) {
267  } else {
268 
269  if (!gdr_recv && !zero_copy_recv) { // Issue CPU->GPU copy if not GDR
270 
271  if (async) {
272 #if (CUDA_VERSION >= 8000) && 0
273  // this will trigger the copy asynchronously
274  *((volatile cuuint32_t*)(commsEnd_h+2*dim+dir2)) = 1;
275 #else
276  errorQuda("Async dslash policy variants require CUDA 8.0 and above");
277 #endif
278  } else {
279  // note the ColorSpinorField::scatter transforms from
280  // scatter centric to gather centric (e.g., flips
281  // direction) so here just use dir not dir2
282  if (scatterIndex == -1) scatterIndex = 2*dim+dir;
283  PROFILE(if (dslash_copy) in.scatter(dslash.Nface()/2, dslash.Dagger(), 2*dim+dir, streams+scatterIndex), profile, QUDA_PROFILE_SCATTER);
284  }
285 
286  }
287 
288  }
289 
290  }
291  return comms_test;
292  }
293 
303  template <typename T>
304  inline void completeDslash(const ColorSpinorField &in, const T&dslashParam) {
305  // this ensures that the p2p sending is completed before any
306  // subsequent work is done on the compute stream
307  for (int dim=3; dim>=0; dim--) {
308  if (!dslashParam.commDim[dim]) continue;
309  for (int dir=0; dir<2; dir++) {
310  if (comm_peer2peer_enabled(dir,dim)) {
312  }
313  }
314  }
315  }
316 
328  template <typename Dslash> inline void setMappedGhost(Dslash &dslash, ColorSpinorField &in, bool to_mapped)
329  {
330 
331  static char aux_copy[TuneKey::aux_n];
332  static bool set_mapped = false;
333 
334  if (to_mapped) {
335  if (set_mapped) errorQuda("set_mapped already set");
336  // in the below we switch to the mapped ghost buffer and update the tuneKey to reflect this
337  in.bufferIndex += 2;
338  strcpy(aux_copy,dslash.getAux(dslash.dslashParam.kernel_type));
340  dslash.augmentAux(dslash.dslashParam.kernel_type, ",zero_copy,p2p=1");
341  else
342  dslash.augmentAux(dslash.dslashParam.kernel_type, ",zero_copy,p2p=0");
343  set_mapped = true;
344  } else {
345  if (!set_mapped) errorQuda("set_mapped not set");
346  // reset to default
347  dslash.setAux(dslash.dslashParam.kernel_type, aux_copy);
348  in.bufferIndex -= 2;
349  set_mapped = false;
350  }
351  }
352 
353  template <typename Dslash> struct DslashPolicyImp {
354 
355  virtual void operator()(
356  Dslash &dslash, cudaColorSpinorField *in, const int volume, const int *faceVolumeCB, TimeProfile &profile)
357  = 0;
358 
359  virtual ~DslashPolicyImp(){}
360  };
361 
365  template <typename Dslash> struct DslashBasic : DslashPolicyImp<Dslash> {
366 
368  Dslash &dslash, cudaColorSpinorField *in, const int volume, const int *faceVolumeCB, TimeProfile &profile)
369  {
370 
371  profile.TPSTART(QUDA_PROFILE_TOTAL);
372 
373  auto &dslashParam = dslash.dslashParam;
374  dslashParam.kernel_type = INTERIOR_KERNEL;
375  dslashParam.threads = volume;
376 
377  // Record the start of the dslash if doing communication in T and not kernel packing
378  if (dslashParam.commDim[3] && !getKernelPackT()) {
379  PROFILE(qudaEventRecord(dslashStart[in->bufferIndex], streams[Nstream - 1]), profile, QUDA_PROFILE_EVENT_RECORD);
380  }
381 
382  issueRecv(*in, dslash, 0, false); // Prepost receives
383 
384  const int packIndex = Nstream - 1;
385  const int parity_src = (in->SiteSubset() == QUDA_PARITY_SITE_SUBSET ? 1 - dslashParam.parity : 0);
386  issuePack(*in, dslash, parity_src, static_cast<MemoryLocation>(Device | (Remote * dslashParam.remote_write)),
387  packIndex);
388 
389  issueGather(*in, dslash);
390 
391  PROFILE(if (dslash_interior_compute) dslash.apply(streams[Nstream - 1]), profile, QUDA_PROFILE_DSLASH_KERNEL);
392  if (aux_worker) aux_worker->apply(streams[Nstream - 1]);
393 
394  DslashCommsPattern pattern(dslashParam.commDim);
395  while (pattern.completeSum < pattern.commDimTotal) {
396  for (int i = 3; i >= 0; i--) {
397  if (!dslashParam.commDim[i]) continue;
398 
399  for (int dir = 1; dir >= 0; dir--) {
400  // Query if gather has completed
401  if (!pattern.gatherCompleted[2 * i + dir] && pattern.gatherCompleted[pattern.previousDir[2 * i + dir]]) {
402 
403  cudaError_t event_test = comm_peer2peer_enabled(dir, i) ? cudaSuccess : cudaErrorNotReady;
404  if (event_test != cudaSuccess)
405  PROFILE(event_test = qudaEventQuery(gatherEnd[2 * i + dir]), profile, QUDA_PROFILE_EVENT_QUERY);
406 
407  if (cudaSuccess == event_test) {
408  pattern.gatherCompleted[2 * i + dir] = 1;
409  pattern.completeSum++;
410  PROFILE(if (dslash_comms) in->sendStart(dslash.Nface() / 2, 2 * i + dir, dslash.Dagger(),
411  dslashParam.remote_write ? streams + packIndex : nullptr, false, dslashParam.remote_write),
412  profile, QUDA_PROFILE_COMMS_START);
413  }
414  }
415 
416  // Query if comms has finished
417  if (!pattern.commsCompleted[2 * i + dir] && pattern.gatherCompleted[2 * i + dir]) {
418  if (commsComplete(*in, dslash, i, dir, false, false, false, false)) {
419  pattern.commsCompleted[2 * i + dir] = 1;
420  pattern.completeSum++;
421  }
422  }
423 
424  } // dir=0,1
425 
426  if (!pattern.dslashCompleted[2 * i] && pattern.dslashCompleted[pattern.previousDir[2 * i + 1]]
427  && pattern.commsCompleted[2 * i] && pattern.commsCompleted[2 * i + 1]) {
428 
429  for (int dir = 1; dir >= 0; dir--) {
431  1 - dir, i)) { // if not peer-to-peer we post an event in the scatter stream and wait on that
432  // Record the end of the scattering
433  PROFILE(
434  qudaEventRecord(scatterEnd[2 * i + dir], streams[2 * i + dir]), profile, QUDA_PROFILE_EVENT_RECORD);
435  // wait for scattering to finish and then launch dslash
436  PROFILE(qudaStreamWaitEvent(streams[Nstream - 1], scatterEnd[2 * i + dir], 0), profile,
438  }
439  }
440 
441  dslashParam.kernel_type = static_cast<KernelType>(i);
442  dslashParam.threads = dslash.Nface() * faceVolumeCB[i]; // updating 2 or 6 faces
443 
444  // all faces use this stream
445  PROFILE(if (dslash_exterior_compute) dslash.apply(streams[Nstream - 1]), profile, QUDA_PROFILE_DSLASH_KERNEL);
446 
447  pattern.dslashCompleted[2 * i] = 1;
448  }
449  }
450  }
451 
452  completeDslash(*in, dslashParam);
453  in->bufferIndex = (1 - in->bufferIndex);
454  profile.TPSTOP(QUDA_PROFILE_TOTAL);
455  }
456  };
457 
461  template <typename Dslash> struct DslashFusedExterior : DslashPolicyImp<Dslash> {
462 
464  Dslash &dslash, cudaColorSpinorField *in, const int volume, const int *faceVolumeCB, TimeProfile &profile)
465  {
466 
467  profile.TPSTART(QUDA_PROFILE_TOTAL);
468 
469  auto &dslashParam = dslash.dslashParam;
470  dslashParam.kernel_type = INTERIOR_KERNEL;
471  dslashParam.threads = volume;
472 
473  // Record the start of the dslash if doing communication in T and not kernel packing
474  if (dslashParam.commDim[3] && !getKernelPackT()) {
475  PROFILE(qudaEventRecord(dslashStart[in->bufferIndex], streams[Nstream - 1]), profile, QUDA_PROFILE_EVENT_RECORD);
476  }
477 
478  issueRecv(*in, dslash, 0, false); // Prepost receives
479 
480  const int packIndex = Nstream - 1;
481  const int parity_src = (in->SiteSubset() == QUDA_PARITY_SITE_SUBSET ? 1 - dslashParam.parity : 0);
482  issuePack(*in, dslash, parity_src, static_cast<MemoryLocation>(Device | (Remote * dslashParam.remote_write)),
483  packIndex);
484 
485  issueGather(*in, dslash);
486 
487  PROFILE(if (dslash_interior_compute) dslash.apply(streams[Nstream - 1]), profile, QUDA_PROFILE_DSLASH_KERNEL);
488  if (aux_worker) aux_worker->apply(streams[Nstream - 1]);
489 
490  const int scatterIndex = getStreamIndex(dslashParam);
491  DslashCommsPattern pattern(dslashParam.commDim);
492  while (pattern.completeSum < pattern.commDimTotal) {
493  for (int i = 3; i >= 0; i--) {
494  if (!dslashParam.commDim[i]) continue;
495 
496  for (int dir = 1; dir >= 0; dir--) {
497  // Query if gather has completed
498  if (!pattern.gatherCompleted[2 * i + dir] && pattern.gatherCompleted[pattern.previousDir[2 * i + dir]]) {
499  cudaError_t event_test = comm_peer2peer_enabled(dir, i) ? cudaSuccess : cudaErrorNotReady;
500  if (event_test != cudaSuccess)
501  PROFILE(event_test = qudaEventQuery(gatherEnd[2 * i + dir]), profile, QUDA_PROFILE_EVENT_QUERY);
502 
503  if (cudaSuccess == event_test) {
504  pattern.gatherCompleted[2 * i + dir] = 1;
505  pattern.completeSum++;
506  PROFILE(if (dslash_comms) in->sendStart(dslash.Nface() / 2, 2 * i + dir, dslash.Dagger(),
507  dslashParam.remote_write ? streams + packIndex : nullptr, false, dslashParam.remote_write),
508  profile, QUDA_PROFILE_COMMS_START);
509  }
510  }
511 
512  // Query if comms has finished
513  if (!pattern.commsCompleted[2 * i + dir] && pattern.gatherCompleted[2 * i + dir]) {
514  if (commsComplete(*in, dslash, i, dir, false, false, false, false, scatterIndex)) {
515  pattern.commsCompleted[2 * i + dir] = 1;
516  pattern.completeSum++;
517  }
518  }
519  } // dir=0,1
520  } // i
521  } // while(pattern.completeSum < commDimTotal)
522 
523  for (int i = 3; i >= 0; i--) {
524  if (dslashParam.commDim[i]
525  && (!comm_peer2peer_enabled(0, i)
527  1, i))) { // if not peer-to-peer we post an event in the scatter stream and wait on that
528  PROFILE(qudaEventRecord(scatterEnd[0], streams[scatterIndex]), profile, QUDA_PROFILE_EVENT_RECORD);
529  PROFILE(qudaStreamWaitEvent(streams[Nstream - 1], scatterEnd[0], 0), profile, QUDA_PROFILE_STREAM_WAIT_EVENT);
530  break;
531  }
532  }
533 
534  // Launch exterior kernel
535  if (pattern.commDimTotal) {
536  setFusedParam(dslashParam, dslash, faceVolumeCB); // setup for exterior kernel
537  PROFILE(if (dslash_exterior_compute) dslash.apply(streams[Nstream - 1]), profile, QUDA_PROFILE_DSLASH_KERNEL);
538  }
539 
540  completeDslash(*in, dslashParam);
541  in->bufferIndex = (1 - in->bufferIndex);
542  profile.TPSTOP(QUDA_PROFILE_TOTAL);
543  }
544  };
545 
549  template <typename Dslash> struct DslashGDR : DslashPolicyImp<Dslash> {
550 
552  Dslash &dslash, cudaColorSpinorField *in, const int volume, const int *faceVolumeCB, TimeProfile &profile)
553  {
554 
555  profile.TPSTART(QUDA_PROFILE_TOTAL);
556 
557  auto &dslashParam = dslash.dslashParam;
558  dslashParam.kernel_type = INTERIOR_KERNEL;
559  dslashParam.threads = volume;
560 
561  issueRecv(*in, dslash, 0, true); // Prepost receives
562 
563  const int packIndex = Nstream - 1;
564  const int parity_src = (in->SiteSubset() == QUDA_PARITY_SITE_SUBSET ? 1 - dslashParam.parity : 0);
565  issuePack(*in, dslash, parity_src, static_cast<MemoryLocation>(Device | (Remote * dslashParam.remote_write)),
566  packIndex);
567 
568  PROFILE(if (dslash_interior_compute) dslash.apply(streams[Nstream - 1]), profile, QUDA_PROFILE_DSLASH_KERNEL);
569  if (aux_worker) aux_worker->apply(streams[Nstream - 1]);
570 
571  bool pack_event = false;
572  for (int p2p = 0; p2p < 2; p2p++) { // schedule non-p2p traffic first, then do p2p
573  for (int i = 3; i >= 0; i--) {
574  if (!dslashParam.commDim[i]) continue;
575 
576  if (!pack_event) {
577  cudaEventSynchronize(packEnd[in->bufferIndex]);
578  pack_event = true;
579  }
580 
581  for (int dir = 1; dir >= 0; dir--) {
582  if ((comm_peer2peer_enabled(dir, i) + p2p) % 2 == 0) {
583  PROFILE(if (dslash_comms) in->sendStart(dslash.Nface() / 2, 2 * i + dir, dslash.Dagger(),
584  dslashParam.remote_write ? streams + packIndex : nullptr, true, dslashParam.remote_write),
585  profile, QUDA_PROFILE_COMMS_START);
586  } // is p2p?
587  } // dir
588  } // i
589  } // p2p
590 
591  DslashCommsPattern pattern(dslashParam.commDim, true);
592  while (pattern.completeSum < pattern.commDimTotal) {
593  for (int i = 3; i >= 0; i--) {
594  if (!dslashParam.commDim[i]) continue;
595 
596  for (int dir = 1; dir >= 0; dir--) {
597 
598  // Query if comms has finished
599  if (!pattern.commsCompleted[2 * i + dir]) {
600  if (commsComplete(*in, dslash, i, dir, true, true, false, false)) {
601  ;
602  pattern.commsCompleted[2 * i + dir] = 1;
603  pattern.completeSum++;
604  }
605  }
606 
607  } // dir=0,1
608 
609  if (!pattern.dslashCompleted[2 * i] && pattern.dslashCompleted[pattern.previousDir[2 * i + 1]]
610  && pattern.commsCompleted[2 * i] && pattern.commsCompleted[2 * i + 1]) {
611  dslashParam.kernel_type = static_cast<KernelType>(i);
612  dslashParam.threads = dslash.Nface() * faceVolumeCB[i]; // updating 2 or 6 faces
613 
614  // all faces use this stream
615  PROFILE(if (dslash_exterior_compute) dslash.apply(streams[Nstream - 1]), profile, QUDA_PROFILE_DSLASH_KERNEL);
616 
617  pattern.dslashCompleted[2 * i] = 1;
618  }
619  }
620  }
621 
622  completeDslash(*in, dslashParam);
623  in->bufferIndex = (1 - in->bufferIndex);
624  profile.TPSTOP(QUDA_PROFILE_TOTAL);
625  }
626  };
627 
631  template <typename Dslash> struct DslashFusedGDR : DslashPolicyImp<Dslash> {
632 
634  Dslash &dslash, cudaColorSpinorField *in, const int volume, const int *faceVolumeCB, TimeProfile &profile)
635  {
636 
637  profile.TPSTART(QUDA_PROFILE_TOTAL);
638 
639  auto &dslashParam = dslash.dslashParam;
640  dslashParam.kernel_type = INTERIOR_KERNEL;
641  dslashParam.threads = volume;
642 
643  issueRecv(*in, dslash, 0, true); // Prepost receives
644 
645  const int packIndex = Nstream - 1;
646  const int parity_src = (in->SiteSubset() == QUDA_PARITY_SITE_SUBSET ? 1 - dslashParam.parity : 0);
647  issuePack(*in, dslash, parity_src, static_cast<MemoryLocation>(Device | (Remote * dslashParam.remote_write)),
648  packIndex);
649 
650  PROFILE(if (dslash_interior_compute) dslash.apply(streams[Nstream - 1]), profile, QUDA_PROFILE_DSLASH_KERNEL);
651  if (aux_worker) aux_worker->apply(streams[Nstream - 1]);
652 
653  bool pack_event = false;
654  for (int p2p = 0; p2p < 2; p2p++) { // schedule non-p2p traffic first, then do p2p
655  for (int i = 3; i >= 0; i--) {
656  if (!dslashParam.commDim[i]) continue;
657 
658  if (!pack_event) {
659  cudaEventSynchronize(packEnd[in->bufferIndex]);
660  pack_event = true;
661  }
662 
663  for (int dir = 1; dir >= 0; dir--) {
664  if ((comm_peer2peer_enabled(dir, i) + p2p) % 2 == 0) {
665  PROFILE(if (dslash_comms) in->sendStart(dslash.Nface() / 2, 2 * i + dir, dslash.Dagger(),
666  dslashParam.remote_write ? streams + packIndex : nullptr, true, dslashParam.remote_write),
667  profile, QUDA_PROFILE_COMMS_START);
668  } // is p2p?
669  }
670  }
671  } // p2p
672 
673  DslashCommsPattern pattern(dslashParam.commDim, true);
674  while (pattern.completeSum < pattern.commDimTotal) {
675  for (int i = 3; i >= 0; i--) {
676  if (!dslashParam.commDim[i]) continue;
677 
678  for (int dir = 1; dir >= 0; dir--) {
679 
680  // Query if comms has finished
681  if (!pattern.commsCompleted[2 * i + dir]) {
682  if (commsComplete(*in, dslash, i, dir, true, true, false, false)) {
683  pattern.commsCompleted[2 * i + dir] = 1;
684  pattern.completeSum++;
685  }
686  }
687  } // dir=0,1
688  } // i
689  } // pattern.completeSum < pattern.CommDimTotal
690 
691  // Launch exterior kernel
692  if (pattern.commDimTotal) {
693  setFusedParam(dslashParam, dslash, faceVolumeCB); // setup for exterior kernel
694  PROFILE(if (dslash_exterior_compute) dslash.apply(streams[Nstream - 1]), profile, QUDA_PROFILE_DSLASH_KERNEL);
695  }
696 
697  completeDslash(*in, dslashParam);
698  in->bufferIndex = (1 - in->bufferIndex);
699  profile.TPSTOP(QUDA_PROFILE_TOTAL);
700  }
701  };
702 
706  template <typename Dslash> struct DslashGDRRecv : DslashPolicyImp<Dslash> {
707 
709  Dslash &dslash, cudaColorSpinorField *in, const int volume, const int *faceVolumeCB, TimeProfile &profile)
710  {
711 
712  profile.TPSTART(QUDA_PROFILE_TOTAL);
713 
714  auto &dslashParam = dslash.dslashParam;
715  dslashParam.kernel_type = INTERIOR_KERNEL;
716  dslashParam.threads = volume;
717 
718  // Record the start of the dslash if doing communication in T and not kernel packing
719  if (dslashParam.commDim[3] && !getKernelPackT()) {
720  PROFILE(qudaEventRecord(dslashStart[in->bufferIndex], streams[Nstream - 1]), profile, QUDA_PROFILE_EVENT_RECORD);
721  }
722 
723  issueRecv(*in, dslash, 0, true); // Prepost receives
724 
725  const int packIndex = Nstream - 1;
726  const int parity_src = (in->SiteSubset() == QUDA_PARITY_SITE_SUBSET ? 1 - dslashParam.parity : 0);
727  issuePack(*in, dslash, parity_src, static_cast<MemoryLocation>(Device | (Remote * dslashParam.remote_write)),
728  packIndex);
729 
730  issueGather(*in, dslash);
731 
732  PROFILE(if (dslash_interior_compute) dslash.apply(streams[Nstream - 1]), profile, QUDA_PROFILE_DSLASH_KERNEL);
733  if (aux_worker) aux_worker->apply(streams[Nstream - 1]);
734 
735  DslashCommsPattern pattern(dslashParam.commDim);
736  while (pattern.completeSum < pattern.commDimTotal) {
737  for (int i = 3; i >= 0; i--) {
738  if (!dslashParam.commDim[i]) continue;
739 
740  for (int dir = 1; dir >= 0; dir--) {
741  // Query if gather has completed
742  if (!pattern.gatherCompleted[2 * i + dir] && pattern.gatherCompleted[pattern.previousDir[2 * i + dir]]) {
743  cudaError_t event_test = comm_peer2peer_enabled(dir, i) ? cudaSuccess : cudaErrorNotReady;
744  if (event_test != cudaSuccess)
745  PROFILE(event_test = qudaEventQuery(gatherEnd[2 * i + dir]), profile, QUDA_PROFILE_EVENT_QUERY);
746 
747  if (cudaSuccess == event_test) {
748  pattern.gatherCompleted[2 * i + dir] = 1;
749  pattern.completeSum++;
750  PROFILE(if (dslash_comms) in->sendStart(dslash.Nface() / 2, 2 * i + dir, dslash.Dagger(),
751  dslashParam.remote_write ? streams + packIndex : nullptr, false, dslashParam.remote_write),
752  profile, QUDA_PROFILE_COMMS_START);
753  }
754  }
755 
756  // Query if comms has finished
757  if (!pattern.commsCompleted[2 * i + dir] && pattern.gatherCompleted[2 * i + dir]) {
758  if (commsComplete(*in, dslash, i, dir, false, true, false, false)) {
759  pattern.commsCompleted[2 * i + dir] = 1;
760  pattern.completeSum++;
761  }
762  }
763 
764  } // dir=0,1
765 
766  if (!pattern.dslashCompleted[2 * i] && pattern.dslashCompleted[pattern.previousDir[2 * i + 1]]
767  && pattern.commsCompleted[2 * i] && pattern.commsCompleted[2 * i + 1]) {
768  dslashParam.kernel_type = static_cast<KernelType>(i);
769  dslashParam.threads = dslash.Nface() * faceVolumeCB[i]; // updating 2 or 6 faces
770 
771  // all faces use this stream
772  PROFILE(if (dslash_exterior_compute) dslash.apply(streams[Nstream - 1]), profile, QUDA_PROFILE_DSLASH_KERNEL);
773 
774  pattern.dslashCompleted[2 * i] = 1;
775  }
776  }
777  }
778 
779  completeDslash(*in, dslashParam);
780  in->bufferIndex = (1 - in->bufferIndex);
781  profile.TPSTOP(QUDA_PROFILE_TOTAL);
782  }
783  };
784 
788  template <typename Dslash> struct DslashFusedGDRRecv : DslashPolicyImp<Dslash> {
789 
791  Dslash &dslash, cudaColorSpinorField *in, const int volume, const int *faceVolumeCB, TimeProfile &profile)
792  {
793 
794  profile.TPSTART(QUDA_PROFILE_TOTAL);
795 
796  auto &dslashParam = dslash.dslashParam;
797  dslashParam.kernel_type = INTERIOR_KERNEL;
798  dslashParam.threads = volume;
799 
800  // Record the start of the dslash if doing communication in T and not kernel packing
801  if (dslashParam.commDim[3] && !getKernelPackT()) {
802  PROFILE(qudaEventRecord(dslashStart[in->bufferIndex], streams[Nstream - 1]), profile, QUDA_PROFILE_EVENT_RECORD);
803  }
804 
805  issueRecv(*in, dslash, 0, true); // Prepost receives
806 
807  const int packIndex = Nstream - 1;
808  const int parity_src = (in->SiteSubset() == QUDA_PARITY_SITE_SUBSET ? 1 - dslashParam.parity : 0);
809  issuePack(*in, dslash, parity_src, static_cast<MemoryLocation>(Device | (Remote * dslashParam.remote_write)),
810  packIndex);
811 
812  issueGather(*in, dslash);
813 
814  PROFILE(if (dslash_interior_compute) dslash.apply(streams[Nstream - 1]), profile, QUDA_PROFILE_DSLASH_KERNEL);
815  if (aux_worker) aux_worker->apply(streams[Nstream - 1]);
816 
817  DslashCommsPattern pattern(dslashParam.commDim);
818  while (pattern.completeSum < pattern.commDimTotal) {
819  for (int i = 3; i >= 0; i--) {
820  if (!dslashParam.commDim[i]) continue;
821 
822  for (int dir = 1; dir >= 0; dir--) {
823  // Query if gather has completed
824  if (!pattern.gatherCompleted[2 * i + dir] && pattern.gatherCompleted[pattern.previousDir[2 * i + dir]]) {
825  cudaError_t event_test = comm_peer2peer_enabled(dir, i) ? cudaSuccess : cudaErrorNotReady;
826  if (event_test != cudaSuccess)
827  PROFILE(event_test = qudaEventQuery(gatherEnd[2 * i + dir]), profile, QUDA_PROFILE_EVENT_QUERY);
828 
829  if (cudaSuccess == event_test) {
830  pattern.gatherCompleted[2 * i + dir] = 1;
831  pattern.completeSum++;
832  PROFILE(if (dslash_comms) in->sendStart(dslash.Nface() / 2, 2 * i + dir, dslash.Dagger(),
833  dslashParam.remote_write ? streams + packIndex : nullptr, false, dslashParam.remote_write),
834  profile, QUDA_PROFILE_COMMS_START);
835  }
836  }
837 
838  // Query if comms has finished
839  if (!pattern.commsCompleted[2 * i + dir] && pattern.gatherCompleted[2 * i + dir]) {
840  if (commsComplete(*in, dslash, i, dir, false, true, false, false)) {
841  pattern.commsCompleted[2 * i + dir] = 1;
842  pattern.completeSum++;
843  }
844  }
845  } // dir=0,1
846  } // i
847  } // while(pattern.completeSum < commDimTotal)
848 
849  // Launch exterior kernel
850  if (pattern.commDimTotal) {
851  setFusedParam(dslashParam, dslash, faceVolumeCB); // setup for exterior kernel
852  PROFILE(if (dslash_exterior_compute) dslash.apply(streams[Nstream - 1]), profile, QUDA_PROFILE_DSLASH_KERNEL);
853  }
854 
855  completeDslash(*in, dslashParam);
856  in->bufferIndex = (1 - in->bufferIndex);
857  profile.TPSTOP(QUDA_PROFILE_TOTAL);
858  }
859  };
860 
861 #ifdef HOST_DEBUG
862 #define CUDA_CALL( call ) \
863  { \
864  CUresult cudaStatus = call; \
865  if ( CUDA_SUCCESS != cudaStatus ) { \
866  const char *err_str = nullptr; \
867  cuGetErrorString(cudaStatus, &err_str); \
868  fprintf(stderr, "ERROR: CUDA call \"%s\" in line %d of file %s failed with %s (%d).\n", #call, __LINE__, __FILE__, err_str, cudaStatus); \
869  } \
870 }
871 #else
872 #define CUDA_CALL( call ) call
873 #endif
874 
878  template <typename Dslash> struct DslashAsync : DslashPolicyImp<Dslash> {
879 
880 #if (CUDA_VERSION >= 8000) && 0
881 
882  void operator()(
883  Dslash &dslash, cudaColorSpinorField *in, const int volume, const int *faceVolumeCB, TimeProfile &profile)
884  {
885 
886  profile.TPSTART(QUDA_PROFILE_TOTAL);
887 
888  auto &dslashParam = dslash.dslashParam;
889  dslashParam.kernel_type = INTERIOR_KERNEL;
890  dslashParam.threads = volume;
891 
892  // Record the start of the dslash if doing communication in T and not kernel packing
893  if (dslashParam.commDim[3] && !getKernelPackT()) {
894  PROFILE(qudaEventRecord(dslashStart[in->bufferIndex], streams[Nstream - 1]), profile, QUDA_PROFILE_EVENT_RECORD);
895  }
896 
897  issueRecv(*in, dslash, 0, false); // Prepost receives
898 
899  const int packIndex = Nstream - 1;
900  const int parity_src = (in->SiteSubset() == QUDA_PARITY_SITE_SUBSET ? 1 - dslashParam.parity : 0);
901  issuePack(*in, dslash, parity_src, static_cast<MemoryLocation>(Device | (Remote * dslashParam.remote_write)),
902  packIndex);
903 
904  issueGather(*in, dslash);
905 
906  PROFILE(if (dslash_interior_compute) dslash.apply(streams[Nstream - 1]), profile, QUDA_PROFILE_DSLASH_KERNEL);
907  if (aux_worker) aux_worker->apply(streams[Nstream - 1]);
908 
909  DslashCommsPattern pattern(dslashParam.commDim);
910  while (pattern.completeSum < pattern.commDimTotal) {
911  for (int i = 3; i >= 0; i--) {
912  if (!dslashParam.commDim[i]) continue;
913 
914  for (int dir = 1; dir >= 0; dir--) {
915  // Query if gather has completed
916  if (!pattern.gatherCompleted[2 * i + dir] && pattern.gatherCompleted[pattern.previousDir[2 * i + dir]]) {
917  cudaError_t event_test = comm_peer2peer_enabled(dir, i) ? cudaSuccess : cudaErrorNotReady;
918  if (event_test != cudaSuccess)
919  PROFILE(event_test = qudaEventQuery(gatherEnd[2 * i + dir]), profile, QUDA_PROFILE_EVENT_QUERY);
920 
921  if (cudaSuccess == event_test) {
922  pattern.gatherCompleted[2 * i + dir] = 1;
923  pattern.completeSum++;
924  PROFILE(if (dslash_comms) in->sendStart(dslash.Nface() / 2, 2 * i + dir, dslash.Dagger(),
925  dslashParam.remote_write ? streams + packIndex : nullptr, false, dslashParam.remote_write),
926  profile, QUDA_PROFILE_COMMS_START);
927 
928  // schedule post comms work (scatter into the end zone)
929  if (!comm_peer2peer_enabled(1 - dir, i)) {
930  *((volatile cuuint32_t *)(commsEnd_h + 2 * i + 1 - dir)) = 0;
931  CUDA_CALL(cuStreamWaitValue32(
932  streams[2 * i + dir], commsEnd_d[2 * i + 1 - dir], 1, CU_STREAM_WAIT_VALUE_EQ));
933  PROFILE(if (dslash_copy)
934  in->scatter(dslash.Nface() / 2, dslash.Dagger(), 2 * i + dir, &streams[2 * i + dir]),
935  profile, QUDA_PROFILE_SCATTER);
936  }
937  }
938  }
939 
940  // Query if comms has finished
941  if (!pattern.commsCompleted[2 * i + dir] && pattern.gatherCompleted[2 * i + dir]) {
942  if (commsComplete(*in, dslash, i, dir, false, false, false, true)) {
943  pattern.commsCompleted[2 * i + dir] = 1;
944  pattern.completeSum++;
945  }
946  }
947 
948  } // dir=0,1
949 
950  if (!pattern.dslashCompleted[2 * i] && pattern.dslashCompleted[pattern.previousDir[2 * i + 1]]
951  && pattern.commsCompleted[2 * i] && pattern.commsCompleted[2 * i + 1]) {
952 
953  for (int dir = 1; dir >= 0; dir--) {
955  1 - dir, i)) { // if not peer-to-peer we post an event in the scatter stream and wait on that
956  // Record the end of the scattering
957  PROFILE(
958  qudaEventRecord(scatterEnd[2 * i + dir], streams[2 * i + dir]), profile, QUDA_PROFILE_EVENT_RECORD);
959  // wait for scattering to finish and then launch dslash
960  PROFILE(qudaStreamWaitEvent(streams[Nstream - 1], scatterEnd[2 * i + dir], 0), profile,
962  }
963  }
964 
965  dslashParam.kernel_type = static_cast<KernelType>(i);
966  dslashParam.threads = dslash.Nface() * faceVolumeCB[i]; // updating 2 or 6 faces
967 
968  // all faces use this stream
969  PROFILE(if (dslash_exterior_compute) dslash.apply(streams[Nstream - 1]), profile, QUDA_PROFILE_DSLASH_KERNEL);
970 
971  pattern.dslashCompleted[2 * i] = 1;
972  }
973  }
974  }
975 
976  completeDslash(*in, dslashParam);
977  in->bufferIndex = (1 - in->bufferIndex);
978  profile.TPSTOP(QUDA_PROFILE_TOTAL);
979  }
980 #else
981 
983  Dslash &dslash, cudaColorSpinorField *in, const int volume, const int *faceVolumeCB, TimeProfile &profile)
984  {
985  errorQuda("Async dslash policy variants require CUDA 8.0 and above");
986  }
987 
988 #endif // CUDA_VERSION >= 8000
989  };
990 
995  template <typename Dslash> struct DslashFusedExteriorAsync : DslashPolicyImp<Dslash> {
996 
997 #if (CUDA_VERSION >= 8000) && 0
998 
999  void operator()(
1000  Dslash &dslash, cudaColorSpinorField *in, const int volume, const int *faceVolumeCB, TimeProfile &profile)
1001  {
1002 
1003  profile.TPSTART(QUDA_PROFILE_TOTAL);
1004 
1005  auto &dslashParam = dslash.dslashParam;
1006  dslashParam.kernel_type = INTERIOR_KERNEL;
1007  dslashParam.threads = volume;
1008 
1009  // Record the start of the dslash if doing communication in T and not kernel packing
1010  if (dslashParam.commDim[3] && !getKernelPackT()) {
1011  PROFILE(qudaEventRecord(dslashStart[in->bufferIndex], streams[Nstream - 1]), profile, QUDA_PROFILE_EVENT_RECORD);
1012  }
1013 
1014  issueRecv(*in, dslash, 0, false); // Prepost receives
1015 
1016  const int packIndex = Nstream - 1;
1017  const int parity_src = (in->SiteSubset() == QUDA_PARITY_SITE_SUBSET ? 1 - dslashParam.parity : 0);
1018  issuePack(*in, dslash, parity_src, static_cast<MemoryLocation>(Device | (Remote * dslashParam.remote_write)),
1019  packIndex);
1020 
1021  issueGather(*in, dslash);
1022 
1023  PROFILE(if (dslash_interior_compute) dslash.apply(streams[Nstream - 1]), profile, QUDA_PROFILE_DSLASH_KERNEL);
1024  if (aux_worker) aux_worker->apply(streams[Nstream - 1]);
1025 
1026  const int scatterIndex = getStreamIndex(dslashParam);
1027  DslashCommsPattern pattern(dslashParam.commDim);
1028  while (pattern.completeSum < pattern.commDimTotal) {
1029  for (int i = 3; i >= 0; i--) {
1030  if (!dslashParam.commDim[i]) continue;
1031 
1032  for (int dir = 1; dir >= 0; dir--) {
1033 
1034  // Query if gather has completed
1035  if (!pattern.gatherCompleted[2 * i + dir] && pattern.gatherCompleted[pattern.previousDir[2 * i + dir]]) {
1036  cudaError_t event_test = comm_peer2peer_enabled(dir, i) ? cudaSuccess : cudaErrorNotReady;
1037  if (event_test != cudaSuccess)
1038  PROFILE(event_test = qudaEventQuery(gatherEnd[2 * i + dir]), profile, QUDA_PROFILE_EVENT_QUERY);
1039 
1040  if (cudaSuccess == event_test) {
1041  pattern.gatherCompleted[2 * i + dir] = 1;
1042  pattern.completeSum++;
1043  PROFILE(if (dslash_comms) in->sendStart(dslash.Nface() / 2, 2 * i + dir, dslash.Dagger(),
1044  dslashParam.remote_write ? streams + packIndex : nullptr, false, dslashParam.remote_write),
1045  profile, QUDA_PROFILE_COMMS_START);
1046 
1047  // schedule post comms work (scatter into the end zone)
1048  if (!comm_peer2peer_enabled(1 - dir, i)) { // gather centric
1049  *((volatile cuuint32_t *)(commsEnd_h + 2 * i + 1 - dir)) = 0;
1050  CUDA_CALL(cuStreamWaitValue32(
1051  streams[scatterIndex], commsEnd_d[2 * i + 1 - dir], 1, CU_STREAM_WAIT_VALUE_EQ));
1052  PROFILE(if (dslash_copy)
1053  in->scatter(dslash.Nface() / 2, dslash.Dagger(), 2 * i + dir, streams + scatterIndex),
1054  profile, QUDA_PROFILE_SCATTER);
1055  }
1056  }
1057  }
1058 
1059  // Query if comms has finished
1060  if (!pattern.commsCompleted[2 * i + dir] && pattern.gatherCompleted[2 * i + dir]) {
1061  if (commsComplete(*in, dslash, i, dir, false, false, false, true, scatterIndex)) {
1062  pattern.commsCompleted[2 * i + dir] = 1;
1063  pattern.completeSum++;
1064  }
1065  }
1066 
1067  } // dir=0,1
1068  } // i
1069  } // while(pattern.completeSum < commDimTotal)
1070 
1071  for (int i = 3; i >= 0; i--) {
1072  if (dslashParam.commDim[i] && (!comm_peer2peer_enabled(0, i) || !comm_peer2peer_enabled(1, i))) {
1073  // if not peer-to-peer we post an event in the scatter stream and wait on that
1074  PROFILE(qudaEventRecord(scatterEnd[0], streams[scatterIndex]), profile, QUDA_PROFILE_EVENT_RECORD);
1075  PROFILE(qudaStreamWaitEvent(streams[Nstream - 1], scatterEnd[0], 0), profile, QUDA_PROFILE_STREAM_WAIT_EVENT);
1076  break;
1077  }
1078  }
1079 
1080  if (pattern.commDimTotal) {
1081  setFusedParam(dslashParam, dslash, faceVolumeCB); // setup for exterior kernel
1082  PROFILE(if (dslash_exterior_compute) dslash.apply(streams[Nstream - 1]), profile, QUDA_PROFILE_DSLASH_KERNEL);
1083  }
1084 
1085  completeDslash(*in, dslashParam);
1086  in->bufferIndex = (1 - in->bufferIndex);
1087  profile.TPSTOP(QUDA_PROFILE_TOTAL);
1088  }
1089 
1090 #else
1091 
1093  Dslash &dslash, cudaColorSpinorField *in, const int volume, const int *faceVolumeCB, TimeProfile &profile)
1094  {
1095  errorQuda("Async dslash policy variants require CUDA 8.0 and above");
1096  }
1097 
1098 #endif // CUDA_VERSION >= 8000
1099  };
1100 
1105  template <typename Dslash> struct DslashZeroCopyPack : DslashPolicyImp<Dslash> {
1106 
1108  Dslash &dslash, cudaColorSpinorField *in, const int volume, const int *faceVolumeCB, TimeProfile &profile)
1109  {
1110 
1111  profile.TPSTART(QUDA_PROFILE_TOTAL);
1112 
1113  auto &dslashParam = dslash.dslashParam;
1114  dslashParam.kernel_type = INTERIOR_KERNEL;
1115  dslashParam.threads = volume;
1116 
1117  // record start of the dslash
1118  PROFILE(qudaEventRecord(dslashStart[in->bufferIndex], streams[Nstream - 1]), profile, QUDA_PROFILE_EVENT_RECORD);
1119 
1120  issueRecv(*in, dslash, 0, false); // Prepost receives
1121 
1122  const int packIndex = getStreamIndex(dslashParam);
1123  PROFILE(qudaStreamWaitEvent(streams[packIndex], dslashStart[in->bufferIndex], 0), profile,
1125  const int parity_src = (in->SiteSubset() == QUDA_PARITY_SITE_SUBSET ? 1 - dslashParam.parity : 0);
1126  issuePack(*in, dslash, parity_src, static_cast<MemoryLocation>(Host | (Remote * dslashParam.remote_write)),
1127  packIndex);
1128 
1129  PROFILE(if (dslash_interior_compute) dslash.apply(streams[Nstream - 1]), profile, QUDA_PROFILE_DSLASH_KERNEL);
1130  if (aux_worker) aux_worker->apply(streams[Nstream - 1]);
1131 
1132  for (int i = 3; i >= 0; i--) { // only synchronize if we need to
1133  if (!dslashParam.remote_write
1134  || (dslashParam.commDim[i] && (!comm_peer2peer_enabled(0, i) || !comm_peer2peer_enabled(1, i)))) {
1135  qudaStreamSynchronize(streams[packIndex]);
1136  break;
1137  }
1138  }
1139 
1140  for (int p2p = 0; p2p < 2; p2p++) { // schedule non-p2p traffic first, then do p2p
1141  for (int i = 3; i >= 0; i--) {
1142  if (!dslashParam.commDim[i]) continue;
1143 
1144  for (int dir = 1; dir >= 0; dir--) {
1145  if ((comm_peer2peer_enabled(dir, i) + p2p) % 2 == 0) {
1146  PROFILE(if (dslash_comms) in->sendStart(dslash.Nface() / 2, 2 * i + dir, dslash.Dagger(),
1147  dslashParam.remote_write ? streams + packIndex : nullptr, false, dslashParam.remote_write),
1148  profile, QUDA_PROFILE_COMMS_START);
1149  } // is p2p?
1150  } // dir
1151  } // i
1152  } // p2p
1153 
1154  DslashCommsPattern pattern(dslashParam.commDim, true);
1155  while (pattern.completeSum < pattern.commDimTotal) {
1156 
1157  for (int i = 3; i >= 0; i--) {
1158  if (!dslashParam.commDim[i]) continue;
1159 
1160  for (int dir = 1; dir >= 0; dir--) {
1161 
1162  // Query if comms have finished
1163  if (!pattern.commsCompleted[2 * i + dir]) {
1164  if (commsComplete(*in, dslash, i, dir, false, false, false, false)) {
1165  pattern.commsCompleted[2 * i + dir] = 1;
1166  pattern.completeSum++;
1167  }
1168  }
1169  }
1170 
1171  if (!pattern.dslashCompleted[2 * i] && pattern.dslashCompleted[pattern.previousDir[2 * i + 1]]
1172  && pattern.commsCompleted[2 * i] && pattern.commsCompleted[2 * i + 1]) {
1173  for (int dir = 1; dir >= 0; dir--) {
1175  1 - dir, i)) { // if not peer-to-peer we post an event in the scatter stream and wait on that
1176  // Record the end of the scattering
1177  PROFILE(
1178  qudaEventRecord(scatterEnd[2 * i + dir], streams[2 * i + dir]), profile, QUDA_PROFILE_EVENT_RECORD);
1179  // wait for scattering to finish and then launch dslash
1180  PROFILE(qudaStreamWaitEvent(streams[Nstream - 1], scatterEnd[2 * i + dir], 0), profile,
1182  }
1183  }
1184 
1185  dslashParam.kernel_type = static_cast<KernelType>(i);
1186  dslashParam.threads = dslash.Nface() * faceVolumeCB[i]; // updating 2 or 6 faces
1187 
1188  // all faces use this stream
1189  PROFILE(if (dslash_exterior_compute) dslash.apply(streams[Nstream - 1]), profile, QUDA_PROFILE_DSLASH_KERNEL);
1190 
1191  pattern.dslashCompleted[2 * i] = 1;
1192  }
1193  }
1194  }
1195 
1196  completeDslash(*in, dslashParam);
1197  in->bufferIndex = (1 - in->bufferIndex);
1198  profile.TPSTOP(QUDA_PROFILE_TOTAL);
1199  }
1200  };
1201 
1206  template <typename Dslash> struct DslashFusedZeroCopyPack : DslashPolicyImp<Dslash> {
1207 
1209  Dslash &dslash, cudaColorSpinorField *in, const int volume, const int *faceVolumeCB, TimeProfile &profile)
1210  {
1211 
1212  profile.TPSTART(QUDA_PROFILE_TOTAL);
1213 
1214  auto &dslashParam = dslash.dslashParam;
1215  dslashParam.kernel_type = INTERIOR_KERNEL;
1216  dslashParam.threads = volume;
1217 
1218  // record start of the dslash
1219  PROFILE(qudaEventRecord(dslashStart[in->bufferIndex], streams[Nstream - 1]), profile, QUDA_PROFILE_EVENT_RECORD);
1220 
1221  const int packScatterIndex = getStreamIndex(dslashParam);
1222  PROFILE(qudaStreamWaitEvent(streams[packScatterIndex], dslashStart[in->bufferIndex], 0), profile,
1224  const int parity_src = (in->SiteSubset() == QUDA_PARITY_SITE_SUBSET ? 1 - dslashParam.parity : 0);
1225  issuePack(*in, dslash, parity_src, static_cast<MemoryLocation>(Host | (Remote * dslashParam.remote_write)),
1226  packScatterIndex);
1227 
1228  issueRecv(*in, dslash, 0, false); // Prepost receives
1229 
1230  PROFILE(if (dslash_interior_compute) dslash.apply(streams[Nstream - 1]), profile, QUDA_PROFILE_DSLASH_KERNEL);
1231  if (aux_worker) aux_worker->apply(streams[Nstream - 1]);
1232 
1233  for (int i = 3; i >= 0; i--) { // only synchronize if we need to
1234  if (!dslashParam.remote_write
1235  || (dslashParam.commDim[i] && (!comm_peer2peer_enabled(0, i) || !comm_peer2peer_enabled(1, i)))) {
1236  qudaStreamSynchronize(streams[packScatterIndex]);
1237  break;
1238  }
1239  }
1240 
1241  for (int p2p = 0; p2p < 2; p2p++) { // schedule non-p2p traffic first, then do p2p
1242  for (int i = 3; i >= 0; i--) {
1243  if (!dslashParam.commDim[i]) continue;
1244 
1245  for (int dir = 1; dir >= 0; dir--) {
1246  if ((comm_peer2peer_enabled(dir, i) + p2p) % 2 == 0) {
1247  PROFILE(
1248  if (dslash_comms) in->sendStart(dslash.Nface() / 2, 2 * i + dir, dslash.Dagger(),
1249  dslashParam.remote_write ? streams + packScatterIndex : nullptr, false, dslashParam.remote_write),
1250  profile, QUDA_PROFILE_COMMS_START);
1251  } // is p2p?
1252  } // dir
1253  } // i
1254  } // p2p
1255 
1256  DslashCommsPattern pattern(dslashParam.commDim, true);
1257  while (pattern.completeSum < pattern.commDimTotal) {
1258 
1259  for (int i = 3; i >= 0; i--) {
1260  if (!dslashParam.commDim[i]) continue;
1261 
1262  for (int dir = 1; dir >= 0; dir--) {
1263 
1264  // Query if comms has finished
1265  if (!pattern.commsCompleted[2 * i + dir]) {
1266  if (commsComplete(*in, dslash, i, dir, false, false, false, false, packScatterIndex)) {
1267  pattern.commsCompleted[2 * i + dir] = 1;
1268  pattern.completeSum++;
1269  }
1270  }
1271 
1272  } // dir=0,1
1273  } // i
1274  } // pattern.completeSum
1275 
1276  for (int i = 3; i >= 0; i--) {
1277  if (dslashParam.commDim[i] && (!comm_peer2peer_enabled(0, i) || !comm_peer2peer_enabled(1, i))) {
1278  // if not peer-to-peer we post an event in the scatter stream and wait on that
1279  PROFILE(qudaEventRecord(scatterEnd[0], streams[packScatterIndex]), profile, QUDA_PROFILE_EVENT_RECORD);
1280  PROFILE(qudaStreamWaitEvent(streams[Nstream - 1], scatterEnd[0], 0), profile, QUDA_PROFILE_STREAM_WAIT_EVENT);
1281  break;
1282  }
1283  }
1284 
1285  // Launch exterior kernel
1286  if (pattern.commDimTotal) {
1287  setFusedParam(dslashParam, dslash, faceVolumeCB); // setup for exterior kernel
1288  PROFILE(if (dslash_exterior_compute) dslash.apply(streams[Nstream - 1]), profile, QUDA_PROFILE_DSLASH_KERNEL);
1289  }
1290 
1291  completeDslash(*in, dslashParam);
1292  in->bufferIndex = (1 - in->bufferIndex);
1293  profile.TPSTOP(QUDA_PROFILE_TOTAL);
1294  }
1295  };
1296 
1300  template <typename Dslash> struct DslashZeroCopyPackGDRRecv : DslashPolicyImp<Dslash> {
1301 
1303  Dslash &dslash, cudaColorSpinorField *in, const int volume, const int *faceVolumeCB, TimeProfile &profile)
1304  {
1305 
1306  profile.TPSTART(QUDA_PROFILE_TOTAL);
1307 
1308  auto &dslashParam = dslash.dslashParam;
1309  dslashParam.kernel_type = INTERIOR_KERNEL;
1310  dslashParam.threads = volume;
1311 
1312  // record start of the dslash
1313  PROFILE(qudaEventRecord(dslashStart[in->bufferIndex], streams[Nstream - 1]), profile, QUDA_PROFILE_EVENT_RECORD);
1314 
1315  issueRecv(*in, dslash, 0, true); // Prepost receives
1316 
1317  const int packIndex = getStreamIndex(dslashParam);
1318  PROFILE(qudaStreamWaitEvent(streams[packIndex], dslashStart[in->bufferIndex], 0), profile,
1320  const int parity_src = (in->SiteSubset() == QUDA_PARITY_SITE_SUBSET ? 1 - dslashParam.parity : 0);
1321  issuePack(*in, dslash, parity_src, static_cast<MemoryLocation>(Host | (Remote * dslashParam.remote_write)),
1322  packIndex);
1323 
1324  PROFILE(if (dslash_interior_compute) dslash.apply(streams[Nstream - 1]), profile, QUDA_PROFILE_DSLASH_KERNEL);
1325  if (aux_worker) aux_worker->apply(streams[Nstream - 1]);
1326 
1327  for (int i = 3; i >= 0; i--) { // only synchronize if we need to
1328  if (!dslashParam.remote_write
1329  || (dslashParam.commDim[i] && (!comm_peer2peer_enabled(0, i) || !comm_peer2peer_enabled(1, i)))) {
1330  qudaStreamSynchronize(streams[packIndex]);
1331  break;
1332  }
1333  }
1334 
1335  for (int p2p = 0; p2p < 2; p2p++) { // schedule non-p2p traffic first, then do p2p
1336  for (int i = 3; i >= 0; i--) {
1337  if (!dslashParam.commDim[i]) continue;
1338 
1339  for (int dir = 1; dir >= 0; dir--) {
1340  if ((comm_peer2peer_enabled(dir, i) + p2p) % 2 == 0) {
1341  PROFILE(if (dslash_comms) in->sendStart(dslash.Nface() / 2, 2 * i + dir, dslash.Dagger(),
1342  dslashParam.remote_write ? streams + packIndex : nullptr, false, dslashParam.remote_write),
1343  profile, QUDA_PROFILE_COMMS_START);
1344  } // is p2p?
1345  } // dir
1346  } // i
1347  } // p2p
1348 
1349  DslashCommsPattern pattern(dslashParam.commDim, true);
1350  while (pattern.completeSum < pattern.commDimTotal) {
1351 
1352  for (int i = 3; i >= 0; i--) {
1353  if (!dslashParam.commDim[i]) continue;
1354 
1355  for (int dir = 1; dir >= 0; dir--) {
1356 
1357  // Query if comms has finished
1358  if (!pattern.commsCompleted[2 * i + dir] && pattern.gatherCompleted[2 * i + dir]) {
1359  if (commsComplete(*in, dslash, i, dir, false, true, false, false)) {
1360  pattern.commsCompleted[2 * i + dir] = 1;
1361  pattern.completeSum++;
1362  }
1363  }
1364 
1365  } // dir=0,1
1366 
1367  if (!pattern.dslashCompleted[2 * i] && pattern.dslashCompleted[pattern.previousDir[2 * i + 1]]
1368  && pattern.commsCompleted[2 * i] && pattern.commsCompleted[2 * i + 1]) {
1369  dslashParam.kernel_type = static_cast<KernelType>(i);
1370  dslashParam.threads = dslash.Nface() * faceVolumeCB[i]; // updating 2 or 6 faces
1371 
1372  // all faces use this stream
1373  PROFILE(if (dslash_exterior_compute) dslash.apply(streams[Nstream - 1]), profile, QUDA_PROFILE_DSLASH_KERNEL);
1374 
1375  pattern.dslashCompleted[2 * i] = 1;
1376  }
1377  }
1378  }
1379 
1380  completeDslash(*in, dslashParam);
1381  in->bufferIndex = (1 - in->bufferIndex);
1382  profile.TPSTOP(QUDA_PROFILE_TOTAL);
1383  }
1384  };
1385 
1390  template <typename Dslash> struct DslashFusedZeroCopyPackGDRRecv : DslashPolicyImp<Dslash> {
1391 
1393  Dslash &dslash, cudaColorSpinorField *in, const int volume, const int *faceVolumeCB, TimeProfile &profile)
1394  {
1395 
1396  profile.TPSTART(QUDA_PROFILE_TOTAL);
1397 
1398  auto &dslashParam = dslash.dslashParam;
1399  dslashParam.kernel_type = INTERIOR_KERNEL;
1400  dslashParam.threads = volume;
1401 
1402  // record start of the dslash
1403  PROFILE(qudaEventRecord(dslashStart[in->bufferIndex], streams[Nstream - 1]), profile, QUDA_PROFILE_EVENT_RECORD);
1404 
1405  const int packIndex = getStreamIndex(dslashParam);
1406  PROFILE(qudaStreamWaitEvent(streams[packIndex], dslashStart[in->bufferIndex], 0), profile,
1408  const int parity_src = (in->SiteSubset() == QUDA_PARITY_SITE_SUBSET ? 1 - dslashParam.parity : 0);
1409  issuePack(*in, dslash, parity_src, static_cast<MemoryLocation>(Host | (Remote * dslashParam.remote_write)),
1410  packIndex);
1411 
1412  issueRecv(*in, dslash, 0, true); // Prepost receives
1413 
1414  PROFILE(if (dslash_interior_compute) dslash.apply(streams[Nstream - 1]), profile, QUDA_PROFILE_DSLASH_KERNEL);
1415  if (aux_worker) aux_worker->apply(streams[Nstream - 1]);
1416 
1417  for (int i = 3; i >= 0; i--) { // only synchronize if we need to
1418  if (!dslashParam.remote_write
1419  || (dslashParam.commDim[i] && (!comm_peer2peer_enabled(0, i) || !comm_peer2peer_enabled(1, i)))) {
1420  qudaStreamSynchronize(streams[packIndex]);
1421  break;
1422  }
1423  }
1424 
1425  for (int p2p = 0; p2p < 2; p2p++) { // schedule non-p2p traffic first, then do p2p
1426  for (int i = 3; i >= 0; i--) {
1427  if (!dslashParam.commDim[i]) continue;
1428 
1429  for (int dir = 1; dir >= 0; dir--) {
1430  if ((comm_peer2peer_enabled(dir, i) + p2p) % 2 == 0) {
1431  PROFILE(if (dslash_comms) in->sendStart(dslash.Nface() / 2, 2 * i + dir, dslash.Dagger(),
1432  dslashParam.remote_write ? streams + packIndex : nullptr, false, dslashParam.remote_write),
1433  profile, QUDA_PROFILE_COMMS_START);
1434  } // is p2p?
1435  } // dir
1436  } // i
1437  } // p2p
1438 
1439  DslashCommsPattern pattern(dslashParam.commDim, true);
1440  while (pattern.completeSum < pattern.commDimTotal) {
1441 
1442  for (int i = 3; i >= 0; i--) {
1443  if (!dslashParam.commDim[i]) continue;
1444 
1445  for (int dir = 1; dir >= 0; dir--) {
1446 
1447  // Query if comms has finished
1448  if (!pattern.commsCompleted[2 * i + dir] && pattern.gatherCompleted[2 * i + dir]) {
1449  if (commsComplete(*in, dslash, i, dir, false, true, false, false)) {
1450  pattern.commsCompleted[2 * i + dir] = 1;
1451  pattern.completeSum++;
1452  }
1453  }
1454  } // dir=0,1
1455  } // i
1456  } // while(pattern.completeSum < commDimTotal)
1457 
1458  // Launch exterior kernel
1459  if (pattern.commDimTotal) {
1460  setFusedParam(dslashParam, dslash, faceVolumeCB); // setup for exterior kernel
1461  PROFILE(if (dslash_exterior_compute) dslash.apply(streams[Nstream - 1]), profile, QUDA_PROFILE_DSLASH_KERNEL);
1462  }
1463 
1464  completeDslash(*in, dslashParam);
1465  in->bufferIndex = (1 - in->bufferIndex);
1466  profile.TPSTOP(QUDA_PROFILE_TOTAL);
1467  }
1468  };
1469 
1474  template <typename Dslash> struct DslashZeroCopy : DslashPolicyImp<Dslash> {
1475 
1477  Dslash &dslash, cudaColorSpinorField *in, const int volume, const int *faceVolumeCB, TimeProfile &profile)
1478  {
1479 
1480  profile.TPSTART(QUDA_PROFILE_TOTAL);
1481 
1482  auto &dslashParam = dslash.dslashParam;
1483  dslashParam.kernel_type = INTERIOR_KERNEL;
1484  dslashParam.threads = volume;
1485 
1486  // record start of the dslash
1487  PROFILE(qudaEventRecord(dslashStart[in->bufferIndex], streams[Nstream - 1]), profile, QUDA_PROFILE_EVENT_RECORD);
1488 
1489  issueRecv(*in, dslash, 0, false); // Prepost receives
1490 
1491  const int packIndex = getStreamIndex(dslashParam);
1492  PROFILE(qudaStreamWaitEvent(streams[packIndex], dslashStart[in->bufferIndex], 0), profile,
1494  const int parity_src = (in->SiteSubset() == QUDA_PARITY_SITE_SUBSET ? 1 - dslashParam.parity : 0);
1495  issuePack(*in, dslash, parity_src, static_cast<MemoryLocation>(Host | (Remote * dslashParam.remote_write)),
1496  packIndex);
1497 
1498  PROFILE(if (dslash_interior_compute) dslash.apply(streams[Nstream - 1]), profile, QUDA_PROFILE_DSLASH_KERNEL);
1499  if (aux_worker) aux_worker->apply(streams[Nstream - 1]);
1500 
1501  for (int i = 3; i >= 0; i--) { // only synchronize if we need to
1502  if (!dslashParam.remote_write
1503  || (dslashParam.commDim[i] && (!comm_peer2peer_enabled(0, i) || !comm_peer2peer_enabled(1, i)))) {
1504  qudaStreamSynchronize(streams[packIndex]);
1505  break;
1506  }
1507  }
1508 
1509  for (int p2p = 0; p2p < 2; p2p++) { // schedule non-p2p traffic first, then do p2p
1510  for (int i = 3; i >= 0; i--) {
1511  if (!dslashParam.commDim[i]) continue;
1512 
1513  for (int dir = 1; dir >= 0; dir--) {
1514  if ((comm_peer2peer_enabled(dir, i) + p2p) % 2 == 0) {
1515  PROFILE(if (dslash_comms) in->sendStart(dslash.Nface() / 2, 2 * i + dir, dslash.Dagger(),
1516  dslashParam.remote_write ? streams + packIndex : nullptr, false, dslashParam.remote_write),
1517  profile, QUDA_PROFILE_COMMS_START);
1518  } // is p2p?
1519  } // dir
1520  } // i
1521  } // p2p
1522 
1523  DslashCommsPattern pattern(dslashParam.commDim, true);
1524  while (pattern.completeSum < pattern.commDimTotal) {
1525 
1526  for (int i = 3; i >= 0; i--) {
1527  if (!dslashParam.commDim[i]) continue;
1528 
1529  for (int dir = 1; dir >= 0; dir--) {
1530 
1531  // Query if comms have finished
1532  if (!pattern.commsCompleted[2 * i + dir]) {
1533  if (commsComplete(*in, dslash, i, dir, false, false, true, false)) {
1534  pattern.commsCompleted[2 * i + dir] = 1;
1535  pattern.completeSum++;
1536  }
1537  }
1538  }
1539 
1540  // enqueue the boundary dslash kernel as soon as the scatters have been enqueued
1541  if (!pattern.dslashCompleted[2 * i] && pattern.dslashCompleted[pattern.previousDir[2 * i + 1]]
1542  && pattern.commsCompleted[2 * i] && pattern.commsCompleted[2 * i + 1]) {
1543  dslashParam.kernel_type = static_cast<KernelType>(i);
1544  dslashParam.threads = dslash.Nface() * faceVolumeCB[i]; // updating 2 or 6 faces
1545 
1546  setMappedGhost(dslash, *in, true);
1547  PROFILE(if (dslash_exterior_compute) dslash.apply(streams[Nstream - 1]), profile, QUDA_PROFILE_DSLASH_KERNEL);
1548  setMappedGhost(dslash, *in, false);
1549 
1550  pattern.dslashCompleted[2 * i] = 1;
1551  }
1552  }
1553  }
1554 
1555  in->bufferIndex = (1 - in->bufferIndex);
1556  profile.TPSTOP(QUDA_PROFILE_TOTAL);
1557  }
1558  };
1559 
1564  template <typename Dslash> struct DslashFusedZeroCopy : DslashPolicyImp<Dslash> {
1565 
1567  Dslash &dslash, cudaColorSpinorField *in, const int volume, const int *faceVolumeCB, TimeProfile &profile)
1568  {
1569 
1570  profile.TPSTART(QUDA_PROFILE_TOTAL);
1571 
1572  auto &dslashParam = dslash.dslashParam;
1573  dslashParam.kernel_type = INTERIOR_KERNEL;
1574  dslashParam.threads = volume;
1575 
1576  // record start of the dslash
1577  PROFILE(qudaEventRecord(dslashStart[in->bufferIndex], streams[Nstream - 1]), profile, QUDA_PROFILE_EVENT_RECORD);
1578 
1579  issueRecv(*in, dslash, 0, false); // Prepost receives
1580 
1581  const int packIndex = getStreamIndex(dslashParam);
1582  PROFILE(qudaStreamWaitEvent(streams[packIndex], dslashStart[in->bufferIndex], 0), profile,
1584  const int parity_src = (in->SiteSubset() == QUDA_PARITY_SITE_SUBSET ? 1 - dslashParam.parity : 0);
1585  issuePack(*in, dslash, parity_src, static_cast<MemoryLocation>(Host | (Remote * dslashParam.remote_write)),
1586  packIndex);
1587 
1588  PROFILE(if (dslash_interior_compute) dslash.apply(streams[Nstream - 1]), profile, QUDA_PROFILE_DSLASH_KERNEL);
1589  if (aux_worker) aux_worker->apply(streams[Nstream - 1]);
1590 
1591  for (int i = 3; i >= 0; i--) { // only synchronize if we need to
1592  if (!dslashParam.remote_write
1593  || (dslashParam.commDim[i] && (!comm_peer2peer_enabled(0, i) || !comm_peer2peer_enabled(1, i)))) {
1594  qudaStreamSynchronize(streams[packIndex]);
1595  break;
1596  }
1597  }
1598 
1599  for (int p2p = 0; p2p < 2; p2p++) { // schedule non-p2p traffic first, then do p2p
1600  for (int i = 3; i >= 0; i--) {
1601  if (!dslashParam.commDim[i]) continue;
1602 
1603  for (int dir = 1; dir >= 0; dir--) {
1604  if ((comm_peer2peer_enabled(dir, i) + p2p) % 2 == 0) {
1605  PROFILE(if (dslash_comms) in->sendStart(dslash.Nface() / 2, 2 * i + dir, dslash.Dagger(),
1606  dslashParam.remote_write ? streams + packIndex : nullptr, false, dslashParam.remote_write),
1607  profile, QUDA_PROFILE_COMMS_START);
1608  } // is p2p?
1609  } // dir
1610  } // i
1611  } // p2p
1612 
1613  DslashCommsPattern pattern(dslashParam.commDim, true);
1614  while (pattern.completeSum < pattern.commDimTotal) {
1615 
1616  for (int i = 3; i >= 0; i--) {
1617  if (!dslashParam.commDim[i]) continue;
1618 
1619  for (int dir = 1; dir >= 0; dir--) {
1620 
1621  // Query if comms have finished
1622  if (!pattern.commsCompleted[2 * i + dir]) {
1623  if (commsComplete(*in, dslash, i, dir, false, false, true, false)) {
1624  pattern.commsCompleted[2 * i + dir] = 1;
1625  pattern.completeSum++;
1626  }
1627  }
1628  }
1629  }
1630  }
1631 
1632  if (pattern.commDimTotal) {
1633  setFusedParam(dslashParam, dslash, faceVolumeCB); // setup for exterior kernel
1634  setMappedGhost(dslash, *in, true);
1635  PROFILE(if (dslash_exterior_compute) dslash.apply(streams[Nstream - 1]), profile, QUDA_PROFILE_DSLASH_KERNEL);
1636  setMappedGhost(dslash, *in, false);
1637  }
1638 
1639  completeDslash(*in, dslashParam);
1640  in->bufferIndex = (1 - in->bufferIndex);
1641  profile.TPSTOP(QUDA_PROFILE_TOTAL);
1642  }
1643  };
1644 
1645  template <typename Dslash> struct DslashNC : DslashPolicyImp<Dslash> {
1646 
1648  Dslash &dslash, cudaColorSpinorField *in, const int volume, const int *faceVolumeCB, TimeProfile &profile)
1649  {
1650 
1651  profile.TPSTART(QUDA_PROFILE_TOTAL);
1652 
1653  auto &dslashParam = dslash.dslashParam;
1654  dslashParam.kernel_type = INTERIOR_KERNEL;
1655  dslashParam.threads = volume;
1656 
1657  PROFILE(if (dslash_interior_compute) dslash.apply(streams[Nstream - 1]), profile, QUDA_PROFILE_DSLASH_KERNEL);
1658 
1659  profile.TPSTOP(QUDA_PROFILE_TOTAL);
1660  }
1661  };
1662 
1663  // whether we have initialized the dslash policy tuner
1664  extern bool dslash_policy_init;
1665 
1666  // used to keep track of which policy to start the autotuning
1667  extern int first_active_policy;
1668  extern int first_active_p2p_policy;
1669 
1670  enum class QudaDslashPolicy {
1671  QUDA_DSLASH,
1686  QUDA_DSLASH_POLICY_DISABLED // this MUST be the last element
1687  };
1688 
1689  // list of dslash policies that are enabled
1690  extern std::vector<QudaDslashPolicy> policies;
1691 
1692  // string used as a tunekey to ensure we retune if the dslash policy env changes
1693  extern char policy_string[TuneKey::aux_n];
1694 
1695  enum class QudaP2PPolicy {
1696  QUDA_P2P_DEFAULT, // no special hanlding for p2p
1697  QUDA_P2P_COPY_ENGINE, // use copy engine for p2p traffic
1698  QUDA_P2P_REMOTE_WRITE, // write packed halos directly to peers
1699  QUDA_P2P_POLICY_DISABLED, // this must be the last element
1700  };
1701 
1702  // list of p2p policies that are enabled
1703  extern std::vector<QudaP2PPolicy> p2p_policies;
1704 
1705  template <typename Dslash> struct DslashFactory {
1706 
1707  static DslashPolicyImp<Dslash> *create(const QudaDslashPolicy &dslashPolicy)
1708  {
1709  DslashPolicyImp<Dslash> *result = nullptr;
1710 
1711  switch (dslashPolicy) {
1712  case QudaDslashPolicy::QUDA_DSLASH: result = new DslashBasic<Dslash>; break;
1713  case QudaDslashPolicy::QUDA_DSLASH_ASYNC: result = new DslashAsync<Dslash>; break;
1717  if (!comm_gdr_blacklist())
1718  result = new DslashGDR<Dslash>;
1719  else
1720  result = new DslashBasic<Dslash>;
1721  break;
1723  if (!comm_gdr_blacklist())
1724  result = new DslashFusedGDR<Dslash>;
1725  else
1726  result = new DslashFusedExterior<Dslash>;
1727  break;
1729  if (!comm_gdr_blacklist())
1730  result = new DslashGDRRecv<Dslash>;
1731  else
1732  result = new DslashBasic<Dslash>;
1733  break;
1735  if (!comm_gdr_blacklist())
1736  result = new DslashFusedGDRRecv<Dslash>;
1737  else
1738  result = new DslashFusedExterior<Dslash>;
1739  break;
1743  if (!comm_gdr_blacklist())
1744  result = new DslashZeroCopyPackGDRRecv<Dslash>;
1745  else
1746  result = new DslashZeroCopyPack<Dslash>;
1747  break;
1749  if (!comm_gdr_blacklist())
1751  else
1752  result = new DslashFusedZeroCopyPack<Dslash>;
1753  break;
1756  case QudaDslashPolicy::QUDA_DSLASH_NC: result = new DslashNC<Dslash>; break;
1757  default: errorQuda("Dslash policy %d not recognized", static_cast<int>(dslashPolicy)); break;
1758  }
1759  return result; // default
1760  }
1761  };
1762 
1763  inline void enable_policy(QudaDslashPolicy p) { policies[static_cast<std::size_t>(p)] = p; }
1764 
1766  {
1767  policies[static_cast<std::size_t>(p)] = QudaDslashPolicy::QUDA_DSLASH_POLICY_DISABLED;
1768  }
1769 
1770  template <typename Dslash> class DslashPolicyTune : public Tunable
1771  {
1772 
1774  decltype(dslash.dslashParam) &dslashParam;
1776  const int volume;
1777  const int *ghostFace;
1779 
1780  bool tuneGridDim() const { return false; } // Don't tune the grid dimensions.
1781  bool tuneAuxDim() const { return true; } // Do tune the aux dimensions.
1782  unsigned int sharedBytesPerThread() const { return 0; }
1783  unsigned int sharedBytesPerBlock(const TuneParam &param) const { return 0; }
1784 
1785 public:
1787  Dslash &dslash, cudaColorSpinorField *in, const int volume, const int *ghostFace, TimeProfile &profile) :
1788  dslash(dslash),
1789  dslashParam(dslash.dslashParam),
1790  in(in),
1791  volume(volume),
1792  ghostFace(ghostFace),
1793  profile(profile)
1794  {
1795  in->streamInit(streams);
1796 
1797  if (!dslash_policy_init) {
1798 
1799  first_active_policy = static_cast<int>(QudaDslashPolicy::QUDA_DSLASH_POLICY_DISABLED);
1800  first_active_p2p_policy = static_cast<int>(QudaP2PPolicy::QUDA_P2P_POLICY_DISABLED);
1801 
1802  if (comm_peer2peer_enabled_global() & 2) { // enable/disable p2p copy engine policy tuning
1803  p2p_policies[static_cast<std::size_t>(QudaP2PPolicy::QUDA_P2P_REMOTE_WRITE)]
1805  first_active_p2p_policy = static_cast<int>(QudaP2PPolicy::QUDA_P2P_REMOTE_WRITE);
1806  }
1807 
1808  if (comm_peer2peer_enabled_global() & 1) { // enable/disable p2p direct store policy tuning
1809  p2p_policies[static_cast<std::size_t>(QudaP2PPolicy::QUDA_P2P_COPY_ENGINE)]
1811  first_active_p2p_policy = static_cast<int>(QudaP2PPolicy::QUDA_P2P_COPY_ENGINE);
1812  }
1813 
1814  if (!(comm_peer2peer_enabled_global() & 4)) { // enable/disable non-p2p policy tuning
1815  p2p_policies[static_cast<std::size_t>(QudaP2PPolicy::QUDA_P2P_DEFAULT)] = QudaP2PPolicy::QUDA_P2P_DEFAULT;
1816  first_active_p2p_policy = static_cast<int>(QudaP2PPolicy::QUDA_P2P_DEFAULT);
1817  }
1818 
1819  static char *dslash_policy_env = getenv("QUDA_ENABLE_DSLASH_POLICY");
1820  if (dslash_policy_env) { // set the policies to tune for explicitly
1821  std::stringstream policy_list(dslash_policy_env);
1822 
1823  int policy_;
1824  while (policy_list >> policy_) {
1825  QudaDslashPolicy dslash_policy = static_cast<QudaDslashPolicy>(policy_);
1826 
1827  // check this is a valid policy choice
1828  if ((dslash_policy == QudaDslashPolicy::QUDA_GDR_DSLASH
1829  || dslash_policy == QudaDslashPolicy::QUDA_FUSED_GDR_DSLASH
1830  || dslash_policy == QudaDslashPolicy::QUDA_GDR_RECV_DSLASH
1832  && !comm_gdr_enabled()) {
1833  errorQuda("Cannot select a GDR policy %d unless QUDA_ENABLE_GDR is set", static_cast<int>(dslash_policy));
1834  }
1835 
1836  enable_policy(static_cast<QudaDslashPolicy>(policy_));
1837  first_active_policy = policy_ < first_active_policy ? policy_ : first_active_policy;
1838  if (policy_list.peek() == ',') policy_list.ignore();
1839  }
1840  if (first_active_policy == static_cast<int>(QudaDslashPolicy::QUDA_DSLASH_POLICY_DISABLED))
1841  errorQuda("No valid policy found in QUDA_ENABLE_DSLASH_POLICY");
1842  } else {
1844  first_active_policy = 0;
1846 
1847  // if we have gdr then enable tuning these policies
1848  if (comm_gdr_enabled()) {
1853  }
1854 
1857 
1858  if (comm_gdr_enabled()) {
1861  }
1862 
1863  // pure zero-copy policies require texture objects
1866 
1867  // Async variants are only supported on CUDA 8.0 and up
1868 #if (CUDA_VERSION >= 8000) && 0
1869 #if (CUDA_VERSION >= 9000)
1870  CUdevice device;
1871  cuDeviceGet(&device, comm_gpuid());
1872  int can_use_stream_mem_ops;
1873  cuDeviceGetAttribute(&can_use_stream_mem_ops, CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_MEM_OPS, device);
1874 #else
1875  int can_use_stream_mem_ops = 1;
1876 #endif
1877  if (can_use_stream_mem_ops) {
1880  }
1881 #endif
1882  }
1883 
1884  // construct string specifying which policies have been enabled
1885  for (int i = 0; i < (int)QudaDslashPolicy::QUDA_DSLASH_POLICY_DISABLED; i++) {
1886  strcat(policy_string, (int)policies[i] == i ? "1" : "0");
1887  }
1888 
1889  static char *dslash_pack_env = getenv("QUDA_ENABLE_DSLASH_PACK");
1890  if (dslash_pack_env && strcmp(dslash_pack_env, "0") == 0) {
1891  if (getVerbosity() > QUDA_SILENT) warningQuda("Disabling Dslash halo packing");
1892  dslash_pack_compute = false;
1893  }
1894 
1895  static char *dslash_interior_env = getenv("QUDA_ENABLE_DSLASH_INTERIOR");
1896  if (dslash_interior_env && strcmp(dslash_interior_env, "0") == 0) {
1897  if (getVerbosity() > QUDA_SILENT) warningQuda("Disabling Dslash interior computation");
1898  dslash_interior_compute = false;
1899  }
1900 
1901  static char *dslash_exterior_env = getenv("QUDA_ENABLE_DSLASH_EXTERIOR");
1902  if (dslash_exterior_env && strcmp(dslash_exterior_env, "0") == 0) {
1903  if (getVerbosity() > QUDA_SILENT) warningQuda("Disabling Dslash exterior computation");
1904  dslash_exterior_compute = false;
1905  }
1906 
1907  static char *dslash_copy_env = getenv("QUDA_ENABLE_DSLASH_COPY");
1908  if (dslash_copy_env && strcmp(dslash_copy_env, "0") == 0) {
1909  if (getVerbosity() > QUDA_SILENT) warningQuda("Disabling Dslash host-device copying");
1910  dslash_copy = false;
1911  }
1912 
1913  static char *dslash_comms_env = getenv("QUDA_ENABLE_DSLASH_COMMS");
1914  if (dslash_comms_env && strcmp(dslash_comms_env, "0") == 0) {
1915  if (getVerbosity() > QUDA_SILENT) warningQuda("Disabling Dslash communication");
1916  dslash_comms = false;
1917  }
1918  }
1919 
1920  // before we do policy tuning we must ensure the kernel
1921  // constituents have been tuned since we can't do nested tuning
1922  if (getTuning() && getTuneCache().find(tuneKey()) == getTuneCache().end()) {
1924 
1925  for (auto &p2p : p2p_policies) {
1926 
1927  if (p2p == QudaP2PPolicy::QUDA_P2P_POLICY_DISABLED) continue;
1928 
1929  bool p2p_enabled = comm_peer2peer_enabled_global();
1930  if (p2p == QudaP2PPolicy::QUDA_P2P_DEFAULT) comm_enable_peer2peer(false); // disable p2p if using default policy
1931  dslashParam.remote_write = (p2p == QudaP2PPolicy::QUDA_P2P_REMOTE_WRITE ? 1 : 0);
1932 
1933  for (auto &i : policies) {
1934 
1935  if ( (i == QudaDslashPolicy::QUDA_DSLASH ||
1939  !dslashParam.remote_write) {
1940 
1942  (*dslashImp)(dslash, in, volume, ghostFace, profile);
1943  delete dslashImp;
1944 
1945  } else if ( (i == QudaDslashPolicy::QUDA_GDR_DSLASH ||
1958  i == QudaDslashPolicy::QUDA_FUSED_DSLASH_ASYNC) && dslashParam.remote_write) ) {
1959  // these dslash policies all must have kernel packing enabled
1960 
1961  // clumsy, but we call setKernelPackT a handful of times before
1962  // we restore the the current state, so this will "just work"
1964 
1965  // if we are using GDR policies then we must tune the
1966  // non-GDR variants as well with and without kernel
1967  // packing enabled - this ensures that all GPUs will have
1968  // the required tune cache entries prior to potential
1969  // process divergence regardless of which GPUs are
1970  // blacklisted. don't enter if remote writing since
1971  // there we always use kernel packing
1972  if ( (i == QudaDslashPolicy::QUDA_GDR_DSLASH ||
1975  i == QudaDslashPolicy::QUDA_FUSED_GDR_RECV_DSLASH) && !dslashParam.remote_write ) {
1979  setKernelPackT(false);
1980  (*dslashImp)(dslash, in, volume, ghostFace, profile);
1981  setKernelPackT(true);
1982  (*dslashImp)(dslash, in, volume, ghostFace, profile);
1983  delete dslashImp;
1984  }
1985 
1986  setKernelPackT(true);
1987 
1989  (*dslashImp)(dslash, in, volume, ghostFace, profile);
1990  delete dslashImp;
1991 
1992  // restore default kernel packing
1993  popKernelPackT();
1994 
1996  errorQuda("Unsupported dslash policy %d\n", static_cast<int>(i));
1997  }
1998  }
1999 
2000  comm_enable_peer2peer(p2p_enabled); // restore p2p state
2001  } // p2p policies
2002 
2004  setPolicyTuning(true);
2005  }
2006  dslash_policy_init = true;
2007  }
2008 
2009  virtual ~DslashPolicyTune() { setPolicyTuning(false); }
2010 
2011  void apply(const cudaStream_t &stream) {
2012  TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
2013 
2014  if (tp.aux.x >= static_cast<int>(policies.size())) errorQuda("Requested policy that is outside of range");
2015  if (static_cast<QudaDslashPolicy>(tp.aux.x) == QudaDslashPolicy::QUDA_DSLASH_POLICY_DISABLED) errorQuda("Requested policy is disabled");
2016 
2017  bool p2p_enabled = comm_peer2peer_enabled_global();
2018  if (p2p_policies[tp.aux.y] == QudaP2PPolicy::QUDA_P2P_DEFAULT) comm_enable_peer2peer(false); // disable p2p if using default policy
2019  dslashParam.remote_write = (p2p_policies[tp.aux.y] == QudaP2PPolicy::QUDA_P2P_REMOTE_WRITE ? 1 : 0); // set whether we are using remote packing writes or copy engines
2020 
2021  // switch on kernel packing for the policies that need it, save default kernel packing
2023  auto p = static_cast<QudaDslashPolicy>(tp.aux.x);
2032  dslashParam.remote_write // always use kernel packing if remote writing
2033  ) {
2034  setKernelPackT(true);
2035  }
2036 
2037  DslashPolicyImp<Dslash> *dslashImp = DslashFactory<Dslash>::create(static_cast<QudaDslashPolicy>(tp.aux.x));
2038  (*dslashImp)(dslash, in, volume, ghostFace, profile);
2039  delete dslashImp;
2040 
2041  // restore p2p state
2042  comm_enable_peer2peer(p2p_enabled);
2043 
2044  // restore default kernel packing
2045  popKernelPackT();
2046  }
2047 
2048  int tuningIter() const { return 10; }
2049 
2050  // Find the best dslash policy
2052  {
2053  while ((unsigned)param.aux.x < policies.size()-1) {
2054  param.aux.x++;
2055  if (policies[param.aux.x] != QudaDslashPolicy::QUDA_DSLASH_POLICY_DISABLED) return true;
2056  }
2057  param.aux.x = first_active_policy;
2058 
2059  while ((unsigned)param.aux.y < p2p_policies.size()-1) {
2060  param.aux.y++;
2061  if (p2p_policies[param.aux.y] != QudaP2PPolicy::QUDA_P2P_POLICY_DISABLED) return true;
2062  }
2063  param.aux.y = first_active_p2p_policy;
2064 
2065  return false;
2066  }
2067 
2068  bool advanceTuneParam(TuneParam &param) const { return advanceAux(param); }
2069 
2071  Tunable::initTuneParam(param);
2072  param.aux.x = first_active_policy;
2073  param.aux.y = first_active_p2p_policy;
2074  param.aux.z = 0;
2075  }
2076 
2079  param.aux.x = first_active_policy;
2080  param.aux.y = first_active_p2p_policy;
2081  param.aux.z = 0;
2082  }
2083 
2084  TuneKey tuneKey() const {
2085  KernelType kernel_type = dslashParam.kernel_type;
2086  dslashParam.kernel_type = KERNEL_POLICY;
2087  TuneKey key = dslash.tuneKey();
2088  strcat(key.aux, comm_dim_topology_string());
2089  strcat(key.aux, comm_config_string()); // any change in P2P/GDR will be stored as a separate tunecache entry
2090  strcat(key.aux, policy_string); // any change in policies enabled will be stored as a separate entry
2091  dslashParam.kernel_type = kernel_type;
2092  return key;
2093  }
2094 
2095  long long flops() const {
2096  KernelType kernel_type = dslashParam.kernel_type;
2097  dslashParam.kernel_type = KERNEL_POLICY;
2098  long long flops_ = dslash.flops();
2099  dslashParam.kernel_type = kernel_type;
2100  return flops_;
2101  }
2102 
2103  long long bytes() const {
2104  KernelType kernel_type = dslashParam.kernel_type;
2105  dslashParam.kernel_type = KERNEL_POLICY;
2106  long long bytes_ = dslash.bytes();
2107  dslashParam.kernel_type = kernel_type;
2108  return bytes_;
2109  }
2110 
2111  void preTune() { dslash.preTune(); }
2112 
2113  void postTune() { dslash.postTune(); }
2114  };
2115 
2116  } // namespace dslash
2117 
2118 } // namespace quda
virtual void apply(const cudaStream_t &stream)=0
bool dslash_exterior_compute
Definition: dslash_quda.cu:65
DslashPolicyTune(Dslash &dslash, cudaColorSpinorField *in, const int volume, const int *ghostFace, TimeProfile &profile)
virtual void postTune()
Restore the output field if doing exterior kernel.
Definition: dslash.h:295
bool dslash_interior_compute
Definition: dslash_quda.cu:64
cudaEvent_t scatterStart[Nstream]
Definition: dslash_quda.cu:58
cudaEvent_t gatherStart[Nstream]
Definition: dslash_quda.cu:56
static DslashPolicyImp< Dslash > * create(const QudaDslashPolicy &dslashPolicy)
void streamInit(cudaStream_t *stream_p)
void apply(const cudaStream_t &stream)
cudaError_t qudaEventQuery(cudaEvent_t &event)
Wrapper around cudaEventQuery or cuEventQuery.
bool getKernelPackT()
Definition: dslash_quda.cu:26
cudaError_t qudaStreamWaitEvent(cudaStream_t stream, cudaEvent_t event, unsigned int flags)
Wrapper around cudaEventRecord or cuEventRecord.
void disableProfileCount()
Disable the profile kernel counting.
Definition: tune.cpp:125
QudaVerbosity getVerbosity()
Definition: util_quda.cpp:21
void end(void)
Definition: blas_quda.cu:489
void issueRecv(cudaColorSpinorField &input, const Dslash &dslash, cudaStream_t *stream, bool gdr)
This helper function simply posts all receives in all directions.
void gather(int nFace, int dagger, int dir, cudaStream_t *stream_p=NULL)
#define errorQuda(...)
Definition: util_quda.h:121
int Dagger() const
Definition: dslash.h:275
#define PROFILE(f, profile, idx)
int getStreamIndex(const T &dslashParam)
Returns a stream index for posting the pack/scatters to. We desire a stream index that is not being u...
cudaStream_t * streams
cudaStream_t * stream
void augmentAux(KernelType type, const char *extra)
Definition: dslash.h:281
const int Nstream
Definition: quda_internal.h:83
void operator()(Dslash &dslash, cudaColorSpinorField *in, const int volume, const int *faceVolumeCB, TimeProfile &profile)
int comm_gpuid(void)
bool dslash_policy_init
Definition: dslash_quda.cu:70
virtual TuneKey tuneKey() const =0
cudaEvent_t dslashStart[2]
Definition: dslash_quda.cu:60
int commsQuery(int nFace, int d, int dagger=0, cudaStream_t *stream_p=nullptr, bool gdr_send=false, bool gdr_recv=false)
Non-blocking query if the halo communication has completed.
void enableProfileCount()
Enable the profile kernel counting.
Definition: tune.cpp:126
void completeDslash(const ColorSpinorField &in, const T &dslashParam)
Ensure that the dslash is complete. By construction, the dslash will have completed (or is in flight)...
void comm_enable_peer2peer(bool enable)
Enable / disable peer-to-peer communication: used for dslash policies that do not presently support p...
std::vector< QudaP2PPolicy > p2p_policies
Definition: dslash_quda.cu:80
void setAux(KernelType type, const char *aux_)
Definition: dslash.h:279
QudaGaugeParam param
Definition: pack_test.cpp:17
char policy_string[TuneKey::aux_n]
Definition: dslash_quda.cu:83
void popKernelPackT()
Definition: dslash_quda.cu:42
virtual long long bytes() const
Definition: dslash.h:364
void defaultTuneParam(TuneParam &param) const
std::vector< QudaDslashPolicy > policies
Definition: dslash_quda.cu:77
bool dslash_comms
Definition: dslash_quda.cu:66
const cudaEvent_t & getIPCCopyEvent(int dir, int dim) const
bool dslash_copy
Definition: dslash_quda.cu:67
static int bufferIndex
unsigned int sharedBytesPerThread() const
cudaError_t qudaStreamSynchronize(cudaStream_t &stream)
Wrapper around cudaStreamSynchronize or cuStreamSynchronize.
Worker * aux_worker
Definition: dslash_quda.cu:87
const char * comm_dim_topology_string()
Return a string that defines the comm topology (for use as a tuneKey)
void operator()(Dslash &dslash, cudaColorSpinorField *in, const int volume, const int *faceVolumeCB, TimeProfile &profile)
void operator()(Dslash &dslash, cudaColorSpinorField *in, const int volume, const int *faceVolumeCB, TimeProfile &profile)
cpuColorSpinorField * in
QudaSiteSubset SiteSubset() const
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
Definition: tune.cpp:643
char aux[aux_n]
Definition: tune_key.h:15
void operator()(Dslash &dslash, cudaColorSpinorField *in, const int volume, const int *faceVolumeCB, TimeProfile &profile)
#define warningQuda(...)
Definition: util_quda.h:133
cudaEvent_t packEnd[2]
Definition: dslash_quda.cu:55
const cudaEvent_t & getIPCRemoteCopyEvent(int dir, int dim) const
void operator()(Dslash &dslash, cudaColorSpinorField *in, const int volume, const int *faceVolumeCB, TimeProfile &profile)
DslashCommsPattern(const int commDim[], bool gdr_send=false)
void operator()(Dslash &dslash, cudaColorSpinorField *in, const int volume, const int *faceVolumeCB, TimeProfile &profile)
void setMappedGhost(Dslash &dslash, ColorSpinorField &in, bool to_mapped)
Set the ghosts to the mapped CPU ghost buffer, or unsets if already set. Note this must not be called...
DslashArg< Float > & dslashParam
Definition: dslash.h:235
void sendStart(int nFace, int d, int dagger=0, cudaStream_t *stream_p=nullptr, bool gdr=false, bool remote_write=false)
Initiate halo communication sending.
#define CUDA_CALL(call)
void issueGather(cudaColorSpinorField &in, const Dslash &dslash)
This helper function simply posts the device-host memory copies of all halos in all dimensions and di...
void setPolicyTuning(bool)
Enable / disable whether are tuning a policy.
Definition: tune.cpp:499
int first_active_p2p_policy
Definition: dslash_quda.cu:74
bool comm_peer2peer_enabled(int dir, int dim)
void issuePack(cudaColorSpinorField &in, const Dslash &dslash, int parity, MemoryLocation location, int packIndex)
This helper function simply posts the packing kernel needed for halo exchange.
static int index(int ndim, const int *dims, const int *x)
Definition: comm_common.cpp:32
void enable_policy(QudaDslashPolicy p)
void initTuneParam(TuneParam &param) const
void operator()(Dslash &dslash, cudaColorSpinorField *in, const int volume, const int *faceVolumeCB, TimeProfile &profile)
static int commDim[QUDA_MAX_DIM]
Definition: dslash_pack.cuh:9
int Nface() const
Definition: dslash.h:271
__device__ __host__ void pack(Arg &arg, int ghost_idx, int s, int parity)
Definition: dslash_pack.cuh:83
void pack(int nFace, int parity, int dagger, int stream_idx, MemoryLocation location[], MemoryLocation location_label, bool spin_project=true, double a=0, double b=0, double c=0)
unsigned int sharedBytesPerBlock(const TuneParam &param) const
void operator()(Dslash &dslash, cudaColorSpinorField *in, const int volume, const int *faceVolumeCB, TimeProfile &profile)
static const int aux_n
Definition: tune_key.h:12
bool dslash_pack_compute
Definition: dslash_quda.cu:63
cudaEvent_t scatterEnd[Nstream]
Definition: dslash_quda.cu:59
bool commsComplete(cudaColorSpinorField &in, const Dslash &dslash, int dim, int dir, bool gdr_send, bool gdr_recv, bool zero_copy_recv, bool async, int scatterIndex=-1)
Wrapper for querying if communication is finished in the dslash, and if it is take the appropriate ac...
void operator()(Dslash &dslash, cudaColorSpinorField *in, const int volume, const int *faceVolumeCB, TimeProfile &profile)
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
void operator()(Dslash &dslash, cudaColorSpinorField *in, const int volume, const int *faceVolumeCB, TimeProfile &profile)
void operator()(Dslash &dslash, cudaColorSpinorField *in, const int volume, const int *faceVolumeCB, TimeProfile &profile)
bool comm_gdr_enabled()
Query if GPU Direct RDMA communication is enabled (global setting)
void setKernelPackT(bool pack)
Definition: dslash_quda.cu:24
cudaError_t qudaEventRecord(cudaEvent_t &event, cudaStream_t stream=0)
Wrapper around cudaEventRecord or cuEventRecord.
int device
Definition: test_util.cpp:1602
static cudaColorSpinorField * inSpinor
virtual void initTuneParam(TuneParam &param) const
Definition: tune_quda.h:304
void operator()(Dslash &dslash, cudaColorSpinorField *in, const int volume, const int *faceVolumeCB, TimeProfile &profile)
void setFusedParam(Arg &param, Dslash &dslash, const int *faceVolumeCB)
void operator()(Dslash &dslash, cudaColorSpinorField *in, const int volume, const int *faceVolumeCB, TimeProfile &profile)
void pushKernelPackT(bool pack)
Definition: dslash_quda.cu:30
#define QUDA_MAX_DIM
Maximum number of dimensions supported by QUDA. In practice, no routines make use of more than 5...
bool advanceTuneParam(TuneParam &param) const
decltype(dslash.dslashParam) & dslashParam
const std::map< TuneKey, TuneParam > & getTuneCache()
Returns a reference to the tunecache map.
Definition: tune.cpp:128
virtual long long flops() const
Definition: dslash.h:316
QudaTune getTuning()
Query whether autotuning is enabled or not. Default is enabled but can be overridden by setting QUDA_...
Definition: util_quda.cpp:52
void scatter(int nFace, int dagger, int dir, cudaStream_t *stream_p)
virtual void preTune()
Save the output field since the output field is both read from and written to in the exterior kernels...
Definition: dslash.h:287
void recvStart(int nFace, int dir, int dagger=0, cudaStream_t *stream_p=nullptr, bool gdr=false)
Initiate halo communication receive.
QudaParity parity
Definition: covdev_test.cpp:54
const char * getAux(KernelType type) const
Definition: dslash.h:277
int comm_peer2peer_enabled_global()
bool comm_gdr_blacklist()
Query if GPU Direct RDMA communication is blacklisted for this GPU.
int first_active_policy
Definition: dslash_quda.cu:73
const char * comm_config_string()
Return a string that defines the P2P/GDR environment variable configuration (for use as a tuneKey to ...
cudaEvent_t gatherEnd[Nstream]
Definition: dslash_quda.cu:57
virtual void apply(const cudaStream_t &stream)=0
virtual void defaultTuneParam(TuneParam &param) const
Definition: tune_quda.h:329
void operator()(Dslash &dslash, cudaColorSpinorField *in, const int volume, const int *faceVolumeCB, TimeProfile &profile)
bool advanceAux(TuneParam &param) const
void disable_policy(QudaDslashPolicy p)
void operator()(Dslash &dslash, cudaColorSpinorField *in, const int volume, const int *faceVolumeCB, TimeProfile &profile)