QUDA  0.9.0
dslash_policy.cuh
Go to the documentation of this file.
1 static cudaColorSpinorField *inSpinor;
2 
3 // hooks into tune.cpp variables for policy tuning
4 typedef std::map<TuneKey, TuneParam> map;
5 const map& getTuneCache();
6 
8 void enableProfileCount();
9 
10 void setPolicyTuning(bool);
11 
12 // these variables are used for benchmarking the dslash components in isolation
13 static bool dslash_pack_compute = true;
14 static bool dslash_interior_compute = true;
15 static bool dslash_exterior_compute = true;
16 static bool dslash_comms = true;
17 static bool dslash_copy = true;
18 
29 
30  inline DslashCommsPattern(const int commDim[], bool gdr_send=false)
32 
33  for (int i=0; i<Nstream-1; i++) gatherCompleted[i] = gdr_send ? 1 : 0;
34  gatherCompleted[Nstream-1] = 1;
35  commsCompleted[Nstream-1] = 1;
36  dslashCompleted[Nstream-1] = 1;
37 
38  // We need to know which was the previous direction in which
39  // communication was issued, since we only query a given event /
40  // comms call after the previous the one has successfully
41  // completed.
42  for (int i=3; i>=0; i--) {
43  if (commDim[i]) {
44  int prev = Nstream-1;
45  for (int j=3; j>i; j--) if (commDim[j]) prev = 2*j;
46  previousDir[2*i + 1] = prev;
47  previousDir[2*i + 0] = 2*i + 1; // always valid
48  }
49  }
50 
51  // this tells us how many events / comms occurances there are in
52  // total. Used for exiting the while loop
53  commDimTotal = 0;
54  for (int i=3; i>=0; i--) {
56  }
57  commDimTotal *= gdr_send ? 2 : 4; // 2 from pipe length, 2 from direction
58  }
59 };
60 
61 
62 inline void setFusedParam(DslashParam& param, DslashCuda &dslash, const int* faceVolumeCB){
63  int prev = -1;
64 
65  param.threads = 0;
66  for (int i=0; i<4; ++i) {
67  param.threadDimMapLower[i] = 0;
68  param.threadDimMapUpper[i] = 0;
69  if (!dslash.dslashParam.commDim[i]) continue;
70  param.threadDimMapLower[i] = (prev >= 0 ? param.threadDimMapUpper[prev] : 0);
71  param.threadDimMapUpper[i] = param.threadDimMapLower[i] + dslash.Nface()*faceVolumeCB[i];
72  param.threads = param.threadDimMapUpper[i];
73  prev=i;
74  }
75 
76  param.kernel_type = EXTERIOR_KERNEL_ALL;
77 }
78 
79 #undef DSLASH_PROFILE
80 #ifdef DSLASH_PROFILE
81 #define PROFILE(f, profile, idx) \
82  profile.TPSTART(idx); \
83  f; \
84  profile.TPSTOP(idx);
85 #else
86 #define PROFILE(f, profile, idx) f;
87 #endif
88 
89 
90 
91 #ifdef PTHREADS
92 #include <pthread.h>
93 
94 
95 namespace {
96 
97  struct ReceiveParam
98  {
99  TimeProfile* profile;
100  int nFace;
101  int dagger;
102  };
103 
104  void *issueMPIReceive(void* receiveParam)
105  {
106  ReceiveParam* param = static_cast<ReceiveParam*>(receiveParam);
107  for(int i=3; i>=0; i--){
108  if(!dslashParam.commDim[i]) continue;
109  for(int dir=1; dir>=0; dir--){
110  PROFILE(inSpinor->recvStart(param->nFace, 2*i+dir, param->dagger), (*(param->profile)), QUDA_PROFILE_COMMS_START);
111  }
112  }
113  return nullptr;
114  }
115 
116  struct InteriorParam
117  {
118  TimeProfile* profile;
120  int current_device;
121  };
122 
123 
124  void* launchInteriorKernel(void* interiorParam)
125  {
126  InteriorParam* param = static_cast<InteriorParam*>(interiorParam);
127  cudaSetDevice(param->current_device); // set device in the new thread
128  PROFILE(param->dslash->apply(streams[Nstream-1]), (*(param->profile)), QUDA_PROFILE_DSLASH_KERNEL);
129  if (aux_worker) aux_worker->apply(streams[Nstream-1]);
130  return nullptr;
131  }
132 
133 } // anonymous namespace
134 #endif
135 
136 
137 namespace {
138 
146  inline void issueRecv(cudaColorSpinorField &input, const DslashCuda &dslash, cudaStream_t *stream, bool gdr) {
147  for(int i=3; i>=0; i--){
148  if (!dslash.dslashParam.commDim[i]) continue;
149  for(int dir=1; dir>=0; dir--) {
150  PROFILE(if (dslash_comms) input.recvStart(dslash.Nface()/2, 2*i+dir, dslash.Dagger(), stream, gdr), profile, QUDA_PROFILE_COMMS_START);
151  }
152  }
153  }
154 
165  inline void issuePack(cudaColorSpinorField &in, const DslashCuda &dslash, int parity, MemoryLocation location, int packIndex) {
166 
167  if ( (location & Device) & Host) errorQuda("MemoryLocation cannot be both Device and Host");
168 
169  using namespace dslash;
170 
171  bool pack = false;
172  for (int i=3; i>=0; i--)
173  if (dslash.dslashParam.commDim[i] && (i!=3 || getKernelPackT()))
174  { pack = true; break; }
175 
176  MemoryLocation pack_dest[2*QUDA_MAX_DIM];
177  for (int dim=0; dim<4; dim++) {
178  for (int dir=0; dir<2; dir++) {
179  if ( (location & Remote) && comm_peer2peer_enabled(dir,dim) ) {
180  pack_dest[2*dim+dir] = Remote; // pack to p2p remote
181  } else if ( location & Host && !comm_peer2peer_enabled(dir,dim) ) {
182  pack_dest[2*dim+dir] = Host; // pack to cpu memory
183  } else {
184  pack_dest[2*dim+dir] = Device; // pack to local gpu memory
185  }
186  }
187  }
188  if (pack) {
189  PROFILE(if (dslash_pack_compute) in.pack(dslash.Nface()/2, parity, dslash.Dagger(), packIndex,
190  pack_dest, location, dslash.dslashParam.twist_a, dslash.dslashParam.twist_b),
191  profile, QUDA_PROFILE_PACK_KERNEL);
192 
193  // Record the end of the packing
194  PROFILE(if (location != Host) qudaEventRecord(packEnd[in.bufferIndex], streams[packIndex]), profile, QUDA_PROFILE_EVENT_RECORD);
195  }
196 
197  }
198 
205  inline void issueGather(cudaColorSpinorField &in, const DslashCuda &dslash) {
206 
207  using namespace dslash;
208 
209  for (int i = 3; i >=0; i--) {
210  if (!dslash.dslashParam.commDim[i]) continue;
211 
212  for (int dir=1; dir>=0; dir--) { // forwards gather
213  cudaEvent_t &event = (i!=3 || getKernelPackT()) ? packEnd[in.bufferIndex] : dslashStart[in.bufferIndex];
214 
216 
217  // Initialize host transfer from source spinor
218  PROFILE(if (dslash_copy) in.gather(dslash.Nface()/2, dslash.Dagger(), 2*i+dir), profile, QUDA_PROFILE_GATHER);
219 
220  // Record the end of the gathering if not peer-to-peer
221  if (!comm_peer2peer_enabled(dir,i)) {
223  }
224  }
225  }
226 
227  }
228 
239  template <typename T>
240  inline int getStreamIndex(const T &dslashParam) {
241  // set index to a stream index not being used for p2p
242  int index = -1;
243  for (int i = 3; i >=0; i--) {
244  if (!dslashParam.commDim[i]) continue;
245  if (!comm_peer2peer_enabled(0,i)) index = 2*i+0;
246  else if (!comm_peer2peer_enabled(1,i)) index = 2*i+1;
247  }
248  // make sure we pick a valid index, in case we are fully p2p connected
249  if (index == -1) index = 0;
250  return index;
251  }
252 
279  inline bool commsComplete(cudaColorSpinorField &in, const DslashCuda &dslash, int dim, int dir,
280  bool gdr_send, bool gdr_recv, bool zero_copy_recv, bool async, int scatterIndex=-1) {
281 
282  using namespace dslash;
283 
284  cudaStream_t *stream = nullptr;
285 
286  PROFILE(int comms_test = dslash_comms ? in.commsQuery(dslash.Nface()/2, 2*dim+dir, dslash.Dagger(), stream, gdr_send, gdr_recv) : 1, profile, QUDA_PROFILE_COMMS_QUERY);
287  if (comms_test) {
288  // now we are receive centric
289  int dir2 = 1-dir;
290 
291  // if peer-2-peer in a given direction then we need to insert a wait on that copy event
292  if (comm_peer2peer_enabled(dir2,dim)) {
293  PROFILE(qudaStreamWaitEvent(streams[Nstream-1], in.getIPCRemoteCopyEvent(dir2,dim), 0), profile, QUDA_PROFILE_STREAM_WAIT_EVENT);
294  } else {
295 
296  if (!gdr_recv && !zero_copy_recv) { // Issue CPU->GPU copy if not GDR
297 
298  if (async) {
299 #if (CUDA_VERSION >= 8000)
300  // this will trigger the copy asynchronously
301  *((volatile cuuint32_t*)(commsEnd_h+2*dim+dir2)) = 1;
302 #else
303  errorQuda("Async dslash policy variants require CUDA 8.0 and above");
304 #endif
305  } else {
306  // note the ColorSpinorField::scatter transforms from
307  // scatter centric to gather centric (e.g., flips
308  // direction) so here just use dir not dir2
309  if (scatterIndex == -1) scatterIndex = 2*dim+dir;
310  PROFILE(if (dslash_copy) in.scatter(dslash.Nface()/2, dslash.Dagger(), 2*dim+dir, streams+scatterIndex), profile, QUDA_PROFILE_SCATTER);
311  }
312 
313  }
314 
315  }
316 
317  }
318  return comms_test;
319  }
320 
330  template <typename T>
331  inline void completeDslash(const ColorSpinorField &in, const T&dslashParam) {
332  // this ensures that the p2p sending is completed before any
333  // subsequent work is done on the compute stream
334  for (int dim=3; dim>=0; dim--) {
335  if (!dslashParam.commDim[dim]) continue;
336  for (int dir=0; dir<2; dir++) {
337  if (comm_peer2peer_enabled(dir,dim)) {
338  PROFILE(qudaStreamWaitEvent(streams[Nstream-1], in.getIPCCopyEvent(dir,dim), 0), profile, QUDA_PROFILE_STREAM_WAIT_EVENT);
339  }
340  }
341  }
342  }
343 
355  inline void setMappedGhost(DslashCuda &dslash, cudaColorSpinorField &in, bool to_mapped) {
356 
357  static char aux_copy[TuneKey::aux_n];
358  static bool set_mapped = false;
359 
360  if (to_mapped) {
361  if (set_mapped) errorQuda("set_mapped already set");
362  // in the below we switch to the mapped ghost buffer and update the tuneKey to reflect this
363  in.bufferIndex += 2;
364  strcpy(aux_copy,dslash.getAux(dslash.dslashParam.kernel_type));
365  dslash.augmentAux(dslash.dslashParam.kernel_type, ",zero_copy");
366  set_mapped = true;
367  } else {
368  if (!set_mapped) errorQuda("set_mapped not set");
369  // reset to default
370  dslash.setAux(dslash.dslashParam.kernel_type, aux_copy);
371  in.bufferIndex -= 2;
372  set_mapped = false;
373  }
374  }
375 
377 
378  virtual void operator()(DslashCuda &dslash, cudaColorSpinorField* in,
379  const int volume, const int *faceVolumeCB, TimeProfile &profile) = 0;
380 
381  virtual ~DslashPolicyImp(){}
382  };
383 
388 
389  void operator()(DslashCuda &dslash, cudaColorSpinorField* in, const int volume, const int *faceVolumeCB, TimeProfile &profile) {
390 
391  using namespace dslash;
392  profile.TPSTART(QUDA_PROFILE_TOTAL);
393 
394  auto &dslashParam = dslash.dslashParam;
395  dslashParam.kernel_type = INTERIOR_KERNEL;
396  dslashParam.threads = volume;
397 
398  // Record the start of the dslash if doing communication in T and not kernel packing
399  if (dslashParam.commDim[3] && !getKernelPackT()) {
401  }
402 
403  issueRecv(*in, dslash, 0, false); // Prepost receives
404 
405  const int packIndex = Nstream-1;
406  issuePack(*in, dslash, 1-dslashParam.parity, static_cast<MemoryLocation>(Device | (Remote*dslashParam.remote_write) ), packIndex);
407 
408  issueGather(*in, dslash);
409 
411  if (aux_worker) aux_worker->apply(streams[Nstream-1]);
412 
413  DslashCommsPattern pattern(dslashParam.commDim);
414  while (pattern.completeSum < pattern.commDimTotal) {
415  for (int i=3; i>=0; i--) {
416  if (!dslashParam.commDim[i]) continue;
417 
418  for (int dir=1; dir>=0; dir--) {
419  // Query if gather has completed
420  if (!pattern.gatherCompleted[2*i+dir] && pattern.gatherCompleted[pattern.previousDir[2*i+dir]]) {
421 
422  cudaError_t event_test = comm_peer2peer_enabled(dir,i) ? cudaSuccess : cudaErrorNotReady;
423  if (event_test != cudaSuccess) PROFILE(event_test = qudaEventQuery(gatherEnd[2*i+dir]), profile, QUDA_PROFILE_EVENT_QUERY);
424 
425  if (cudaSuccess == event_test) {
426  pattern.gatherCompleted[2*i+dir] = 1;
427  pattern.completeSum++;
428  PROFILE(if (dslash_comms) in->sendStart(dslash.Nface()/2, 2*i+dir, dslash.Dagger(), dslashParam.remote_write ? streams+packIndex : nullptr,
429  false, dslashParam.remote_write), profile, QUDA_PROFILE_COMMS_START);
430  if (dslash_comms) in->commsQuery(dslash.Nface()/2, 2*i+dir, dslash.Dagger()); // do a comms query to ensure MPI has begun
431  }
432  }
433 
434  // Query if comms has finished
435  if (!pattern.commsCompleted[2*i+dir] && pattern.gatherCompleted[2*i+dir]) {
436  if ( commsComplete(*in, dslash, i, dir, false, false, false, false) ) {
437  pattern.commsCompleted[2*i+dir] = 1;
438  pattern.completeSum++;
439  }
440  }
441 
442  } // dir=0,1
443 
444  if ( !pattern.dslashCompleted[2*i] && pattern.dslashCompleted[pattern.previousDir[2*i+1]] && pattern.commsCompleted[2*i] && pattern.commsCompleted[2*i+1] ) {
445 
446  for (int dir=1; dir>=0; dir--) {
447  if (!comm_peer2peer_enabled(1-dir,i)) { // if not peer-to-peer we post an event in the scatter stream and wait on that
448  // Record the end of the scattering
450  // wait for scattering to finish and then launch dslash
452  }
453  }
454 
455  dslashParam.kernel_type = static_cast<KernelType>(i);
456  dslashParam.threads = dslash.Nface()*faceVolumeCB[i]; // updating 2 or 6 faces
457 
458  // all faces use this stream
460 
461  pattern.dslashCompleted[2*i] = 1;
462  }
463  }
464  }
465 
466  completeDslash(*in,dslashParam);
467  in->bufferIndex = (1 - in->bufferIndex);
468  profile.TPSTOP(QUDA_PROFILE_TOTAL);
469  }
470 
471 };
472 
474 
475  void operator()(DslashCuda &dslash, cudaColorSpinorField* in, const int volume, const int *faceVolumeCB, TimeProfile &profile) {
476 #ifdef PTHREADS
477  using namespace dslash;
478  profile.TPSTART(QUDA_PROFILE_TOTAL);
479 
480  auto &dslashParam = dslash.dslashParam;
481  dslashParam.kernel_type = INTERIOR_KERNEL;
482  dslashParam.threads = volume;
483 
484 #ifdef MULTI_GPU
485  // Record the start of the dslash if doing communication in T and not kernel packing
486  {
488  profile, QUDA_PROFILE_EVENT_RECORD);
489  }
490 
491  // and launch the interior dslash kernel
492 
493  const int packIndex = Nstream-2;
494  //const int packIndex = Nstream-1;
495  pthread_t receiveThread, interiorThread;
496  ReceiveParam receiveParam;
497  receiveParam.profile = &profile;
498  receiveParam.nFace = (dslash.Nface() >> 1);
499  receiveParam.dagger = dslash.Dagger();
500 
501  if(pthread_create(&receiveThread, NULL, issueMPIReceive, &receiveParam)){
502  errorQuda("pthread_create failed");
503  }
504 
505  InteriorParam interiorParam;
506  interiorParam.dslash = &dslash;
507  interiorParam.profile = &profile;
508 
509  cudaGetDevice(&(interiorParam.current_device)); // get the current device number
510  if(pthread_create(&interiorThread, NULL, launchInteriorKernel, &interiorParam)){
511  errorQuda("pthread_create failed");
512  }
513 
514  bool pack = false;
515  for (int i=3; i>=0; i--)
516  if (dslashParam.commDim[i] && (i!=3 || getKernelPackT()))
517  { pack = true; break; }
518 
519  if (pack){
520  PROFILE(qudaStreamWaitEvent(streams[packIndex], dslashStart[in->bufferIndex], 0),
522  }
523 
524  // Initialize pack from source spinor
525  MemoryLocation pack_dest[2*QUDA_MAX_DIM];
526  for (int i=0; i<2*QUDA_MAX_DIM; i++) pack_dest[i] = Device;
527  PROFILE(if (dslash_pack_compute) in->pack(dslash.Nface()/2, 1-dslashParam.parity, dslash.Dagger(), packIndex, pack_dest, twist_a, twist_b),
528  profile, QUDA_PROFILE_PACK_KERNEL);
529 
530  if (pack) {
531  // Record the end of the packing
532  PROFILE(qudaEventRecord(packEnd[in->bufferIndex], streams[packIndex]), profile, QUDA_PROFILE_EVENT_RECORD);
533  }
534  for(int i = 3; i >=0; i--){
535  if (!dslashParam.commDim[i]) continue;
536 
537  for (int dir=1; dir>=0; dir--) {
538  cudaEvent_t &event = (i!=3 || getKernelPackT()) ? packEnd[in->bufferIndex] : dslashStart[in->bufferIndex];
539 
542 
543  // Initialize host transfer from source spinor
544  PROFILE(if (dslash_copy) in->gather(dslash.Nface()/2, dslash.Dagger(), 2*i+dir), profile, QUDA_PROFILE_GATHER);
545 
546  // Record the end of the gathering
547  PROFILE(qudaEventRecord(gatherEnd[2*i+dir], streams[2*i+dir]),
548  profile, QUDA_PROFILE_EVENT_RECORD);
549  }
550  }
551 
552 #endif // MULTI_GPU
553 
554 #if (!defined MULTI_GPU)
556  if (aux_worker) aux_worker->apply(streams[Nstream-1]);
557 #endif
558 
559 #ifdef MULTI_GPU
560  if(pthread_join(receiveThread, NULL)) errorQuda("pthread_join failed");
561  bool interiorLaunched = false;
562  DslashCommsPattern pattern(dslashParam.commDim);
563  while (pattern.completeSum < pattern.commDimTotal) {
564  for (int i=3; i>=0; i--) {
565  if (!dslashParam.commDim[i]) continue;
566 
567  for (int dir=1; dir>=0; dir--) {
568 
569  // Query if gather has completed
570  if (!pattern.gatherCompleted[2*i+dir] && pattern.gatherCompleted[pattern.previousDir[2*i+dir]]) {
571  cudaError_t event_test = comm_peer2peer_enabled(dir,i) ? cudaSuccess : cudaErrorNotReady;
572  if (event_test != cudaSuccess) PROFILE(event_test = qudaEventQuery(gatherEnd[2*i+dir]), profile, QUDA_PROFILE_EVENT_QUERY);
573 
574  if (cudaSuccess == event_test) {
575  pattern.gatherCompleted[2*i+dir] = 1;
576  pattern.completeSum++;
577  PROFILE(if (dslash_comms) in->sendStart(dslash.Nface()/2, 2*i+dir, dslash.Dagger(), dslashParam.remote_write ? streams+packIndex : nullptr,
578  false, dslashParam.remote_write), profile, QUDA_PROFILE_COMMS_START);
579  if (dslash_comms) ? in->commsQuery(dslash.Nface()/2, 2*i+dir, dslash.Dagger()); // do a comms query to ensure MPI has begun
580  }
581  }
582 
583  // Query if comms has finished
584  if(!pattern.commsCompleted[2*i+dir] && pattern.commsCompleted[pattern.previousDir[2*i+dir]] &&
585  pattern.gatherCompleted[2*i+dir]) {
586  PROFILE(int comms_test = dslash_comms ? in->commsQuery(dslash.Nface()/2, 2*i+dir, dslash.Dagger()) : 1,
587  profile, QUDA_PROFILE_COMMS_QUERY);
588  if (comms_test) {
589  pattern.commsCompleted[2*i+dir] = 1;
590  pattern.completeSum++;
591 
592  // Scatter into the end zone
593  PROFILE(if (dslash_copy) in->scatter(dslash.Nface()/2, dslash.Dagger(), 2*i+dir), profile, QUDA_PROFILE_SCATTER);
594  }
595  }
596 
597  } // dir=0,1
598 
599  // enqueue the boundary dslash kernel as soon as the scatters have been enqueued
600  if (!pattern.dslashCompleted[2*i] && pattern.commsCompleted[2*i] && pattern.commsCompleted[2*i+1] ) {
601  // Record the end of the scattering
603  profile, QUDA_PROFILE_EVENT_RECORD);
604 
605  if(!interiorLaunched){
606  if(pthread_join(interiorThread, NULL)) errorQuda("pthread_join failed");
607  interiorLaunched = true;
608  }
609 
610  // wait for scattering to finish and then launch dslash
613 
614  dslashParam.kernel_type = static_cast<KernelType>(i);
615  dslashParam.threads = dslash.Nface()*faceVolumeCB[i]; // updating 2 or 6 faces
616  // all faces use this stream
618 
619  pattern.dslashCompleted[2*i] = 1;
620  }
621 
622  }
623 
624  }
625 
626  completeDslash(*in,dslashParam);
627  in->bufferIndex = (1 - in->bufferIndex);
628 #endif // MULTI_GPU
629  profile.TPSTOP(QUDA_PROFILE_TOTAL);
630 #else // !PTHREADS
631  errorQuda("Pthreads has not been built\n");
632 #endif
633  }
634 };
635 
640 
641  void operator()(DslashCuda &dslash, cudaColorSpinorField* in, const int volume, const int *faceVolumeCB, TimeProfile &profile) {
642 
643  using namespace dslash;
644  profile.TPSTART(QUDA_PROFILE_TOTAL);
645 
646  auto &dslashParam = dslash.dslashParam;
647  dslashParam.kernel_type = INTERIOR_KERNEL;
648  dslashParam.threads = volume;
649 
650  // Record the start of the dslash if doing communication in T and not kernel packing
651  if (dslashParam.commDim[3] && !getKernelPackT()) {
653  }
654 
655  issueRecv(*in, dslash, 0, false); // Prepost receives
656 
657  const int packIndex = Nstream-1;
658  issuePack(*in, dslash, 1-dslashParam.parity, static_cast<MemoryLocation>(Device | (Remote*dslashParam.remote_write) ), packIndex);
659 
660  issueGather(*in, dslash);
661 
663  if (aux_worker) aux_worker->apply(streams[Nstream-1]);
664 
665  const int scatterIndex = getStreamIndex(dslashParam);
666  DslashCommsPattern pattern(dslashParam.commDim);
667  while (pattern.completeSum < pattern.commDimTotal) {
668  for (int i=3; i>=0; i--) {
669  if (!dslashParam.commDim[i]) continue;
670 
671  for (int dir=1; dir>=0; dir--) {
672  // Query if gather has completed
673  if (!pattern.gatherCompleted[2*i+dir] && pattern.gatherCompleted[pattern.previousDir[2*i+dir]]) {
674  cudaError_t event_test = comm_peer2peer_enabled(dir,i) ? cudaSuccess : cudaErrorNotReady;
675  if (event_test != cudaSuccess) PROFILE(event_test = qudaEventQuery(gatherEnd[2*i+dir]), profile, QUDA_PROFILE_EVENT_QUERY);
676 
677  if (cudaSuccess == event_test) {
678  pattern.gatherCompleted[2*i+dir] = 1;
679  pattern.completeSum++;
680  PROFILE(if (dslash_comms) in->sendStart(dslash.Nface()/2, 2*i+dir, dslash.Dagger(), dslashParam.remote_write ? streams+packIndex : nullptr,
681  false, dslashParam.remote_write), profile, QUDA_PROFILE_COMMS_START);
682  if (dslash_comms) in->commsQuery(dslash.Nface()/2, 2*i+dir, dslash.Dagger()); // do a comms query to ensure MPI has begun
683  }
684  }
685 
686  // Query if comms has finished
687  if (!pattern.commsCompleted[2*i+dir] && pattern.gatherCompleted[2*i+dir]) {
688  if ( commsComplete(*in, dslash, i, dir, false, false, false, false, scatterIndex) ) {
689  pattern.commsCompleted[2*i+dir] = 1;
690  pattern.completeSum++;
691  }
692  }
693  } // dir=0,1
694  } // i
695  } // while(pattern.completeSum < commDimTotal)
696 
697  for (int i=3; i>=0; i--) {
698  if (dslashParam.commDim[i] && (!comm_peer2peer_enabled(0,i) || !comm_peer2peer_enabled(1,i))) { // if not peer-to-peer we post an event in the scatter stream and wait on that
701  break;
702  }
703  }
704 
705  // Launch exterior kernel
706  if (pattern.commDimTotal) {
707  setFusedParam(dslashParam,dslash,faceVolumeCB); // setup for exterior kernel
709  }
710 
711  completeDslash(*in,dslashParam);
712  in->bufferIndex = (1 - in->bufferIndex);
713  profile.TPSTOP(QUDA_PROFILE_TOTAL);
714  }
715 
716 };
717 
722 
723  void operator()(DslashCuda &dslash, cudaColorSpinorField* in, const int volume, const int *faceVolumeCB, TimeProfile &profile) {
724 
725  using namespace dslash;
726  profile.TPSTART(QUDA_PROFILE_TOTAL);
727 
728  auto &dslashParam = dslash.dslashParam;
729  dslashParam.kernel_type = INTERIOR_KERNEL;
730  dslashParam.threads = volume;
731 
732  issueRecv(*in, dslash, 0, true); // Prepost receives
733 
734  const int packIndex = Nstream-1;
735  issuePack(*in, dslash, 1-dslashParam.parity, static_cast<MemoryLocation>(Device | (Remote*dslashParam.remote_write) ), packIndex);
736 
738  if (aux_worker) aux_worker->apply(streams[Nstream-1]);
739 
740  bool pack_event = false;
741  for (int p2p=0; p2p<2; p2p++) { // schedule non-p2p traffic first, then do p2p
742  for (int i=3; i>=0; i--) {
743  if (!dslashParam.commDim[i]) continue;
744 
745  if (!pack_event) {
746  cudaEventSynchronize(packEnd[in->bufferIndex]);
747  pack_event = true;
748  }
749 
750  for (int dir=1; dir>=0; dir--) {
751  if ( (comm_peer2peer_enabled(dir,i) + p2p) % 2 == 0 ) {
752  PROFILE(if (dslash_comms) in->sendStart(dslash.Nface()/2, 2*i+dir, dslash.Dagger(), dslashParam.remote_write ? streams+packIndex : nullptr,
753  true, dslashParam.remote_write), profile, QUDA_PROFILE_COMMS_START);
754  if (dslash_comms) in->commsQuery(dslash.Nface()/2, 2*i+dir, dslash.Dagger(), 0, true, true); // do a comms query to ensure MPI has begun
755  } // is p2p?
756  } // dir
757  } // i
758  } // p2p
759 
760  DslashCommsPattern pattern(dslashParam.commDim, true);
761  while (pattern.completeSum < pattern.commDimTotal) {
762  for (int i=3; i>=0; i--) {
763  if (!dslashParam.commDim[i]) continue;
764 
765  for (int dir=1; dir>=0; dir--) {
766 
767  // Query if comms has finished
768  if (!pattern.commsCompleted[2*i+dir]) {
769  if ( commsComplete(*in, dslash, i, dir, true, true, false, false) ) {;
770  pattern.commsCompleted[2*i+dir] = 1;
771  pattern.completeSum++;
772  }
773  }
774 
775  } // dir=0,1
776 
777  if ( !pattern.dslashCompleted[2*i] && pattern.dslashCompleted[pattern.previousDir[2*i+1]] && pattern.commsCompleted[2*i] && pattern.commsCompleted[2*i+1] ) {
778  dslashParam.kernel_type = static_cast<KernelType>(i);
779  dslashParam.threads = dslash.Nface()*faceVolumeCB[i]; // updating 2 or 6 faces
780 
781  // all faces use this stream
783 
784  pattern.dslashCompleted[2*i] = 1;
785  }
786  }
787  }
788 
789  completeDslash(*in,dslashParam);
790  in->bufferIndex = (1 - in->bufferIndex);
791  profile.TPSTOP(QUDA_PROFILE_TOTAL);
792  }
793 };
794 
795 
800 
801  void operator()(DslashCuda &dslash, cudaColorSpinorField* in, const int volume, const int *faceVolumeCB, TimeProfile &profile) {
802 
803  using namespace dslash;
804  profile.TPSTART(QUDA_PROFILE_TOTAL);
805 
806  auto &dslashParam = dslash.dslashParam;
807  dslashParam.kernel_type = INTERIOR_KERNEL;
808  dslashParam.threads = volume;
809 
810  issueRecv(*in, dslash, 0, true); // Prepost receives
811 
812  const int packIndex = Nstream-1;
813  issuePack(*in, dslash, 1-dslashParam.parity, static_cast<MemoryLocation>(Device | (Remote*dslashParam.remote_write) ), packIndex);
814 
816  if (aux_worker) aux_worker->apply(streams[Nstream-1]);
817 
818  bool pack_event = false;
819  for (int p2p=0; p2p<2; p2p++) { // schedule non-p2p traffic first, then do p2p
820  for (int i=3; i>=0; i--) {
821  if (!dslashParam.commDim[i]) continue;
822 
823  if (!pack_event) {
824  cudaEventSynchronize(packEnd[in->bufferIndex]);
825  pack_event = true;
826  }
827 
828  for (int dir=1; dir>=0; dir--) {
829  if ( (comm_peer2peer_enabled(dir,i) + p2p) % 2 == 0 ) {
830  PROFILE(if (dslash_comms) in->sendStart(dslash.Nface()/2, 2*i+dir, dslash.Dagger(), dslashParam.remote_write ? streams+packIndex : nullptr,
831  true, dslashParam.remote_write), profile, QUDA_PROFILE_COMMS_START);
832  if (dslash_comms) in->commsQuery(dslash.Nface()/2, 2*i+dir, dslash.Dagger(), 0, true, true); // do a comms query to ensure MPI has begun
833  } // is p2p?
834  }
835  }
836  } // p2p
837 
838  DslashCommsPattern pattern(dslashParam.commDim, true);
839  while (pattern.completeSum < pattern.commDimTotal) {
840  for (int i=3; i>=0; i--) {
841  if (!dslashParam.commDim[i]) continue;
842 
843  for (int dir=1; dir>=0; dir--) {
844 
845  // Query if comms has finished
846  if (!pattern.commsCompleted[2*i+dir]) {
847  if ( commsComplete(*in, dslash, i, dir, true, true, false, false) ) {
848  pattern.commsCompleted[2*i+dir] = 1;
849  pattern.completeSum++;
850  }
851  }
852  } // dir=0,1
853  } // i
854  } // pattern.completeSum < pattern.CommDimTotal
855 
856  // Launch exterior kernel
857  if (pattern.commDimTotal) {
858  setFusedParam(dslashParam,dslash,faceVolumeCB); // setup for exterior kernel
860  }
861 
862  completeDslash(*in,dslashParam);
863  in->bufferIndex = (1 - in->bufferIndex);
864  profile.TPSTOP(QUDA_PROFILE_TOTAL);
865  }
866 };
867 
872 
873  void operator()(DslashCuda &dslash, cudaColorSpinorField* in, const int volume, const int *faceVolumeCB, TimeProfile &profile) {
874 
875  using namespace dslash;
876  profile.TPSTART(QUDA_PROFILE_TOTAL);
877 
878  auto &dslashParam = dslash.dslashParam;
879  dslashParam.kernel_type = INTERIOR_KERNEL;
880  dslashParam.threads = volume;
881 
882  // Record the start of the dslash if doing communication in T and not kernel packing
883  if (dslashParam.commDim[3] && !getKernelPackT()) {
885  }
886 
887  issueRecv(*in, dslash, 0, true); // Prepost receives
888 
889  const int packIndex = Nstream-1;
890  issuePack(*in, dslash, 1-dslashParam.parity, static_cast<MemoryLocation>(Device | (Remote*dslashParam.remote_write) ), packIndex);
891 
892  issueGather(*in, dslash);
893 
895  if (aux_worker) aux_worker->apply(streams[Nstream-1]);
896 
897  DslashCommsPattern pattern(dslashParam.commDim);
898  while (pattern.completeSum < pattern.commDimTotal) {
899  for (int i=3; i>=0; i--) {
900  if (!dslashParam.commDim[i]) continue;
901 
902  for (int dir=1; dir>=0; dir--) {
903  // Query if gather has completed
904  if (!pattern.gatherCompleted[2*i+dir] && pattern.gatherCompleted[pattern.previousDir[2*i+dir]]) {
905  cudaError_t event_test = comm_peer2peer_enabled(dir,i) ? cudaSuccess : cudaErrorNotReady;
906  if (event_test != cudaSuccess) PROFILE(event_test = qudaEventQuery(gatherEnd[2*i+dir]), profile, QUDA_PROFILE_EVENT_QUERY);
907 
908  if (cudaSuccess == event_test) {
909  pattern.gatherCompleted[2*i+dir] = 1;
910  pattern.completeSum++;
911  PROFILE(if (dslash_comms) in->sendStart(dslash.Nface()/2, 2*i+dir, dslash.Dagger(), dslashParam.remote_write ? streams+packIndex : nullptr,
912  false, dslashParam.remote_write), profile, QUDA_PROFILE_COMMS_START);
913  if (dslash_comms) in->commsQuery(dslash.Nface()/2, 2*i+dir, dslash.Dagger(), 0, false, true); // do a comms query to ensure MPI has begun
914  }
915  }
916 
917  // Query if comms has finished
918  if (!pattern.commsCompleted[2*i+dir] && pattern.gatherCompleted[2*i+dir]) {
919  if ( commsComplete(*in, dslash, i, dir, false, true, false, false) ) {
920  pattern.commsCompleted[2*i+dir] = 1;
921  pattern.completeSum++;
922  }
923  }
924 
925  } // dir=0,1
926 
927  if ( !pattern.dslashCompleted[2*i] && pattern.dslashCompleted[pattern.previousDir[2*i+1]] && pattern.commsCompleted[2*i] && pattern.commsCompleted[2*i+1] ) {
928  dslashParam.kernel_type = static_cast<KernelType>(i);
929  dslashParam.threads = dslash.Nface()*faceVolumeCB[i]; // updating 2 or 6 faces
930 
931  // all faces use this stream
933 
934  pattern.dslashCompleted[2*i] = 1;
935  }
936  }
937  }
938 
939  completeDslash(*in,dslashParam);
940  in->bufferIndex = (1 - in->bufferIndex);
941  profile.TPSTOP(QUDA_PROFILE_TOTAL);
942  }
943 
944 };
945 
950 
951  void operator()(DslashCuda &dslash, cudaColorSpinorField* in, const int volume, const int *faceVolumeCB, TimeProfile &profile) {
952 
953  using namespace dslash;
954  profile.TPSTART(QUDA_PROFILE_TOTAL);
955 
956  auto &dslashParam = dslash.dslashParam;
957  dslashParam.kernel_type = INTERIOR_KERNEL;
958  dslashParam.threads = volume;
959 
960  // Record the start of the dslash if doing communication in T and not kernel packing
961  if (dslashParam.commDim[3] && !getKernelPackT()) {
963  }
964 
965  issueRecv(*in, dslash, 0, true); // Prepost receives
966 
967  const int packIndex = Nstream-1;
968  issuePack(*in, dslash, 1-dslashParam.parity, static_cast<MemoryLocation>(Device | (Remote*dslashParam.remote_write) ), packIndex);
969 
970  issueGather(*in, dslash);
971 
973  if (aux_worker) aux_worker->apply(streams[Nstream-1]);
974 
975  DslashCommsPattern pattern(dslashParam.commDim);
976  while (pattern.completeSum < pattern.commDimTotal) {
977  for (int i=3; i>=0; i--) {
978  if (!dslashParam.commDim[i]) continue;
979 
980  for (int dir=1; dir>=0; dir--) {
981  // Query if gather has completed
982  if (!pattern.gatherCompleted[2*i+dir] && pattern.gatherCompleted[pattern.previousDir[2*i+dir]]) {
983  cudaError_t event_test = comm_peer2peer_enabled(dir,i) ? cudaSuccess : cudaErrorNotReady;
984  if (event_test != cudaSuccess) PROFILE(event_test = qudaEventQuery(gatherEnd[2*i+dir]), profile, QUDA_PROFILE_EVENT_QUERY);
985 
986  if (cudaSuccess == event_test) {
987  pattern.gatherCompleted[2*i+dir] = 1;
988  pattern.completeSum++;
989  PROFILE(if (dslash_comms) in->sendStart(dslash.Nface()/2, 2*i+dir, dslash.Dagger(), dslashParam.remote_write ? streams+packIndex : nullptr,
990  false, dslashParam.remote_write), profile, QUDA_PROFILE_COMMS_START);
991  if (dslash_comms) in->commsQuery(dslash.Nface()/2, 2*i+dir, dslash.Dagger(), 0, false, true); // do a comms query to ensure MPI has begun
992  }
993  }
994 
995  // Query if comms has finished
996  if (!pattern.commsCompleted[2*i+dir] && pattern.gatherCompleted[2*i+dir]) {
997  if ( commsComplete(*in, dslash, i, dir, false, true, false, false) ) {
998  pattern.commsCompleted[2*i+dir] = 1;
999  pattern.completeSum++;
1000  }
1001  }
1002  } // dir=0,1
1003  } // i
1004  } // while(pattern.completeSum < commDimTotal)
1005 
1006  // Launch exterior kernel
1007  if (pattern.commDimTotal) {
1008  setFusedParam(dslashParam,dslash,faceVolumeCB); // setup for exterior kernel
1010  }
1011 
1012  completeDslash(*in,dslashParam);
1013  in->bufferIndex = (1 - in->bufferIndex);
1014  profile.TPSTOP(QUDA_PROFILE_TOTAL);
1015  }
1016 
1017 };
1018 
1019 #ifdef HOST_DEBUG
1020 #define CUDA_CALL( call ) \
1021  { \
1022  CUresult cudaStatus = call; \
1023  if ( CUDA_SUCCESS != cudaStatus ) { \
1024  const char *err_str = nullptr; \
1025  cuGetErrorString(cudaStatus, &err_str); \
1026  fprintf(stderr, "ERROR: CUDA call \"%s\" in line %d of file %s failed with %s (%d).\n", #call, __LINE__, __FILE__, err_str, cudaStatus); \
1027  } \
1028 }
1029 #else
1030 #define CUDA_CALL( call ) call
1031 #endif
1032 
1037 
1038 #if (CUDA_VERSION >= 8000)
1039 
1040  void operator()(DslashCuda &dslash, cudaColorSpinorField* in, const int volume, const int *faceVolumeCB, TimeProfile &profile) {
1041 
1042  using namespace dslash;
1043  profile.TPSTART(QUDA_PROFILE_TOTAL);
1044 
1045  auto &dslashParam = dslash.dslashParam;
1046  dslashParam.kernel_type = INTERIOR_KERNEL;
1047  dslashParam.threads = volume;
1048 
1049  // Record the start of the dslash if doing communication in T and not kernel packing
1050  if (dslashParam.commDim[3] && !getKernelPackT()) {
1052  }
1053 
1054  issueRecv(*in, dslash, 0, false); // Prepost receives
1055 
1056  const int packIndex = Nstream-1;
1057  issuePack(*in, dslash, 1-dslashParam.parity, static_cast<MemoryLocation>(Device | (Remote*dslashParam.remote_write) ), packIndex);
1058 
1059  issueGather(*in, dslash);
1060 
1062  if (aux_worker) aux_worker->apply(streams[Nstream-1]);
1063 
1064  DslashCommsPattern pattern(dslashParam.commDim);
1065  while (pattern.completeSum < pattern.commDimTotal) {
1066  for (int i=3; i>=0; i--) {
1067  if (!dslashParam.commDim[i]) continue;
1068 
1069  for (int dir=1; dir>=0; dir--) {
1070  // Query if gather has completed
1071  if (!pattern.gatherCompleted[2*i+dir] && pattern.gatherCompleted[pattern.previousDir[2*i+dir]]) {
1072  cudaError_t event_test = comm_peer2peer_enabled(dir,i) ? cudaSuccess : cudaErrorNotReady;
1073  if (event_test != cudaSuccess) PROFILE(event_test = qudaEventQuery(gatherEnd[2*i+dir]), profile, QUDA_PROFILE_EVENT_QUERY);
1074 
1075  if (cudaSuccess == event_test) {
1076  pattern.gatherCompleted[2*i+dir] = 1;
1077  pattern.completeSum++;
1078  PROFILE(if (dslash_comms) in->sendStart(dslash.Nface()/2, 2*i+dir, dslash.Dagger(), dslashParam.remote_write ? streams+packIndex : nullptr,
1079  false, dslashParam.remote_write), profile, QUDA_PROFILE_COMMS_START);
1080  if (dslash_comms) in->commsQuery(dslash.Nface()/2, 2*i+dir, dslash.Dagger()); // do a comms query to ensure MPI has begun
1081 
1082  // schedule post comms work (scatter into the end zone)
1083  if (!comm_peer2peer_enabled(1-dir,i)) {
1084  *((volatile cuuint32_t*)(commsEnd_h+2*i+1-dir)) = 0;
1085  CUDA_CALL(cuStreamWaitValue32( streams[2*i+dir], commsEnd_d[2*i+1-dir], 1, CU_STREAM_WAIT_VALUE_EQ ));
1086  PROFILE(if (dslash_copy) in->scatter(dslash.Nface()/2, dslash.Dagger(), 2*i+dir, &streams[2*i+dir]), profile, QUDA_PROFILE_SCATTER);
1087  }
1088  }
1089  }
1090 
1091  // Query if comms has finished
1092  if (!pattern.commsCompleted[2*i+dir] && pattern.gatherCompleted[2*i+dir]) {
1093  if ( commsComplete(*in, dslash, i, dir, false, false, false, true) ) {
1094  pattern.commsCompleted[2*i+dir] = 1;
1095  pattern.completeSum++;
1096  }
1097  }
1098 
1099  } // dir=0,1
1100 
1101  if ( !pattern.dslashCompleted[2*i] && pattern.dslashCompleted[pattern.previousDir[2*i+1]] && pattern.commsCompleted[2*i] && pattern.commsCompleted[2*i+1] ) {
1102 
1103  for (int dir=1; dir>=0; dir--) {
1104  if (!comm_peer2peer_enabled(1-dir,i)) { // if not peer-to-peer we post an event in the scatter stream and wait on that
1105  // Record the end of the scattering
1107  // wait for scattering to finish and then launch dslash
1109  }
1110  }
1111 
1112  dslashParam.kernel_type = static_cast<KernelType>(i);
1113  dslashParam.threads = dslash.Nface()*faceVolumeCB[i]; // updating 2 or 6 faces
1114 
1115  // all faces use this stream
1117 
1118  pattern.dslashCompleted[2*i] = 1;
1119  }
1120 
1121  }
1122 
1123  }
1124 
1125  completeDslash(*in,dslashParam);
1126  in->bufferIndex = (1 - in->bufferIndex);
1127  profile.TPSTOP(QUDA_PROFILE_TOTAL);
1128  }
1129 #else
1130 
1131  void operator()(DslashCuda &dslash, cudaColorSpinorField* in, const int volume, const int *faceVolumeCB, TimeProfile &profile) {
1132  errorQuda("Async dslash policy variants require CUDA 8.0 and above");
1133  }
1134 
1135 #endif // CUDA_VERSION >= 8000
1136 
1137 };
1138 
1139 
1145 
1146 #if (CUDA_VERSION >= 8000)
1147 
1148  void operator()(DslashCuda &dslash, cudaColorSpinorField* in, const int volume, const int *faceVolumeCB, TimeProfile &profile) {
1149 
1150  using namespace dslash;
1151  profile.TPSTART(QUDA_PROFILE_TOTAL);
1152 
1153  auto &dslashParam = dslash.dslashParam;
1154  dslashParam.kernel_type = INTERIOR_KERNEL;
1155  dslashParam.threads = volume;
1156 
1157  // Record the start of the dslash if doing communication in T and not kernel packing
1158  if (dslashParam.commDim[3] && !getKernelPackT()) {
1160  }
1161 
1162  issueRecv(*in, dslash, 0, false); // Prepost receives
1163 
1164  const int packIndex = Nstream-1;
1165  issuePack(*in, dslash, 1-dslashParam.parity, static_cast<MemoryLocation>(Device | (Remote*dslashParam.remote_write) ), packIndex);
1166 
1167  issueGather(*in, dslash);
1168 
1170  if (aux_worker) aux_worker->apply(streams[Nstream-1]);
1171 
1172  const int scatterIndex = getStreamIndex(dslashParam);
1173  DslashCommsPattern pattern(dslashParam.commDim);
1174  while (pattern.completeSum < pattern.commDimTotal) {
1175  for (int i=3; i>=0; i--) {
1176  if (!dslashParam.commDim[i]) continue;
1177 
1178  for (int dir=1; dir>=0; dir--) {
1179 
1180  // Query if gather has completed
1181  if (!pattern.gatherCompleted[2*i+dir] && pattern.gatherCompleted[pattern.previousDir[2*i+dir]]) {
1182  cudaError_t event_test = comm_peer2peer_enabled(dir,i) ? cudaSuccess : cudaErrorNotReady;
1183  if (event_test != cudaSuccess) PROFILE(event_test = qudaEventQuery(gatherEnd[2*i+dir]), profile, QUDA_PROFILE_EVENT_QUERY);
1184 
1185  if (cudaSuccess == event_test) {
1186  pattern.gatherCompleted[2*i+dir] = 1;
1187  pattern.completeSum++;
1188  PROFILE(if (dslash_comms) in->sendStart(dslash.Nface()/2, 2*i+dir, dslash.Dagger(), dslashParam.remote_write ? streams+packIndex : nullptr,
1189  false, dslashParam.remote_write), profile, QUDA_PROFILE_COMMS_START);
1190  if (dslash_comms) in->commsQuery(dslash.Nface()/2, 2*i+dir, dslash.Dagger()); // do a comms query to ensure MPI has begun
1191 
1192  // schedule post comms work (scatter into the end zone)
1193  if (!comm_peer2peer_enabled(1-dir,i)) { // gather centric
1194  *((volatile cuuint32_t*)(commsEnd_h+2*i+1-dir)) = 0;
1195  CUDA_CALL(cuStreamWaitValue32( streams[scatterIndex], commsEnd_d[2*i+1-dir], 1, CU_STREAM_WAIT_VALUE_EQ ));
1196  PROFILE(if (dslash_copy) in->scatter(dslash.Nface()/2, dslash.Dagger(), 2*i+dir, streams+scatterIndex), profile, QUDA_PROFILE_SCATTER);
1197  }
1198  }
1199 
1200  }
1201 
1202  // Query if comms has finished
1203  if (!pattern.commsCompleted[2*i+dir] && pattern.gatherCompleted[2*i+dir]) {
1204  if ( commsComplete(*in, dslash, i, dir, false, false, false, true, scatterIndex) ) {
1205  pattern.commsCompleted[2*i+dir] = 1;
1206  pattern.completeSum++;
1207  }
1208  }
1209 
1210  } // dir=0,1
1211  } // i
1212  } // while(pattern.completeSum < commDimTotal)
1213 
1214  for (int i=3; i>=0; i--) {
1215  if (dslashParam.commDim[i] && (!comm_peer2peer_enabled(0,i) || !comm_peer2peer_enabled(1,i))) {
1216  // if not peer-to-peer we post an event in the scatter stream and wait on that
1217  PROFILE(qudaEventRecord(scatterEnd[0], streams[scatterIndex]), profile, QUDA_PROFILE_EVENT_RECORD);
1219  break;
1220  }
1221  }
1222 
1223  if (pattern.commDimTotal) {
1224  setFusedParam(dslashParam,dslash,faceVolumeCB); // setup for exterior kernel
1226  }
1227 
1228  completeDslash(*in,dslashParam);
1229  in->bufferIndex = (1 - in->bufferIndex);
1230  profile.TPSTOP(QUDA_PROFILE_TOTAL);
1231  }
1232 
1233 #else
1234 
1235  void operator()(DslashCuda &dslash, cudaColorSpinorField* in, const int volume, const int *faceVolumeCB, TimeProfile &profile) {
1236  errorQuda("Async dslash policy variants require CUDA 8.0 and above");
1237  }
1238 
1239 #endif // CUDA_VERSION >= 8000
1240 
1241 };
1242 
1243 
1249 
1250  void operator()(DslashCuda &dslash, cudaColorSpinorField *in, const int volume, const int *faceVolumeCB, TimeProfile &profile) {
1251 
1252  using namespace dslash;
1253  profile.TPSTART(QUDA_PROFILE_TOTAL);
1254 
1255  auto &dslashParam = dslash.dslashParam;
1256  dslashParam.kernel_type = INTERIOR_KERNEL;
1257  dslashParam.threads = volume;
1258 
1259  // record start of the dslash
1261 
1262  issueRecv(*in, dslash, 0, false); // Prepost receives
1263 
1264  const int packIndex = getStreamIndex(dslashParam);
1265  PROFILE(qudaStreamWaitEvent(streams[packIndex], dslashStart[in->bufferIndex], 0), profile, QUDA_PROFILE_STREAM_WAIT_EVENT);
1266  issuePack(*in, dslash, 1-dslashParam.parity, static_cast<MemoryLocation>(Host | (Remote*dslashParam.remote_write) ), packIndex);
1267 
1269  if (aux_worker) aux_worker->apply(streams[Nstream-1]);
1270 
1271  for (int i=3; i>=0; i--) { // only synchronize if we need to
1272  if ( !dslashParam.remote_write || (dslashParam.commDim[i] && (!comm_peer2peer_enabled(0,i) || !comm_peer2peer_enabled(1,i))) ) {
1273  qudaStreamSynchronize(streams[packIndex]);
1274  break;
1275  }
1276  }
1277 
1278  for (int p2p=0; p2p<2; p2p++) { // schedule non-p2p traffic first, then do p2p
1279  for (int i=3; i>=0; i--) {
1280  if (!dslashParam.commDim[i]) continue;
1281 
1282  for (int dir=1; dir>=0; dir--) {
1283  if ( (comm_peer2peer_enabled(dir,i) + p2p) % 2 == 0 ) {
1284  PROFILE(if (dslash_comms) in->sendStart(dslash.Nface()/2, 2*i+dir, dslash.Dagger(), dslashParam.remote_write ? streams+packIndex : nullptr,
1285  false, dslashParam.remote_write), profile, QUDA_PROFILE_COMMS_START);
1286  if (dslash_comms) in->commsQuery(dslash.Nface()/2, 2*i+dir, dslash.Dagger()); // do a comms query to ensure MPI has begun
1287  } // is p2p?
1288  } // dir
1289  } // i
1290  } // p2p
1291 
1292  DslashCommsPattern pattern(dslashParam.commDim, true);
1293  while (pattern.completeSum < pattern.commDimTotal) {
1294 
1295  for (int i=3; i>=0; i--) {
1296  if (!dslashParam.commDim[i]) continue;
1297 
1298  for (int dir=1; dir>=0; dir--) {
1299 
1300  // Query if comms have finished
1301  if (!pattern.commsCompleted[2*i+dir]) {
1302  if ( commsComplete(*in, dslash, i, dir, false, false, false, false) ) {
1303  pattern.commsCompleted[2*i+dir] = 1;
1304  pattern.completeSum++;
1305  }
1306  }
1307 
1308  }
1309 
1310  if ( !pattern.dslashCompleted[2*i] && pattern.dslashCompleted[pattern.previousDir[2*i+1]] && pattern.commsCompleted[2*i] && pattern.commsCompleted[2*i+1] ) {
1311  for (int dir=1; dir>=0; dir--) {
1312  if (!comm_peer2peer_enabled(1-dir,i)) { // if not peer-to-peer we post an event in the scatter stream and wait on that
1313  // Record the end of the scattering
1315  // wait for scattering to finish and then launch dslash
1317  }
1318  }
1319 
1320  dslashParam.kernel_type = static_cast<KernelType>(i);
1321  dslashParam.threads = dslash.Nface()*faceVolumeCB[i]; // updating 2 or 6 faces
1322 
1323  // all faces use this stream
1325 
1326  pattern.dslashCompleted[2*i] = 1;
1327  }
1328  }
1329  }
1330 
1331  completeDslash(*in,dslashParam);
1332  in->bufferIndex = (1 - in->bufferIndex);
1333  profile.TPSTOP(QUDA_PROFILE_TOTAL);
1334  }
1335 };
1336 
1337 
1343 
1344  void operator()(DslashCuda &dslash, cudaColorSpinorField *in, const int volume, const int *faceVolumeCB, TimeProfile &profile) {
1345 
1346  using namespace dslash;
1347  profile.TPSTART(QUDA_PROFILE_TOTAL);
1348 
1349  auto &dslashParam = dslash.dslashParam;
1350  dslashParam.kernel_type = INTERIOR_KERNEL;
1351  dslashParam.threads = volume;
1352 
1353  // record start of the dslash
1355 
1356  const int packScatterIndex = getStreamIndex(dslashParam);
1357  PROFILE(qudaStreamWaitEvent(streams[packScatterIndex], dslashStart[in->bufferIndex], 0), profile, QUDA_PROFILE_STREAM_WAIT_EVENT);
1358  issuePack(*in, dslash, 1-dslashParam.parity, static_cast<MemoryLocation>(Host | (Remote*dslashParam.remote_write) ), packScatterIndex);
1359 
1360  issueRecv(*in, dslash, 0, false); // Prepost receives
1361 
1363  if (aux_worker) aux_worker->apply(streams[Nstream-1]);
1364 
1365  for (int i=3; i>=0; i--) { // only synchronize if we need to
1366  if ( !dslashParam.remote_write || (dslashParam.commDim[i] && (!comm_peer2peer_enabled(0,i) || !comm_peer2peer_enabled(1,i))) ) {
1367  qudaStreamSynchronize(streams[packScatterIndex]);
1368  break;
1369  }
1370  }
1371 
1372  for (int p2p=0; p2p<2; p2p++) { // schedule non-p2p traffic first, then do p2p
1373  for (int i=3; i>=0; i--) {
1374  if (!dslashParam.commDim[i]) continue;
1375 
1376  for (int dir=1; dir>=0; dir--) {
1377  if ( (comm_peer2peer_enabled(dir,i) + p2p) % 2 == 0 ) {
1378  PROFILE(if (dslash_comms) in->sendStart(dslash.Nface()/2, 2*i+dir, dslash.Dagger(), dslashParam.remote_write ? streams+packScatterIndex : nullptr,
1379  false, dslashParam.remote_write), profile, QUDA_PROFILE_COMMS_START);
1380  if (dslash_comms) in->commsQuery(dslash.Nface()/2, 2*i+dir, dslash.Dagger()); // do a comms query to ensure MPI has begun
1381  } // is p2p?
1382  } // dir
1383  } // i
1384  } // p2p
1385 
1386  DslashCommsPattern pattern(dslashParam.commDim, true);
1387  while (pattern.completeSum < pattern.commDimTotal) {
1388 
1389  for (int i=3; i>=0; i--) {
1390  if (!dslashParam.commDim[i]) continue;
1391 
1392  for (int dir=1; dir>=0; dir--) {
1393 
1394  // Query if comms has finished
1395  if (!pattern.commsCompleted[2*i+dir]) {
1396  if ( commsComplete(*in, dslash, i, dir, false, false, false, false, packScatterIndex) ) {
1397  pattern.commsCompleted[2*i+dir] = 1;
1398  pattern.completeSum++;
1399  }
1400  }
1401 
1402  } // dir=0,1
1403  } // i
1404  } // pattern.completeSum
1405 
1406  for (int i=3; i>=0; i--) {
1407  if (dslashParam.commDim[i] && (!comm_peer2peer_enabled(0,i) || !comm_peer2peer_enabled(1,i))) {
1408  // if not peer-to-peer we post an event in the scatter stream and wait on that
1409  PROFILE(qudaEventRecord(scatterEnd[0], streams[packScatterIndex]), profile, QUDA_PROFILE_EVENT_RECORD);
1411  break;
1412  }
1413  }
1414 
1415  // Launch exterior kernel
1416  if (pattern.commDimTotal) {
1417  setFusedParam(dslashParam,dslash,faceVolumeCB); // setup for exterior kernel
1419  }
1420 
1421  completeDslash(*in,dslashParam);
1422  in->bufferIndex = (1 - in->bufferIndex);
1423  profile.TPSTOP(QUDA_PROFILE_TOTAL);
1424  }
1425 };
1426 
1431 
1432  void operator()(DslashCuda &dslash, cudaColorSpinorField* in, const int volume, const int *faceVolumeCB, TimeProfile &profile) {
1433 
1434  using namespace dslash;
1435  profile.TPSTART(QUDA_PROFILE_TOTAL);
1436 
1437  auto &dslashParam = dslash.dslashParam;
1438  dslashParam.kernel_type = INTERIOR_KERNEL;
1439  dslashParam.threads = volume;
1440 
1441  // record start of the dslash
1443 
1444  issueRecv(*in, dslash, 0, true); // Prepost receives
1445 
1446  const int packIndex = getStreamIndex(dslashParam);
1447  PROFILE(qudaStreamWaitEvent(streams[packIndex], dslashStart[in->bufferIndex], 0), profile, QUDA_PROFILE_STREAM_WAIT_EVENT);
1448  issuePack(*in, dslash, 1-dslashParam.parity, static_cast<MemoryLocation>(Host | (Remote*dslashParam.remote_write) ), packIndex);
1449 
1451  if (aux_worker) aux_worker->apply(streams[Nstream-1]);
1452 
1453  for (int i=3; i>=0; i--) { // only synchronize if we need to
1454  if ( !dslashParam.remote_write || (dslashParam.commDim[i] && (!comm_peer2peer_enabled(0,i) || !comm_peer2peer_enabled(1,i))) ) {
1455  qudaStreamSynchronize(streams[packIndex]);
1456  break;
1457  }
1458  }
1459 
1460  for (int p2p=0; p2p<2; p2p++) { // schedule non-p2p traffic first, then do p2p
1461  for (int i=3; i>=0; i--) {
1462  if (!dslashParam.commDim[i]) continue;
1463 
1464  for (int dir=1; dir>=0; dir--) {
1465  if ( (comm_peer2peer_enabled(dir,i) + p2p) % 2 == 0 ) {
1466  PROFILE(if (dslash_comms) in->sendStart(dslash.Nface()/2, 2*i+dir, dslash.Dagger(), dslashParam.remote_write ? streams+packIndex : nullptr,
1467  false, dslashParam.remote_write), profile, QUDA_PROFILE_COMMS_START);
1468  if (dslash_comms) in->commsQuery(dslash.Nface()/2, 2*i+dir, dslash.Dagger(), 0, false, true); // do a comms query to ensure MPI has begun
1469  } // is p2p?
1470  } // dir
1471  } // i
1472  } // p2p
1473 
1474  DslashCommsPattern pattern(dslashParam.commDim, true);
1475  while (pattern.completeSum < pattern.commDimTotal) {
1476 
1477  for (int i=3; i>=0; i--) {
1478  if (!dslashParam.commDim[i]) continue;
1479 
1480  for (int dir=1; dir>=0; dir--) {
1481 
1482  // Query if comms has finished
1483  if (!pattern.commsCompleted[2*i+dir] && pattern.gatherCompleted[2*i+dir]) {
1484  if ( commsComplete(*in, dslash, i, dir, false, true, false, false) ) {
1485  pattern.commsCompleted[2*i+dir] = 1;
1486  pattern.completeSum++;
1487  }
1488  }
1489 
1490  } // dir=0,1
1491 
1492  if ( !pattern.dslashCompleted[2*i] && pattern.dslashCompleted[pattern.previousDir[2*i+1]] && pattern.commsCompleted[2*i] && pattern.commsCompleted[2*i+1] ) {
1493  dslashParam.kernel_type = static_cast<KernelType>(i);
1494  dslashParam.threads = dslash.Nface()*faceVolumeCB[i]; // updating 2 or 6 faces
1495 
1496  // all faces use this stream
1498 
1499  pattern.dslashCompleted[2*i] = 1;
1500  }
1501  }
1502  }
1503 
1504  completeDslash(*in,dslashParam);
1505  in->bufferIndex = (1 - in->bufferIndex);
1506  profile.TPSTOP(QUDA_PROFILE_TOTAL);
1507  }
1508 
1509 };
1510 
1516 
1517  void operator()(DslashCuda &dslash, cudaColorSpinorField* in, const int volume, const int *faceVolumeCB, TimeProfile &profile) {
1518 
1519  using namespace dslash;
1520  profile.TPSTART(QUDA_PROFILE_TOTAL);
1521 
1522  auto &dslashParam = dslash.dslashParam;
1523  dslashParam.kernel_type = INTERIOR_KERNEL;
1524  dslashParam.threads = volume;
1525 
1526  // record start of the dslash
1528 
1529  const int packIndex = getStreamIndex(dslashParam);
1530  PROFILE(qudaStreamWaitEvent(streams[packIndex], dslashStart[in->bufferIndex], 0), profile, QUDA_PROFILE_STREAM_WAIT_EVENT);
1531  issuePack(*in, dslash, 1-dslashParam.parity, static_cast<MemoryLocation>(Host | (Remote*dslashParam.remote_write) ), packIndex);
1532 
1533  issueRecv(*in, dslash, 0, true); // Prepost receives
1534 
1536  if (aux_worker) aux_worker->apply(streams[Nstream-1]);
1537 
1538  for (int i=3; i>=0; i--) { // only synchronize if we need to
1539  if ( !dslashParam.remote_write || (dslashParam.commDim[i] && (!comm_peer2peer_enabled(0,i) || !comm_peer2peer_enabled(1,i))) ) {
1540  qudaStreamSynchronize(streams[packIndex]);
1541  break;
1542  }
1543  }
1544 
1545  for (int p2p=0; p2p<2; p2p++) { // schedule non-p2p traffic first, then do p2p
1546  for (int i=3; i>=0; i--) {
1547  if (!dslashParam.commDim[i]) continue;
1548 
1549  for (int dir=1; dir>=0; dir--) {
1550  if ( (comm_peer2peer_enabled(dir,i) + p2p) % 2 == 0 ) {
1551  PROFILE(if (dslash_comms) in->sendStart(dslash.Nface()/2, 2*i+dir, dslash.Dagger(), dslashParam.remote_write ? streams+packIndex : nullptr,
1552  false, dslashParam.remote_write), profile, QUDA_PROFILE_COMMS_START);
1553  if (dslash_comms) in->commsQuery(dslash.Nface()/2, 2*i+dir, dslash.Dagger(), 0, false, true); // do a comms query to ensure MPI has begun
1554  } // is p2p?
1555  } // dir
1556  } // i
1557  } // p2p
1558 
1559  DslashCommsPattern pattern(dslashParam.commDim, true);
1560  while (pattern.completeSum < pattern.commDimTotal) {
1561 
1562  for (int i=3; i>=0; i--) {
1563  if (!dslashParam.commDim[i]) continue;
1564 
1565  for (int dir=1; dir>=0; dir--) {
1566 
1567  // Query if comms has finished
1568  if (!pattern.commsCompleted[2*i+dir] && pattern.gatherCompleted[2*i+dir]) {
1569  if ( commsComplete(*in, dslash, i, dir, false, true, false, false) ) {
1570  pattern.commsCompleted[2*i+dir] = 1;
1571  pattern.completeSum++;
1572  }
1573  }
1574  } // dir=0,1
1575  } // i
1576  } // while(pattern.completeSum < commDimTotal)
1577 
1578  // Launch exterior kernel
1579  if (pattern.commDimTotal) {
1580  setFusedParam(dslashParam,dslash,faceVolumeCB); // setup for exterior kernel
1582  }
1583 
1584  completeDslash(*in,dslashParam);
1585  in->bufferIndex = (1 - in->bufferIndex);
1586  profile.TPSTOP(QUDA_PROFILE_TOTAL);
1587  }
1588 
1589 };
1590 
1596 
1597  void operator()(DslashCuda &dslash, cudaColorSpinorField *in, const int volume, const int *faceVolumeCB, TimeProfile &profile) {
1598 
1599  using namespace dslash;
1600  profile.TPSTART(QUDA_PROFILE_TOTAL);
1601 
1602  auto &dslashParam = dslash.dslashParam;
1603  dslashParam.kernel_type = INTERIOR_KERNEL;
1604  dslashParam.threads = volume;
1605 
1606  // record start of the dslash
1608 
1609  issueRecv(*in, dslash, 0, false); // Prepost receives
1610 
1611  const int packIndex = getStreamIndex(dslashParam);
1612  PROFILE(qudaStreamWaitEvent(streams[packIndex], dslashStart[in->bufferIndex], 0), profile, QUDA_PROFILE_STREAM_WAIT_EVENT);
1613  issuePack(*in, dslash, 1-dslashParam.parity, static_cast<MemoryLocation>(Host | (Remote*dslashParam.remote_write) ), packIndex);
1614 
1616  if (aux_worker) aux_worker->apply(streams[Nstream-1]);
1617 
1618  for (int i=3; i>=0; i--) { // only synchronize if we need to
1619  if ( !dslashParam.remote_write || (dslashParam.commDim[i] && (!comm_peer2peer_enabled(0,i) || !comm_peer2peer_enabled(1,i))) ) {
1620  qudaStreamSynchronize(streams[packIndex]);
1621  break;
1622  }
1623  }
1624 
1625  for (int p2p=0; p2p<2; p2p++) { // schedule non-p2p traffic first, then do p2p
1626  for (int i=3; i>=0; i--) {
1627  if (!dslashParam.commDim[i]) continue;
1628 
1629  for (int dir=1; dir>=0; dir--) {
1630  if ( (comm_peer2peer_enabled(dir,i) + p2p) % 2 == 0 ) {
1631  PROFILE(if (dslash_comms) in->sendStart(dslash.Nface()/2, 2*i+dir, dslash.Dagger(), dslashParam.remote_write ? streams+packIndex : nullptr,
1632  false, dslashParam.remote_write), profile, QUDA_PROFILE_COMMS_START);
1633  if (dslash_comms) in->commsQuery(dslash.Nface()/2, 2*i+dir, dslash.Dagger()); // do a comms query to ensure MPI has begun
1634  } // is p2p?
1635  } // dir
1636  } // i
1637  } // p2p
1638 
1639  DslashCommsPattern pattern(dslashParam.commDim, true);
1640  while (pattern.completeSum < pattern.commDimTotal) {
1641 
1642  for (int i=3; i>=0; i--) {
1643  if (!dslashParam.commDim[i]) continue;
1644 
1645  for (int dir=1; dir>=0; dir--) {
1646 
1647  // Query if comms have finished
1648  if (!pattern.commsCompleted[2*i+dir]) {
1649  if ( commsComplete(*in, dslash, i, dir, false, false, true, false) ) {
1650  pattern.commsCompleted[2*i+dir] = 1;
1651  pattern.completeSum++;
1652  }
1653  }
1654 
1655  }
1656 
1657  // enqueue the boundary dslash kernel as soon as the scatters have been enqueued
1658  if ( !pattern.dslashCompleted[2*i] && pattern.dslashCompleted[pattern.previousDir[2*i+1]] &&
1659  pattern.commsCompleted[2*i] && pattern.commsCompleted[2*i+1] ) {
1660  dslashParam.kernel_type = static_cast<KernelType>(i);
1661  dslashParam.threads = dslash.Nface()*faceVolumeCB[i]; // updating 2 or 6 faces
1662 
1663  setMappedGhost(dslash, *in, true);
1665  setMappedGhost(dslash, *in, false);
1666 
1667  pattern.dslashCompleted[2*i] = 1;
1668  }
1669  }
1670  }
1671 
1672  in->bufferIndex = (1 - in->bufferIndex);
1673  profile.TPSTOP(QUDA_PROFILE_TOTAL);
1674  }
1675 };
1676 
1677 
1683 
1684  void operator()(DslashCuda &dslash, cudaColorSpinorField *in, const int volume, const int *faceVolumeCB, TimeProfile &profile) {
1685 
1686  using namespace dslash;
1687  profile.TPSTART(QUDA_PROFILE_TOTAL);
1688 
1689  auto &dslashParam = dslash.dslashParam;
1690  dslashParam.kernel_type = INTERIOR_KERNEL;
1691  dslashParam.threads = volume;
1692 
1693  // record start of the dslash
1695 
1696  issueRecv(*in, dslash, 0, false); // Prepost receives
1697 
1698  const int packIndex = getStreamIndex(dslashParam);
1699  PROFILE(qudaStreamWaitEvent(streams[packIndex], dslashStart[in->bufferIndex], 0), profile, QUDA_PROFILE_STREAM_WAIT_EVENT);
1700  issuePack(*in, dslash, 1-dslashParam.parity, static_cast<MemoryLocation>(Host | (Remote*dslashParam.remote_write) ), packIndex);
1701 
1703  if (aux_worker) aux_worker->apply(streams[Nstream-1]);
1704 
1705  for (int i=3; i>=0; i--) { // only synchronize if we need to
1706  if ( !dslashParam.remote_write || (dslashParam.commDim[i] && (!comm_peer2peer_enabled(0,i) || !comm_peer2peer_enabled(1,i))) ) {
1707  qudaStreamSynchronize(streams[packIndex]);
1708  break;
1709  }
1710  }
1711 
1712  for (int p2p=0; p2p<2; p2p++) { // schedule non-p2p traffic first, then do p2p
1713  for (int i=3; i>=0; i--) {
1714  if (!dslashParam.commDim[i]) continue;
1715 
1716  for (int dir=1; dir>=0; dir--) {
1717  if ( (comm_peer2peer_enabled(dir,i) + p2p) % 2 == 0 ) {
1718  PROFILE(if (dslash_comms) in->sendStart(dslash.Nface()/2, 2*i+dir, dslash.Dagger(), dslashParam.remote_write ? streams+packIndex : nullptr,
1719  false, dslashParam.remote_write), profile, QUDA_PROFILE_COMMS_START);
1720  if (dslash_comms) in->commsQuery(dslash.Nface()/2, 2*i+dir, dslash.Dagger()); // do a comms query to ensure MPI has begun
1721  } // is p2p?
1722  } // dir
1723  } // i
1724  } // p2p
1725 
1726  DslashCommsPattern pattern(dslashParam.commDim, true);
1727  while (pattern.completeSum < pattern.commDimTotal) {
1728 
1729  for (int i=3; i>=0; i--) {
1730  if (!dslashParam.commDim[i]) continue;
1731 
1732  for (int dir=1; dir>=0; dir--) {
1733 
1734  // Query if comms have finished
1735  if (!pattern.commsCompleted[2*i+dir]) {
1736  if ( commsComplete(*in, dslash, i, dir, false, false, true, false) ) {
1737  pattern.commsCompleted[2*i+dir] = 1;
1738  pattern.completeSum++;
1739  }
1740  }
1741 
1742  }
1743 
1744  }
1745 
1746  }
1747 
1748  if (pattern.commDimTotal) {
1749  setFusedParam(dslashParam,dslash,faceVolumeCB); // setup for exterior kernel
1750  setMappedGhost(dslash, *in, true);
1752  setMappedGhost(dslash, *in, false);
1753  }
1754 
1755  completeDslash(*in,dslashParam);
1756  in->bufferIndex = (1 - in->bufferIndex);
1757  profile.TPSTOP(QUDA_PROFILE_TOTAL);
1758  }
1759 };
1760 
1761 
1763 
1764  void operator()(DslashCuda &dslash, cudaColorSpinorField* in, const int volume, const int *faceVolumeCB, TimeProfile &profile) {
1765 
1766  profile.TPSTART(QUDA_PROFILE_TOTAL);
1767 
1768  auto &dslashParam = dslash.dslashParam;
1769  dslashParam.kernel_type = INTERIOR_KERNEL;
1770  dslashParam.threads = volume;
1771 
1773 
1774  profile.TPSTOP(QUDA_PROFILE_TOTAL);
1775  }
1776 
1777 };
1778 
1779  enum class QudaDslashPolicy {
1780  QUDA_DSLASH,
1796  QUDA_DSLASH_POLICY_DISABLED // this MUST be the last element
1797  };
1798 
1799  static std::vector<QudaDslashPolicy> policies(static_cast<int>(QudaDslashPolicy::QUDA_DSLASH_POLICY_DISABLED), QudaDslashPolicy::QUDA_DSLASH_POLICY_DISABLED);
1800 
1801  enum class QudaP2PPolicy {
1802  QUDA_P2P_DEFAULT, // no special hanlding for p2p
1803  QUDA_P2P_COPY_ENGINE, // use copy engine for p2p traffic
1804  QUDA_P2P_REMOTE_WRITE, // write packed halos directly to peers
1805  QUDA_P2P_POLICY_DISABLED, // this must be the last element
1806  };
1807 
1808  static std::vector<QudaP2PPolicy> p2p_policies(static_cast<int>(QudaP2PPolicy::QUDA_P2P_POLICY_DISABLED), QudaP2PPolicy::QUDA_P2P_POLICY_DISABLED);
1809 
1811 
1812  static DslashPolicyImp* create(const QudaDslashPolicy &dslashPolicy)
1813  {
1814  DslashPolicyImp* result = nullptr;
1815 
1816  switch(dslashPolicy){
1817  case QudaDslashPolicy::QUDA_DSLASH:
1818  result = new DslashBasic;
1819  break;
1820  case QudaDslashPolicy::QUDA_DSLASH_ASYNC:
1821  result = new DslashAsync;
1822  break;
1823  case QudaDslashPolicy::QUDA_PTHREADS_DSLASH:
1824  result = new DslashPthreads;
1825  break;
1826  case QudaDslashPolicy::QUDA_FUSED_DSLASH:
1827  result = new DslashFusedExterior;
1828  break;
1829  case QudaDslashPolicy::QUDA_FUSED_DSLASH_ASYNC:
1830  result = new DslashFusedExteriorAsync;
1831  break;
1832  case QudaDslashPolicy::QUDA_GDR_DSLASH:
1833  if (!comm_gdr_blacklist()) result = new DslashGDR;
1834  else result = new DslashBasic;
1835  break;
1836  case QudaDslashPolicy::QUDA_FUSED_GDR_DSLASH:
1837  if (!comm_gdr_blacklist()) result = new DslashFusedGDR;
1838  else result = new DslashFusedExterior;
1839  break;
1840  case QudaDslashPolicy::QUDA_GDR_RECV_DSLASH:
1841  if (!comm_gdr_blacklist()) result = new DslashGDRRecv;
1842  else result = new DslashBasic;
1843  break;
1844  case QudaDslashPolicy::QUDA_FUSED_GDR_RECV_DSLASH:
1845  if (!comm_gdr_blacklist()) result = new DslashFusedGDRRecv;
1846  else result = new DslashFusedExterior;
1847  break;
1848  case QudaDslashPolicy::QUDA_ZERO_COPY_PACK_DSLASH:
1849  result = new DslashZeroCopyPack;
1850  break;
1851  case QudaDslashPolicy::QUDA_FUSED_ZERO_COPY_PACK_DSLASH:
1852  result = new DslashFusedZeroCopyPack;
1853  break;
1854  case QudaDslashPolicy::QUDA_ZERO_COPY_PACK_GDR_RECV_DSLASH:
1855  if (!comm_gdr_blacklist()) result = new DslashZeroCopyPackGDRRecv;
1856  else result = new DslashZeroCopyPack;
1857  break;
1858  case QudaDslashPolicy::QUDA_FUSED_ZERO_COPY_PACK_GDR_RECV_DSLASH:
1860  else result = new DslashFusedZeroCopyPack;
1861  break;
1862  case QudaDslashPolicy::QUDA_ZERO_COPY_DSLASH:
1863  result = new DslashZeroCopy;
1864  break;
1865  case QudaDslashPolicy::QUDA_FUSED_ZERO_COPY_DSLASH:
1866  result = new DslashFusedZeroCopy;
1867  break;
1868  case QudaDslashPolicy::QUDA_DSLASH_NC:
1869  result = new DslashNC;
1870  break;
1871  default:
1872  errorQuda("Dslash policy %d not recognized",static_cast<int>(dslashPolicy));
1873  break;
1874  }
1875  return result; // default
1876  }
1877 };
1878 
1879  static bool dslash_init = false;
1880 
1881  static int config = 0; // 3-bit number used to record the machine config (first bit for gdr / two bits for p2p) and if this changes we will force a retune
1882 
1883  static int first_active_policy=static_cast<int>(QudaDslashPolicy::QUDA_DSLASH_POLICY_DISABLED);
1884 
1885  static int first_active_p2p_policy=static_cast<int>(QudaP2PPolicy::QUDA_P2P_POLICY_DISABLED);
1886 
1888  policies[static_cast<std::size_t>(p)] = p;
1889  }
1890 
1892  policies[static_cast<std::size_t>(p)] = QudaDslashPolicy::QUDA_DSLASH_POLICY_DISABLED;
1893  }
1894 
1895  class DslashPolicyTune : public Tunable {
1896 
1899  cudaColorSpinorField *in;
1900  const int volume;
1901  const int *ghostFace;
1902  TimeProfile &profile;
1903 
1904  bool tuneGridDim() const { return false; } // Don't tune the grid dimensions.
1905  bool tuneAuxDim() const { return true; } // Do tune the aux dimensions.
1906  unsigned int sharedBytesPerThread() const { return 0; }
1907  unsigned int sharedBytesPerBlock(const TuneParam &param) const { return 0; }
1908 
1909  public:
1910  DslashPolicyTune(DslashCuda &dslash, cudaColorSpinorField *in,
1911  const int volume, const int *ghostFace, TimeProfile &profile)
1912  : dslash(dslash), dslashParam(dslash.dslashParam), in(in), volume(volume), ghostFace(ghostFace), profile(profile)
1913  {
1914  in->streamInit(streams);
1915 
1916  if (!dslash_init) {
1917 
1918  config += comm_gdr_enabled();
1920 
1921  if (comm_peer2peer_enabled_global() & 2) {
1922  p2p_policies[static_cast<std::size_t>(QudaP2PPolicy::QUDA_P2P_REMOTE_WRITE)] = QudaP2PPolicy::QUDA_P2P_REMOTE_WRITE;
1923  }
1924  if (comm_peer2peer_enabled_global() & 1) {
1925  p2p_policies[static_cast<std::size_t>(QudaP2PPolicy::QUDA_P2P_COPY_ENGINE)] = QudaP2PPolicy::QUDA_P2P_COPY_ENGINE;
1926  }
1927  p2p_policies[static_cast<std::size_t>(QudaP2PPolicy::QUDA_P2P_DEFAULT)] = QudaP2PPolicy::QUDA_P2P_DEFAULT;
1928  first_active_p2p_policy = static_cast<int>(QudaP2PPolicy::QUDA_P2P_DEFAULT); // first active policy is presently always the default
1929 
1930  static char *dslash_policy_env = getenv("QUDA_ENABLE_DSLASH_POLICY");
1931  if (dslash_policy_env) { // set the policies to tune for explicitly
1932  std::stringstream policy_list(dslash_policy_env);
1933 
1934  int policy_;
1935  while (policy_list >> policy_) {
1936  QudaDslashPolicy dslash_policy = static_cast<QudaDslashPolicy>(policy_);
1937 
1938  // check this is a valid policy choice
1939  if ( ( dslash_policy == QudaDslashPolicy::QUDA_GDR_DSLASH ||
1940  dslash_policy == QudaDslashPolicy::QUDA_FUSED_GDR_DSLASH ||
1941  dslash_policy == QudaDslashPolicy::QUDA_GDR_RECV_DSLASH ||
1942  dslash_policy == QudaDslashPolicy::QUDA_FUSED_GDR_RECV_DSLASH)
1943  && !comm_gdr_enabled() ) {
1944  errorQuda("Cannot select a GDR policy %d unless QUDA_ENABLE_GDR is set", static_cast<int>(dslash_policy));
1945  }
1946 
1947  enable_policy(static_cast<QudaDslashPolicy>(policy_));
1949  if (policy_list.peek() == ',') policy_list.ignore();
1950  }
1951  if (first_active_policy == static_cast<int>(QudaDslashPolicy::QUDA_DSLASH_POLICY_DISABLED)) errorQuda("No valid policy found in QUDA_ENABLE_DSLASH_POLICY");
1952  }
1953  else {
1954  enable_policy(QudaDslashPolicy::QUDA_DSLASH);
1955  first_active_policy = 0;
1956  enable_policy(QudaDslashPolicy::QUDA_FUSED_DSLASH);
1957 
1958  // if we have gdr then enable tuning these policies
1959  if (comm_gdr_enabled()) {
1960  enable_policy(QudaDslashPolicy::QUDA_GDR_DSLASH);
1961  enable_policy(QudaDslashPolicy::QUDA_FUSED_GDR_DSLASH);
1962  enable_policy(QudaDslashPolicy::QUDA_GDR_RECV_DSLASH);
1963  enable_policy(QudaDslashPolicy::QUDA_FUSED_GDR_RECV_DSLASH);
1964  }
1965 
1966  enable_policy(QudaDslashPolicy::QUDA_ZERO_COPY_PACK_DSLASH);
1967  enable_policy(QudaDslashPolicy::QUDA_FUSED_ZERO_COPY_PACK_DSLASH);
1968 
1969  if (comm_gdr_enabled()) {
1970  enable_policy(QudaDslashPolicy::QUDA_ZERO_COPY_PACK_GDR_RECV_DSLASH);
1971  enable_policy(QudaDslashPolicy::QUDA_FUSED_ZERO_COPY_PACK_GDR_RECV_DSLASH);
1972  }
1973 
1974 #ifdef USE_TEXTURE_OBJECTS
1975  // pure zero-copy policies require texture objects
1976  enable_policy(QudaDslashPolicy::QUDA_ZERO_COPY_DSLASH);
1977  enable_policy(QudaDslashPolicy::QUDA_FUSED_ZERO_COPY_DSLASH);
1978 #endif
1979 
1980  // Async variants are only supported on CUDA 8.0 and up
1981 #if (CUDA_VERSION >= 8000) && 0
1982 #if (CUDA_VERSION >= 9000)
1983  CUdevice device;
1984  cuDeviceGet(&device, comm_gpuid());
1985  int can_use_stream_mem_ops;
1986  cuDeviceGetAttribute(&can_use_stream_mem_ops, CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_MEM_OPS, device);
1987 #else
1988  int can_use_stream_mem_ops = 1;
1989 #endif
1990  if (can_use_stream_mem_ops) {
1991  enable_policy(QudaDslashPolicy::QUDA_DSLASH_ASYNC);
1992  enable_policy(QudaDslashPolicy::QUDA_FUSED_DSLASH_ASYNC);
1993  }
1994 #endif
1995  }
1996 
1997  static char *dslash_pack_env = getenv("QUDA_ENABLE_DSLASH_PACK");
1998  if (dslash_pack_env && strcmp(dslash_pack_env, "0") == 0) {
1999  if (getVerbosity() > QUDA_SILENT) warningQuda("Disabling Dslash halo packing");
2000  dslash_pack_compute = false;
2001  }
2002 
2003  static char *dslash_interior_env = getenv("QUDA_ENABLE_DSLASH_INTERIOR");
2004  if (dslash_interior_env && strcmp(dslash_interior_env, "0") == 0) {
2005  if (getVerbosity() > QUDA_SILENT) warningQuda("Disabling Dslash interior computation");
2006  dslash_interior_compute = false;
2007  }
2008 
2009  static char *dslash_exterior_env = getenv("QUDA_ENABLE_DSLASH_EXTERIOR");
2010  if (dslash_exterior_env && strcmp(dslash_exterior_env, "0") == 0) {
2011  if (getVerbosity() > QUDA_SILENT) warningQuda("Disabling Dslash exterior computation");
2012  dslash_exterior_compute = false;
2013  }
2014 
2015  static char *dslash_copy_env = getenv("QUDA_ENABLE_DSLASH_COPY");
2016  if (dslash_copy_env && strcmp(dslash_copy_env, "0") == 0) {
2017  if (getVerbosity() > QUDA_SILENT) warningQuda("Disabling Dslash host-device copying");
2018  dslash_copy = false;
2019  }
2020 
2021  static char *dslash_comms_env = getenv("QUDA_ENABLE_DSLASH_COMMS");
2022  if (dslash_comms_env && strcmp(dslash_comms_env, "0") == 0) {
2023  if (getVerbosity() > QUDA_SILENT) warningQuda("Disabling Dslash communication");
2024  dslash_comms = false;
2025  }
2026  }
2027 
2028  // before we do policy tuning we must ensure the kernel
2029  // constituents have been tuned since we can't do nested tuning
2030  if (getTuning() && getTuneCache().find(tuneKey()) == getTuneCache().end()) {
2032 
2033  for (auto &p2p : p2p_policies) {
2034 
2035  if (p2p == QudaP2PPolicy::QUDA_P2P_POLICY_DISABLED) continue;
2036 
2037  bool p2p_enabled = comm_peer2peer_enabled_global();
2038  if (p2p == QudaP2PPolicy::QUDA_P2P_DEFAULT) comm_enable_peer2peer(false); // disable p2p if using default policy
2039  dslashParam.remote_write = (p2p == QudaP2PPolicy::QUDA_P2P_REMOTE_WRITE ? 1 : 0);
2040 
2041  for (auto &i : policies) {
2042 
2043  if ( (i == QudaDslashPolicy::QUDA_DSLASH ||
2044  i == QudaDslashPolicy::QUDA_FUSED_DSLASH ||
2045  i == QudaDslashPolicy::QUDA_DSLASH_ASYNC ||
2046  i == QudaDslashPolicy::QUDA_FUSED_DSLASH_ASYNC) &&
2047  !dslashParam.remote_write) {
2048 
2049  DslashPolicyImp* dslashImp = DslashFactory::create(i);
2050  (*dslashImp)(dslash, in, volume, ghostFace, profile);
2051  delete dslashImp;
2052 
2053  } else if ( (i == QudaDslashPolicy::QUDA_GDR_DSLASH ||
2054  i == QudaDslashPolicy::QUDA_FUSED_GDR_DSLASH ||
2055  i == QudaDslashPolicy::QUDA_GDR_RECV_DSLASH ||
2056  i == QudaDslashPolicy::QUDA_FUSED_GDR_RECV_DSLASH ||
2057  i == QudaDslashPolicy::QUDA_ZERO_COPY_PACK_DSLASH ||
2058  i == QudaDslashPolicy::QUDA_FUSED_ZERO_COPY_PACK_DSLASH ||
2059  i == QudaDslashPolicy::QUDA_ZERO_COPY_PACK_GDR_RECV_DSLASH ||
2060  i == QudaDslashPolicy::QUDA_FUSED_ZERO_COPY_PACK_GDR_RECV_DSLASH ||
2061  i == QudaDslashPolicy::QUDA_ZERO_COPY_DSLASH ||
2062  i == QudaDslashPolicy::QUDA_FUSED_ZERO_COPY_DSLASH) ||
2063  ((i == QudaDslashPolicy::QUDA_DSLASH ||
2064  i == QudaDslashPolicy::QUDA_FUSED_DSLASH ||
2065  i == QudaDslashPolicy::QUDA_DSLASH_ASYNC ||
2066  i == QudaDslashPolicy::QUDA_FUSED_DSLASH_ASYNC) && dslashParam.remote_write) ) {
2067  // these dslash policies all must have kernel packing enabled
2068 
2069  bool kernel_pack_old = getKernelPackT();
2070 
2071  // if we are using GDR policies then we must tune the
2072  // non-GDR variants as well with and without kernel packing
2073  // enabled - this ensures that all GPUs will have the
2074  // required tune cache entries prior to potential process
2075  // divergence regardless of which GPUs are blacklisted
2076  // don't enter if remote writing since there we always use kernel packing
2077  if ( (i == QudaDslashPolicy::QUDA_GDR_DSLASH ||
2078  i == QudaDslashPolicy::QUDA_FUSED_GDR_DSLASH ||
2079  i == QudaDslashPolicy::QUDA_GDR_RECV_DSLASH ||
2080  i == QudaDslashPolicy::QUDA_FUSED_GDR_RECV_DSLASH) && !dslashParam.remote_write ) {
2081  QudaDslashPolicy policy = (i==QudaDslashPolicy::QUDA_GDR_DSLASH || i==QudaDslashPolicy::QUDA_GDR_RECV_DSLASH) ?
2082  QudaDslashPolicy::QUDA_DSLASH : QudaDslashPolicy::QUDA_FUSED_DSLASH;
2083  DslashPolicyImp* dslashImp = DslashFactory::create(policy);
2084  setKernelPackT(false);
2085  (*dslashImp)(dslash, in, volume, ghostFace, profile);
2086  setKernelPackT(true);
2087  (*dslashImp)(dslash, in, volume, ghostFace, profile);
2088  delete dslashImp;
2089  }
2090 
2091  setKernelPackT(true);
2092 
2093  DslashPolicyImp* dslashImp = DslashFactory::create(i);
2094  (*dslashImp)(dslash, in, volume, ghostFace, profile);
2095  delete dslashImp;
2096 
2097  // restore default kernel packing
2098  setKernelPackT(kernel_pack_old);
2099 
2100  } else if (i != QudaDslashPolicy::QUDA_DSLASH_POLICY_DISABLED){
2101  errorQuda("Unsupported dslash policy %d\n", static_cast<int>(i));
2102  }
2103  }
2104 
2105  comm_enable_peer2peer(p2p_enabled); // restore p2p state
2106  } // p2p policies
2107 
2109  setPolicyTuning(true);
2110  }
2111  dslash_init = true;
2112  }
2113 
2114  virtual ~DslashPolicyTune() { setPolicyTuning(false); }
2115 
2116  void apply(const cudaStream_t &stream) {
2117  TuneParam tp = tuneLaunch(*this, getTuning(), QUDA_DEBUG_VERBOSE /*getVerbosity()*/);
2118 
2119  if (config != tp.aux.w) {
2120  errorQuda("Machine configuration (P2P/GDR=%d) changed since tunecache was created (P2P/GDR=%d). Please delete "
2121  "this file or set the QUDA_RESOURCE_PATH environment variable to point to a new path.",
2122  config, tp.aux.w);
2123  }
2124 
2125  if (tp.aux.x >= static_cast<int>(policies.size())) errorQuda("Requested policy that is outside of range");
2126  if (static_cast<QudaDslashPolicy>(tp.aux.x) == QudaDslashPolicy::QUDA_DSLASH_POLICY_DISABLED) errorQuda("Requested policy is disabled");
2127 
2128  bool p2p_enabled = comm_peer2peer_enabled_global();
2129  if (p2p_policies[tp.aux.y] == QudaP2PPolicy::QUDA_P2P_DEFAULT) comm_enable_peer2peer(false); // disable p2p if using default policy
2130  dslashParam.remote_write = (p2p_policies[tp.aux.y] == QudaP2PPolicy::QUDA_P2P_REMOTE_WRITE ? 1 : 0); // set whether we are using remote packing writes or copy engines
2131 
2132  // switch on kernel packing for the policies that need it
2133  bool kernel_pack_old = getKernelPackT();
2134  auto p = static_cast<QudaDslashPolicy>(tp.aux.x);
2135  if ( p == QudaDslashPolicy::QUDA_GDR_DSLASH ||
2136  p == QudaDslashPolicy::QUDA_FUSED_GDR_DSLASH ||
2137  p == QudaDslashPolicy::QUDA_ZERO_COPY_PACK_DSLASH ||
2138  p == QudaDslashPolicy::QUDA_FUSED_ZERO_COPY_PACK_DSLASH ||
2139  p == QudaDslashPolicy::QUDA_ZERO_COPY_PACK_GDR_RECV_DSLASH ||
2140  p == QudaDslashPolicy::QUDA_FUSED_ZERO_COPY_PACK_GDR_RECV_DSLASH ||
2141  p == QudaDslashPolicy::QUDA_ZERO_COPY_DSLASH ||
2142  p == QudaDslashPolicy::QUDA_FUSED_ZERO_COPY_DSLASH ||
2143  dslashParam.remote_write // always use kernel packing if remote writing
2144  ) {
2145  setKernelPackT(true);
2146  }
2147 
2148  DslashPolicyImp* dslashImp = DslashFactory::create(static_cast<QudaDslashPolicy>(tp.aux.x));
2149  (*dslashImp)(dslash, in, volume, ghostFace, profile);
2150  delete dslashImp;
2151 
2152  // restore p2p state
2153  comm_enable_peer2peer(p2p_enabled);
2154 
2155  // restore default kernel packing
2156  setKernelPackT(kernel_pack_old);
2157  }
2158 
2159  int tuningIter() const { return 10; }
2160 
2161  // Find the best dslash policy
2162  bool advanceAux(TuneParam &param) const
2163  {
2164  while ((unsigned)param.aux.x < policies.size()-1) {
2165  param.aux.x++;
2166  if (policies[param.aux.x] != QudaDslashPolicy::QUDA_DSLASH_POLICY_DISABLED) return true;
2167  }
2168  param.aux.x = first_active_policy;
2169 
2170  while ((unsigned)param.aux.y < p2p_policies.size()-1) {
2171  param.aux.y++;
2172  if (p2p_policies[param.aux.y] != QudaP2PPolicy::QUDA_P2P_POLICY_DISABLED) return true;
2173  }
2175 
2176  return false;
2177  }
2178 
2179  bool advanceTuneParam(TuneParam &param) const { return advanceAux(param); }
2180 
2181  void initTuneParam(TuneParam &param) const {
2182  Tunable::initTuneParam(param);
2183  param.aux.x = first_active_policy; param.aux.y = first_active_p2p_policy; param.aux.z = 0; param.aux.w = config;
2184  }
2185 
2186  void defaultTuneParam(TuneParam &param) const {
2187  Tunable::defaultTuneParam(param);
2188  param.aux.x = first_active_policy; param.aux.y = first_active_p2p_policy; param.aux.z = 0; param.aux.w = config;
2189  }
2190 
2191  TuneKey tuneKey() const {
2192  KernelType kernel_type = dslashParam.kernel_type;
2193  dslashParam.kernel_type = KERNEL_POLICY;
2194  TuneKey key = dslash.tuneKey();
2195  strcat(key.aux,comm_dim_topology_string());
2196  dslashParam.kernel_type = kernel_type;
2197  return key;
2198  }
2199 
2200  long long flops() const {
2201  KernelType kernel_type = dslashParam.kernel_type;
2202  dslashParam.kernel_type = KERNEL_POLICY;
2203  long long flops_ = dslash.flops();
2204  dslashParam.kernel_type = kernel_type;
2205  return flops_;
2206  }
2207 
2208  long long bytes() const {
2209  KernelType kernel_type = dslashParam.kernel_type;
2210  dslashParam.kernel_type = KERNEL_POLICY;
2211  long long bytes_ = dslash.bytes();
2212  dslashParam.kernel_type = kernel_type;
2213  return bytes_;
2214  }
2215 
2216  void preTune() { dslash.preTune(); }
2217 
2218  void postTune() { dslash.postTune(); }
2219 
2220  };
2221 
2222 } // anonymous namespace
cudaEvent_t event
void disable_policy(QudaDslashPolicy p)
cudaStream_t stream
void operator()(DslashCuda &dslash, cudaColorSpinorField *in, const int volume, const int *faceVolumeCB, TimeProfile &profile)
cudaError_t qudaEventQuery(cudaEvent_t &event)
Wrapper around cudaEventQuery or cuEventQuery.
std::map< TuneKey, TuneParam > map
bool getKernelPackT()
Definition: dslash_quda.cu:61
cudaError_t qudaStreamWaitEvent(cudaStream_t stream, cudaEvent_t event, unsigned int flags)
Wrapper around cudaEventRecord or cuEventRecord.
QudaVerbosity getVerbosity()
Definition: util_quda.cpp:20
cudaEvent_t scatterEnd[Nstream]
Definition: dslash_quda.cu:73
int dslashCompleted[Nstream]
#define errorQuda(...)
Definition: util_quda.h:90
#define PROFILE(f, profile, idx)
void setFusedParam(DslashParam &param, DslashCuda &dslash, const int *faceVolumeCB)
cudaStream_t * streams
static __inline__ dim3 dim3 void size_t cudaStream_t int dim
int commsCompleted[Nstream]
bool commsComplete(cudaColorSpinorField &in, const DslashCuda &dslash, int dim, int dir, bool gdr_send, bool gdr_recv, bool zero_copy_recv, bool async, int scatterIndex=-1)
Wrapper for querying if communication is finished in the dslash, and if it is take the appropriate ac...
const int Nstream
char * strcpy(char *__dst, const char *__src)
char * strcat(char *__s1, const char *__s2)
void disableProfileCount()
Definition: tune.cpp:107
void operator()(DslashCuda &dslash, cudaColorSpinorField *in, const int volume, const int *faceVolumeCB, TimeProfile &profile)
void operator()(DslashCuda &dslash, cudaColorSpinorField *in, const int volume, const int *faceVolumeCB, TimeProfile &profile)
int comm_gpuid(void)
Definition: comm_mpi.cpp:132
int * commsEnd_h
KernelType
int getStreamIndex(const T &dslashParam)
Returns a stream index for posting the pack/scatters to. We desire a stream index that is not being u...
void comm_enable_peer2peer(bool enable)
Enable / disable peer-to-peer communication: used for dslash policies that do not presently support p...
char * index(const char *, int)
QudaGaugeParam param
Definition: pack_test.cpp:17
int strcmp(const char *__s1, const char *__s2)
static bool dslash_pack_compute
void operator()(DslashCuda &dslash, cudaColorSpinorField *in, const int volume, const int *faceVolumeCB, TimeProfile &profile)
Worker * aux_worker
Definition: dslash_quda.cu:78
cudaError_t qudaStreamSynchronize(cudaStream_t &stream)
Wrapper around cudaStreamSynchronize or cuStreamSynchronize.
const map & getTuneCache()
Definition: tune.cpp:110
KernelType kernel_type
void operator()(DslashCuda &dslash, cudaColorSpinorField *in, const int volume, const int *faceVolumeCB, TimeProfile &profile)
const char * comm_dim_topology_string()
Return a string that defines the comm topology (for use as a tuneKey)
Definition: comm_mpi.cpp:346
void operator()(DslashCuda &dslash, cudaColorSpinorField *in, const int volume, const int *faceVolumeCB, TimeProfile &profile)
cpuColorSpinorField * in
static __inline__ size_t p
int commDim(int)
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
Definition: tune.cpp:603
#define warningQuda(...)
Definition: util_quda.h:101
cudaEvent_t gatherEnd[Nstream]
Definition: dslash_quda.cu:71
void operator()(DslashCuda &dslash, cudaColorSpinorField *in, const int volume, const int *faceVolumeCB, TimeProfile &profile)
static std::vector< DslashCoarsePolicy > policy
void operator()(DslashCuda &dslash, cudaColorSpinorField *in, const int volume, const int *faceVolumeCB, TimeProfile &profile)
void setPolicyTuning(bool)
Definition: tune.cpp:457
void operator()(DslashCuda &dslash, cudaColorSpinorField *in, const int volume, const int *faceVolumeCB, TimeProfile &profile)
void operator()(DslashCuda &dslash, cudaColorSpinorField *in, const int volume, const int *faceVolumeCB, TimeProfile &profile)
long unsigned int size_t
void operator()(DslashCuda &dslash, cudaColorSpinorField *in, const int volume, const int *faceVolumeCB, TimeProfile &profile)
#define CUDA_CALL(call)
void setMappedGhost(DslashCuda &dslash, cudaColorSpinorField &in, bool to_mapped)
Set the ghosts to the mapped CPU ghost buffer, or unsets if already set. Note this must not be called...
void operator()(DslashCuda &dslash, cudaColorSpinorField *in, const int volume, const int *faceVolumeCB, TimeProfile &profile)
unsigned int sharedBytesPerBlock(const TuneParam &param) const
static std::vector< QudaP2PPolicy > p2p_policies(static_cast< int >(QudaP2PPolicy::QUDA_P2P_POLICY_DISABLED), QudaP2PPolicy::QUDA_P2P_POLICY_DISABLED)
bool comm_peer2peer_enabled(int dir, int dim)
DslashPolicyTune(DslashCuda &dslash, cudaColorSpinorField *in, const int volume, const int *ghostFace, TimeProfile &profile)
cudaEvent_t dslashStart[2]
Definition: dslash_quda.cu:74
static cudaColorSpinorField * inSpinor
void enableProfileCount()
Definition: tune.cpp:108
static bool dslash_copy
static bool dslash_exterior_compute
static bool dslash_comms
int previousDir[Nstream]
DslashCommsPattern(const int commDim[], bool gdr_send=false)
static bool dslash_interior_compute
bool comm_gdr_enabled()
Query if GPU Direct RDMA communication is enabled (global setting)
void setKernelPackT(bool pack)
Definition: dslash_quda.cu:59
void issueRecv(cudaColorSpinorField &input, const DslashCuda &dslash, cudaStream_t *stream, bool gdr)
This helper function simply posts all receives in all directions.
cudaError_t qudaEventRecord(cudaEvent_t &event, cudaStream_t stream=0)
Wrapper around cudaEventRecord or cuEventRecord.
void enable_policy(QudaDslashPolicy p)
void operator()(DslashCuda &dslash, cudaColorSpinorField *in, const int volume, const int *faceVolumeCB, TimeProfile &profile)
cudaEvent_t packEnd[2]
Definition: dslash_quda.cu:69
#define QUDA_MAX_DIM
Maximum number of dimensions supported by QUDA. In practice, no routines make use of more than 5...
static std::vector< QudaDslashPolicy > policies(static_cast< int >(QudaDslashPolicy::QUDA_DSLASH_POLICY_DISABLED), QudaDslashPolicy::QUDA_DSLASH_POLICY_DISABLED)
QudaTune getTuning()
Query whether autotuning is enabled or not. Default is enabled but can be overridden by setting QUDA_...
Definition: util_quda.cpp:51
QudaParity parity
Definition: covdev_test.cpp:53
void issueGather(cudaColorSpinorField &in, const DslashCuda &dslash)
This helper function simply posts the device-host memory copies of all halos in all dimensions and di...
void completeDslash(const ColorSpinorField &in, const T &dslashParam)
Ensure that the dslash is complete. By construction, the dslash will have completed (or is in flight)...
int comm_peer2peer_enabled_global()
void operator()(DslashCuda &dslash, cudaColorSpinorField *in, const int volume, const int *faceVolumeCB, TimeProfile &profile)
char * getenv(const char *)
bool comm_gdr_blacklist()
Query if GPU Direct RDMA communication is blacklisted for this GPU.
static DslashPolicyImp * create(const QudaDslashPolicy &dslashPolicy)
void operator()(DslashCuda &dslash, cudaColorSpinorField *in, const int volume, const int *faceVolumeCB, TimeProfile &profile)
int gatherCompleted[Nstream]
cudaEvent_t cudaEvent_t end
void issuePack(cudaColorSpinorField &in, const DslashCuda &dslash, int parity, MemoryLocation location, int packIndex)
This helper function simply posts the packing kernel needed for halo exchange.
void operator()(DslashCuda &dslash, cudaColorSpinorField *in, const int volume, const int *faceVolumeCB, TimeProfile &profile)
CUdeviceptr commsEnd_d[Nstream]
enum cudaDeviceAttr attr int device