QUDA  0.9.0
comm_common.cpp
Go to the documentation of this file.
1 #include <unistd.h> // for gethostname()
2 #include <assert.h>
3 
4 #include <quda_internal.h>
5 #include <comm_quda.h>
6 
7 
8 struct Topology_s {
9  int ndim;
11  int *ranks;
13  int my_rank;
15  // It might be worth adding communicators to allow for efficient reductions:
16  // #if defined(MPI_COMMS)
17  // MPI_Comm comm;
18  // #elif defined(QMP_COMMS)
19  // QMP_communicator_t comm; // currently only supported by qmp-2.4.0-alpha
20  // #endif
21 };
22 
23 
32 static inline int index(int ndim, const int *dims, const int *x)
33 {
34  int idx = x[0];
35  for (int i = 1; i < ndim; i++) {
36  idx = dims[i]*idx + x[i];
37  }
38  return idx;
39 }
40 
41 
42 static inline bool advance_coords(int ndim, const int *dims, int *x)
43 {
44  bool valid = false;
45  for (int i = ndim-1; i >= 0; i--) {
46  if (x[i] < dims[i]-1) {
47  x[i]++;
48  valid = true;
49  break;
50  } else {
51  x[i] = 0;
52  }
53  }
54  return valid;
55 }
56 
57 
58 char *comm_hostname(void)
59 {
60  static bool cached = false;
61  static char hostname[128];
62 
63  if (!cached) {
64  gethostname(hostname, 128);
65  hostname[127] = '\0';
66  cached = true;
67  }
68 
69  return hostname;
70 }
71 
72 
73 static unsigned long int rand_seed = 137;
74 
82 double comm_drand(void)
83 {
84  const double twoneg48 = 0.35527136788005009e-14;
85  const unsigned long int m = 25214903917, a = 11, mask = 281474976710655;
86  rand_seed = (m * rand_seed + a) & mask;
87  return (twoneg48 * rand_seed);
88 }
89 
90 
91 // QudaCommsMap is declared in quda.h:
92 // typedef int (*QudaCommsMap)(const int *coords, void *fdata);
93 
94 Topology *comm_create_topology(int ndim, const int *dims, QudaCommsMap rank_from_coords, void *map_data)
95 {
96  if (ndim > QUDA_MAX_DIM) {
97  errorQuda("ndim exceeds QUDA_MAX_DIM");
98  }
99 
100  Topology *topo = (Topology *) safe_malloc(sizeof(Topology));
101 
102  topo->ndim = ndim;
103 
104  int nodes = 1;
105  for (int i=0; i<ndim; i++) {
106  topo->dims[i] = dims[i];
107  nodes *= dims[i];
108  }
109 
110  topo->ranks = (int *) safe_malloc(nodes*sizeof(int));
111  topo->coords = (int (*)[QUDA_MAX_DIM]) safe_malloc(nodes*sizeof(int[QUDA_MAX_DIM]));
112 
113  int x[QUDA_MAX_DIM];
114  for (int i = 0; i < QUDA_MAX_DIM; i++) x[i] = 0;
115 
116  do {
117  int rank = rank_from_coords(x, map_data);
118  topo->ranks[index(ndim, dims, x)] = rank;
119  for (int i=0; i<ndim; i++) {
120  topo->coords[rank][i] = x[i];
121  }
122  } while (advance_coords(ndim, dims, x));
123 
124  int my_rank = comm_rank();
125  topo->my_rank = my_rank;
126  for (int i = 0; i < ndim; i++) {
127  topo->my_coords[i] = topo->coords[my_rank][i];
128  }
129 
130  // initialize the random number generator with a rank-dependent seed
131  rand_seed = 17*my_rank + 137;
132 
133  return topo;
134 }
135 
136 
138 {
139  host_free(topo->ranks);
140  host_free(topo->coords);
141  host_free(topo);
142 }
143 
144 
145 static bool peer2peer_enabled[2][4] = { {false,false,false,false},
146  {false,false,false,false} };
147 static bool peer2peer_init = false;
148 
149 static bool intranode_enabled[2][4] = { {false,false,false,false},
150  {false,false,false,false} };
151 
152 static int enable_peer_to_peer = 3; // by default enable both copy engines and load/store access
153 
154 void comm_peer2peer_init(const char* hostname_recv_buf)
155 {
156  if (peer2peer_init) return;
157 
158  char *enable_peer_to_peer_env = getenv("QUDA_ENABLE_P2P");
159 
160  // disable peer-to-peer comms in one direction if QUDA_ENABLE_P2P=-1
161  // and comm_dim(dim) == 2 (used for perf benchmarking)
162  bool disable_peer_to_peer_bidir = false;
163 
164  if (enable_peer_to_peer_env) {
165  enable_peer_to_peer = atoi(enable_peer_to_peer_env);
166 
167  switch ( std::abs(enable_peer_to_peer) ) {
168  case 0: if (getVerbosity() > QUDA_SILENT) printfQuda("Disabling peer-to-peer access\n"); break;
169  case 1: if (getVerbosity() > QUDA_SILENT) printfQuda("Enabling peer-to-peer copy engine access (disabling direct load/store)\n"); break;
170  case 2: if (getVerbosity() > QUDA_SILENT) printfQuda("Enabling peer-to-peer direct load/store access (disabling copy engines)\n"); break;
171  case 3: if (getVerbosity() > QUDA_SILENT) printfQuda("Enabling peer-to-peer copy engine and direct load/store access\n"); break;
172  default: errorQuda("Unexpected value QUDA_ENABLE_P2P=%d\n", enable_peer_to_peer);
173  }
174 
175  if (enable_peer_to_peer < 0) { // only values -1, -2, -3 can make it here
176  if (getVerbosity() > QUDA_SILENT) printfQuda("Disabling bi-directional peer-to-peer access\n");
177  disable_peer_to_peer_bidir = true;
178  }
179 
181 
182  } else { // !enable_peer_to_peer_env
183  if (getVerbosity() > QUDA_SILENT) printfQuda("Enabling peer-to-peer copy engine and direct load/store access\n");
184  }
185 
187 
188  // first check that the local GPU supports UVA
189  const int gpuid = comm_gpuid();
190  cudaDeviceProp prop;
191  cudaGetDeviceProperties(&prop, gpuid);
192  if(!prop.unifiedAddressing) return;
193 
195 
196  char *hostname = comm_hostname();
197  int *gpuid_recv_buf = (int *)safe_malloc(sizeof(int)*comm_size());
198 
199  comm_gather_gpuid(gpuid_recv_buf);
200 
201  for(int dir=0; dir<2; ++dir){ // forward/backward directions
202  for(int dim=0; dim<4; ++dim){
204  if(neighbor_rank == comm_rank()) continue;
205 
206  // disable peer-to-peer comms in one direction
207  if ( ((comm_rank() > neighbor_rank && dir == 0) || (comm_rank() < neighbor_rank && dir == 1)) &&
208  disable_peer_to_peer_bidir && comm_dim(dim) == 2 ) continue;
209 
210  // if the neighbors are on the same
211  if (!strncmp(hostname, &hostname_recv_buf[128*neighbor_rank], 128)) {
212  int neighbor_gpuid = gpuid_recv_buf[neighbor_rank];
213  int canAccessPeer[2];
214  cudaDeviceCanAccessPeer(&canAccessPeer[0], gpuid, neighbor_gpuid);
215  cudaDeviceCanAccessPeer(&canAccessPeer[1], neighbor_gpuid, gpuid);
216 
217  int accessRank[2] = { };
218 #if CUDA_VERSION >= 8000 // this was introduced with CUDA 8
219  if (canAccessPeer[0]*canAccessPeer[1]) {
220  cudaDeviceGetP2PAttribute(&accessRank[0], cudaDevP2PAttrPerformanceRank, gpuid, neighbor_gpuid);
221  cudaDeviceGetP2PAttribute(&accessRank[1], cudaDevP2PAttrPerformanceRank, neighbor_gpuid, gpuid);
222  }
223 #endif
224 
225  // enable P2P if we can access the peer or if peer is self
226  if (canAccessPeer[0]*canAccessPeer[1] || gpuid == neighbor_gpuid) {
227  peer2peer_enabled[dir][dim] = true;
228  if (getVerbosity() > QUDA_SILENT) {
229  printf("Peer-to-peer enabled for rank %d (gpu=%d) with neighbor %d (gpu=%d) dir=%d, dim=%d, performance rank = (%d, %d)\n",
230  comm_rank(), gpuid, neighbor_rank, neighbor_gpuid, dir, dim, accessRank[0], accessRank[1]);
231  }
232  } else {
233  intranode_enabled[dir][dim] = true;
234  if (getVerbosity() > QUDA_SILENT) {
235  printf("Intra-node (non peer-to-peer) enabled for rank %d (gpu=%d) with neighbor %d (gpu=%d) dir=%d, dim=%d\n",
236  comm_rank(), gpuid, neighbor_rank, neighbor_gpuid, dir, dim);
237  }
238  }
239 
240  } // on the same node
241  } // different dimensions - x, y, z, t
242  } // different directions - forward/backward
243 
244  host_free(gpuid_recv_buf);
245  }
246 
247  peer2peer_init = true;
248 
249  comm_barrier();
250 
251  // set gdr enablement
252  if (comm_gdr_enabled()) {
253  if (getVerbosity() > QUDA_SILENT) printfQuda("Enabling GPU-Direct RDMA access\n");
254  comm_gdr_blacklist(); // set GDR blacklist
255  } else {
256  if (getVerbosity() > QUDA_SILENT) printfQuda("Disabling GPU-Direct RDMA access\n");
257  }
258 
260  return;
261 }
262 
263 static bool enable_p2p = true;
264 
265 bool comm_peer2peer_enabled(int dir, int dim){
266  return enable_p2p ? peer2peer_enabled[dir][dim] : false;
267 }
268 
270  if (!enable_p2p) return false;
271 
272  static bool init = false;
273  static bool p2p_global = false;
274 
275  if (!init) {
276  int p2p = 0;
277  for (int dim=0; dim<4; dim++)
278  for (int dir=0; dir<2; dir++)
279  p2p += (int)comm_peer2peer_enabled(dir,dim);
280 
281  comm_allreduce_int(&p2p);
282  init = true;
283  p2p_global = p2p > 0 ? true : false;
284  }
285  return p2p_global * enable_peer_to_peer;
286 }
287 
288 void comm_enable_peer2peer(bool enable) {
289  enable_p2p = enable;
290 }
291 
292 static bool enable_intranode = true;
293 
294 bool comm_intranode_enabled(int dir, int dim){
295  return enable_intranode ? intranode_enabled[dir][dim] : false;
296 }
297 
298 void comm_enable_intranode(bool enable) {
299  enable_intranode = enable;
300 }
301 
302 int comm_ndim(const Topology *topo)
303 {
304  return topo->ndim;
305 }
306 
307 
308 const int *comm_dims(const Topology *topo)
309 {
310  return topo->dims;
311 }
312 
313 
314 const int *comm_coords(const Topology *topo)
315 {
316  return topo->my_coords;
317 }
318 
319 
320 const int *comm_coords_from_rank(const Topology *topo, int rank)
321 {
322  return topo->coords[rank];
323 }
324 
325 
326 int comm_rank_from_coords(const Topology *topo, const int *coords)
327 {
328  return topo->ranks[index(topo->ndim, topo->dims, coords)];
329 }
330 
331 
332 static inline int mod(int a, int b)
333 {
334  return ((a % b) + b) % b;
335 }
336 
337 int comm_rank_displaced(const Topology *topo, const int displacement[])
338 {
339  int coords[QUDA_MAX_DIM];
340 
341  for (int i = 0; i < QUDA_MAX_DIM; i++) {
342  coords[i] = (i < topo->ndim) ?
343  mod(comm_coords(topo)[i] + displacement[i], comm_dims(topo)[i]) : 0;
344  }
345 
346  return comm_rank_from_coords(topo, coords);
347 }
348 
349 
350 // FIXME: The following routines rely on a "default" topology.
351 // They should probably be reworked or eliminated eventually.
352 
354 
356 {
357  default_topo = topo;
358 }
359 
360 
362 {
363  if (!default_topo) {
364  errorQuda("Default topology has not been declared");
365  }
366  return default_topo;
367 }
368 
369 static int neighbor_rank[2][4] = { {-1,-1,-1,-1},
370  {-1,-1,-1,-1} };
371 
372 static bool neighbors_cached = false;
373 
375 
376  if(neighbors_cached) return;
377 
378  Topology *topology = topo ? topo : default_topo; // use default topology if topo is NULL
379  if(!topology){
380  errorQuda("Topology not specified");
381  return;
382  }
383 
384  for(int d=0; d<4; ++d){
385  int pos_displacement[4] = {0,0,0,0};
386  int neg_displacement[4] = {0,0,0,0};
387  pos_displacement[d] = +1;
388  neg_displacement[d] = -1;
389  neighbor_rank[0][d] = comm_rank_displaced(topology, neg_displacement);
390  neighbor_rank[1][d] = comm_rank_displaced(topology, pos_displacement);
391  }
392  neighbors_cached = true;
393  return;
394 }
395 
396 int comm_neighbor_rank(int dir, int dim){
397  if(!neighbors_cached){
399  }
400  return neighbor_rank[dir][dim];
401 }
402 
403 
404 int comm_dim(int dim)
405 {
407  return comm_dims(topo)[dim];
408 }
409 
410 
411 int comm_coord(int dim)
412 {
414  return comm_coords(topo)[dim];
415 }
416 
417 
421 MsgHandle *comm_declare_send_relative_(const char *func, const char *file, int line,
422  void *buffer, int dim, int dir, size_t nbytes)
423 {
424 #ifdef HOST_DEBUG
425  checkCudaError(); // check and clear error state first
426  cudaPointerAttributes attributes;
427  cudaError_t err = cudaPointerGetAttributes(&attributes, buffer);
428  if (err != cudaSuccess || attributes.memoryType == cudaMemoryTypeHost) {
429  // test this memory allocation is ok by doing a memcpy from it
430  void *tmp = safe_malloc(nbytes);
431  try {
432  std::copy(static_cast<char*>(buffer), static_cast<char*>(buffer)+nbytes, static_cast<char*>(tmp));
433  } catch(std::exception &e) {
434  printfQuda("ERROR: buffer failed (%s:%d in %s(), dim=%d, dir=%d, nbytes=%zu)\n", file, line, func, dim, dir, nbytes);
435  errorQuda("aborting");
436  }
437  if (err != cudaSuccess) cudaGetLastError();
438  host_free(tmp);
439  } else {
440  // test this memory allocation is ok by doing a memcpy from it
441  void *tmp = device_malloc(nbytes);
442  cudaError_t err = cudaMemcpy(tmp, buffer, nbytes, cudaMemcpyDeviceToDevice);
443  if (err != cudaSuccess) {
444  printfQuda("ERROR: buffer failed (%s:%d in %s(), dim=%d, dir=%d, nbytes=%zu)\n", file, line, func, dim, dir, nbytes);
445  errorQuda("aborting with error %s", cudaGetErrorString(err));
446  }
447  device_free(tmp);
448  }
449 #endif
450 
451  int disp[QUDA_MAX_DIM] = {0};
452  disp[dim] = dir;
453 
454  return comm_declare_send_displaced(buffer, disp, nbytes);
455 }
456 
460 MsgHandle *comm_declare_receive_relative_(const char *func, const char *file, int line,
461  void *buffer, int dim, int dir, size_t nbytes)
462 {
463 #ifdef HOST_DEBUG
464  checkCudaError(); // check and clear error state first
465  cudaPointerAttributes attributes;
466  cudaError_t err = cudaPointerGetAttributes(&attributes, buffer);
467  if (err != cudaSuccess || attributes.memoryType == cudaMemoryTypeHost) {
468  // test this memory allocation is ok by filling it
469  try {
470  std::fill(static_cast<char*>(buffer), static_cast<char*>(buffer)+nbytes, 0);
471  } catch(std::exception &e) {
472  printfQuda("ERROR: buffer failed (%s:%d in %s(), dim=%d, dir=%d, nbytes=%zu)\n", file, line, func, dim, dir, nbytes);
473  errorQuda("aborting");
474  }
475  if (err != cudaSuccess) cudaGetLastError();
476  } else {
477  // test this memory allocation is ok by doing a memset
478  cudaError_t err = cudaMemset(buffer, 0, nbytes);
479  if (err != cudaSuccess) {
480  printfQuda("ERROR: buffer failed (%s:%d in %s(), dim=%d, dir=%d, nbytes=%zu)\n", file, line, func, dim, dir, nbytes);
481  errorQuda("aborting with error %s", cudaGetErrorString(err));
482  }
483  }
484 #endif
485 
486  int disp[QUDA_MAX_DIM] = {0};
487  disp[dim] = dir;
488 
489  return comm_declare_receive_displaced(buffer, disp, nbytes);
490 }
491 
495 MsgHandle *comm_declare_strided_send_relative_(const char *func, const char *file, int line,
496  void *buffer, int dim, int dir, size_t blksize, int nblocks, size_t stride)
497 {
498 #ifdef HOST_DEBUG
499  checkCudaError(); // check and clear error state first
500  cudaPointerAttributes attributes;
501  cudaError_t err = cudaPointerGetAttributes(&attributes, buffer);
502  if (err != cudaSuccess || attributes.memoryType == cudaMemoryTypeHost) {
503  // test this memory allocation is ok by doing a memcpy from it
504  void *tmp = safe_malloc(blksize*nblocks);
505  try {
506  for (int i=0; i<nblocks; i++)
507  std::copy(static_cast<char*>(buffer)+i*stride, static_cast<char*>(buffer)+i*stride+blksize, static_cast<char*>(tmp));
508  } catch(std::exception &e) {
509  printfQuda("ERROR: buffer failed (%s:%d in %s(), dim=%d, dir=%d, blksize=%zu nblocks=%d stride=%zu)\n",
510  file, line, func, dim, dir, blksize, nblocks, stride);
511  errorQuda("aborting");
512  }
513  host_free(tmp);
514  if (err != cudaSuccess) cudaGetLastError();
515  } else {
516  // test this memory allocation is ok by doing a memcpy from it
517  void *tmp = device_malloc(blksize*nblocks);
518  cudaError_t err = cudaMemcpy2D(tmp, blksize, buffer, stride, blksize, nblocks, cudaMemcpyDeviceToDevice);
519  if (err != cudaSuccess) {
520  printfQuda("ERROR: buffer failed (%s:%d in %s(), dim=%d, dir=%d, blksize=%zu nblocks=%d stride=%zu)\n",
521  file, line, func, dim, dir, blksize, nblocks, stride);
522  errorQuda("aborting with error %s", cudaGetErrorString(err));
523  }
524  device_free(tmp);
525  }
526 #endif
527 
528  int disp[QUDA_MAX_DIM] = {0};
529  disp[dim] = dir;
530 
531  return comm_declare_strided_send_displaced(buffer, disp, blksize, nblocks, stride);
532 }
533 
534 
538 MsgHandle *comm_declare_strided_receive_relative_(const char *func, const char *file, int line,
539  void *buffer, int dim, int dir, size_t blksize, int nblocks, size_t stride)
540 {
541 #ifdef HOST_DEBUG
542  checkCudaError(); // check and clear error state first
543  cudaPointerAttributes attributes;
544  cudaError_t err = cudaPointerGetAttributes(&attributes, buffer);
545  if (err != cudaSuccess || attributes.memoryType == cudaMemoryTypeHost) {
546  // test this memory allocation is ok by filling it
547  try {
548  for (int i=0; i<nblocks; i++)
549  std::fill(static_cast<char*>(buffer)+i*stride, static_cast<char*>(buffer)+i*stride+blksize, 0);
550  } catch(std::exception &e) {
551  printfQuda("ERROR: buffer failed (%s:%d in %s(), dim=%d, dir=%d, blksize=%zu nblocks=%d stride=%zu)\n",
552  file, line, func, dim, dir, blksize, nblocks, stride);
553  errorQuda("aborting");
554  }
555  if (err != cudaSuccess) cudaGetLastError();
556  } else {
557  // test this memory allocation is ok by doing a memset
558  cudaError_t err = cudaMemset2D(buffer, stride, 0, blksize, nblocks);
559  if (err != cudaSuccess) {
560  printfQuda("ERROR: buffer failed (%s:%d in %s(), dim=%d, dir=%d, blksize=%zu nblocks=%d stride=%zu)\n",
561  file, line, func, dim, dir, blksize, nblocks, stride);
562  errorQuda("aborting with error %s", cudaGetErrorString(err));
563  }
564  }
565 #endif
566 
567  int disp[QUDA_MAX_DIM] = {0};
568  disp[dim] = dir;
569 
570  return comm_declare_strided_receive_displaced(buffer, disp, blksize, nblocks, stride);
571 }
572 
573 void comm_finalize(void)
574 {
576  comm_destroy_topology(topo);
578 }
579 
580 
582 
584 {
585 #ifdef MULTI_GPU
587 #endif
588 }
589 
591  for (int i=0; i<QUDA_MAX_DIM; i++)
592  manual_set_partition[i] = 0;
593 
594 }
595 
596 
598 {
599  return (manual_set_partition[dim] || (comm_dim(dim) > 1));
600 }
601 
603 {
604  int partitioned = 0;
605  for (int i=0; i<4; i++) {
606  partitioned = partitioned || comm_dim_partitioned(i);
607  }
608  return partitioned;
609 }
610 
612  static bool gdr_enabled = false;
613 #ifdef MULTI_GPU
614  static bool gdr_init = false;
615 
616  if (!gdr_init) {
617  char *enable_gdr_env = getenv("QUDA_ENABLE_GDR");
618  if (enable_gdr_env && strcmp(enable_gdr_env, "1") == 0) {
619  gdr_enabled = true;
620  }
621  gdr_init = true;
622  }
623 #endif
624  return gdr_enabled;
625 }
626 
628  static bool blacklist = false;
629  static bool blacklist_init = false;
630 
631  if (!blacklist_init) {
632  char *blacklist_env = getenv("QUDA_ENABLE_GDR_BLACKLIST");
633 
634  if (blacklist_env) { // set the policies to tune for explicitly
635  std::stringstream blacklist_list(blacklist_env);
636 
637  int device_count;
638  cudaGetDeviceCount(&device_count);
639 
640  int excluded_device;
641  while (blacklist_list >> excluded_device) {
642  // check this is a valid device
643  if ( excluded_device < 0 || excluded_device >= device_count ) {
644  errorQuda("Cannot blacklist invalid GPU device ordinal %d", excluded_device);
645  }
646 
647  if (blacklist_list.peek() == ',') blacklist_list.ignore();
648  if (excluded_device == comm_gpuid()) blacklist = true;
649  }
650  comm_barrier();
651  if (getVerbosity() > QUDA_SILENT && blacklist) printf("Blacklisting GPU-Direct RDMA for rank %d (GPU %d)\n", comm_rank(), comm_gpuid());
652  }
653  blacklist_init = true;
654 
655  }
656 
657  return blacklist;
658 }
659 
660 static bool globalReduce = true;
661 static bool asyncReduce = false;
662 
663 void reduceMaxDouble(double &max) { comm_allreduce_max(&max); }
664 
665 void reduceDouble(double &sum) { if (globalReduce) comm_allreduce(&sum); }
666 
667 void reduceDoubleArray(double *sum, const int len)
669 
670 int commDim(int dir) { return comm_dim(dir); }
671 
672 int commCoords(int dir) { return comm_coord(dir); }
673 
674 int commDimPartitioned(int dir){ return comm_dim_partitioned(dir);}
675 
677 
679 
681 
682 void commGlobalReductionSet(bool global_reduction) { globalReduce = global_reduction; }
683 
684 bool commAsyncReduction() { return asyncReduce; }
685 
686 void commAsyncReductionSet(bool async_reduction) { asyncReduce = async_reduction; }
static int neighbor_rank[2][4]
double comm_drand(void)
Definition: comm_common.cpp:82
int comm_rank(void)
Definition: comm_mpi.cpp:120
int comm_peer2peer_enabled_global()
const int * comm_dims(const Topology *topo)
int commDimPartitioned(int dir)
static bool enable_intranode
MsgHandle * comm_declare_receive_displaced(void *buffer, const int displacement[], size_t nbytes)
Definition: comm_mpi.cpp:174
void reduceDoubleArray(double *sum, const int len)
int comm_dim_partitioned(int dim)
QudaVerbosity getVerbosity()
Definition: util_quda.cpp:20
const void * func
static bool globalReduce
void comm_set_default_topology(Topology *topo)
MsgHandle * comm_declare_strided_receive_displaced(void *buffer, const int displacement[], size_t blksize, int nblocks, size_t stride)
Definition: comm_mpi.cpp:226
#define errorQuda(...)
Definition: util_quda.h:90
void init()
Definition: blas_quda.cu:64
#define host_free(ptr)
Definition: malloc_quda.h:59
static __inline__ dim3 dim3 void size_t cudaStream_t int dim
cudaColorSpinorField * tmp
Definition: covdev_test.cpp:44
static int rank
Definition: comm_mpi.cpp:42
static bool neighbors_cached
void comm_allreduce_array(double *data, size_t size)
Definition: comm_mpi.cpp:296
static int enable_peer_to_peer
void copy(ColorSpinorField &dst, const ColorSpinorField &src)
Definition: copy_quda.cu:263
cudaGraphNode_t * nodes
int comm_gpuid(void)
Definition: comm_mpi.cpp:132
void reduceDouble(double &sum)
char * comm_hostname(void)
Definition: comm_common.cpp:58
int comm_ndim(const Topology *topo)
bool comm_peer2peer_enabled(int dir, int dim)
void comm_destroy_topology(Topology *topo)
void comm_enable_intranode(bool enable)
Enable / disable intra-node (non-peer-to-peer) communication.
int comm_rank_from_coords(const Topology *topo, const int *coords)
#define b
static int ndim
Definition: layout_hyper.c:53
void comm_dim_partitioned_set(int dim)
MsgHandle * comm_declare_strided_send_displaced(void *buffer, const int displacement[], size_t blksize, int nblocks, size_t stride)
Definition: comm_mpi.cpp:197
int strcmp(const char *__s1, const char *__s2)
const struct cudaDeviceProp * prop
static unsigned long int rand_seed
Definition: comm_common.cpp:73
int comm_size(void)
Definition: comm_mpi.cpp:126
const int * comm_coords_from_rank(const Topology *topo, int rank)
MsgHandle * comm_declare_send_displaced(void *buffer, const int displacement[], size_t nbytes)
Definition: comm_mpi.cpp:151
int printf(const char *,...) __attribute__((__format__(__printf__
static bool enable_p2p
int my_coords[QUDA_MAX_DIM]
Definition: comm_common.cpp:14
MsgHandle * comm_declare_receive_relative_(const char *func, const char *file, int line, void *buffer, int dim, int dir, size_t nbytes)
int commDim(int dir)
void commDimPartitionedSet(int dir)
Topology * default_topo
void comm_peer2peer_init(const char *hostname_recv_buf)
bool commGlobalReduction()
__host__ __device__ void sum(double &a, double &b)
void reduceMaxDouble(double &max)
void commDimPartitionedReset()
Reset the comm dim partioned array to zero,.
int dims[QUDA_MAX_DIM]
Definition: comm_common.cpp:10
size_t enum cudaMemRangeAttribute * attributes
static int gpuid
Definition: comm_mpi.cpp:44
int(* coords)[QUDA_MAX_DIM]
Definition: comm_common.cpp:12
cudaError_t err
int(* QudaCommsMap)(const int *coords, void *fdata)
Definition: comm_quda.h:12
static bool advance_coords(int ndim, const int *dims, int *x)
Definition: comm_common.cpp:42
int comm_dim(int dim)
#define safe_malloc(size)
Definition: malloc_quda.h:54
void comm_dim_partitioned_reset()
static bool peer2peer_init
void comm_enable_peer2peer(bool enable)
Enable / disable peer-to-peer communication: used for dslash policies that do not presently support p...
int commCoords(int dir)
#define checkCudaErrorNoSync()
Definition: util_quda.h:113
int comm_coord(int dim)
int comm_partitioned()
Loop over comm_dim_partitioned(dim) for all comms dimensions.
static int index(int ndim, const int *dims, const int *x)
Definition: comm_common.cpp:32
void commGlobalReductionSet(bool global_reduction)
int comm_rank_displaced(const Topology *topo, const int displacement[])
int abs(int) __attribute__((const))
const int * comm_coords(const Topology *topo)
void comm_gather_gpuid(int *gpuid_recv_buf)
Gather all GPU ids.
Definition: comm_mpi.cpp:56
int strncmp(const char *__s1, const char *__s2, size_t __n)
#define printfQuda(...)
Definition: util_quda.h:84
void commAsyncReductionSet(bool async_reduction)
int atoi(const char *)
void comm_allreduce_int(int *data)
Definition: comm_mpi.cpp:305
int comm_neighbor_rank(int dir, int dim)
#define device_malloc(size)
Definition: malloc_quda.h:52
MsgHandle * comm_declare_strided_send_relative_(const char *func, const char *file, int line, void *buffer, int dim, int dir, size_t blksize, int nblocks, size_t stride)
Topology * comm_create_topology(int ndim, const int *dims, QudaCommsMap rank_from_coords, void *map_data)
Definition: comm_common.cpp:94
bool comm_gdr_blacklist()
Query if GPU Direct RDMA communication is blacklisted for this GPU.
bool comm_gdr_enabled()
Query if GPU Direct RDMA communication is enabled (global setting)
#define QUDA_MAX_DIM
Maximum number of dimensions supported by QUDA. In practice, no routines make use of more than 5...
MsgHandle * comm_declare_strided_receive_relative_(const char *func, const char *file, int line, void *buffer, int dim, int dir, size_t blksize, int nblocks, size_t stride)
#define checkCudaError()
Definition: util_quda.h:129
static int mod(int a, int b)
void comm_allreduce(double *data)
Definition: comm_mpi.cpp:281
static bool intranode_enabled[2][4]
void comm_allreduce_max(double *data)
Definition: comm_mpi.cpp:289
bool comm_intranode_enabled(int dir, int dim)
static bool peer2peer_enabled[2][4]
void comm_finalize(void)
static __inline__ size_t size_t d
bool commAsyncReduction()
void comm_set_neighbor_ranks(Topology *topo)
char * getenv(const char *)
#define a
Topology * comm_default_topology(void)
MsgHandle * comm_declare_send_relative_(const char *func, const char *file, int line, void *buffer, int dim, int dir, size_t nbytes)
static bool asyncReduce
static int manual_set_partition[QUDA_MAX_DIM]
#define device_free(ptr)
Definition: malloc_quda.h:57
void comm_barrier(void)
Definition: comm_mpi.cpp:328