QUDA  v1.1.0
A library for QCD on GPUs
communicator_quda.h
Go to the documentation of this file.
1 #pragma once
2 
3 #include <unistd.h> // for gethostname()
4 #include <assert.h>
5 #include <limits>
6 
7 #include <quda_internal.h>
8 #include <comm_quda.h>
9 #include <csignal>
10 
11 #include <comm_key.h>
12 
13 #include <algorithm>
14 #include <numeric>
15 
16 #if defined(MPI_COMMS) || defined(QMP_COMMS)
17 #include <mpi.h>
18 #endif
19 
20 #if defined(QMP_COMMS)
21 #include <qmp.h>
22 #endif
23 
24 #ifdef QUDA_BACKWARDSCPP
25 #include "backward.hpp"
26 namespace backward
27 {
28  static backward::SignalHandling sh;
29 } // namespace backward
30 #endif
31 
32 struct Topology_s {
33  int ndim;
35  int *ranks;
37  int my_rank;
39  // It might be worth adding communicators to allow for efficient reductions:
40  // #if defined(MPI_COMMS)
41  // MPI_Comm comm;
42  // #elif defined(QMP_COMMS)
43  // QMP_communicator_t comm; // currently only supported by qmp-2.4.0-alpha
44  // #endif
45 };
46 
47 static const int max_displacement = 4;
48 
49 inline int lex_rank_from_coords_dim_t(const int *coords, void *fdata)
50 {
51  int *dims = reinterpret_cast<int *>(fdata);
52  int rank = coords[0];
53  for (int i = 1; i < 4; i++) { rank = dims[i] * rank + coords[i]; }
54  return rank;
55 }
56 
57 inline int lex_rank_from_coords_dim_x(const int *coords, void *fdata)
58 {
59  int *dims = reinterpret_cast<int *>(fdata);
60  int rank = coords[3];
61  for (int i = 2; i >= 0; i--) { rank = dims[i] * rank + coords[i]; }
62  return rank;
63 }
64 
73 static inline int index(int ndim, const int *dims, const int *x)
74 {
75  int idx = x[0];
76  for (int i = 1; i < ndim; i++) { idx = dims[i] * idx + x[i]; }
77  return idx;
78 }
79 
80 static inline bool advance_coords(int ndim, const int *dims, int *x)
81 {
82  bool valid = false;
83  for (int i = ndim - 1; i >= 0; i--) {
84  if (x[i] < dims[i] - 1) {
85  x[i]++;
86  valid = true;
87  break;
88  } else {
89  x[i] = 0;
90  }
91  }
92  return valid;
93 }
94 
95 // QudaCommsMap is declared in quda.h:
96 // typedef int (*QudaCommsMap)(const int *coords, void *fdata);
97 Topology *comm_create_topology(int ndim, const int *dims, QudaCommsMap rank_from_coords, void *map_data, int my_rank);
98 
99 inline void comm_destroy_topology(Topology *topo)
100 {
101  delete [] topo->ranks;
102  delete [] topo->coords;
103  delete topo;
104 }
105 
106 inline int comm_ndim(const Topology *topo) { return topo->ndim; }
107 
108 inline const int *comm_dims(const Topology *topo) { return topo->dims; }
109 
110 inline const int *comm_coords(const Topology *topo) { return topo->my_coords; }
111 
112 inline const int *comm_coords_from_rank(const Topology *topo, int rank) { return topo->coords[rank]; }
113 
114 inline int comm_rank_from_coords(const Topology *topo, const int *coords)
115 {
116  return topo->ranks[index(topo->ndim, topo->dims, coords)];
117 }
118 
119 static inline int mod(int a, int b) { return ((a % b) + b) % b; }
120 
121 inline int comm_rank_displaced(const Topology *topo, const int displacement[])
122 {
123  int coords[QUDA_MAX_DIM];
124 
125  for (int i = 0; i < QUDA_MAX_DIM; i++) {
126  coords[i] = (i < topo->ndim) ? mod(comm_coords(topo)[i] + displacement[i], comm_dims(topo)[i]) : 0;
127  }
128 
129  return comm_rank_from_coords(topo, coords);
130 }
131 
132 inline bool isHost(const void *buffer)
133 {
134  CUmemorytype memType;
135  void *attrdata[] = {(void *)&memType};
136  CUpointer_attribute attributes[2] = {CU_POINTER_ATTRIBUTE_MEMORY_TYPE};
137  CUresult err = cuPointerGetAttributes(1, attributes, attrdata, (CUdeviceptr)buffer);
138  if (err != CUDA_SUCCESS) {
139  const char *str;
140  cuGetErrorName(err, &str);
141  errorQuda("cuPointerGetAttributes returned error %s", str);
142  }
143 
144  switch (memType) {
145  case CU_MEMORYTYPE_DEVICE: return false;
146  case CU_MEMORYTYPE_ARRAY: errorQuda("Using array memory for communications buffer is not supported");
147  case CU_MEMORYTYPE_UNIFIED: errorQuda("Using unified memory for communications buffer is not supported");
148  case CU_MEMORYTYPE_HOST:
149  default: // memory not allocated by CUDA allocaters will default to being host memory
150  return true;
151  }
152 }
153 
154 inline void check_displacement(const int displacement[], int ndim)
155 {
156  for (int i = 0; i < ndim; i++) {
157  if (abs(displacement[i]) > max_displacement) {
158  errorQuda("Requested displacement[%d] = %d is greater than maximum allowed", i, displacement[i]);
159  }
160  }
161 }
162 
163 struct Communicator {
164 
168  static int gpuid;
169  static int comm_gpuid() { return gpuid; }
170 
175 
176  bool peer2peer_enabled[2][4] = {{false, false, false, false}, {false, false, false, false}};
177  bool peer2peer_init = false;
178 
179  bool intranode_enabled[2][4] = {{false, false, false, false}, {false, false, false, false}};
180 
183  bool peer2peer_present = false;
184 
187 
189  int enable_p2p_max_access_rank = std::numeric_limits<int>::max();
190 
191  void comm_peer2peer_init(const char *hostname_recv_buf)
192  {
193  if (peer2peer_init) return;
194 
195  // set gdr enablement
196  if (comm_gdr_enabled()) {
197  if (getVerbosity() > QUDA_SILENT && rank == 0) printf("Enabling GPU-Direct RDMA access\n");
198  comm_gdr_blacklist(); // set GDR blacklist
199  // by default, if GDR is enabled we disable non-p2p policies to
200  // prevent possible conflict between MPI and QUDA opening the same
201  // IPC memory handles when using CUDA-aware MPI
202  enable_peer_to_peer += 4;
203  } else {
204  if (getVerbosity() > QUDA_SILENT && rank == 0) printf("Disabling GPU-Direct RDMA access\n");
205  }
206 
207  char *enable_peer_to_peer_env = getenv("QUDA_ENABLE_P2P");
208 
209  // disable peer-to-peer comms in one direction if QUDA_ENABLE_P2P=-1
210  // and comm_dim(dim) == 2 (used for perf benchmarking)
211  bool disable_peer_to_peer_bidir = false;
212 
213  if (enable_peer_to_peer_env) {
214  enable_peer_to_peer = atoi(enable_peer_to_peer_env);
215 
216  switch (std::abs(enable_peer_to_peer)) {
217  case 0:
218  if (getVerbosity() > QUDA_SILENT && rank == 0) printf("Disabling peer-to-peer access\n");
219  break;
220  case 1:
221  if (getVerbosity() > QUDA_SILENT && rank == 0)
222  printf("Enabling peer-to-peer copy engine access (disabling direct load/store)\n");
223  break;
224  case 2:
225  if (getVerbosity() > QUDA_SILENT && rank == 0)
226  printf("Enabling peer-to-peer direct load/store access (disabling copy engines)\n");
227  break;
228  case 3:
229  if (getVerbosity() > QUDA_SILENT && rank == 0)
230  printf("Enabling peer-to-peer copy engine and direct load/store access\n");
231  break;
232  case 5:
233  if (getVerbosity() > QUDA_SILENT && rank == 0)
234  printf("Enabling peer-to-peer copy engine access (disabling direct load/store and non-p2p policies)\n");
235  break;
236  case 6:
237  if (getVerbosity() > QUDA_SILENT && rank == 0)
238  printf("Enabling peer-to-peer direct load/store access (disabling copy engines and non-p2p policies)\n");
239  break;
240  case 7:
241  if (getVerbosity() > QUDA_SILENT && rank == 0)
242  printf("Enabling peer-to-peer copy engine and direct load/store access (disabling non-p2p policies)\n");
243  break;
244  default: errorQuda("Unexpected value QUDA_ENABLE_P2P=%d\n", enable_peer_to_peer);
245  }
246 
247  if (enable_peer_to_peer < 0) { // only values -1, -2, -3 can make it here
248  if (getVerbosity() > QUDA_SILENT && rank == 0) printf("Disabling bi-directional peer-to-peer access\n");
249  disable_peer_to_peer_bidir = true;
250  }
251 
253 
254  } else { // !enable_peer_to_peer_env
255  if (getVerbosity() > QUDA_SILENT && rank == 0)
256  printf("Enabling peer-to-peer copy engine and direct load/store access\n");
257  }
258 
260 
261  // set whether we are limiting p2p enablement
262  char *enable_p2p_max_access_rank_env = getenv("QUDA_ENABLE_P2P_MAX_ACCESS_RANK");
263  if (enable_p2p_max_access_rank_env) {
264  enable_p2p_max_access_rank = atoi(enable_p2p_max_access_rank_env);
266  errorQuda("Invalid QUDA_ENABLE_P2P_MAX_ACCESS_RANK=%d\n", enable_p2p_max_access_rank);
267  if (getVerbosity() > QUDA_SILENT)
268  printfQuda(
269  "Limiting peer-to-peer communication to a maximum access rank of %d (lower ranks have higher bandwidth)\n",
271  }
272 
273  // first check that the local GPU supports UVA
274  const int gpuid = comm_gpuid();
275  cudaDeviceProp prop;
276  cudaGetDeviceProperties(&prop, gpuid);
277  if (!prop.unifiedAddressing) return;
278 
280 
281  char *hostname = comm_hostname();
282  int *gpuid_recv_buf = (int *)safe_malloc(sizeof(int) * comm_size());
283 
284  comm_gather_gpuid(gpuid_recv_buf);
285 
286  for (int dir = 0; dir < 2; ++dir) { // forward/backward directions
287  for (int dim = 0; dim < 4; ++dim) {
289  if (neighbor_rank == comm_rank()) continue;
290 
291  // disable peer-to-peer comms in one direction
292  if (((comm_rank() > neighbor_rank && dir == 0) || (comm_rank() < neighbor_rank && dir == 1))
293  && disable_peer_to_peer_bidir && comm_dim(dim) == 2)
294  continue;
295 
296  // if the neighbors are on the same
297  if (!strncmp(hostname, &hostname_recv_buf[128 * neighbor_rank], 128)) {
298  int neighbor_gpuid = gpuid_recv_buf[neighbor_rank];
299  int canAccessPeer[2];
300  cudaDeviceCanAccessPeer(&canAccessPeer[0], gpuid, neighbor_gpuid);
301  cudaDeviceCanAccessPeer(&canAccessPeer[1], neighbor_gpuid, gpuid);
302 
303  int accessRank[2] = {};
304  if (canAccessPeer[0] * canAccessPeer[1] != 0) {
305  cudaDeviceGetP2PAttribute(&accessRank[0], cudaDevP2PAttrPerformanceRank, gpuid, neighbor_gpuid);
306  cudaDeviceGetP2PAttribute(&accessRank[1], cudaDevP2PAttrPerformanceRank, neighbor_gpuid, gpuid);
307  }
308 
309  // enable P2P if we can access the peer or if peer is self
310  // if (canAccessPeer[0] * canAccessPeer[1] != 0 || gpuid == neighbor_gpuid) {
311  if ((canAccessPeer[0] * canAccessPeer[1] != 0 && accessRank[0] <= enable_p2p_max_access_rank
312  && accessRank[1] <= enable_p2p_max_access_rank)
313  || gpuid == neighbor_gpuid) {
314  peer2peer_enabled[dir][dim] = true;
315  if (getVerbosity() > QUDA_SILENT) {
316  printf("Peer-to-peer enabled for rank %3d (gpu=%d) with neighbor %3d (gpu=%d) dir=%d, dim=%d, "
317  "access rank = (%3d, %3d)\n",
318  comm_rank(), gpuid, neighbor_rank, neighbor_gpuid, dir, dim, accessRank[0], accessRank[1]);
319  }
320  } else {
321  intranode_enabled[dir][dim] = true;
322  if (getVerbosity() > QUDA_SILENT) {
323  printf("Intra-node (non peer-to-peer) enabled for rank %3d (gpu=%d) with neighbor %3d (gpu=%d) dir=%d, "
324  "dim=%d\n",
325  comm_rank(), gpuid, neighbor_rank, neighbor_gpuid, dir, dim);
326  }
327  }
328 
329  } // on the same node
330  } // different dimensions - x, y, z, t
331  } // different directions - forward/backward
332 
333  host_free(gpuid_recv_buf);
334  }
335 
336  peer2peer_init = true;
337 
338  comm_barrier();
339 
341 
343  }
344 
346 
347  bool enable_p2p = true;
348 
349  bool comm_peer2peer_enabled(int dir, int dim) { return enable_p2p ? peer2peer_enabled[dir][dim] : false; }
350 
351  bool init = false;
352  bool p2p_global = false;
353 
355  {
356 
357  if (!enable_p2p) return false;
358 
359  if (!init) {
360  int p2p = 0;
361  for (int dim = 0; dim < 4; dim++)
362  for (int dir = 0; dir < 2; dir++) p2p += (int)comm_peer2peer_enabled(dir, dim);
363 
364  comm_allreduce_int(&p2p);
365  init = true;
366  p2p_global = p2p > 0 ? true : false;
367  }
369  }
370 
371  void comm_enable_peer2peer(bool enable) { enable_p2p = enable; }
372 
373  bool enable_intranode = true;
374 
375  bool comm_intranode_enabled(int dir, int dim) { return enable_intranode ? intranode_enabled[dir][dim] : false; }
376 
377  void comm_enable_intranode(bool enable) { enable_intranode = enable; }
378 
379  Topology *default_topo = nullptr;
380 
382 
384  {
385  if (!default_topo) { errorQuda("Default topology has not been declared"); }
386  return default_topo;
387  }
388 
389  int neighbor_rank[2][4] = {{-1, -1, -1, -1}, {-1, -1, -1, -1}};
390 
391  bool neighbors_cached = false;
392 
393  void comm_set_neighbor_ranks(Topology *topo = nullptr)
394  {
395 
396  if (neighbors_cached) return;
397 
398  Topology *topology = topo ? topo : default_topo; // use default topology if topo is NULL
399  if (!topology) { errorQuda("Topology not specified"); }
400 
401  for (int d = 0; d < 4; ++d) {
402  int pos_displacement[QUDA_MAX_DIM] = {};
403  int neg_displacement[QUDA_MAX_DIM] = {};
404  pos_displacement[d] = +1;
405  neg_displacement[d] = -1;
406  neighbor_rank[0][d] = comm_rank_displaced(topology, neg_displacement);
407  neighbor_rank[1][d] = comm_rank_displaced(topology, pos_displacement);
408  }
409  neighbors_cached = true;
410  }
411 
412  int comm_neighbor_rank(int dir, int dim)
413  {
415  return neighbor_rank[dir][dim];
416  }
417 
418  int comm_dim(int dim)
419  {
421  return comm_dims(topo)[dim];
422  }
423 
424  int comm_coord(int dim)
425  {
427  return comm_coords(topo)[dim];
428  }
429 
430  void comm_finalize(void)
431  {
433  comm_destroy_topology(topo);
435  }
436 
437  char partition_string[16];
438  char topology_string[128];
442 
444  {
445 #ifdef MULTI_GPU
447 #endif
448 
449  snprintf(partition_string, 16, ",comm=%d%d%d%d", comm_dim_partitioned(0), comm_dim_partitioned(1),
451  }
452 
454  {
455  for (int i = 0; i < QUDA_MAX_DIM; i++) manual_set_partition[i] = 0;
456 
457  snprintf(partition_string, 16, ",comm=%d%d%d%d", comm_dim_partitioned(0), comm_dim_partitioned(1),
459  }
460 
461 #ifdef MULTI_GPU
462  int comm_dim_partitioned(int dim) { return (manual_set_partition[dim] || (default_topo && comm_dim(dim) > 1)); }
463 #else
464  int comm_dim_partitioned(int) { return 0; }
465 #endif
466 
468  {
469  int partitioned = 0;
470  for (int i = 0; i < 4; i++) { partitioned = partitioned || comm_dim_partitioned(i); }
471  return partitioned;
472  }
473 
474  bool gdr_enabled = false;
475 
476 #ifdef MULTI_GPU
477  bool gdr_init = false;
478 #endif
479 
481  {
482 #ifdef MULTI_GPU
483 
484  if (!gdr_init) {
485  char *enable_gdr_env = getenv("QUDA_ENABLE_GDR");
486  if (enable_gdr_env && strcmp(enable_gdr_env, "1") == 0) { gdr_enabled = true; }
487  gdr_init = true;
488  }
489 #endif
490  return gdr_enabled;
491  }
492 
493  bool blacklist = false;
494  bool blacklist_init = false;
495 
497  {
498  if (!blacklist_init) {
499  char *blacklist_env = getenv("QUDA_ENABLE_GDR_BLACKLIST");
500 
501  if (blacklist_env) { // set the policies to tune for explicitly
502  std::stringstream blacklist_list(blacklist_env);
503 
504  int device_count;
505  cudaGetDeviceCount(&device_count);
506 
507  int excluded_device;
508  while (blacklist_list >> excluded_device) {
509  // check this is a valid device
510  if (excluded_device < 0 || excluded_device >= device_count) {
511  errorQuda("Cannot blacklist invalid GPU device ordinal %d", excluded_device);
512  }
513 
514  if (blacklist_list.peek() == ',') blacklist_list.ignore();
515  if (excluded_device == comm_gpuid()) blacklist = true;
516  }
517  comm_barrier();
519  printf("Blacklisting GPU-Direct RDMA for rank %d (GPU %d)\n", comm_rank(), comm_gpuid());
520  }
521  blacklist_init = true;
522  }
523 
524  return blacklist;
525  }
526 
528  {
529 #if (defined MULTI_GPU) && (defined NVSHMEM_COMMS)
530  static bool nvshmem_enabled = true;
531  static bool nvshmem_init = false;
532  if (!nvshmem_init) {
533  char *enable_nvshmem_env = getenv("QUDA_ENABLE_NVSHMEM");
534  if (enable_nvshmem_env && strcmp(enable_nvshmem_env, "1") == 0) { nvshmem_enabled = true; }
535  if (enable_nvshmem_env && strcmp(enable_nvshmem_env, "0") == 0) { nvshmem_enabled = false; }
536  nvshmem_init = true;
537  }
538 #else
539  static bool nvshmem_enabled = false;
540 #endif
541  return nvshmem_enabled;
542  }
543 
545 
546  void comm_init_common(int ndim, const int *dims, QudaCommsMap rank_from_coords, void *map_data)
547  {
548  Topology *topo = comm_create_topology(ndim, dims, rank_from_coords, map_data, comm_rank());
550 
551  // determine which GPU this rank will use
552  char *hostname_recv_buf = (char *)safe_malloc(128 * comm_size());
553  comm_gather_hostname(hostname_recv_buf);
554 
555  if (gpuid < 0) {
556 
557  int device_count;
558  cudaGetDeviceCount(&device_count);
559  if (device_count == 0) { errorQuda("No CUDA devices found"); }
560 
561  // We initialize gpuid if it's still negative.
562  gpuid = 0;
563  for (int i = 0; i < comm_rank(); i++) {
564  if (!strncmp(comm_hostname(), &hostname_recv_buf[128 * i], 128)) { gpuid++; }
565  }
566 
567  // At this point we had either pulled a gpuid from an env var or from the old way.
568  if (gpuid >= device_count) {
569  char *enable_mps_env = getenv("QUDA_ENABLE_MPS");
570  if (enable_mps_env && strcmp(enable_mps_env, "1") == 0) {
571  gpuid = gpuid % device_count;
572  printf("MPS enabled, rank=%3d -> gpu=%d\n", comm_rank(), gpuid);
573  } else {
574  errorQuda("Too few GPUs available on %s", comm_hostname());
575  }
576  }
577  } // -ve gpuid
578 
579  comm_peer2peer_init(hostname_recv_buf);
580 
581  host_free(hostname_recv_buf);
582 
583  char *enable_reduce_env = getenv("QUDA_DETERMINISTIC_REDUCE");
584  if (enable_reduce_env && strcmp(enable_reduce_env, "1") == 0) { use_deterministic_reduce = true; }
585 
586  snprintf(partition_string, 16, ",comm=%d%d%d%d", comm_dim_partitioned(0), comm_dim_partitioned(1),
588 
589  // if CUDA_VISIBLE_DEVICES is set, we include this information in the topology_string
590  char *device_order_env = getenv("CUDA_VISIBLE_DEVICES");
591  if (device_order_env) {
592 
593  // to ensure we have process consistency define using rank 0
594  if (comm_rank() == 0) {
595  std::stringstream device_list_raw(device_order_env); // raw input
596  std::stringstream device_list; // formatted (no commas)
597 
598  int device;
599  int deviceCount;
600  cudaGetDeviceCount(&deviceCount);
601  while (device_list_raw >> device) {
602  // check this is a valid policy choice
603  if (device < 0) { errorQuda("Invalid CUDA_VISIBLE_DEVICE ordinal %d", device); }
604 
605  device_list << device;
606  if (device_list_raw.peek() == ',') device_list_raw.ignore();
607  }
608  snprintf(topology_string, 128, ",topo=%d%d%d%d,order=%s", comm_dim(0), comm_dim(1), comm_dim(2), comm_dim(3),
609  device_list.str().c_str());
610  }
611 
613  } else {
614  snprintf(topology_string, 128, ",topo=%d%d%d%d", comm_dim(0), comm_dim(1), comm_dim(2), comm_dim(3));
615  }
616  }
617 
618  char config_string[64];
619  bool config_init = false;
620 
621  const char *comm_config_string()
622  {
623  if (!config_init) {
624  strcpy(config_string, ",p2p=");
625  strcat(config_string, std::to_string(comm_peer2peer_enabled_global()).c_str());
626  if (enable_p2p_max_access_rank != std::numeric_limits<int>::max()) {
627  strcat(config_string, ",p2p_max_access_rank=");
628  strcat(config_string, std::to_string(enable_p2p_max_access_rank).c_str());
629  }
630  strcat(config_string, ",gdr=");
631  strcat(config_string, std::to_string(comm_gdr_enabled()).c_str());
632  strcat(config_string, ",nvshmem=");
633  strcat(config_string, std::to_string(comm_nvshmem_enabled()).c_str());
634  config_init = true;
635  }
636 
637  return config_string;
638  }
639 
640  const char *comm_dim_partitioned_string(const int *comm_dim_override)
641  {
642  if (comm_dim_override) {
643  char comm[5] = {(!comm_dim_partitioned(0) ? '0' : comm_dim_override[0] ? '1' : '0'),
644  (!comm_dim_partitioned(1) ? '0' : comm_dim_override[1] ? '1' : '0'),
645  (!comm_dim_partitioned(2) ? '0' : comm_dim_override[2] ? '1' : '0'),
646  (!comm_dim_partitioned(3) ? '0' : comm_dim_override[3] ? '1' : '0'), '\0'};
647  strcpy(partition_override_string, ",comm=");
648  strcat(partition_override_string, comm);
650  } else {
651  return partition_string;
652  }
653  }
654 
655  const char *comm_dim_topology_string() { return topology_string; }
656 
658 
659  bool globalReduce = true;
660  bool asyncReduce = false;
661 
662  void reduceMaxDouble(double &max) { comm_allreduce_max(&max); }
663 
664  void reduceDouble(double &sum)
665  {
667  }
668 
669  void reduceDoubleArray(double *sum, const int len)
670  {
672  }
673 
674  int commDim(int dir) { return comm_dim(dir); }
675 
676  int commCoords(int dir) { return comm_coord(dir); }
677 
678  int commDimPartitioned(int dir) { return comm_dim_partitioned(dir); }
679 
681 
683 
685 
686  void commGlobalReductionSet(bool global_reduction) { globalReduce = global_reduction; }
687 
688  bool commAsyncReduction() { return asyncReduce; }
689 
690  void commAsyncReductionSet(bool async_reduction) { asyncReduce = async_reduction; }
691 
692 #if defined(QMP_COMMS) || defined(MPI_COMMS)
693  MPI_Comm MPI_COMM_HANDLE;
694 #endif
695 
696 #if defined(QMP_COMMS)
697  QMP_comm_t QMP_COMM_HANDLE;
698 
703  bool is_qmp_handle_default;
704 #endif
705 
706  int rank = -1;
707  int size = -1;
708 
710 
711  Communicator(Communicator &other, const int *comm_split);
712 
713  Communicator(int nDim, const int *commDims, QudaCommsMap rank_from_coords, void *map_data,
714  bool user_set_comm_handle = false, void *user_comm = nullptr);
715 
716  ~Communicator();
717 
718  void comm_gather_hostname(char *hostname_recv_buf);
719 
720  void comm_gather_gpuid(int *gpuid_recv_buf);
721 
722  void comm_init(int ndim, const int *dims, QudaCommsMap rank_from_coords, void *map_data);
723 
724  int comm_rank(void);
725 
726  int comm_size(void);
727 
728  int comm_rank_from_coords(const int *coords)
729  {
731  return ::comm_rank_from_coords(topo, coords);
732  }
733 
737  MsgHandle *comm_declare_send_rank(void *buffer, int rank, int tag, size_t nbytes);
738 
742  MsgHandle *comm_declare_recv_rank(void *buffer, int rank, int tag, size_t nbytes);
743 
747  MsgHandle *comm_declare_send_displaced(void *buffer, const int displacement[], size_t nbytes);
748 
752  MsgHandle *comm_declare_receive_displaced(void *buffer, const int displacement[], size_t nbytes);
753 
757  MsgHandle *comm_declare_strided_send_displaced(void *buffer, const int displacement[], size_t blksize, int nblocks,
758  size_t stride);
759 
763  MsgHandle *comm_declare_strided_receive_displaced(void *buffer, const int displacement[], size_t blksize, int nblocks,
764  size_t stride);
765 
766  void comm_free(MsgHandle *&mh);
767 
768  void comm_start(MsgHandle *mh);
769 
770  void comm_wait(MsgHandle *mh);
771 
772  int comm_query(MsgHandle *mh);
773 
774  template <typename T> T deterministic_reduce(T *array, int n)
775  {
776  std::sort(array, array + n); // sort reduction into ascending order for deterministic reduction
777  return std::accumulate(array, array + n, 0.0);
778  }
779 
780  void comm_allreduce(double *data);
781 
782  void comm_allreduce_max(double *data);
783 
784  void comm_allreduce_min(double *data);
785 
786  void comm_allreduce_array(double *data, size_t size);
787 
788  void comm_allreduce_max_array(double *data, size_t size);
789 
790  void comm_allreduce_int(int *data);
791 
792  void comm_allreduce_xor(uint64_t *data);
793 
795  void comm_broadcast(void *data, size_t nbytes);
796 
797  void comm_barrier(void);
798 
799  static void comm_abort_(int status);
800 
801  static int comm_rank_global();
802 };
803 
804 constexpr quda::CommKey default_comm_key = {1, 1, 1, 1};
805 
806 void push_communicator(const quda::CommKey &split_key);
807 
809 void comm_broadcast_global(void *data, size_t nbytes);
char * comm_hostname(void)
Definition: comm_common.cpp:10
int(* QudaCommsMap)(const int *coords, void *fdata)
Definition: comm_quda.h:12
std::array< int, 4 > dim
const int * comm_coords_from_rank(const Topology *topo, int rank)
void comm_broadcast_global(void *data, size_t nbytes)
These routine broadcast the data according to the default communicator.
int comm_rank_displaced(const Topology *topo, const int displacement[])
int comm_rank_from_coords(const Topology *topo, const int *coords)
int lex_rank_from_coords_dim_x(const int *coords, void *fdata)
constexpr quda::CommKey default_comm_key
void check_displacement(const int displacement[], int ndim)
Topology * comm_create_topology(int ndim, const int *dims, QudaCommsMap rank_from_coords, void *map_data, int my_rank)
int comm_ndim(const Topology *topo)
const int * comm_dims(const Topology *topo)
bool isHost(const void *buffer)
int lex_rank_from_coords_dim_t(const int *coords, void *fdata)
const int * comm_coords(const Topology *topo)
void comm_destroy_topology(Topology *topo)
void push_communicator(const quda::CommKey &split_key)
@ QUDA_SILENT
Definition: enum_quda.h:265
#define safe_malloc(size)
Definition: malloc_quda.h:106
#define host_free(ptr)
Definition: malloc_quda.h:115
__host__ __device__ ValueType abs(ValueType x)
Definition: complex_quda.h:125
__host__ __device__ T sum(const array< T, s > &a)
Definition: utility.h:76
#define QUDA_MAX_DIM
Maximum number of dimensions supported by QUDA. In practice, no routines make use of more than 5.
int neighbor_rank[2][4]
int comm_dim(int dim)
void commGlobalReductionSet(bool global_reduction)
bool comm_deterministic_reduce()
int commDimPartitioned(int dir)
char topology_string[128]
void comm_allreduce_max(double *data)
T deterministic_reduce(T *array, int n)
void comm_peer2peer_init(const char *hostname_recv_buf)
void commAsyncReductionSet(bool async_reduction)
void reduceDouble(double &sum)
char config_string[64]
void comm_wait(MsgHandle *mh)
void comm_allreduce(double *data)
bool commGlobalReduction()
void comm_broadcast(void *data, size_t nbytes)
int commDim(int dir)
void comm_set_default_topology(Topology *topo)
int comm_query(MsgHandle *mh)
Topology * default_topo
void comm_gather_gpuid(int *gpuid_recv_buf)
void comm_init(int ndim, const int *dims, QudaCommsMap rank_from_coords, void *map_data)
void comm_allreduce_min(double *data)
int manual_set_partition[QUDA_MAX_DIM]
bool comm_intranode_enabled(int dir, int dim)
void reduceMaxDouble(double &max)
int comm_coord(int dim)
static int comm_rank_global()
MsgHandle * comm_declare_strided_send_displaced(void *buffer, const int displacement[], size_t blksize, int nblocks, size_t stride)
void comm_enable_intranode(bool enable)
void comm_barrier(void)
void comm_set_neighbor_ranks(Topology *topo=nullptr)
static void comm_abort_(int status)
void comm_enable_peer2peer(bool enable)
int commCoords(int dir)
MsgHandle * comm_declare_send_rank(void *buffer, int rank, int tag, size_t nbytes)
MsgHandle * comm_declare_strided_receive_displaced(void *buffer, const int displacement[], size_t blksize, int nblocks, size_t stride)
bool comm_peer2peer_present()
int comm_dim_partitioned(int)
void reduceDoubleArray(double *sum, const int len)
const char * comm_dim_partitioned_string(const int *comm_dim_override)
void comm_allreduce_max_array(double *data, size_t size)
MsgHandle * comm_declare_send_displaced(void *buffer, const int displacement[], size_t nbytes)
int comm_neighbor_rank(int dir, int dim)
const char * comm_dim_topology_string()
bool use_deterministic_reduce
void comm_gather_hostname(char *hostname_recv_buf)
void comm_allreduce_xor(uint64_t *data)
bool commAsyncReduction()
int enable_p2p_max_access_rank
void comm_allreduce_int(int *data)
char partition_override_string[16]
MsgHandle * comm_declare_receive_displaced(void *buffer, const int displacement[], size_t nbytes)
static int gpuid
bool comm_gdr_blacklist()
void comm_free(MsgHandle *&mh)
Topology * comm_default_topology(void)
const char * comm_config_string()
int comm_peer2peer_enabled_global()
bool comm_peer2peer_enabled(int dir, int dim)
bool peer2peer_enabled[2][4]
bool comm_nvshmem_enabled()
MsgHandle * comm_declare_recv_rank(void *buffer, int rank, int tag, size_t nbytes)
int comm_rank_from_coords(const int *coords)
void comm_finalize(void)
static int comm_gpuid()
void comm_dim_partitioned_reset()
void comm_dim_partitioned_set(int dim)
void comm_init_common(int ndim, const int *dims, QudaCommsMap rank_from_coords, void *map_data)
void commDimPartitionedReset()
void comm_start(MsgHandle *mh)
char partition_string[16]
void commDimPartitionedSet(int dir)
void comm_allreduce_array(double *data, size_t size)
bool intranode_enabled[2][4]
int(* coords)[QUDA_MAX_DIM]
int my_coords[QUDA_MAX_DIM]
int dims[QUDA_MAX_DIM]
#define printfQuda(...)
Definition: util_quda.h:114
#define checkCudaErrorNoSync()
Definition: util_quda.h:143
QudaVerbosity getVerbosity()
Definition: util_quda.cpp:21
#define errorQuda(...)
Definition: util_quda.h:120