QUDA  1.0.0
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros
comm_common.cpp
Go to the documentation of this file.
1 #include <unistd.h> // for gethostname()
2 #include <assert.h>
3 
4 #include <quda_internal.h>
5 #include <comm_quda.h>
6 
7 
8 struct Topology_s {
9  int ndim;
11  int *ranks;
13  int my_rank;
15  // It might be worth adding communicators to allow for efficient reductions:
16  // #if defined(MPI_COMMS)
17  // MPI_Comm comm;
18  // #elif defined(QMP_COMMS)
19  // QMP_communicator_t comm; // currently only supported by qmp-2.4.0-alpha
20  // #endif
21 };
22 
23 
32 static inline int index(int ndim, const int *dims, const int *x)
33 {
34  int idx = x[0];
35  for (int i = 1; i < ndim; i++) {
36  idx = dims[i]*idx + x[i];
37  }
38  return idx;
39 }
40 
41 
42 static inline bool advance_coords(int ndim, const int *dims, int *x)
43 {
44  bool valid = false;
45  for (int i = ndim-1; i >= 0; i--) {
46  if (x[i] < dims[i]-1) {
47  x[i]++;
48  valid = true;
49  break;
50  } else {
51  x[i] = 0;
52  }
53  }
54  return valid;
55 }
56 
57 
58 char *comm_hostname(void)
59 {
60  static bool cached = false;
61  static char hostname[128];
62 
63  if (!cached) {
64  gethostname(hostname, 128);
65  hostname[127] = '\0';
66  cached = true;
67  }
68 
69  return hostname;
70 }
71 
72 
73 static unsigned long int rand_seed = 137;
74 
82 double comm_drand(void)
83 {
84  const double twoneg48 = 0.35527136788005009e-14;
85  const unsigned long int m = 25214903917, a = 11, mask = 281474976710655;
86  rand_seed = (m * rand_seed + a) & mask;
87  return (twoneg48 * rand_seed);
88 }
89 
90 
91 // QudaCommsMap is declared in quda.h:
92 // typedef int (*QudaCommsMap)(const int *coords, void *fdata);
93 
94 Topology *comm_create_topology(int ndim, const int *dims, QudaCommsMap rank_from_coords, void *map_data)
95 {
96  if (ndim > QUDA_MAX_DIM) {
97  errorQuda("ndim exceeds QUDA_MAX_DIM");
98  }
99 
100  Topology *topo = (Topology *) safe_malloc(sizeof(Topology));
101 
102  topo->ndim = ndim;
103 
104  int nodes = 1;
105  for (int i=0; i<ndim; i++) {
106  topo->dims[i] = dims[i];
107  nodes *= dims[i];
108  }
109 
110  topo->ranks = (int *) safe_malloc(nodes*sizeof(int));
111  topo->coords = (int (*)[QUDA_MAX_DIM]) safe_malloc(nodes*sizeof(int[QUDA_MAX_DIM]));
112 
113  int x[QUDA_MAX_DIM];
114  for (int i = 0; i < QUDA_MAX_DIM; i++) x[i] = 0;
115 
116  do {
117  int rank = rank_from_coords(x, map_data);
118  topo->ranks[index(ndim, dims, x)] = rank;
119  for (int i=0; i<ndim; i++) {
120  topo->coords[rank][i] = x[i];
121  }
122  } while (advance_coords(ndim, dims, x));
123 
124  int my_rank = comm_rank();
125  topo->my_rank = my_rank;
126  for (int i = 0; i < ndim; i++) {
127  topo->my_coords[i] = topo->coords[my_rank][i];
128  }
129 
130  // initialize the random number generator with a rank-dependent seed
131  rand_seed = 17*my_rank + 137;
132 
133  return topo;
134 }
135 
136 
138 {
139  host_free(topo->ranks);
140  host_free(topo->coords);
141  host_free(topo);
142 }
143 
144 static int gpuid = -1;
145 
146 int comm_gpuid(void) { return gpuid; }
147 
148 static bool peer2peer_enabled[2][4] = { {false,false,false,false},
149  {false,false,false,false} };
150 static bool peer2peer_init = false;
151 
152 static bool intranode_enabled[2][4] = { {false,false,false,false},
153  {false,false,false,false} };
154 
157 static bool peer2peer_present = false;
158 
160 static int enable_peer_to_peer = 3;
161 
162 
163 void comm_peer2peer_init(const char* hostname_recv_buf)
164 {
165  if (peer2peer_init) return;
166 
167  // set gdr enablement
168  if (comm_gdr_enabled()) {
169  if (getVerbosity() > QUDA_SILENT) printfQuda("Enabling GPU-Direct RDMA access\n");
170  comm_gdr_blacklist(); // set GDR blacklist
171  // by default, if GDR is enabled we disable non-p2p policies to
172  // prevent possible conflict between MPI and QUDA opening the same
173  // IPC memory handles when using CUDA-aware MPI
174  enable_peer_to_peer += 4;
175  } else {
176  if (getVerbosity() > QUDA_SILENT) printfQuda("Disabling GPU-Direct RDMA access\n");
177  }
178 
179  char *enable_peer_to_peer_env = getenv("QUDA_ENABLE_P2P");
180 
181  // disable peer-to-peer comms in one direction if QUDA_ENABLE_P2P=-1
182  // and comm_dim(dim) == 2 (used for perf benchmarking)
183  bool disable_peer_to_peer_bidir = false;
184 
185  if (enable_peer_to_peer_env) {
186  enable_peer_to_peer = atoi(enable_peer_to_peer_env);
187 
188  switch ( std::abs(enable_peer_to_peer) ) {
189  case 0: if (getVerbosity() > QUDA_SILENT) printfQuda("Disabling peer-to-peer access\n"); break;
190  case 1: if (getVerbosity() > QUDA_SILENT) printfQuda("Enabling peer-to-peer copy engine access (disabling direct load/store)\n"); break;
191  case 2: if (getVerbosity() > QUDA_SILENT) printfQuda("Enabling peer-to-peer direct load/store access (disabling copy engines)\n"); break;
192  case 3: if (getVerbosity() > QUDA_SILENT) printfQuda("Enabling peer-to-peer copy engine and direct load/store access\n"); break;
193  case 5: if (getVerbosity() > QUDA_SILENT) printfQuda("Enabling peer-to-peer copy engine access (disabling direct load/store and non-p2p policies)\n"); break;
194  case 6: if (getVerbosity() > QUDA_SILENT) printfQuda("Enabling peer-to-peer direct load/store access (disabling copy engines and non-p2p policies)\n"); break;
195  case 7: if (getVerbosity() > QUDA_SILENT) printfQuda("Enabling peer-to-peer copy engine and direct load/store access (disabling non-p2p policies)\n"); break;
196  default: errorQuda("Unexpected value QUDA_ENABLE_P2P=%d\n", enable_peer_to_peer);
197  }
198 
199  if (enable_peer_to_peer < 0) { // only values -1, -2, -3 can make it here
200  if (getVerbosity() > QUDA_SILENT) printfQuda("Disabling bi-directional peer-to-peer access\n");
201  disable_peer_to_peer_bidir = true;
202  }
203 
205 
206  } else { // !enable_peer_to_peer_env
207  if (getVerbosity() > QUDA_SILENT) printfQuda("Enabling peer-to-peer copy engine and direct load/store access\n");
208  }
209 
211 
212  // first check that the local GPU supports UVA
213  const int gpuid = comm_gpuid();
214  cudaDeviceProp prop;
215  cudaGetDeviceProperties(&prop, gpuid);
216  if(!prop.unifiedAddressing) return;
217 
219 
220  char *hostname = comm_hostname();
221  int *gpuid_recv_buf = (int *)safe_malloc(sizeof(int)*comm_size());
222 
223  comm_gather_gpuid(gpuid_recv_buf);
224 
225  for(int dir=0; dir<2; ++dir){ // forward/backward directions
226  for(int dim=0; dim<4; ++dim){
227  int neighbor_rank = comm_neighbor_rank(dir,dim);
228  if(neighbor_rank == comm_rank()) continue;
229 
230  // disable peer-to-peer comms in one direction
231  if ( ((comm_rank() > neighbor_rank && dir == 0) || (comm_rank() < neighbor_rank && dir == 1)) &&
232  disable_peer_to_peer_bidir && comm_dim(dim) == 2 ) continue;
233 
234  // if the neighbors are on the same
235  if (!strncmp(hostname, &hostname_recv_buf[128*neighbor_rank], 128)) {
236  int neighbor_gpuid = gpuid_recv_buf[neighbor_rank];
237  int canAccessPeer[2];
238  cudaDeviceCanAccessPeer(&canAccessPeer[0], gpuid, neighbor_gpuid);
239  cudaDeviceCanAccessPeer(&canAccessPeer[1], neighbor_gpuid, gpuid);
240 
241  int accessRank[2] = { };
242 #if CUDA_VERSION >= 8000 // this was introduced with CUDA 8
243  if (canAccessPeer[0]*canAccessPeer[1] != 0) {
244  cudaDeviceGetP2PAttribute(&accessRank[0], cudaDevP2PAttrPerformanceRank, gpuid, neighbor_gpuid);
245  cudaDeviceGetP2PAttribute(&accessRank[1], cudaDevP2PAttrPerformanceRank, neighbor_gpuid, gpuid);
246  }
247 #endif
248 
249  // enable P2P if we can access the peer or if peer is self
250  if (canAccessPeer[0]*canAccessPeer[1] != 0 || gpuid == neighbor_gpuid) {
251  peer2peer_enabled[dir][dim] = true;
252  if (getVerbosity() > QUDA_SILENT) {
253  printf("Peer-to-peer enabled for rank %d (gpu=%d) with neighbor %d (gpu=%d) dir=%d, dim=%d, performance rank = (%d, %d)\n",
254  comm_rank(), gpuid, neighbor_rank, neighbor_gpuid, dir, dim, accessRank[0], accessRank[1]);
255  }
256  } else {
257  intranode_enabled[dir][dim] = true;
258  if (getVerbosity() > QUDA_SILENT) {
259  printf("Intra-node (non peer-to-peer) enabled for rank %d (gpu=%d) with neighbor %d (gpu=%d) dir=%d, dim=%d\n",
260  comm_rank(), gpuid, neighbor_rank, neighbor_gpuid, dir, dim);
261  }
262  }
263 
264  } // on the same node
265  } // different dimensions - x, y, z, t
266  } // different directions - forward/backward
267 
268  host_free(gpuid_recv_buf);
269  }
270 
271  peer2peer_init = true;
272 
273  comm_barrier();
274 
276 
278  return;
279 }
280 
282 
283 static bool enable_p2p = true;
284 
285 bool comm_peer2peer_enabled(int dir, int dim){
286  return enable_p2p ? peer2peer_enabled[dir][dim] : false;
287 }
288 
290  if (!enable_p2p) return false;
291 
292  static bool init = false;
293  static bool p2p_global = false;
294 
295  if (!init) {
296  int p2p = 0;
297  for (int dim=0; dim<4; dim++)
298  for (int dir=0; dir<2; dir++)
299  p2p += (int)comm_peer2peer_enabled(dir,dim);
300 
301  comm_allreduce_int(&p2p);
302  init = true;
303  p2p_global = p2p > 0 ? true : false;
304  }
305  return p2p_global * enable_peer_to_peer;
306 }
307 
308 void comm_enable_peer2peer(bool enable) {
309  enable_p2p = enable;
310 }
311 
312 static bool enable_intranode = true;
313 
314 bool comm_intranode_enabled(int dir, int dim){
315  return enable_intranode ? intranode_enabled[dir][dim] : false;
316 }
317 
318 void comm_enable_intranode(bool enable) {
319  enable_intranode = enable;
320 }
321 
322 int comm_ndim(const Topology *topo)
323 {
324  return topo->ndim;
325 }
326 
327 
328 const int *comm_dims(const Topology *topo)
329 {
330  return topo->dims;
331 }
332 
333 
334 const int *comm_coords(const Topology *topo)
335 {
336  return topo->my_coords;
337 }
338 
339 
340 const int *comm_coords_from_rank(const Topology *topo, int rank)
341 {
342  return topo->coords[rank];
343 }
344 
345 
346 int comm_rank_from_coords(const Topology *topo, const int *coords)
347 {
348  return topo->ranks[index(topo->ndim, topo->dims, coords)];
349 }
350 
351 
352 static inline int mod(int a, int b)
353 {
354  return ((a % b) + b) % b;
355 }
356 
357 int comm_rank_displaced(const Topology *topo, const int displacement[])
358 {
359  int coords[QUDA_MAX_DIM];
360 
361  for (int i = 0; i < QUDA_MAX_DIM; i++) {
362  coords[i] = (i < topo->ndim) ?
363  mod(comm_coords(topo)[i] + displacement[i], comm_dims(topo)[i]) : 0;
364  }
365 
366  return comm_rank_from_coords(topo, coords);
367 }
368 
369 
370 // FIXME: The following routines rely on a "default" topology.
371 // They should probably be reworked or eliminated eventually.
372 
374 
376 {
377  default_topo = topo;
378 }
379 
380 
382 {
383  if (!default_topo) {
384  errorQuda("Default topology has not been declared");
385  }
386  return default_topo;
387 }
388 
389 static int neighbor_rank[2][4] = { {-1,-1,-1,-1},
390  {-1,-1,-1,-1} };
391 
392 static bool neighbors_cached = false;
393 
395 
396  if(neighbors_cached) return;
397 
398  Topology *topology = topo ? topo : default_topo; // use default topology if topo is NULL
399  if(!topology){
400  errorQuda("Topology not specified");
401  return;
402  }
403 
404  for(int d=0; d<4; ++d){
405  int pos_displacement[QUDA_MAX_DIM] = { };
406  int neg_displacement[QUDA_MAX_DIM] = { };
407  pos_displacement[d] = +1;
408  neg_displacement[d] = -1;
409  neighbor_rank[0][d] = comm_rank_displaced(topology, neg_displacement);
410  neighbor_rank[1][d] = comm_rank_displaced(topology, pos_displacement);
411  }
412  neighbors_cached = true;
413  return;
414 }
415 
416 int comm_neighbor_rank(int dir, int dim){
417  if(!neighbors_cached){
419  }
420  return neighbor_rank[dir][dim];
421 }
422 
423 
424 int comm_dim(int dim)
425 {
427  return comm_dims(topo)[dim];
428 }
429 
430 
431 int comm_coord(int dim)
432 {
434  return comm_coords(topo)[dim];
435 }
436 
437 inline bool isHost(const void *buffer)
438 {
439  CUmemorytype memType;
440  void *attrdata[] = {(void *)&memType};
441  CUpointer_attribute attributes[2] = {CU_POINTER_ATTRIBUTE_MEMORY_TYPE};
442  CUresult err = cuPointerGetAttributes(1, attributes, attrdata, (CUdeviceptr)buffer);
443  if (err != CUDA_SUCCESS) {
444  const char *str;
445  cuGetErrorName(err, &str);
446  errorQuda("cuPointerGetAttributes returned error %s", str);
447  }
448 
449  switch (memType) {
450  case CU_MEMORYTYPE_DEVICE: return false;
451  case CU_MEMORYTYPE_ARRAY: errorQuda("Using array memory for communications buffer is not supported");
452  case CU_MEMORYTYPE_UNIFIED: errorQuda("Using unified memory for communications buffer is not supported");
453  case CU_MEMORYTYPE_HOST:
454  default: // memory not allocated by CUDA allocaters will default to being host memory
455  return true;
456  }
457 }
458 
462 MsgHandle *comm_declare_send_relative_(const char *func, const char *file, int line,
463  void *buffer, int dim, int dir, size_t nbytes)
464 {
465 #ifdef HOST_DEBUG
466  checkCudaError(); // check and clear error state first
467 
468  if (isHost(buffer)) {
469  // test this memory allocation is ok by doing a memcpy from it
470  void *tmp = safe_malloc(nbytes);
471  try {
472  std::copy(static_cast<char*>(buffer), static_cast<char*>(buffer)+nbytes, static_cast<char*>(tmp));
473  } catch(std::exception &e) {
474  printfQuda("ERROR: buffer failed (%s:%d in %s(), dim=%d, dir=%d, nbytes=%zu)\n", file, line, func, dim, dir, nbytes);
475  errorQuda("aborting");
476  }
477  host_free(tmp);
478  } else {
479  // test this memory allocation is ok by doing a memcpy from it
480  void *tmp = device_malloc(nbytes);
481  cudaError_t err = cudaMemcpy(tmp, buffer, nbytes, cudaMemcpyDeviceToDevice);
482  if (err != cudaSuccess) {
483  printfQuda("ERROR: buffer failed (%s:%d in %s(), dim=%d, dir=%d, nbytes=%zu)\n", file, line, func, dim, dir, nbytes);
484  errorQuda("aborting with error %s", cudaGetErrorString(err));
485  }
486  device_free(tmp);
487  }
488 #endif
489 
490  int disp[QUDA_MAX_DIM] = {0};
491  disp[dim] = dir;
492 
493  return comm_declare_send_displaced(buffer, disp, nbytes);
494 }
495 
499 MsgHandle *comm_declare_receive_relative_(const char *func, const char *file, int line,
500  void *buffer, int dim, int dir, size_t nbytes)
501 {
502 #ifdef HOST_DEBUG
503  checkCudaError(); // check and clear error state first
504 
505  if (isHost(buffer)) {
506  // test this memory allocation is ok by filling it
507  try {
508  std::fill(static_cast<char*>(buffer), static_cast<char*>(buffer)+nbytes, 0);
509  } catch(std::exception &e) {
510  printfQuda("ERROR: buffer failed (%s:%d in %s(), dim=%d, dir=%d, nbytes=%zu)\n", file, line, func, dim, dir, nbytes);
511  errorQuda("aborting");
512  }
513  } else {
514  // test this memory allocation is ok by doing a memset
515  cudaError_t err = cudaMemset(buffer, 0, nbytes);
516  if (err != cudaSuccess) {
517  printfQuda("ERROR: buffer failed (%s:%d in %s(), dim=%d, dir=%d, nbytes=%zu)\n", file, line, func, dim, dir, nbytes);
518  errorQuda("aborting with error %s", cudaGetErrorString(err));
519  }
520  }
521 #endif
522 
523  int disp[QUDA_MAX_DIM] = {0};
524  disp[dim] = dir;
525 
526  return comm_declare_receive_displaced(buffer, disp, nbytes);
527 }
528 
532 MsgHandle *comm_declare_strided_send_relative_(const char *func, const char *file, int line,
533  void *buffer, int dim, int dir, size_t blksize, int nblocks, size_t stride)
534 {
535 #ifdef HOST_DEBUG
536  checkCudaError(); // check and clear error state first
537 
538  if (isHost(buffer)) {
539  // test this memory allocation is ok by doing a memcpy from it
540  void *tmp = safe_malloc(blksize*nblocks);
541  try {
542  for (int i=0; i<nblocks; i++)
543  std::copy(static_cast<char*>(buffer)+i*stride, static_cast<char*>(buffer)+i*stride+blksize, static_cast<char*>(tmp));
544  } catch(std::exception &e) {
545  printfQuda("ERROR: buffer failed (%s:%d in %s(), dim=%d, dir=%d, blksize=%zu nblocks=%d stride=%zu)\n",
546  file, line, func, dim, dir, blksize, nblocks, stride);
547  errorQuda("aborting");
548  }
549  host_free(tmp);
550  } else {
551  // test this memory allocation is ok by doing a memcpy from it
552  void *tmp = device_malloc(blksize*nblocks);
553  cudaError_t err = cudaMemcpy2D(tmp, blksize, buffer, stride, blksize, nblocks, cudaMemcpyDeviceToDevice);
554  if (err != cudaSuccess) {
555  printfQuda("ERROR: buffer failed (%s:%d in %s(), dim=%d, dir=%d, blksize=%zu nblocks=%d stride=%zu)\n",
556  file, line, func, dim, dir, blksize, nblocks, stride);
557  errorQuda("aborting with error %s", cudaGetErrorString(err));
558  }
559  device_free(tmp);
560  }
561 #endif
562 
563  int disp[QUDA_MAX_DIM] = {0};
564  disp[dim] = dir;
565 
566  return comm_declare_strided_send_displaced(buffer, disp, blksize, nblocks, stride);
567 }
568 
569 
573 MsgHandle *comm_declare_strided_receive_relative_(const char *func, const char *file, int line,
574  void *buffer, int dim, int dir, size_t blksize, int nblocks, size_t stride)
575 {
576 #ifdef HOST_DEBUG
577  checkCudaError(); // check and clear error state first
578 
579  if (isHost(buffer)) {
580  // test this memory allocation is ok by filling it
581  try {
582  for (int i=0; i<nblocks; i++)
583  std::fill(static_cast<char*>(buffer)+i*stride, static_cast<char*>(buffer)+i*stride+blksize, 0);
584  } catch(std::exception &e) {
585  printfQuda("ERROR: buffer failed (%s:%d in %s(), dim=%d, dir=%d, blksize=%zu nblocks=%d stride=%zu)\n",
586  file, line, func, dim, dir, blksize, nblocks, stride);
587  errorQuda("aborting");
588  }
589  } else {
590  // test this memory allocation is ok by doing a memset
591  cudaError_t err = cudaMemset2D(buffer, stride, 0, blksize, nblocks);
592  if (err != cudaSuccess) {
593  printfQuda("ERROR: buffer failed (%s:%d in %s(), dim=%d, dir=%d, blksize=%zu nblocks=%d stride=%zu)\n",
594  file, line, func, dim, dir, blksize, nblocks, stride);
595  errorQuda("aborting with error %s", cudaGetErrorString(err));
596  }
597  }
598 #endif
599 
600  int disp[QUDA_MAX_DIM] = {0};
601  disp[dim] = dir;
602 
603  return comm_declare_strided_receive_displaced(buffer, disp, blksize, nblocks, stride);
604 }
605 
606 void comm_finalize(void)
607 {
609  comm_destroy_topology(topo);
611 }
612 
613 static char partition_string[16];
614 static char topology_string[128];
615 static char partition_override_string[16];
617 
619 {
620 #ifdef MULTI_GPU
621  manual_set_partition[dim] = 1;
622 #endif
623 
624  snprintf(partition_string, 16, ",comm=%d%d%d%d", comm_dim_partitioned(0),
626 }
627 
629  for (int i = 0; i < QUDA_MAX_DIM; i++) manual_set_partition[i] = 0;
630 
631  snprintf(partition_string, 16, ",comm=%d%d%d%d", comm_dim_partitioned(0),
633 }
634 
636 {
637  return (manual_set_partition[dim] || (default_topo && comm_dim(dim) > 1));
638 }
639 
641 {
642  int partitioned = 0;
643  for (int i=0; i<4; i++) {
644  partitioned = partitioned || comm_dim_partitioned(i);
645  }
646  return partitioned;
647 }
648 
650  static bool gdr_enabled = false;
651 #ifdef MULTI_GPU
652  static bool gdr_init = false;
653 
654  if (!gdr_init) {
655  char *enable_gdr_env = getenv("QUDA_ENABLE_GDR");
656  if (enable_gdr_env && strcmp(enable_gdr_env, "1") == 0) {
657  gdr_enabled = true;
658  }
659  gdr_init = true;
660  }
661 #endif
662  return gdr_enabled;
663 }
664 
666  static bool blacklist = false;
667  static bool blacklist_init = false;
668 
669  if (!blacklist_init) {
670  char *blacklist_env = getenv("QUDA_ENABLE_GDR_BLACKLIST");
671 
672  if (blacklist_env) { // set the policies to tune for explicitly
673  std::stringstream blacklist_list(blacklist_env);
674 
675  int device_count;
676  cudaGetDeviceCount(&device_count);
677 
678  int excluded_device;
679  while (blacklist_list >> excluded_device) {
680  // check this is a valid device
681  if ( excluded_device < 0 || excluded_device >= device_count ) {
682  errorQuda("Cannot blacklist invalid GPU device ordinal %d", excluded_device);
683  }
684 
685  if (blacklist_list.peek() == ',') blacklist_list.ignore();
686  if (excluded_device == comm_gpuid()) blacklist = true;
687  }
688  comm_barrier();
689  if (getVerbosity() > QUDA_SILENT && blacklist) printf("Blacklisting GPU-Direct RDMA for rank %d (GPU %d)\n", comm_rank(), comm_gpuid());
690  }
691  blacklist_init = true;
692 
693  }
694 
695  return blacklist;
696 }
697 
698 static bool deterministic_reduce = false;
699 
700 void comm_init_common(int ndim, const int *dims, QudaCommsMap rank_from_coords, void *map_data)
701 {
702  Topology *topo = comm_create_topology(ndim, dims, rank_from_coords, map_data);
704 
705  // determine which GPU this rank will use
706  char *hostname_recv_buf = (char *)safe_malloc(128 * comm_size());
707  comm_gather_hostname(hostname_recv_buf);
708 
709  gpuid = 0;
710  for (int i = 0; i < comm_rank(); i++) {
711  if (!strncmp(comm_hostname(), &hostname_recv_buf[128 * i], 128)) { gpuid++; }
712  }
713 
714  int device_count;
715  cudaGetDeviceCount(&device_count);
716  if (device_count == 0) { errorQuda("No CUDA devices found"); }
717  if (gpuid >= device_count) {
718  char *enable_mps_env = getenv("QUDA_ENABLE_MPS");
719  if (enable_mps_env && strcmp(enable_mps_env, "1") == 0) {
720  gpuid = gpuid % device_count;
721  printf("MPS enabled, rank=%d -> gpu=%d\n", comm_rank(), gpuid);
722  } else {
723  errorQuda("Too few GPUs available on %s", comm_hostname());
724  }
725  }
726 
727  comm_peer2peer_init(hostname_recv_buf);
728 
729  host_free(hostname_recv_buf);
730 
731  char *enable_reduce_env = getenv("QUDA_DETERMINISTIC_REDUCE");
732  if (enable_reduce_env && strcmp(enable_reduce_env, "1") == 0) { deterministic_reduce = true; }
733 
734  snprintf(partition_string, 16, ",comm=%d%d%d%d", comm_dim_partitioned(0), comm_dim_partitioned(1),
736 
737  // if CUDA_VISIBLE_DEVICES is set, we include this information in the topology_string
738  char *device_order_env = getenv("CUDA_VISIBLE_DEVICES");
739  if (device_order_env) {
740 
741  // to ensure we have process consistency define using rank 0
742  if (comm_rank() == 0) {
743  std::stringstream device_list_raw(device_order_env); // raw input
744  std::stringstream device_list; // formatted (no commas)
745 
746  int device;
747  int deviceCount;
748  cudaGetDeviceCount(&deviceCount);
749  while (device_list_raw >> device) {
750  // check this is a valid policy choice
751  if (device < 0) { errorQuda("Invalid CUDA_VISIBLE_DEVICE ordinal %d", device); }
752 
753  device_list << device;
754  if (device_list_raw.peek() == ',') device_list_raw.ignore();
755  }
756  snprintf(topology_string, 128, ",topo=%d%d%d%d,order=%s", comm_dim(0), comm_dim(1), comm_dim(2), comm_dim(3),
757  device_list.str().c_str());
758  }
759 
761  } else {
762  snprintf(topology_string, 128, ",topo=%d%d%d%d", comm_dim(0), comm_dim(1), comm_dim(2), comm_dim(3));
763  }
764 }
765 
766 const char *comm_config_string()
767 {
768  static char config_string[16];
769  static bool config_init = false;
770 
771  if (!config_init) {
772  strcpy(config_string, ",p2p=");
773  strcat(config_string, std::to_string(comm_peer2peer_enabled_global()).c_str());
774  strcat(config_string, ",gdr=");
775  strcat(config_string, std::to_string(comm_gdr_enabled()).c_str());
776  config_init = true;
777  }
778 
779  return config_string;
780 }
781 
782 const char *comm_dim_partitioned_string(const int *comm_dim_override)
783 {
784  if (comm_dim_override) {
785  char comm[5] = {(!comm_dim_partitioned(0) ? '0' : comm_dim_override[0] ? '1' : '0'),
786  (!comm_dim_partitioned(1) ? '0' : comm_dim_override[1] ? '1' : '0'),
787  (!comm_dim_partitioned(2) ? '0' : comm_dim_override[2] ? '1' : '0'),
788  (!comm_dim_partitioned(3) ? '0' : comm_dim_override[3] ? '1' : '0'), '\0'};
789  strcpy(partition_override_string, ",comm=");
790  strcat(partition_override_string, comm);
792  } else {
793  return partition_string;
794  }
795 }
796 
797 const char *comm_dim_topology_string() { return topology_string; }
798 
800 
801 static bool globalReduce = true;
802 static bool asyncReduce = false;
803 
804 void reduceMaxDouble(double &max) { comm_allreduce_max(&max); }
805 
806 void reduceDouble(double &sum) { if (globalReduce) comm_allreduce(&sum); }
807 
808 void reduceDoubleArray(double *sum, const int len)
809 { if (globalReduce) comm_allreduce_array(sum, len); }
810 
811 int commDim(int dir) { return comm_dim(dir); }
812 
813 int commCoords(int dir) { return comm_coord(dir); }
814 
815 int commDimPartitioned(int dir){ return comm_dim_partitioned(dir);}
816 
818 
820 
822 
823 void commGlobalReductionSet(bool global_reduction) { globalReduce = global_reduction; }
824 
825 bool commAsyncReduction() { return asyncReduce; }
826 
827 void commAsyncReductionSet(bool async_reduction) { asyncReduce = async_reduction; }
static int neighbor_rank[2][4]
double comm_drand(void)
Definition: comm_common.cpp:82
int comm_rank(void)
Definition: comm_mpi.cpp:82
int comm_peer2peer_enabled_global()
static void sum(Float *dst, Float *a, Float *b, int cnt)
Definition: dslash_util.h:8
const int * comm_dims(const Topology *topo)
int commDimPartitioned(int dir)
static bool enable_intranode
bool isHost(const void *buffer)
bool comm_deterministic_reduce()
MsgHandle * comm_declare_receive_displaced(void *buffer, const int displacement[], size_t nbytes)
Definition: comm_mpi.cpp:130
void reduceDoubleArray(double *sum, const int len)
int comm_dim_partitioned(int dim)
QudaVerbosity getVerbosity()
Definition: util_quda.cpp:21
static char partition_string[16]
static bool globalReduce
void comm_set_default_topology(Topology *topo)
MsgHandle * comm_declare_strided_receive_displaced(void *buffer, const int displacement[], size_t blksize, int nblocks, size_t stride)
Definition: comm_mpi.cpp:182
#define errorQuda(...)
Definition: util_quda.h:121
#define host_free(ptr)
Definition: malloc_quda.h:71
static char partition_override_string[16]
cudaColorSpinorField * tmp
Definition: covdev_test.cpp:44
static int rank
Definition: comm_mpi.cpp:44
static bool neighbors_cached
void comm_allreduce_array(double *data, size_t size)
Definition: comm_mpi.cpp:272
static int enable_peer_to_peer
void copy(ColorSpinorField &dst, const ColorSpinorField &src)
Definition: copy_quda.cu:355
void reduceDouble(double &sum)
char * comm_hostname(void)
Definition: comm_common.cpp:58
int comm_ndim(const Topology *topo)
bool comm_peer2peer_enabled(int dir, int dim)
void comm_gather_hostname(char *hostname_recv_buf)
Gather all hostnames.
Definition: comm_mpi.cpp:47
void comm_destroy_topology(Topology *topo)
void comm_enable_intranode(bool enable)
Enable / disable intra-node (non-peer-to-peer) communication.
int comm_rank_from_coords(const Topology *topo, const int *coords)
void comm_dim_partitioned_set(int dim)
MsgHandle * comm_declare_strided_send_displaced(void *buffer, const int displacement[], size_t blksize, int nblocks, size_t stride)
Definition: comm_mpi.cpp:153
static unsigned long int rand_seed
Definition: comm_common.cpp:73
int comm_size(void)
Definition: comm_mpi.cpp:88
const int * comm_coords_from_rank(const Topology *topo, int rank)
MsgHandle * comm_declare_send_displaced(void *buffer, const int displacement[], size_t nbytes)
Definition: comm_mpi.cpp:107
static bool enable_p2p
int my_coords[QUDA_MAX_DIM]
Definition: comm_common.cpp:14
MsgHandle * comm_declare_receive_relative_(const char *func, const char *file, int line, void *buffer, int dim, int dir, size_t nbytes)
int commDim(int dir)
void commDimPartitionedSet(int dir)
Topology * default_topo
void comm_peer2peer_init(const char *hostname_recv_buf)
bool commGlobalReduction()
void reduceMaxDouble(double &max)
void commDimPartitionedReset()
Reset the comm dim partioned array to zero,.
static char topology_string[128]
static bool peer2peer_present
int dims[QUDA_MAX_DIM]
Definition: comm_common.cpp:10
void comm_init_common(int ndim, const int *dims, QudaCommsMap rank_from_coords, void *map_data)
Initialize the communications common to all communications abstractions.
int(* coords)[QUDA_MAX_DIM]
Definition: comm_common.cpp:12
int(* QudaCommsMap)(const int *coords, void *fdata)
Definition: comm_quda.h:12
static bool advance_coords(int ndim, const int *dims, int *x)
Definition: comm_common.cpp:42
void init()
Create the CUBLAS context.
Definition: blas_cublas.cu:31
int comm_dim(int dim)
#define safe_malloc(size)
Definition: malloc_quda.h:66
void comm_dim_partitioned_reset()
static bool peer2peer_init
void comm_enable_peer2peer(bool enable)
Enable / disable peer-to-peer communication: used for dslash policies that do not presently support p...
bool comm_peer2peer_present()
Returns true if any peer-to-peer capability is present on this system (regardless of whether it has b...
int commCoords(int dir)
#define checkCudaErrorNoSync()
Definition: util_quda.h:145
int comm_coord(int dim)
int comm_partitioned()
Loop over comm_dim_partitioned(dim) for all comms dimensions.
static int index(int ndim, const int *dims, const int *x)
Definition: comm_common.cpp:32
void commGlobalReductionSet(bool global_reduction)
int comm_rank_displaced(const Topology *topo, const int displacement[])
const int * comm_coords(const Topology *topo)
void comm_gather_gpuid(int *gpuid_recv_buf)
Gather all GPU ids.
Definition: comm_mpi.cpp:53
const char * comm_config_string()
Return a string that defines the P2P/GDR environment variable configuration (for use as a tuneKey to ...
void comm_broadcast(void *data, size_t nbytes)
Definition: comm_mpi.cpp:321
static int gpuid
#define printfQuda(...)
Definition: util_quda.h:115
void commAsyncReductionSet(bool async_reduction)
void comm_allreduce_int(int *data)
Definition: comm_mpi.cpp:304
int comm_neighbor_rank(int dir, int dim)
#define device_malloc(size)
Definition: malloc_quda.h:64
MsgHandle * comm_declare_strided_send_relative_(const char *func, const char *file, int line, void *buffer, int dim, int dir, size_t blksize, int nblocks, size_t stride)
Topology * comm_create_topology(int ndim, const int *dims, QudaCommsMap rank_from_coords, void *map_data)
Definition: comm_common.cpp:94
bool comm_gdr_blacklist()
Query if GPU Direct RDMA communication is blacklisted for this GPU.
int device
Definition: test_util.cpp:1602
__host__ __device__ ValueType abs(ValueType x)
Definition: complex_quda.h:125
bool comm_gdr_enabled()
Query if GPU Direct RDMA communication is enabled (global setting)
int comm_gpuid(void)
#define QUDA_MAX_DIM
Maximum number of dimensions supported by QUDA. In practice, no routines make use of more than 5...
MsgHandle * comm_declare_strided_receive_relative_(const char *func, const char *file, int line, void *buffer, int dim, int dir, size_t blksize, int nblocks, size_t stride)
#define checkCudaError()
Definition: util_quda.h:161
static int mod(int a, int b)
void comm_allreduce(double *data)
Definition: comm_mpi.cpp:242
static bool intranode_enabled[2][4]
void comm_allreduce_max(double *data)
Definition: comm_mpi.cpp:258
bool comm_intranode_enabled(int dir, int dim)
const char * comm_dim_topology_string()
Return a string that defines the comm topology (for use as a tuneKey)
static bool peer2peer_enabled[2][4]
void comm_finalize(void)
bool commAsyncReduction()
static bool deterministic_reduce
void comm_set_neighbor_ranks(Topology *topo)
Topology * comm_default_topology(void)
const char * comm_dim_partitioned_string(const int *comm_dim_override)
Return a string that defines the comm partitioning (used as a tuneKey)
MsgHandle * comm_declare_send_relative_(const char *func, const char *file, int line, void *buffer, int dim, int dir, size_t nbytes)
static bool asyncReduce
static int manual_set_partition[QUDA_MAX_DIM]
#define device_free(ptr)
Definition: malloc_quda.h:69
void comm_barrier(void)
Definition: comm_mpi.cpp:326