QUDA  0.9.0
comm_qmp.cpp
Go to the documentation of this file.
1 #include <qmp.h>
2 #include <csignal>
3 #include <quda_internal.h>
4 #include <comm_quda.h>
5 
6 #define QMP_CHECK(qmp_call) do { \
7  QMP_status_t status = qmp_call; \
8  if (status != QMP_SUCCESS) \
9  errorQuda("(QMP) %s", QMP_error_string(status)); \
10 } while (0)
11 
12 struct MsgHandle_s {
13  QMP_msgmem_t mem;
14  QMP_msghandle_t handle;
15 };
16 
17 static int gpuid = -1;
18 
19 static char partition_string[16];
20 static char topology_string[16];
21 
22 // While we can emulate an all-gather using QMP reductions, this
23 // scales horribly as the number of nodes increases, so for
24 // performance we just call MPI directly
25 #define USE_MPI_GATHER
26 
27 #ifdef USE_MPI_GATHER
28 #include <mpi.h>
29 #endif
30 
31 // There are more efficient ways to do the following,
32 // but it doesn't really matter since this function should be
33 // called just once.
34 void comm_gather_hostname(char *hostname_recv_buf) {
35  // determine which GPU this rank will use
36  char *hostname = comm_hostname();
37 
38 #ifdef USE_MPI_GATHER
39  MPI_Allgather(hostname, 128, MPI_CHAR, hostname_recv_buf, 128, MPI_CHAR, MPI_COMM_WORLD);
40 #else
41  // Abuse reductions to emulate all-gather. We need to copy the
42  // local hostname to all other nodes
43  // this isn't very scalable though
44  for (int i=0; i<comm_size(); i++) {
45  int data[128];
46  for (int j=0; j<128; j++) {
47  data[j] = (i == comm_rank()) ? hostname[j] : 0;
48  QMP_sum_int(data+j);
49  hostname_recv_buf[i*128 + j] = data[j];
50  }
51  }
52 #endif
53 
54 }
55 
56 
57 // There are more efficient ways to do the following,
58 // but it doesn't really matter since this function should be
59 // called just once.
60 void comm_gather_gpuid(int *gpuid_recv_buf) {
61 
62 #ifdef USE_MPI_GATHER
63  MPI_Allgather(&gpuid, 1, MPI_INT, gpuid_recv_buf, 1, MPI_INT, MPI_COMM_WORLD);
64 #else
65  // Abuse reductions to emulate all-gather. We need to copy the
66  // local gpu to all other nodes
67  for (int i=0; i<comm_size(); i++) {
68  int data = (i == comm_rank()) ? gpuid : 0;
69  QMP_sum_int(&data);
70  gpuid_recv_buf[i] = data;
71  }
72 #endif
73 }
74 
75 
76 void comm_init(int ndim, const int *dims, QudaCommsMap rank_from_coords, void *map_data)
77 {
78  if ( QMP_is_initialized() != QMP_TRUE ) {
79  errorQuda("QMP has not been initialized");
80  }
81 
82  int grid_size = 1;
83  for (int i = 0; i < ndim; i++) {
84  grid_size *= dims[i];
85  }
86  if (grid_size != QMP_get_number_of_nodes()) {
87  errorQuda("Communication grid size declared via initCommsGridQuda() does not match"
88  " total number of QMP nodes (%d != %d)", grid_size, QMP_get_number_of_nodes());
89  }
90 
91  Topology *topo = comm_create_topology(ndim, dims, rank_from_coords, map_data);
93 
94  // determine which GPU this rank will use
95  char *hostname_recv_buf = (char *)safe_malloc(128*comm_size());
96  comm_gather_hostname(hostname_recv_buf);
97 
98  gpuid = 0;
99  for (int i = 0; i < comm_rank(); i++) {
100  if (!strncmp(comm_hostname(), &hostname_recv_buf[128*i], 128)) {
101  gpuid++;
102  }
103  }
104 
105  int device_count;
106  cudaGetDeviceCount(&device_count);
107  if (device_count == 0) {
108  errorQuda("No CUDA devices found");
109  }
110  if (gpuid >= device_count) {
111  char *enable_mps_env = getenv("QUDA_ENABLE_MPS");
112  if (enable_mps_env && strcmp(enable_mps_env,"1") == 0) {
113  gpuid = gpuid%device_count;
114  printf("MPS enabled, rank=%d -> gpu=%d\n", comm_rank(), gpuid);
115  } else {
116  errorQuda("Too few GPUs available on %s", comm_hostname());
117  }
118  }
119 
120  comm_peer2peer_init(hostname_recv_buf);
121 
122  host_free(hostname_recv_buf);
123 
125  snprintf(topology_string, 16, ",topo=%d%d%d%d", comm_dim(0), comm_dim(1), comm_dim(2), comm_dim(3));
126 }
127 
128 int comm_rank(void)
129 {
130  return QMP_get_node_number();
131 }
132 
133 
134 int comm_size(void)
135 {
136  return QMP_get_number_of_nodes();
137 }
138 
139 
140 int comm_gpuid(void)
141 {
142  return gpuid;
143 }
144 
145 
149 MsgHandle *comm_declare_send_displaced(void *buffer, const int displacement[], size_t nbytes)
150 {
152 
153  int rank = comm_rank_displaced(topo, displacement);
154  MsgHandle *mh = (MsgHandle *)safe_malloc(sizeof(MsgHandle));
155 
156  mh->mem = QMP_declare_msgmem(buffer, nbytes);
157  if (mh->mem == NULL) errorQuda("Unable to allocate QMP message memory");
158 
159  mh->handle = QMP_declare_send_to(mh->mem, rank, 0);
160  if (mh->handle == NULL) errorQuda("Unable to allocate QMP message handle");
161 
162  return mh;
163 }
164 
168 MsgHandle *comm_declare_receive_displaced(void *buffer, const int displacement[], size_t nbytes)
169 {
171 
172  int rank = comm_rank_displaced(topo, displacement);
173  MsgHandle *mh = (MsgHandle *)safe_malloc(sizeof(MsgHandle));
174 
175  mh->mem = QMP_declare_msgmem(buffer, nbytes);
176  if (mh->mem == NULL) errorQuda("Unable to allocate QMP message memory");
177 
178  mh->handle = QMP_declare_receive_from(mh->mem, rank, 0);
179  if (mh->handle == NULL) errorQuda("Unable to allocate QMP message handle");
180 
181  return mh;
182 }
183 
184 
189 MsgHandle *comm_declare_strided_send_displaced(void *buffer, const int displacement[],
190  size_t blksize, int nblocks, size_t stride)
191 {
193 
194  int rank = comm_rank_displaced(topo, displacement);
195  MsgHandle *mh = (MsgHandle *)safe_malloc(sizeof(MsgHandle));
196 
197  mh->mem = QMP_declare_strided_msgmem(buffer, blksize, nblocks, stride);
198  if (mh->mem == NULL) errorQuda("Unable to allocate QMP message memory");
199 
200  mh->handle = QMP_declare_send_to(mh->mem, rank, 0);
201  if (mh->handle == NULL) errorQuda("Unable to allocate QMP message handle");
202 
203  return mh;
204 }
205 
210 MsgHandle *comm_declare_strided_receive_displaced(void *buffer, const int displacement[],
211  size_t blksize, int nblocks, size_t stride)
212 {
214 
215  int rank = comm_rank_displaced(topo, displacement);
216  MsgHandle *mh = (MsgHandle *)safe_malloc(sizeof(MsgHandle));
217 
218  mh->mem = QMP_declare_strided_msgmem(buffer, blksize, nblocks, stride);
219  if (mh->mem == NULL) errorQuda("Unable to allocate QMP message memory");
220 
221  mh->handle = QMP_declare_receive_from(mh->mem, rank, 0);
222  if (mh->handle == NULL) errorQuda("Unable to allocate QMP message handle");
223 
224  return mh;
225 }
226 
227 
229 {
230  QMP_free_msghandle(mh->handle);
231  QMP_free_msgmem(mh->mem);
232  host_free(mh);
233 }
234 
235 
237 {
238  QMP_CHECK( QMP_start(mh->handle) );
239 }
240 
241 
243 {
244  QMP_CHECK( QMP_wait(mh->handle) );
245 }
246 
247 
249 {
250  return (QMP_is_complete(mh->handle) == QMP_TRUE);
251 }
252 
253 
254 void comm_allreduce(double* data)
255 {
256  QMP_CHECK( QMP_sum_double(data) );
257 }
258 
259 
260 void comm_allreduce_max(double* data)
261 {
262  QMP_CHECK( QMP_max_double(data) );
263 }
264 
265 
266 void comm_allreduce_array(double* data, size_t size)
267 {
268  QMP_CHECK( QMP_sum_double_array(data, size) );
269 }
270 
271 
272 void comm_allreduce_int(int* data)
273 {
274  QMP_CHECK( QMP_sum_int(data) );
275 }
276 
278 {
279  if (sizeof(uint64_t) != sizeof(unsigned long)) errorQuda("unsigned long is not 64-bit");
280  QMP_CHECK( QMP_xor_ulong( reinterpret_cast<unsigned long*>(data) ));
281 }
282 
283 void comm_broadcast(void *data, size_t nbytes)
284 {
285  QMP_CHECK( QMP_broadcast(data, nbytes) );
286 }
287 
288 
289 void comm_barrier(void)
290 {
291  QMP_CHECK( QMP_barrier() );
292 }
293 
294 
295 void comm_abort(int status)
296 {
297 #ifdef HOST_DEBUG
298  raise(SIGINT);
299 #endif
300  QMP_abort(status);
301 }
302 
304  return partition_string;
305 }
306 
308  return topology_string;
309 }
void comm_gather_gpuid(int *gpuid_recv_buf)
Gather all GPU ids.
Definition: comm_qmp.cpp:60
QMP_msghandle_t handle
Definition: comm_qmp.cpp:14
int comm_query(MsgHandle *mh)
Definition: comm_qmp.cpp:248
_EXTERN_C_ int MPI_Allgather(const void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf, int recvcount, MPI_Datatype recvtype, MPI_Comm comm)
Definition: nvtx_pmpi.c:308
int snprintf(char *__str, size_t __size, const char *__format,...) __attribute__((__format__(__printf__
Topology * comm_create_topology(int ndim, const int *dims, QudaCommsMap rank_from_coords, void *map_data)
Definition: comm_common.cpp:94
void comm_peer2peer_init(const char *hostname_recv_buf)
int comm_gpuid(void)
Definition: comm_qmp.cpp:140
static int gpuid
Definition: comm_qmp.cpp:17
static char partition_string[16]
Definition: comm_qmp.cpp:19
#define errorQuda(...)
Definition: util_quda.h:90
#define host_free(ptr)
Definition: malloc_quda.h:59
QMP_msgmem_t mem
Definition: comm_qmp.cpp:13
int comm_dim(int dim)
static int rank
Definition: comm_mpi.cpp:42
void comm_allreduce_xor(uint64_t *data)
Definition: comm_qmp.cpp:277
Topology * comm_default_topology(void)
void comm_wait(MsgHandle *mh)
Definition: comm_qmp.cpp:242
const char * comm_dim_topology_string()
Return a string that defines the comm topology (for use as a tuneKey)
Definition: comm_qmp.cpp:307
void comm_free(MsgHandle *mh)
Definition: comm_qmp.cpp:228
void comm_start(MsgHandle *mh)
Definition: comm_qmp.cpp:236
static int ndim
Definition: layout_hyper.c:53
MsgHandle * comm_declare_strided_send_displaced(void *buffer, const int displacement[], size_t blksize, int nblocks, size_t stride)
Definition: comm_qmp.cpp:189
int strcmp(const char *__s1, const char *__s2)
const char * comm_dim_partitioned_string()
Return a string that defines the comm partitioning (used as a tuneKey)
Definition: comm_qmp.cpp:303
char * comm_hostname(void)
Definition: comm_common.cpp:58
MsgHandle * comm_declare_strided_receive_displaced(void *buffer, const int displacement[], size_t blksize, int nblocks, size_t stride)
Definition: comm_qmp.cpp:210
int printf(const char *,...) __attribute__((__format__(__printf__
void comm_allreduce(double *data)
Definition: comm_qmp.cpp:254
void comm_abort(int status)
Definition: comm_qmp.cpp:295
#define QMP_CHECK(qmp_call)
Definition: comm_qmp.cpp:6
int comm_rank_displaced(const Topology *topo, const int displacement[])
unsigned long long uint64_t
int(* QudaCommsMap)(const int *coords, void *fdata)
Definition: comm_quda.h:12
void comm_init(int ndim, const int *dims, QudaCommsMap rank_from_coords, void *map_data)
Definition: comm_qmp.cpp:76
#define safe_malloc(size)
Definition: malloc_quda.h:54
void comm_allreduce_max(double *data)
Definition: comm_qmp.cpp:260
int comm_size(void)
Definition: comm_qmp.cpp:134
int strncmp(const char *__s1, const char *__s2, size_t __n)
void comm_set_default_topology(Topology *topo)
static char topology_string[16]
Definition: comm_qmp.cpp:20
void comm_allreduce_int(int *data)
Definition: comm_qmp.cpp:272
MsgHandle * comm_declare_receive_displaced(void *buffer, const int displacement[], size_t nbytes)
Definition: comm_qmp.cpp:168
void comm_allreduce_array(double *data, size_t size)
Definition: comm_qmp.cpp:266
void comm_broadcast(void *data, size_t nbytes)
Definition: comm_qmp.cpp:283
void comm_gather_hostname(char *hostname_recv_buf)
Gather all hostnames.
Definition: comm_qmp.cpp:34
MsgHandle * comm_declare_send_displaced(void *buffer, const int displacement[], size_t nbytes)
Definition: comm_qmp.cpp:149
int comm_rank(void)
Definition: comm_qmp.cpp:128
char * getenv(const char *)
int comm_dim_partitioned(int dim)
void comm_barrier(void)
Definition: comm_qmp.cpp:289