16 #if defined(MPI_COMMS) || defined(QMP_COMMS)
20 #if defined(QMP_COMMS)
24 #ifdef QUDA_BACKWARDSCPP
25 #include "backward.hpp"
28 static backward::SignalHandling sh;
47 static const int max_displacement = 4;
51 int *dims =
reinterpret_cast<int *
>(fdata);
53 for (
int i = 1; i < 4; i++) { rank = dims[i] * rank + coords[i]; }
59 int *dims =
reinterpret_cast<int *
>(fdata);
61 for (
int i = 2; i >= 0; i--) { rank = dims[i] * rank + coords[i]; }
73 static inline int index(
int ndim,
const int *dims,
const int *x)
76 for (
int i = 1; i < ndim; i++) { idx = dims[i] * idx + x[i]; }
80 static inline bool advance_coords(
int ndim,
const int *dims,
int *x)
83 for (
int i = ndim - 1; i >= 0; i--) {
84 if (x[i] < dims[i] - 1) {
101 delete [] topo->
ranks;
119 static inline int mod(
int a,
int b) {
return ((a % b) + b) % b; }
134 CUmemorytype memType;
135 void *attrdata[] = {(
void *)&memType};
136 CUpointer_attribute attributes[2] = {CU_POINTER_ATTRIBUTE_MEMORY_TYPE};
137 CUresult err = cuPointerGetAttributes(1, attributes, attrdata, (CUdeviceptr)buffer);
138 if (err != CUDA_SUCCESS) {
140 cuGetErrorName(err, &str);
141 errorQuda(
"cuPointerGetAttributes returned error %s", str);
145 case CU_MEMORYTYPE_DEVICE:
return false;
146 case CU_MEMORYTYPE_ARRAY:
errorQuda(
"Using array memory for communications buffer is not supported");
147 case CU_MEMORYTYPE_UNIFIED:
errorQuda(
"Using unified memory for communications buffer is not supported");
148 case CU_MEMORYTYPE_HOST:
156 for (
int i = 0; i < ndim; i++) {
157 if (
abs(displacement[i]) > max_displacement) {
158 errorQuda(
"Requested displacement[%d] = %d is greater than maximum allowed", i, displacement[i]);
207 char *enable_peer_to_peer_env = getenv(
"QUDA_ENABLE_P2P");
211 bool disable_peer_to_peer_bidir =
false;
213 if (enable_peer_to_peer_env) {
222 printf(
"Enabling peer-to-peer copy engine access (disabling direct load/store)\n");
226 printf(
"Enabling peer-to-peer direct load/store access (disabling copy engines)\n");
230 printf(
"Enabling peer-to-peer copy engine and direct load/store access\n");
234 printf(
"Enabling peer-to-peer copy engine access (disabling direct load/store and non-p2p policies)\n");
238 printf(
"Enabling peer-to-peer direct load/store access (disabling copy engines and non-p2p policies)\n");
242 printf(
"Enabling peer-to-peer copy engine and direct load/store access (disabling non-p2p policies)\n");
249 disable_peer_to_peer_bidir =
true;
256 printf(
"Enabling peer-to-peer copy engine and direct load/store access\n");
262 char *enable_p2p_max_access_rank_env = getenv(
"QUDA_ENABLE_P2P_MAX_ACCESS_RANK");
263 if (enable_p2p_max_access_rank_env) {
269 "Limiting peer-to-peer communication to a maximum access rank of %d (lower ranks have higher bandwidth)\n",
276 cudaGetDeviceProperties(&prop,
gpuid);
277 if (!prop.unifiedAddressing)
return;
286 for (
int dir = 0; dir < 2; ++dir) {
293 && disable_peer_to_peer_bidir &&
comm_dim(
dim) == 2)
297 if (!strncmp(hostname, &hostname_recv_buf[128 *
neighbor_rank], 128)) {
299 int canAccessPeer[2];
300 cudaDeviceCanAccessPeer(&canAccessPeer[0],
gpuid, neighbor_gpuid);
301 cudaDeviceCanAccessPeer(&canAccessPeer[1], neighbor_gpuid,
gpuid);
303 int accessRank[2] = {};
304 if (canAccessPeer[0] * canAccessPeer[1] != 0) {
305 cudaDeviceGetP2PAttribute(&accessRank[0], cudaDevP2PAttrPerformanceRank,
gpuid, neighbor_gpuid);
306 cudaDeviceGetP2PAttribute(&accessRank[1], cudaDevP2PAttrPerformanceRank, neighbor_gpuid,
gpuid);
313 ||
gpuid == neighbor_gpuid) {
316 printf(
"Peer-to-peer enabled for rank %3d (gpu=%d) with neighbor %3d (gpu=%d) dir=%d, dim=%d, "
317 "access rank = (%3d, %3d)\n",
323 printf(
"Intra-node (non peer-to-peer) enabled for rank %3d (gpu=%d) with neighbor %3d (gpu=%d) dir=%d, "
399 if (!topology) {
errorQuda(
"Topology not specified"); }
401 for (
int d = 0; d < 4; ++d) {
404 pos_displacement[d] = +1;
405 neg_displacement[d] = -1;
477 bool gdr_init =
false;
485 char *enable_gdr_env = getenv(
"QUDA_ENABLE_GDR");
486 if (enable_gdr_env && strcmp(enable_gdr_env,
"1") == 0) {
gdr_enabled =
true; }
499 char *blacklist_env = getenv(
"QUDA_ENABLE_GDR_BLACKLIST");
502 std::stringstream blacklist_list(blacklist_env);
505 cudaGetDeviceCount(&device_count);
508 while (blacklist_list >> excluded_device) {
510 if (excluded_device < 0 || excluded_device >= device_count) {
511 errorQuda(
"Cannot blacklist invalid GPU device ordinal %d", excluded_device);
514 if (blacklist_list.peek() ==
',') blacklist_list.ignore();
529 #if (defined MULTI_GPU) && (defined NVSHMEM_COMMS)
530 static bool nvshmem_enabled =
true;
531 static bool nvshmem_init =
false;
533 char *enable_nvshmem_env = getenv(
"QUDA_ENABLE_NVSHMEM");
534 if (enable_nvshmem_env && strcmp(enable_nvshmem_env,
"1") == 0) { nvshmem_enabled =
true; }
535 if (enable_nvshmem_env && strcmp(enable_nvshmem_env,
"0") == 0) { nvshmem_enabled =
false; }
539 static bool nvshmem_enabled =
false;
541 return nvshmem_enabled;
558 cudaGetDeviceCount(&device_count);
559 if (device_count == 0) {
errorQuda(
"No CUDA devices found"); }
568 if (
gpuid >= device_count) {
569 char *enable_mps_env = getenv(
"QUDA_ENABLE_MPS");
570 if (enable_mps_env && strcmp(enable_mps_env,
"1") == 0) {
583 char *enable_reduce_env = getenv(
"QUDA_DETERMINISTIC_REDUCE");
590 char *device_order_env = getenv(
"CUDA_VISIBLE_DEVICES");
591 if (device_order_env) {
595 std::stringstream device_list_raw(device_order_env);
596 std::stringstream device_list;
600 cudaGetDeviceCount(&deviceCount);
601 while (device_list_raw >> device) {
603 if (device < 0) {
errorQuda(
"Invalid CUDA_VISIBLE_DEVICE ordinal %d", device); }
605 device_list << device;
606 if (device_list_raw.peek() ==
',') device_list_raw.ignore();
609 device_list.str().c_str());
642 if (comm_dim_override) {
692 #if defined(QMP_COMMS) || defined(MPI_COMMS)
693 MPI_Comm MPI_COMM_HANDLE;
696 #if defined(QMP_COMMS)
697 QMP_comm_t QMP_COMM_HANDLE;
703 bool is_qmp_handle_default;
776 std::sort(array, array + n);
777 return std::accumulate(array, array + n, 0.0);
char * comm_hostname(void)
int(* QudaCommsMap)(const int *coords, void *fdata)
const int * comm_coords_from_rank(const Topology *topo, int rank)
void comm_broadcast_global(void *data, size_t nbytes)
These routine broadcast the data according to the default communicator.
int comm_rank_displaced(const Topology *topo, const int displacement[])
int comm_rank_from_coords(const Topology *topo, const int *coords)
int lex_rank_from_coords_dim_x(const int *coords, void *fdata)
constexpr quda::CommKey default_comm_key
void check_displacement(const int displacement[], int ndim)
Topology * comm_create_topology(int ndim, const int *dims, QudaCommsMap rank_from_coords, void *map_data, int my_rank)
int comm_ndim(const Topology *topo)
const int * comm_dims(const Topology *topo)
bool isHost(const void *buffer)
int lex_rank_from_coords_dim_t(const int *coords, void *fdata)
const int * comm_coords(const Topology *topo)
void comm_destroy_topology(Topology *topo)
void push_communicator(const quda::CommKey &split_key)
#define safe_malloc(size)
__host__ __device__ ValueType abs(ValueType x)
__host__ __device__ T sum(const array< T, s > &a)
#define QUDA_MAX_DIM
Maximum number of dimensions supported by QUDA. In practice, no routines make use of more than 5.
void commGlobalReductionSet(bool global_reduction)
bool comm_deterministic_reduce()
int commDimPartitioned(int dir)
char topology_string[128]
void comm_allreduce_max(double *data)
T deterministic_reduce(T *array, int n)
void comm_peer2peer_init(const char *hostname_recv_buf)
void commAsyncReductionSet(bool async_reduction)
void reduceDouble(double &sum)
void comm_wait(MsgHandle *mh)
void comm_allreduce(double *data)
bool commGlobalReduction()
void comm_broadcast(void *data, size_t nbytes)
void comm_set_default_topology(Topology *topo)
int comm_query(MsgHandle *mh)
void comm_gather_gpuid(int *gpuid_recv_buf)
void comm_init(int ndim, const int *dims, QudaCommsMap rank_from_coords, void *map_data)
void comm_allreduce_min(double *data)
int manual_set_partition[QUDA_MAX_DIM]
bool comm_intranode_enabled(int dir, int dim)
void reduceMaxDouble(double &max)
static int comm_rank_global()
MsgHandle * comm_declare_strided_send_displaced(void *buffer, const int displacement[], size_t blksize, int nblocks, size_t stride)
void comm_enable_intranode(bool enable)
void comm_set_neighbor_ranks(Topology *topo=nullptr)
static void comm_abort_(int status)
void comm_enable_peer2peer(bool enable)
MsgHandle * comm_declare_send_rank(void *buffer, int rank, int tag, size_t nbytes)
MsgHandle * comm_declare_strided_receive_displaced(void *buffer, const int displacement[], size_t blksize, int nblocks, size_t stride)
bool comm_peer2peer_present()
int comm_dim_partitioned(int)
void reduceDoubleArray(double *sum, const int len)
const char * comm_dim_partitioned_string(const int *comm_dim_override)
void comm_allreduce_max_array(double *data, size_t size)
MsgHandle * comm_declare_send_displaced(void *buffer, const int displacement[], size_t nbytes)
int comm_neighbor_rank(int dir, int dim)
const char * comm_dim_topology_string()
bool use_deterministic_reduce
bool user_set_comm_handle
void comm_gather_hostname(char *hostname_recv_buf)
void comm_allreduce_xor(uint64_t *data)
bool commAsyncReduction()
int enable_p2p_max_access_rank
void comm_allreduce_int(int *data)
char partition_override_string[16]
MsgHandle * comm_declare_receive_displaced(void *buffer, const int displacement[], size_t nbytes)
bool comm_gdr_blacklist()
void comm_free(MsgHandle *&mh)
Topology * comm_default_topology(void)
const char * comm_config_string()
int comm_peer2peer_enabled_global()
bool comm_peer2peer_enabled(int dir, int dim)
bool peer2peer_enabled[2][4]
bool comm_nvshmem_enabled()
MsgHandle * comm_declare_recv_rank(void *buffer, int rank, int tag, size_t nbytes)
int comm_rank_from_coords(const int *coords)
void comm_dim_partitioned_reset()
void comm_dim_partitioned_set(int dim)
void comm_init_common(int ndim, const int *dims, QudaCommsMap rank_from_coords, void *map_data)
void commDimPartitionedReset()
void comm_start(MsgHandle *mh)
char partition_string[16]
void commDimPartitionedSet(int dir)
void comm_allreduce_array(double *data, size_t size)
bool intranode_enabled[2][4]
int(* coords)[QUDA_MAX_DIM]
int my_coords[QUDA_MAX_DIM]
#define checkCudaErrorNoSync()
QudaVerbosity getVerbosity()