35 for (
int i = 1; i <
ndim; i++) {
36 idx = dims[i]*idx + x[i];
45 for (
int i = ndim-1; i >= 0; i--) {
46 if (x[i] < dims[i]-1) {
60 static bool cached =
false;
61 static char hostname[128];
64 gethostname(hostname, 128);
84 const double twoneg48 = 0.35527136788005009e-14;
85 const unsigned long int m = 25214903917, a = 11, mask = 281474976710655;
105 for (
int i=0; i<
ndim; i++) {
106 topo->
dims[i] = dims[i];
117 int rank = rank_from_coords(x, map_data);
119 for (
int i=0; i<
ndim; i++) {
126 for (
int i = 0; i <
ndim; i++) {
149 {
false,
false,
false,
false} };
153 {
false,
false,
false,
false} };
179 char *enable_peer_to_peer_env = getenv(
"QUDA_ENABLE_P2P");
183 bool disable_peer_to_peer_bidir =
false;
185 if (enable_peer_to_peer_env) {
201 disable_peer_to_peer_bidir =
true;
215 cudaGetDeviceProperties(&prop, gpuid);
216 if(!prop.unifiedAddressing)
return;
225 for(
int dir=0; dir<2; ++dir){
226 for(
int dim=0; dim<4; ++dim){
228 if(neighbor_rank ==
comm_rank())
continue;
231 if ( ((
comm_rank() > neighbor_rank && dir == 0) || (
comm_rank() < neighbor_rank && dir == 1)) &&
232 disable_peer_to_peer_bidir &&
comm_dim(dim) == 2 )
continue;
235 if (!strncmp(hostname, &hostname_recv_buf[128*neighbor_rank], 128)) {
237 int canAccessPeer[2];
238 cudaDeviceCanAccessPeer(&canAccessPeer[0], gpuid, neighbor_gpuid);
239 cudaDeviceCanAccessPeer(&canAccessPeer[1], neighbor_gpuid, gpuid);
241 int accessRank[2] = { };
242 #if CUDA_VERSION >= 8000 // this was introduced with CUDA 8 243 if (canAccessPeer[0]*canAccessPeer[1] != 0) {
244 cudaDeviceGetP2PAttribute(&accessRank[0], cudaDevP2PAttrPerformanceRank, gpuid, neighbor_gpuid);
245 cudaDeviceGetP2PAttribute(&accessRank[1], cudaDevP2PAttrPerformanceRank, neighbor_gpuid, gpuid);
250 if (canAccessPeer[0]*canAccessPeer[1] != 0 || gpuid == neighbor_gpuid) {
253 printf(
"Peer-to-peer enabled for rank %d (gpu=%d) with neighbor %d (gpu=%d) dir=%d, dim=%d, performance rank = (%d, %d)\n",
254 comm_rank(), gpuid, neighbor_rank, neighbor_gpuid, dir, dim, accessRank[0], accessRank[1]);
259 printf(
"Intra-node (non peer-to-peer) enabled for rank %d (gpu=%d) with neighbor %d (gpu=%d) dir=%d, dim=%d\n",
260 comm_rank(), gpuid, neighbor_rank, neighbor_gpuid, dir, dim);
292 static bool init =
false;
293 static bool p2p_global =
false;
297 for (
int dim=0; dim<4; dim++)
298 for (
int dir=0; dir<2; dir++)
303 p2p_global = p2p > 0 ? true :
false;
352 static inline int mod(
int a,
int b)
354 return ((a % b) + b) % b;
362 coords[i] = (i < topo->
ndim) ?
384 errorQuda(
"Default topology has not been declared");
404 for(
int d=0; d<4; ++d){
407 pos_displacement[d] = +1;
408 neg_displacement[d] = -1;
439 CUmemorytype memType;
440 void *attrdata[] = {(
void *)&memType};
441 CUpointer_attribute attributes[2] = {CU_POINTER_ATTRIBUTE_MEMORY_TYPE};
442 CUresult err = cuPointerGetAttributes(1, attributes, attrdata, (CUdeviceptr)buffer);
443 if (err != CUDA_SUCCESS) {
445 cuGetErrorName(err, &str);
446 errorQuda(
"cuPointerGetAttributes returned error %s", str);
450 case CU_MEMORYTYPE_DEVICE:
return false;
451 case CU_MEMORYTYPE_ARRAY:
errorQuda(
"Using array memory for communications buffer is not supported");
452 case CU_MEMORYTYPE_UNIFIED:
errorQuda(
"Using unified memory for communications buffer is not supported");
453 case CU_MEMORYTYPE_HOST:
463 void *buffer,
int dim,
int dir,
size_t nbytes)
472 std::copy(static_cast<char*>(buffer), static_cast<char*>(buffer)+nbytes, static_cast<char*>(tmp));
473 }
catch(std::exception &e) {
474 printfQuda(
"ERROR: buffer failed (%s:%d in %s(), dim=%d, dir=%d, nbytes=%zu)\n", file, line, func, dim, dir, nbytes);
481 cudaError_t err = cudaMemcpy(tmp, buffer, nbytes, cudaMemcpyDeviceToDevice);
482 if (err != cudaSuccess) {
483 printfQuda(
"ERROR: buffer failed (%s:%d in %s(), dim=%d, dir=%d, nbytes=%zu)\n", file, line, func, dim, dir, nbytes);
484 errorQuda(
"aborting with error %s", cudaGetErrorString(err));
500 void *buffer,
int dim,
int dir,
size_t nbytes)
508 std::fill(static_cast<char*>(buffer), static_cast<char*>(buffer)+nbytes, 0);
509 }
catch(std::exception &e) {
510 printfQuda(
"ERROR: buffer failed (%s:%d in %s(), dim=%d, dir=%d, nbytes=%zu)\n", file, line, func, dim, dir, nbytes);
515 cudaError_t err = cudaMemset(buffer, 0, nbytes);
516 if (err != cudaSuccess) {
517 printfQuda(
"ERROR: buffer failed (%s:%d in %s(), dim=%d, dir=%d, nbytes=%zu)\n", file, line, func, dim, dir, nbytes);
518 errorQuda(
"aborting with error %s", cudaGetErrorString(err));
533 void *buffer,
int dim,
int dir,
size_t blksize,
int nblocks,
size_t stride)
542 for (
int i=0; i<nblocks; i++)
543 std::copy(static_cast<char*>(buffer)+i*stride, static_cast<char*>(buffer)+i*stride+blksize, static_cast<char*>(tmp));
544 }
catch(std::exception &e) {
545 printfQuda(
"ERROR: buffer failed (%s:%d in %s(), dim=%d, dir=%d, blksize=%zu nblocks=%d stride=%zu)\n",
546 file, line, func, dim, dir, blksize, nblocks, stride);
553 cudaError_t err = cudaMemcpy2D(tmp, blksize, buffer, stride, blksize, nblocks, cudaMemcpyDeviceToDevice);
554 if (err != cudaSuccess) {
555 printfQuda(
"ERROR: buffer failed (%s:%d in %s(), dim=%d, dir=%d, blksize=%zu nblocks=%d stride=%zu)\n",
556 file, line, func, dim, dir, blksize, nblocks, stride);
557 errorQuda(
"aborting with error %s", cudaGetErrorString(err));
574 void *buffer,
int dim,
int dir,
size_t blksize,
int nblocks,
size_t stride)
582 for (
int i=0; i<nblocks; i++)
583 std::fill(static_cast<char*>(buffer)+i*stride, static_cast<char*>(buffer)+i*stride+blksize, 0);
584 }
catch(std::exception &e) {
585 printfQuda(
"ERROR: buffer failed (%s:%d in %s(), dim=%d, dir=%d, blksize=%zu nblocks=%d stride=%zu)\n",
586 file, line, func, dim, dir, blksize, nblocks, stride);
591 cudaError_t err = cudaMemset2D(buffer, stride, 0, blksize, nblocks);
592 if (err != cudaSuccess) {
593 printfQuda(
"ERROR: buffer failed (%s:%d in %s(), dim=%d, dir=%d, blksize=%zu nblocks=%d stride=%zu)\n",
594 file, line, func, dim, dir, blksize, nblocks, stride);
595 errorQuda(
"aborting with error %s", cudaGetErrorString(err));
643 for (
int i=0; i<4; i++) {
650 static bool gdr_enabled =
false;
652 static bool gdr_init =
false;
655 char *enable_gdr_env = getenv(
"QUDA_ENABLE_GDR");
656 if (enable_gdr_env && strcmp(enable_gdr_env,
"1") == 0) {
666 static bool blacklist =
false;
667 static bool blacklist_init =
false;
669 if (!blacklist_init) {
670 char *blacklist_env = getenv(
"QUDA_ENABLE_GDR_BLACKLIST");
673 std::stringstream blacklist_list(blacklist_env);
676 cudaGetDeviceCount(&device_count);
679 while (blacklist_list >> excluded_device) {
681 if ( excluded_device < 0 || excluded_device >= device_count ) {
682 errorQuda(
"Cannot blacklist invalid GPU device ordinal %d", excluded_device);
685 if (blacklist_list.peek() ==
',') blacklist_list.ignore();
686 if (excluded_device ==
comm_gpuid()) blacklist =
true;
691 blacklist_init =
true;
715 cudaGetDeviceCount(&device_count);
716 if (device_count == 0) {
errorQuda(
"No CUDA devices found"); }
717 if (
gpuid >= device_count) {
718 char *enable_mps_env = getenv(
"QUDA_ENABLE_MPS");
719 if (enable_mps_env && strcmp(enable_mps_env,
"1") == 0) {
731 char *enable_reduce_env = getenv(
"QUDA_DETERMINISTIC_REDUCE");
738 char *device_order_env = getenv(
"CUDA_VISIBLE_DEVICES");
739 if (device_order_env) {
743 std::stringstream device_list_raw(device_order_env);
744 std::stringstream device_list;
748 cudaGetDeviceCount(&deviceCount);
749 while (device_list_raw >> device) {
751 if (device < 0) {
errorQuda(
"Invalid CUDA_VISIBLE_DEVICE ordinal %d", device); }
754 if (device_list_raw.peek() ==
',') device_list_raw.ignore();
757 device_list.str().c_str());
768 static char config_string[16];
769 static bool config_init =
false;
772 strcpy(config_string,
",p2p=");
774 strcat(config_string,
",gdr=");
779 return config_string;
784 if (comm_dim_override) {
static int neighbor_rank[2][4]
int comm_peer2peer_enabled_global()
static void sum(Float *dst, Float *a, Float *b, int cnt)
const int * comm_dims(const Topology *topo)
int commDimPartitioned(int dir)
static bool enable_intranode
bool isHost(const void *buffer)
bool comm_deterministic_reduce()
MsgHandle * comm_declare_receive_displaced(void *buffer, const int displacement[], size_t nbytes)
void reduceDoubleArray(double *sum, const int len)
int comm_dim_partitioned(int dim)
QudaVerbosity getVerbosity()
static char partition_string[16]
void comm_set_default_topology(Topology *topo)
MsgHandle * comm_declare_strided_receive_displaced(void *buffer, const int displacement[], size_t blksize, int nblocks, size_t stride)
static char partition_override_string[16]
cudaColorSpinorField * tmp
static bool neighbors_cached
void comm_allreduce_array(double *data, size_t size)
static int enable_peer_to_peer
void copy(ColorSpinorField &dst, const ColorSpinorField &src)
void reduceDouble(double &sum)
char * comm_hostname(void)
int comm_ndim(const Topology *topo)
bool comm_peer2peer_enabled(int dir, int dim)
void comm_gather_hostname(char *hostname_recv_buf)
Gather all hostnames.
void comm_destroy_topology(Topology *topo)
void comm_enable_intranode(bool enable)
Enable / disable intra-node (non-peer-to-peer) communication.
int comm_rank_from_coords(const Topology *topo, const int *coords)
void comm_dim_partitioned_set(int dim)
MsgHandle * comm_declare_strided_send_displaced(void *buffer, const int displacement[], size_t blksize, int nblocks, size_t stride)
static unsigned long int rand_seed
const int * comm_coords_from_rank(const Topology *topo, int rank)
MsgHandle * comm_declare_send_displaced(void *buffer, const int displacement[], size_t nbytes)
int my_coords[QUDA_MAX_DIM]
MsgHandle * comm_declare_receive_relative_(const char *func, const char *file, int line, void *buffer, int dim, int dir, size_t nbytes)
void commDimPartitionedSet(int dir)
void comm_peer2peer_init(const char *hostname_recv_buf)
bool commGlobalReduction()
void reduceMaxDouble(double &max)
void commDimPartitionedReset()
Reset the comm dim partioned array to zero,.
static char topology_string[128]
static bool peer2peer_present
void comm_init_common(int ndim, const int *dims, QudaCommsMap rank_from_coords, void *map_data)
Initialize the communications common to all communications abstractions.
int(* coords)[QUDA_MAX_DIM]
int(* QudaCommsMap)(const int *coords, void *fdata)
static bool advance_coords(int ndim, const int *dims, int *x)
void init()
Create the CUBLAS context.
#define safe_malloc(size)
void comm_dim_partitioned_reset()
static bool peer2peer_init
void comm_enable_peer2peer(bool enable)
Enable / disable peer-to-peer communication: used for dslash policies that do not presently support p...
bool comm_peer2peer_present()
Returns true if any peer-to-peer capability is present on this system (regardless of whether it has b...
#define checkCudaErrorNoSync()
int comm_partitioned()
Loop over comm_dim_partitioned(dim) for all comms dimensions.
static int index(int ndim, const int *dims, const int *x)
void commGlobalReductionSet(bool global_reduction)
int comm_rank_displaced(const Topology *topo, const int displacement[])
const int * comm_coords(const Topology *topo)
void comm_gather_gpuid(int *gpuid_recv_buf)
Gather all GPU ids.
const char * comm_config_string()
Return a string that defines the P2P/GDR environment variable configuration (for use as a tuneKey to ...
void comm_broadcast(void *data, size_t nbytes)
void commAsyncReductionSet(bool async_reduction)
void comm_allreduce_int(int *data)
int comm_neighbor_rank(int dir, int dim)
#define device_malloc(size)
MsgHandle * comm_declare_strided_send_relative_(const char *func, const char *file, int line, void *buffer, int dim, int dir, size_t blksize, int nblocks, size_t stride)
Topology * comm_create_topology(int ndim, const int *dims, QudaCommsMap rank_from_coords, void *map_data)
bool comm_gdr_blacklist()
Query if GPU Direct RDMA communication is blacklisted for this GPU.
__host__ __device__ ValueType abs(ValueType x)
bool comm_gdr_enabled()
Query if GPU Direct RDMA communication is enabled (global setting)
#define QUDA_MAX_DIM
Maximum number of dimensions supported by QUDA. In practice, no routines make use of more than 5...
MsgHandle * comm_declare_strided_receive_relative_(const char *func, const char *file, int line, void *buffer, int dim, int dir, size_t blksize, int nblocks, size_t stride)
static int mod(int a, int b)
void comm_allreduce(double *data)
static bool intranode_enabled[2][4]
void comm_allreduce_max(double *data)
bool comm_intranode_enabled(int dir, int dim)
const char * comm_dim_topology_string()
Return a string that defines the comm topology (for use as a tuneKey)
static bool peer2peer_enabled[2][4]
bool commAsyncReduction()
static bool deterministic_reduce
void comm_set_neighbor_ranks(Topology *topo)
Topology * comm_default_topology(void)
const char * comm_dim_partitioned_string(const int *comm_dim_override)
Return a string that defines the comm partitioning (used as a tuneKey)
MsgHandle * comm_declare_send_relative_(const char *func, const char *file, int line, void *buffer, int dim, int dir, size_t nbytes)
static int manual_set_partition[QUDA_MAX_DIM]