32 static inline int index(
int ndim,
const int *dims,
const int *
x)
35 for (
int i = 1;
i <
ndim;
i++) {
45 for (
int i =
ndim-1;
i >= 0;
i--) {
46 if (
x[
i] < dims[
i]-1) {
60 static bool cached =
false;
61 static char hostname[128];
64 gethostname(hostname, 128);
84 const double twoneg48 = 0.35527136788005009e-14;
85 const unsigned long int m = 25214903917,
a = 11, mask = 281474976710655;
117 int rank = rank_from_coords(
x, map_data);
126 for (
int i = 0;
i <
ndim;
i++) {
146 {
false,
false,
false,
false} };
150 {
false,
false,
false,
false} };
158 char *enable_peer_to_peer_env =
getenv(
"QUDA_ENABLE_P2P");
162 bool disable_peer_to_peer_bidir =
false;
164 if (enable_peer_to_peer_env) {
177 disable_peer_to_peer_bidir =
true;
192 if(!
prop.unifiedAddressing)
return;
201 for(
int dir=0; dir<2; ++dir){
208 disable_peer_to_peer_bidir &&
comm_dim(
dim) == 2 )
continue;
213 int canAccessPeer[2];
214 cudaDeviceCanAccessPeer(&canAccessPeer[0],
gpuid, neighbor_gpuid);
215 cudaDeviceCanAccessPeer(&canAccessPeer[1], neighbor_gpuid,
gpuid);
217 int accessRank[2] = { };
218 #if CUDA_VERSION >= 8000 // this was introduced with CUDA 8 219 if (canAccessPeer[0]*canAccessPeer[1]) {
220 cudaDeviceGetP2PAttribute(&accessRank[0], cudaDevP2PAttrPerformanceRank,
gpuid, neighbor_gpuid);
221 cudaDeviceGetP2PAttribute(&accessRank[1], cudaDevP2PAttrPerformanceRank, neighbor_gpuid,
gpuid);
226 if (canAccessPeer[0]*canAccessPeer[1] ||
gpuid == neighbor_gpuid) {
229 printf(
"Peer-to-peer enabled for rank %d (gpu=%d) with neighbor %d (gpu=%d) dir=%d, dim=%d, performance rank = (%d, %d)\n",
235 printf(
"Intra-node (non peer-to-peer) enabled for rank %d (gpu=%d) with neighbor %d (gpu=%d) dir=%d, dim=%d\n",
272 static bool init =
false;
273 static bool p2p_global =
false;
278 for (
int dir=0; dir<2; dir++)
283 p2p_global = p2p > 0 ? true :
false;
334 return ((
a %
b) +
b) %
b;
342 coords[
i] = (
i < topo->
ndim) ?
364 errorQuda(
"Default topology has not been declared");
384 for(
int d=0;
d<4; ++
d){
385 int pos_displacement[4] = {0,0,0,0};
386 int neg_displacement[4] = {0,0,0,0};
387 pos_displacement[
d] = +1;
388 neg_displacement[
d] = -1;
422 void *buffer,
int dim,
int dir,
size_t nbytes)
427 cudaError_t
err = cudaPointerGetAttributes(&
attributes, buffer);
428 if (
err != cudaSuccess ||
attributes.memoryType == cudaMemoryTypeHost) {
432 std::copy(static_cast<char*>(buffer), static_cast<char*>(buffer)+nbytes, static_cast<char*>(
tmp));
433 }
catch(std::exception &
e) {
434 printfQuda(
"ERROR: buffer failed (%s:%d in %s(), dim=%d, dir=%d, nbytes=%zu)\n", file, line,
func,
dim, dir, nbytes);
437 if (
err != cudaSuccess) cudaGetLastError();
442 cudaError_t
err = cudaMemcpy(
tmp, buffer, nbytes, cudaMemcpyDeviceToDevice);
443 if (
err != cudaSuccess) {
444 printfQuda(
"ERROR: buffer failed (%s:%d in %s(), dim=%d, dir=%d, nbytes=%zu)\n", file, line,
func,
dim, dir, nbytes);
445 errorQuda(
"aborting with error %s", cudaGetErrorString(
err));
461 void *buffer,
int dim,
int dir,
size_t nbytes)
466 cudaError_t
err = cudaPointerGetAttributes(&
attributes, buffer);
467 if (
err != cudaSuccess ||
attributes.memoryType == cudaMemoryTypeHost) {
470 std::fill(static_cast<char*>(buffer), static_cast<char*>(buffer)+nbytes, 0);
471 }
catch(std::exception &
e) {
472 printfQuda(
"ERROR: buffer failed (%s:%d in %s(), dim=%d, dir=%d, nbytes=%zu)\n", file, line,
func,
dim, dir, nbytes);
475 if (
err != cudaSuccess) cudaGetLastError();
478 cudaError_t
err = cudaMemset(buffer, 0, nbytes);
479 if (
err != cudaSuccess) {
480 printfQuda(
"ERROR: buffer failed (%s:%d in %s(), dim=%d, dir=%d, nbytes=%zu)\n", file, line,
func,
dim, dir, nbytes);
481 errorQuda(
"aborting with error %s", cudaGetErrorString(
err));
496 void *buffer,
int dim,
int dir,
size_t blksize,
int nblocks,
size_t stride)
501 cudaError_t
err = cudaPointerGetAttributes(&
attributes, buffer);
502 if (
err != cudaSuccess ||
attributes.memoryType == cudaMemoryTypeHost) {
506 for (
int i=0;
i<nblocks;
i++)
507 std::copy(static_cast<char*>(buffer)+
i*stride, static_cast<char*>(buffer)+
i*stride+blksize, static_cast<char*>(
tmp));
508 }
catch(std::exception &
e) {
509 printfQuda(
"ERROR: buffer failed (%s:%d in %s(), dim=%d, dir=%d, blksize=%zu nblocks=%d stride=%zu)\n",
510 file, line,
func,
dim, dir, blksize, nblocks, stride);
514 if (
err != cudaSuccess) cudaGetLastError();
518 cudaError_t
err = cudaMemcpy2D(
tmp, blksize, buffer, stride, blksize, nblocks, cudaMemcpyDeviceToDevice);
519 if (
err != cudaSuccess) {
520 printfQuda(
"ERROR: buffer failed (%s:%d in %s(), dim=%d, dir=%d, blksize=%zu nblocks=%d stride=%zu)\n",
521 file, line,
func,
dim, dir, blksize, nblocks, stride);
522 errorQuda(
"aborting with error %s", cudaGetErrorString(
err));
539 void *buffer,
int dim,
int dir,
size_t blksize,
int nblocks,
size_t stride)
544 cudaError_t
err = cudaPointerGetAttributes(&
attributes, buffer);
545 if (
err != cudaSuccess ||
attributes.memoryType == cudaMemoryTypeHost) {
548 for (
int i=0;
i<nblocks;
i++)
549 std::fill(static_cast<char*>(buffer)+
i*stride, static_cast<char*>(buffer)+
i*stride+blksize, 0);
550 }
catch(std::exception &
e) {
551 printfQuda(
"ERROR: buffer failed (%s:%d in %s(), dim=%d, dir=%d, blksize=%zu nblocks=%d stride=%zu)\n",
552 file, line,
func,
dim, dir, blksize, nblocks, stride);
555 if (
err != cudaSuccess) cudaGetLastError();
558 cudaError_t
err = cudaMemset2D(buffer, stride, 0, blksize, nblocks);
559 if (
err != cudaSuccess) {
560 printfQuda(
"ERROR: buffer failed (%s:%d in %s(), dim=%d, dir=%d, blksize=%zu nblocks=%d stride=%zu)\n",
561 file, line,
func,
dim, dir, blksize, nblocks, stride);
562 errorQuda(
"aborting with error %s", cudaGetErrorString(
err));
605 for (
int i=0;
i<4;
i++) {
612 static bool gdr_enabled =
false;
614 static bool gdr_init =
false;
617 char *enable_gdr_env =
getenv(
"QUDA_ENABLE_GDR");
618 if (enable_gdr_env &&
strcmp(enable_gdr_env,
"1") == 0) {
628 static bool blacklist =
false;
629 static bool blacklist_init =
false;
631 if (!blacklist_init) {
632 char *blacklist_env =
getenv(
"QUDA_ENABLE_GDR_BLACKLIST");
635 std::stringstream blacklist_list(blacklist_env);
638 cudaGetDeviceCount(&device_count);
641 while (blacklist_list >> excluded_device) {
643 if ( excluded_device < 0 || excluded_device >= device_count ) {
644 errorQuda(
"Cannot blacklist invalid GPU device ordinal %d", excluded_device);
647 if (blacklist_list.peek() ==
',') blacklist_list.ignore();
648 if (excluded_device ==
comm_gpuid()) blacklist =
true;
653 blacklist_init =
true;
static int neighbor_rank[2][4]
int comm_peer2peer_enabled_global()
const int * comm_dims(const Topology *topo)
int commDimPartitioned(int dir)
static bool enable_intranode
MsgHandle * comm_declare_receive_displaced(void *buffer, const int displacement[], size_t nbytes)
void reduceDoubleArray(double *sum, const int len)
int comm_dim_partitioned(int dim)
QudaVerbosity getVerbosity()
void comm_set_default_topology(Topology *topo)
MsgHandle * comm_declare_strided_receive_displaced(void *buffer, const int displacement[], size_t blksize, int nblocks, size_t stride)
static __inline__ dim3 dim3 void size_t cudaStream_t int dim
cudaColorSpinorField * tmp
static bool neighbors_cached
void comm_allreduce_array(double *data, size_t size)
static int enable_peer_to_peer
void copy(ColorSpinorField &dst, const ColorSpinorField &src)
void reduceDouble(double &sum)
char * comm_hostname(void)
int comm_ndim(const Topology *topo)
bool comm_peer2peer_enabled(int dir, int dim)
void comm_destroy_topology(Topology *topo)
void comm_enable_intranode(bool enable)
Enable / disable intra-node (non-peer-to-peer) communication.
int comm_rank_from_coords(const Topology *topo, const int *coords)
void comm_dim_partitioned_set(int dim)
MsgHandle * comm_declare_strided_send_displaced(void *buffer, const int displacement[], size_t blksize, int nblocks, size_t stride)
int strcmp(const char *__s1, const char *__s2)
const struct cudaDeviceProp * prop
static unsigned long int rand_seed
const int * comm_coords_from_rank(const Topology *topo, int rank)
MsgHandle * comm_declare_send_displaced(void *buffer, const int displacement[], size_t nbytes)
int printf(const char *,...) __attribute__((__format__(__printf__
int my_coords[QUDA_MAX_DIM]
MsgHandle * comm_declare_receive_relative_(const char *func, const char *file, int line, void *buffer, int dim, int dir, size_t nbytes)
void commDimPartitionedSet(int dir)
void comm_peer2peer_init(const char *hostname_recv_buf)
bool commGlobalReduction()
__host__ __device__ void sum(double &a, double &b)
void reduceMaxDouble(double &max)
void commDimPartitionedReset()
Reset the comm dim partioned array to zero,.
size_t enum cudaMemRangeAttribute * attributes
int(* coords)[QUDA_MAX_DIM]
int(* QudaCommsMap)(const int *coords, void *fdata)
static bool advance_coords(int ndim, const int *dims, int *x)
#define safe_malloc(size)
void comm_dim_partitioned_reset()
static bool peer2peer_init
void comm_enable_peer2peer(bool enable)
Enable / disable peer-to-peer communication: used for dslash policies that do not presently support p...
#define checkCudaErrorNoSync()
int comm_partitioned()
Loop over comm_dim_partitioned(dim) for all comms dimensions.
static int index(int ndim, const int *dims, const int *x)
void commGlobalReductionSet(bool global_reduction)
int comm_rank_displaced(const Topology *topo, const int displacement[])
int abs(int) __attribute__((const))
const int * comm_coords(const Topology *topo)
void comm_gather_gpuid(int *gpuid_recv_buf)
Gather all GPU ids.
int strncmp(const char *__s1, const char *__s2, size_t __n)
void commAsyncReductionSet(bool async_reduction)
void comm_allreduce_int(int *data)
int comm_neighbor_rank(int dir, int dim)
#define device_malloc(size)
MsgHandle * comm_declare_strided_send_relative_(const char *func, const char *file, int line, void *buffer, int dim, int dir, size_t blksize, int nblocks, size_t stride)
Topology * comm_create_topology(int ndim, const int *dims, QudaCommsMap rank_from_coords, void *map_data)
bool comm_gdr_blacklist()
Query if GPU Direct RDMA communication is blacklisted for this GPU.
bool comm_gdr_enabled()
Query if GPU Direct RDMA communication is enabled (global setting)
#define QUDA_MAX_DIM
Maximum number of dimensions supported by QUDA. In practice, no routines make use of more than 5...
MsgHandle * comm_declare_strided_receive_relative_(const char *func, const char *file, int line, void *buffer, int dim, int dir, size_t blksize, int nblocks, size_t stride)
static int mod(int a, int b)
void comm_allreduce(double *data)
static bool intranode_enabled[2][4]
void comm_allreduce_max(double *data)
bool comm_intranode_enabled(int dir, int dim)
static bool peer2peer_enabled[2][4]
static __inline__ size_t size_t d
bool commAsyncReduction()
void comm_set_neighbor_ranks(Topology *topo)
char * getenv(const char *)
Topology * comm_default_topology(void)
MsgHandle * comm_declare_send_relative_(const char *func, const char *file, int line, void *buffer, int dim, int dir, size_t nbytes)
static int manual_set_partition[QUDA_MAX_DIM]