v0.9.0/doc/comm__common_8cpp_source.html

 #include <unistd.h> // for gethostname()
 #include <assert.h>

 #include <quda_internal.h>
 #include <comm_quda.h>


 struct Topology_s {
   int ndim;
   int dims[QUDA_MAX_DIM];
   int *ranks;
   int (*coords)[QUDA_MAX_DIM];
   int my_rank;
   int my_coords[QUDA_MAX_DIM];
   // It might be worth adding communicators to allow for efficient reductions:
   //   #if defined(MPI_COMMS)
   //     MPI_Comm comm;
   //   #elif defined(QMP_COMMS)
   //     QMP_communicator_t comm; // currently only supported by qmp-2.4.0-alpha
   //   #endif
 };


 static inline int index(int ndim, const int *dims, const int *x)
 {
   int idx = x[0];
   for (int i = 1; i < ndim; i++) {
     idx = dims[i]*idx + x[i];
   }
   return idx;
 }


 static inline bool advance_coords(int ndim, const int *dims, int *x)
 {
   bool valid = false;
   for (int i = ndim-1; i >= 0; i--) {
     if (x[i] < dims[i]-1) {
       x[i]++;
       valid = true;
       break;
     } else {
       x[i] = 0;
     }
   }
   return valid;
 }


 char *comm_hostname(void)
 {
   static bool cached = false;
   static char hostname[128];

   if (!cached) {
     gethostname(hostname, 128);
     hostname[127] = '\0';
     cached = true;
   }

   return hostname;
 }


 static unsigned long int rand_seed = 137;

 double comm_drand(void)
 {
   const double twoneg48 = 0.35527136788005009e-14;
   const unsigned long int m = 25214903917, a = 11, mask = 281474976710655;
   rand_seed = (m * rand_seed + a) & mask;
   return (twoneg48 * rand_seed);
 }


 // QudaCommsMap is declared in quda.h:
 //   typedef int (*QudaCommsMap)(const int *coords, void *fdata);

 Topology *comm_create_topology(int ndim, const int *dims, QudaCommsMap rank_from_coords, void *map_data)
 {
   if (ndim > QUDA_MAX_DIM) {
     errorQuda("ndim exceeds QUDA_MAX_DIM");
   }

   Topology *topo = (Topology *) safe_malloc(sizeof(Topology));

   topo->ndim = ndim;

   int nodes = 1;
   for (int i=0; i<ndim; i++) {
     topo->dims[i] = dims[i];
     nodes *= dims[i];
   }

   topo->ranks = (int *) safe_malloc(nodes*sizeof(int));
   topo->coords = (int (*)[QUDA_MAX_DIM]) safe_malloc(nodes*sizeof(int[QUDA_MAX_DIM]));

   int x[QUDA_MAX_DIM];
   for (int i = 0; i < QUDA_MAX_DIM; i++) x[i] = 0;

   do {
     int rank = rank_from_coords(x, map_data);
     topo->ranks[index(ndim, dims, x)] = rank;
     for (int i=0; i<ndim; i++) {
       topo->coords[rank][i] = x[i];
     }
   } while (advance_coords(ndim, dims, x));

   int my_rank = comm_rank();
   topo->my_rank = my_rank;
   for (int i = 0; i < ndim; i++) {
     topo->my_coords[i] = topo->coords[my_rank][i];
   }

   // initialize the random number generator with a rank-dependent seed
   rand_seed = 17*my_rank + 137;

   return topo;
 }


 void comm_destroy_topology(Topology *topo)
 {
   host_free(topo->ranks);
   host_free(topo->coords);
   host_free(topo);
 }


 static bool peer2peer_enabled[2][4] = { {false,false,false,false},
                                         {false,false,false,false} };
 static bool peer2peer_init = false;

 static bool intranode_enabled[2][4] = { {false,false,false,false},
           {false,false,false,false} };

 static int enable_peer_to_peer = 3; // by default enable both copy engines and load/store access

 void comm_peer2peer_init(const char* hostname_recv_buf)
 {
   if (peer2peer_init) return;

   char *enable_peer_to_peer_env = getenv("QUDA_ENABLE_P2P");

   // disable peer-to-peer comms in one direction if QUDA_ENABLE_P2P=-1
   // and comm_dim(dim) == 2 (used for perf benchmarking)
   bool disable_peer_to_peer_bidir = false;

   if (enable_peer_to_peer_env) {
     enable_peer_to_peer = atoi(enable_peer_to_peer_env);

     switch ( std::abs(enable_peer_to_peer) ) {
     case 0: if (getVerbosity() > QUDA_SILENT) printfQuda("Disabling peer-to-peer access\n"); break;
     case 1: if (getVerbosity() > QUDA_SILENT) printfQuda("Enabling peer-to-peer copy engine access (disabling direct load/store)\n"); break;
     case 2: if (getVerbosity() > QUDA_SILENT) printfQuda("Enabling peer-to-peer direct load/store access (disabling copy engines)\n"); break;
     case 3: if (getVerbosity() > QUDA_SILENT) printfQuda("Enabling peer-to-peer copy engine and direct load/store access\n"); break;
     default: errorQuda("Unexpected value QUDA_ENABLE_P2P=%d\n", enable_peer_to_peer);
     }

     if (enable_peer_to_peer < 0) { // only values -1, -2, -3 can make it here
       if (getVerbosity() > QUDA_SILENT) printfQuda("Disabling bi-directional peer-to-peer access\n");
       disable_peer_to_peer_bidir = true;
     }

     enable_peer_to_peer = abs(enable_peer_to_peer);

   } else { // !enable_peer_to_peer_env
     if (getVerbosity() > QUDA_SILENT) printfQuda("Enabling peer-to-peer copy engine and direct load/store access\n");
   }

   if (!peer2peer_init && enable_peer_to_peer) {

     // first check that the local GPU supports UVA
     const int gpuid = comm_gpuid();
     cudaDeviceProp prop;
     cudaGetDeviceProperties(&prop, gpuid);
     if(!prop.unifiedAddressing) return;

     comm_set_neighbor_ranks();

     char *hostname = comm_hostname();
     int *gpuid_recv_buf = (int *)safe_malloc(sizeof(int)*comm_size());

     comm_gather_gpuid(gpuid_recv_buf);

     for(int dir=0; dir<2; ++dir){ // forward/backward directions
       for(int dim=0; dim<4; ++dim){
   int neighbor_rank = comm_neighbor_rank(dir,dim);
   if(neighbor_rank == comm_rank()) continue;

   // disable peer-to-peer comms in one direction
   if ( ((comm_rank() > neighbor_rank && dir == 0) || (comm_rank() < neighbor_rank && dir == 1)) &&
        disable_peer_to_peer_bidir && comm_dim(dim) == 2 ) continue;

   // if the neighbors are on the same
   if (!strncmp(hostname, &hostname_recv_buf[128*neighbor_rank], 128)) {
     int neighbor_gpuid = gpuid_recv_buf[neighbor_rank];
     int canAccessPeer[2];
     cudaDeviceCanAccessPeer(&canAccessPeer[0], gpuid, neighbor_gpuid);
     cudaDeviceCanAccessPeer(&canAccessPeer[1], neighbor_gpuid, gpuid);

     int accessRank[2] = { };
 #if CUDA_VERSION >= 8000  // this was introduced with CUDA 8
     if (canAccessPeer[0]*canAccessPeer[1]) {
       cudaDeviceGetP2PAttribute(&accessRank[0], cudaDevP2PAttrPerformanceRank, gpuid, neighbor_gpuid);
       cudaDeviceGetP2PAttribute(&accessRank[1], cudaDevP2PAttrPerformanceRank, neighbor_gpuid, gpuid);
     }
 #endif

     // enable P2P if we can access the peer or if peer is self
     if (canAccessPeer[0]*canAccessPeer[1] || gpuid == neighbor_gpuid) {
       peer2peer_enabled[dir][dim] = true;
       if (getVerbosity() > QUDA_SILENT) {
         printf("Peer-to-peer enabled for rank %d (gpu=%d) with neighbor %d (gpu=%d) dir=%d, dim=%d, performance rank = (%d, %d)\n",
          comm_rank(), gpuid, neighbor_rank, neighbor_gpuid, dir, dim, accessRank[0], accessRank[1]);
       }
     } else {
       intranode_enabled[dir][dim] = true;
       if (getVerbosity() > QUDA_SILENT) {
         printf("Intra-node (non peer-to-peer) enabled for rank %d (gpu=%d) with neighbor %d (gpu=%d) dir=%d, dim=%d\n",
          comm_rank(), gpuid, neighbor_rank, neighbor_gpuid, dir, dim);
       }
     }

   } // on the same node
       } // different dimensions - x, y, z, t
     } // different directions - forward/backward

     host_free(gpuid_recv_buf);
   }

   peer2peer_init = true;

   comm_barrier();

   // set gdr enablement
   if (comm_gdr_enabled()) {
     if (getVerbosity() > QUDA_SILENT) printfQuda("Enabling GPU-Direct RDMA access\n");
     comm_gdr_blacklist(); // set GDR blacklist
   } else {
     if (getVerbosity() > QUDA_SILENT) printfQuda("Disabling GPU-Direct RDMA access\n");
   }

   checkCudaErrorNoSync();
   return;
 }

 static bool enable_p2p = true;

 bool comm_peer2peer_enabled(int dir, int dim){
   return enable_p2p ? peer2peer_enabled[dir][dim] : false;
 }

 int comm_peer2peer_enabled_global() {
   if (!enable_p2p) return false;

   static bool init = false;
   static bool p2p_global = false;

   if (!init) {
     int p2p = 0;
     for (int dim=0; dim<4; dim++)
       for (int dir=0; dir<2; dir++)
   p2p += (int)comm_peer2peer_enabled(dir,dim);

     comm_allreduce_int(&p2p);
     init = true;
     p2p_global = p2p > 0 ? true : false;
   }
   return p2p_global * enable_peer_to_peer;
 }

 void comm_enable_peer2peer(bool enable) {
   enable_p2p = enable;
 }

 static bool enable_intranode = true;

 bool comm_intranode_enabled(int dir, int dim){
   return enable_intranode ? intranode_enabled[dir][dim] : false;
 }

 void comm_enable_intranode(bool enable) {
   enable_intranode = enable;
 }

 int comm_ndim(const Topology *topo)
 {
   return topo->ndim;
 }


 const int *comm_dims(const Topology *topo)
 {
   return topo->dims;
 }


 const int *comm_coords(const Topology *topo)
 {
   return topo->my_coords;
 }


 const int *comm_coords_from_rank(const Topology *topo, int rank)
 {
   return topo->coords[rank];
 }


 int comm_rank_from_coords(const Topology *topo, const int *coords)
 {
   return topo->ranks[index(topo->ndim, topo->dims, coords)];
 }


 static inline int mod(int a, int b)
 {
   return ((a % b) + b) % b;
 }

 int comm_rank_displaced(const Topology *topo, const int displacement[])
 {
   int coords[QUDA_MAX_DIM];

   for (int i = 0; i < QUDA_MAX_DIM; i++) {
     coords[i] = (i < topo->ndim) ?
       mod(comm_coords(topo)[i] + displacement[i], comm_dims(topo)[i]) : 0;
   }

   return comm_rank_from_coords(topo, coords);
 }


 // FIXME: The following routines rely on a "default" topology.
 // They should probably be reworked or eliminated eventually.

 Topology *default_topo = NULL;

 void comm_set_default_topology(Topology *topo)
 {
   default_topo = topo;
 }


 Topology *comm_default_topology(void)
 {
   if (!default_topo) {
     errorQuda("Default topology has not been declared");
   }
   return default_topo;
 }

 static int neighbor_rank[2][4] = { {-1,-1,-1,-1},
                                           {-1,-1,-1,-1} };

 static bool neighbors_cached = false;

 void comm_set_neighbor_ranks(Topology *topo){

   if(neighbors_cached) return;

   Topology *topology = topo ? topo : default_topo; // use default topology if topo is NULL
   if(!topology){
     errorQuda("Topology not specified");
     return;
   }

   for(int d=0; d<4; ++d){
     int pos_displacement[4] = {0,0,0,0};
     int neg_displacement[4] = {0,0,0,0};
     pos_displacement[d] = +1;
     neg_displacement[d] = -1;
     neighbor_rank[0][d] = comm_rank_displaced(topology, neg_displacement);
     neighbor_rank[1][d] = comm_rank_displaced(topology, pos_displacement);
   }
   neighbors_cached = true;
   return;
 }

 int comm_neighbor_rank(int dir, int dim){
   if(!neighbors_cached){
     comm_set_neighbor_ranks();
   }
   return neighbor_rank[dir][dim];
 }


 int comm_dim(int dim)
 {
   Topology *topo = comm_default_topology();
   return comm_dims(topo)[dim];
 }


 int comm_coord(int dim)
 {
   Topology *topo = comm_default_topology();
   return comm_coords(topo)[dim];
 }


 MsgHandle *comm_declare_send_relative_(const char *func, const char *file, int line,
                void *buffer, int dim, int dir, size_t nbytes)
 {
 #ifdef HOST_DEBUG
   checkCudaError(); // check and clear error state first
   cudaPointerAttributes attributes;
   cudaError_t err = cudaPointerGetAttributes(&attributes, buffer);
   if (err != cudaSuccess || attributes.memoryType == cudaMemoryTypeHost) {
     // test this memory allocation is ok by doing a memcpy from it
     void *tmp = safe_malloc(nbytes);
     try {
       std::copy(static_cast<char*>(buffer), static_cast<char*>(buffer)+nbytes, static_cast<char*>(tmp));
     } catch(std::exception &e) {
       printfQuda("ERROR: buffer failed (%s:%d in %s(), dim=%d, dir=%d, nbytes=%zu)\n", file, line, func, dim, dir, nbytes);
       errorQuda("aborting");
     }
     if (err != cudaSuccess) cudaGetLastError();
     host_free(tmp);
   } else {
     // test this memory allocation is ok by doing a memcpy from it
     void *tmp = device_malloc(nbytes);
     cudaError_t err = cudaMemcpy(tmp, buffer, nbytes, cudaMemcpyDeviceToDevice);
     if (err != cudaSuccess) {
       printfQuda("ERROR: buffer failed (%s:%d in %s(), dim=%d, dir=%d, nbytes=%zu)\n", file, line, func, dim, dir, nbytes);
       errorQuda("aborting with error %s", cudaGetErrorString(err));
     }
     device_free(tmp);
   }
 #endif

   int disp[QUDA_MAX_DIM] = {0};
   disp[dim] = dir;

   return comm_declare_send_displaced(buffer, disp, nbytes);
 }

 MsgHandle *comm_declare_receive_relative_(const char *func, const char *file, int line,
             void *buffer, int dim, int dir, size_t nbytes)
 {
 #ifdef HOST_DEBUG
   checkCudaError(); // check and clear error state first
   cudaPointerAttributes attributes;
   cudaError_t err = cudaPointerGetAttributes(&attributes, buffer);
   if (err != cudaSuccess || attributes.memoryType == cudaMemoryTypeHost) {
     // test this memory allocation is ok by filling it
     try {
       std::fill(static_cast<char*>(buffer), static_cast<char*>(buffer)+nbytes, 0);
     } catch(std::exception &e) {
       printfQuda("ERROR: buffer failed (%s:%d in %s(), dim=%d, dir=%d, nbytes=%zu)\n", file, line, func, dim, dir, nbytes);
       errorQuda("aborting");
     }
     if (err != cudaSuccess) cudaGetLastError();
   } else {
     // test this memory allocation is ok by doing a memset
     cudaError_t err = cudaMemset(buffer, 0, nbytes);
     if (err != cudaSuccess) {
       printfQuda("ERROR: buffer failed (%s:%d in %s(), dim=%d, dir=%d, nbytes=%zu)\n", file, line, func, dim, dir, nbytes);
       errorQuda("aborting with error %s", cudaGetErrorString(err));
     }
   }
 #endif

   int disp[QUDA_MAX_DIM] = {0};
   disp[dim] = dir;

   return comm_declare_receive_displaced(buffer, disp, nbytes);
 }

 MsgHandle *comm_declare_strided_send_relative_(const char *func, const char *file, int line,
                  void *buffer, int dim, int dir, size_t blksize, int nblocks, size_t stride)
 {
 #ifdef HOST_DEBUG
   checkCudaError(); // check and clear error state first
   cudaPointerAttributes attributes;
   cudaError_t err = cudaPointerGetAttributes(&attributes, buffer);
   if (err != cudaSuccess || attributes.memoryType == cudaMemoryTypeHost) {
     // test this memory allocation is ok by doing a memcpy from it
     void *tmp = safe_malloc(blksize*nblocks);
     try {
       for (int i=0; i<nblocks; i++)
   std::copy(static_cast<char*>(buffer)+i*stride, static_cast<char*>(buffer)+i*stride+blksize, static_cast<char*>(tmp));
     } catch(std::exception &e) {
       printfQuda("ERROR: buffer failed (%s:%d in %s(), dim=%d, dir=%d, blksize=%zu nblocks=%d stride=%zu)\n",
      file, line, func, dim, dir, blksize, nblocks, stride);
       errorQuda("aborting");
       }
     host_free(tmp);
     if (err != cudaSuccess) cudaGetLastError();
   } else {
     // test this memory allocation is ok by doing a memcpy from it
     void *tmp = device_malloc(blksize*nblocks);
     cudaError_t err = cudaMemcpy2D(tmp, blksize, buffer, stride, blksize, nblocks, cudaMemcpyDeviceToDevice);
     if (err != cudaSuccess) {
       printfQuda("ERROR: buffer failed (%s:%d in %s(), dim=%d, dir=%d, blksize=%zu nblocks=%d stride=%zu)\n",
      file, line, func, dim, dir, blksize, nblocks, stride);
       errorQuda("aborting with error %s", cudaGetErrorString(err));
     }
     device_free(tmp);
   }
 #endif

   int disp[QUDA_MAX_DIM] = {0};
   disp[dim] = dir;

   return comm_declare_strided_send_displaced(buffer, disp, blksize, nblocks, stride);
 }


 MsgHandle *comm_declare_strided_receive_relative_(const char *func, const char *file, int line,
               void *buffer, int dim, int dir, size_t blksize, int nblocks, size_t stride)
 {
 #ifdef HOST_DEBUG
   checkCudaError(); // check and clear error state first
   cudaPointerAttributes attributes;
   cudaError_t err = cudaPointerGetAttributes(&attributes, buffer);
   if (err != cudaSuccess || attributes.memoryType == cudaMemoryTypeHost) {
     // test this memory allocation is ok by filling it
     try {
       for (int i=0; i<nblocks; i++)
   std::fill(static_cast<char*>(buffer)+i*stride, static_cast<char*>(buffer)+i*stride+blksize, 0);
     } catch(std::exception &e) {
       printfQuda("ERROR: buffer failed (%s:%d in %s(), dim=%d, dir=%d, blksize=%zu nblocks=%d stride=%zu)\n",
      file, line, func, dim, dir, blksize, nblocks, stride);
       errorQuda("aborting");
     }
     if (err != cudaSuccess) cudaGetLastError();
   } else {
     // test this memory allocation is ok by doing a memset
     cudaError_t err = cudaMemset2D(buffer, stride, 0, blksize, nblocks);
     if (err != cudaSuccess) {
       printfQuda("ERROR: buffer failed (%s:%d in %s(), dim=%d, dir=%d, blksize=%zu nblocks=%d stride=%zu)\n",
      file, line, func, dim, dir, blksize, nblocks, stride);
       errorQuda("aborting with error %s", cudaGetErrorString(err));
     }
   }
 #endif

   int disp[QUDA_MAX_DIM] = {0};
   disp[dim] = dir;

   return comm_declare_strided_receive_displaced(buffer, disp, blksize, nblocks, stride);
 }

 void comm_finalize(void)
 {
   Topology *topo = comm_default_topology();
   comm_destroy_topology(topo);
   comm_set_default_topology(NULL);
 }


 static int manual_set_partition[QUDA_MAX_DIM] = {0};

 void comm_dim_partitioned_set(int dim)
 {
 #ifdef MULTI_GPU
   manual_set_partition[dim] = 1;
 #endif
 }

 void comm_dim_partitioned_reset(){
   for (int i=0; i<QUDA_MAX_DIM; i++)
    manual_set_partition[i] = 0;

 }


 int comm_dim_partitioned(int dim)
 {
   return (manual_set_partition[dim] || (comm_dim(dim) > 1));
 }

 int comm_partitioned()
 {
   int partitioned = 0;
   for (int i=0; i<4; i++) {
     partitioned = partitioned || comm_dim_partitioned(i);
   }
   return partitioned;
 }

 bool comm_gdr_enabled() {
   static bool gdr_enabled = false;
 #ifdef MULTI_GPU
   static bool gdr_init = false;

   if (!gdr_init) {
     char *enable_gdr_env = getenv("QUDA_ENABLE_GDR");
     if (enable_gdr_env && strcmp(enable_gdr_env, "1") == 0) {
       gdr_enabled = true;
     }
     gdr_init = true;
   }
 #endif
   return gdr_enabled;
 }

 bool comm_gdr_blacklist() {
   static bool blacklist = false;
   static bool blacklist_init = false;

   if (!blacklist_init) {
     char *blacklist_env = getenv("QUDA_ENABLE_GDR_BLACKLIST");

     if (blacklist_env) { // set the policies to tune for explicitly
       std::stringstream blacklist_list(blacklist_env);

       int device_count;
       cudaGetDeviceCount(&device_count);

       int excluded_device;
       while (blacklist_list >> excluded_device) {
   // check this is a valid device
   if ( excluded_device < 0 || excluded_device >= device_count ) {
     errorQuda("Cannot blacklist invalid GPU device ordinal %d", excluded_device);
   }

   if (blacklist_list.peek() == ',') blacklist_list.ignore();
   if (excluded_device == comm_gpuid()) blacklist = true;
       }
       comm_barrier();
       if (getVerbosity() > QUDA_SILENT && blacklist) printf("Blacklisting GPU-Direct RDMA for rank %d (GPU %d)\n", comm_rank(), comm_gpuid());
     }
     blacklist_init = true;

   }

   return blacklist;
 }

 static bool globalReduce = true;
 static bool asyncReduce = false;

 void reduceMaxDouble(double &max) { comm_allreduce_max(&max); }

 void reduceDouble(double &sum) { if (globalReduce) comm_allreduce(&sum); }

 void reduceDoubleArray(double *sum, const int len)
 { if (globalReduce) comm_allreduce_array(sum, len); }

 int commDim(int dir) { return comm_dim(dir); }

 int commCoords(int dir) { return comm_coord(dir); }

 int commDimPartitioned(int dir){ return comm_dim_partitioned(dir);}

 void commDimPartitionedSet(int dir) { comm_dim_partitioned_set(dir);}

 void commDimPartitionedReset(){ comm_dim_partitioned_reset();}

 bool commGlobalReduction() { return globalReduce; }

 void commGlobalReductionSet(bool global_reduction) { globalReduce = global_reduction; }

 bool commAsyncReduction() { return asyncReduce; }

 void commAsyncReductionSet(bool async_reduction) { asyncReduce = async_reduction; }
neighbor_rank
static int neighbor_rank[2][4]
Definition: comm_common.cpp:369

Topology_s::ndim
int ndim
Definition: comm_common.cpp:9

comm_drand
double comm_drand(void)
Definition: comm_common.cpp:82

comm_rank
int comm_rank(void)
Definition: comm_mpi.cpp:120

comm_peer2peer_enabled_global
int comm_peer2peer_enabled_global()
Definition: comm_common.cpp:269

comm_dims
const int * comm_dims(const Topology *topo)
Definition: comm_common.cpp:308

commDimPartitioned
int commDimPartitioned(int dir)
Definition: comm_common.cpp:674

enable_intranode
static bool enable_intranode
Definition: comm_common.cpp:292

comm_declare_receive_displaced
MsgHandle * comm_declare_receive_displaced(void *buffer, const int displacement[], size_t nbytes)
Definition: comm_mpi.cpp:174

reduceDoubleArray
void reduceDoubleArray(double *sum, const int len)
Definition: comm_common.cpp:667

comm_dim_partitioned
int comm_dim_partitioned(int dim)
Definition: comm_common.cpp:597

getVerbosity
QudaVerbosity getVerbosity()
Definition: util_quda.cpp:20

func
const void * func
Definition: CMakeCUDACompilerId.cpp1.ii:2248

globalReduce
static bool globalReduce
Definition: comm_common.cpp:660

comm_set_default_topology
void comm_set_default_topology(Topology *topo)
Definition: comm_common.cpp:355

comm_declare_strided_receive_displaced
MsgHandle * comm_declare_strided_receive_displaced(void *buffer, const int displacement[], size_t blksize, int nblocks, size_t stride)
Definition: comm_mpi.cpp:226

errorQuda
#define errorQuda(...)
Definition: util_quda.h:90

quda::blas::init
void init()
Definition: blas_quda.cu:64

host_free
#define host_free(ptr)
Definition: malloc_quda.h:59

Topology_s::my_rank
int my_rank
Definition: comm_common.cpp:13

dim
static __inline__ dim3 dim3 void size_t cudaStream_t int dim
Definition: CMakeCUDACompilerId.cpp1.ii:15687

tmp
cudaColorSpinorField * tmp
Definition: covdev_test.cpp:44

rank
static int rank
Definition: comm_mpi.cpp:42

Topology_s::ranks
int * ranks
Definition: comm_common.cpp:11

neighbors_cached
static bool neighbors_cached
Definition: comm_common.cpp:372

comm_allreduce_array
void comm_allreduce_array(double *data, size_t size)
Definition: comm_mpi.cpp:296

enable_peer_to_peer
static int enable_peer_to_peer
Definition: comm_common.cpp:152

quda::blas::copy
void copy(ColorSpinorField &dst, const ColorSpinorField &src)
Definition: copy_quda.cu:263

nodes
cudaGraphNode_t * nodes
Definition: CMakeCUDACompilerId.cpp1.ii:2715

comm_gpuid
int comm_gpuid(void)
Definition: comm_mpi.cpp:132

reduceDouble
void reduceDouble(double &sum)
Definition: comm_common.cpp:665

comm_hostname
char * comm_hostname(void)
Definition: comm_common.cpp:58

comm_ndim
int comm_ndim(const Topology *topo)
Definition: comm_common.cpp:302

comm_peer2peer_enabled
bool comm_peer2peer_enabled(int dir, int dim)
Definition: comm_common.cpp:265

comm_destroy_topology
void comm_destroy_topology(Topology *topo)
Definition: comm_common.cpp:137

comm_enable_intranode
void comm_enable_intranode(bool enable)
Enable / disable intra-node (non-peer-to-peer) communication.
Definition: comm_common.cpp:298

comm_rank_from_coords
int comm_rank_from_coords(const Topology *topo, const int *coords)
Definition: comm_common.cpp:326

comm_quda.h

b
#define b
Definition: dw_dslash4_core.h:83

x
p x
Definition: CMakeCUDACompilerId.cpp1.ii:3011

ndim
static int ndim
Definition: layout_hyper.c:53

comm_dim_partitioned_set
void comm_dim_partitioned_set(int dim)
Definition: comm_common.cpp:583

comm_declare_strided_send_displaced
MsgHandle * comm_declare_strided_send_displaced(void *buffer, const int displacement[], size_t blksize, int nblocks, size_t stride)
Definition: comm_mpi.cpp:197

strcmp
int strcmp(const char *__s1, const char *__s2)

QUDA_SILENT
Definition: enum_quda.h:235

prop
const struct cudaDeviceProp * prop
Definition: CMakeCUDACompilerId.cpp1.ii:2392

rand_seed
static unsigned long int rand_seed
Definition: comm_common.cpp:73

comm_size
int comm_size(void)
Definition: comm_mpi.cpp:126

comm_coords_from_rank
const int * comm_coords_from_rank(const Topology *topo, int rank)
Definition: comm_common.cpp:320

comm_declare_send_displaced
MsgHandle * comm_declare_send_displaced(void *buffer, const int displacement[], size_t nbytes)
Definition: comm_mpi.cpp:151

printf
int printf(const char *,...) __attribute__((__format__(__printf__

enable_p2p
static bool enable_p2p
Definition: comm_common.cpp:263

Topology_s::my_coords
int my_coords[QUDA_MAX_DIM]
Definition: comm_common.cpp:14

comm_declare_receive_relative_
MsgHandle * comm_declare_receive_relative_(const char *func, const char *file, int line, void *buffer, int dim, int dir, size_t nbytes)
Definition: comm_common.cpp:460

commDim
int commDim(int dir)
Definition: comm_common.cpp:670

commDimPartitionedSet
void commDimPartitionedSet(int dir)
Definition: comm_common.cpp:676

default_topo
Topology * default_topo
Definition: comm_common.cpp:353

comm_peer2peer_init
void comm_peer2peer_init(const char *hostname_recv_buf)
Definition: comm_common.cpp:154

commGlobalReduction
bool commGlobalReduction()
Definition: comm_common.cpp:680

sum
__host__ __device__ void sum(double &a, double &b)
Definition: multi_reduce_core.cuh:4

reduceMaxDouble
void reduceMaxDouble(double &max)
Definition: comm_common.cpp:663

commDimPartitionedReset
void commDimPartitionedReset()
Reset the comm dim partioned array to zero,.
Definition: comm_common.cpp:678

fused_exterior_ndeg_tm_dslash_cuda_gen.i
int i
start here
Definition: fused_exterior_ndeg_tm_dslash_cuda_gen.py:816

Topology_s::dims
int dims[QUDA_MAX_DIM]
Definition: comm_common.cpp:10

attributes
size_t enum cudaMemRangeAttribute * attributes
Definition: CMakeCUDACompilerId.cpp1.ii:2611

gpuid
static int gpuid
Definition: comm_mpi.cpp:44

Topology_s::coords
int(* coords)[QUDA_MAX_DIM]
Definition: comm_common.cpp:12

err
cudaError_t err
Definition: CMakeCUDACompilerId.cpp1.ii:15938

QudaCommsMap
int(* QudaCommsMap)(const int *coords, void *fdata)
Definition: comm_quda.h:12

advance_coords
static bool advance_coords(int ndim, const int *dims, int *x)
Definition: comm_common.cpp:42

comm_dim
int comm_dim(int dim)
Definition: comm_common.cpp:404

safe_malloc
#define safe_malloc(size)
Definition: malloc_quda.h:54

comm_dim_partitioned_reset
void comm_dim_partitioned_reset()
Definition: comm_common.cpp:590

peer2peer_init
static bool peer2peer_init
Definition: comm_common.cpp:147

comm_enable_peer2peer
void comm_enable_peer2peer(bool enable)
Enable / disable peer-to-peer communication: used for dslash policies that do not presently support p...
Definition: comm_common.cpp:288

commCoords
int commCoords(int dir)
Definition: comm_common.cpp:672

checkCudaErrorNoSync
#define checkCudaErrorNoSync()
Definition: util_quda.h:113

comm_coord
int comm_coord(int dim)
Definition: comm_common.cpp:411

comm_partitioned
int comm_partitioned()
Loop over comm_dim_partitioned(dim) for all comms dimensions.
Definition: comm_common.cpp:602

index
static int index(int ndim, const int *dims, const int *x)
Definition: comm_common.cpp:32

commGlobalReductionSet
void commGlobalReductionSet(bool global_reduction)
Definition: comm_common.cpp:682

comm_rank_displaced
int comm_rank_displaced(const Topology *topo, const int displacement[])
Definition: comm_common.cpp:337

abs
int abs(int) __attribute__((const))

comm_coords
const int * comm_coords(const Topology *topo)
Definition: comm_common.cpp:314

comm_gather_gpuid
void comm_gather_gpuid(int *gpuid_recv_buf)
Gather all GPU ids.
Definition: comm_mpi.cpp:56

idx
int idx
Definition: staggered_fused_exterior_dslash_core.h:355

strncmp
int strncmp(const char *__s1, const char *__s2, size_t __n)

int
int
Definition: CMakeCUDACompilerId.cpp1.ii:3962

printfQuda
#define printfQuda(...)
Definition: util_quda.h:84

commAsyncReductionSet
void commAsyncReductionSet(bool async_reduction)
Definition: comm_common.cpp:686

atoi
int atoi(const char *)

e
return e
Definition: CMakeCUDACompilerId.cpp1.ii:3026

comm_allreduce_int
void comm_allreduce_int(int *data)
Definition: comm_mpi.cpp:305

comm_neighbor_rank
int comm_neighbor_rank(int dir, int dim)
Definition: comm_common.cpp:396

device_malloc
#define device_malloc(size)
Definition: malloc_quda.h:52

comm_declare_strided_send_relative_
MsgHandle * comm_declare_strided_send_relative_(const char *func, const char *file, int line, void *buffer, int dim, int dir, size_t blksize, int nblocks, size_t stride)
Definition: comm_common.cpp:495

comm_create_topology
Topology * comm_create_topology(int ndim, const int *dims, QudaCommsMap rank_from_coords, void *map_data)
Definition: comm_common.cpp:94

comm_gdr_blacklist
bool comm_gdr_blacklist()
Query if GPU Direct RDMA communication is blacklisted for this GPU.
Definition: comm_common.cpp:627

Topology_s
Definition: comm_common.cpp:8

comm_gdr_enabled
bool comm_gdr_enabled()
Query if GPU Direct RDMA communication is enabled (global setting)
Definition: comm_common.cpp:611

QUDA_MAX_DIM
#define QUDA_MAX_DIM
Maximum number of dimensions supported by QUDA. In practice, no routines make use of more than 5...
Definition: quda_constants.h:17

comm_declare_strided_receive_relative_
MsgHandle * comm_declare_strided_receive_relative_(const char *func, const char *file, int line, void *buffer, int dim, int dir, size_t blksize, int nblocks, size_t stride)
Definition: comm_common.cpp:538

checkCudaError
#define checkCudaError()
Definition: util_quda.h:129

mod
static int mod(int a, int b)
Definition: comm_common.cpp:332

comm_allreduce
void comm_allreduce(double *data)
Definition: comm_mpi.cpp:281

intranode_enabled
static bool intranode_enabled[2][4]
Definition: comm_common.cpp:149

comm_allreduce_max
void comm_allreduce_max(double *data)
Definition: comm_mpi.cpp:289

comm_intranode_enabled
bool comm_intranode_enabled(int dir, int dim)
Definition: comm_common.cpp:294

peer2peer_enabled
static bool peer2peer_enabled[2][4]
Definition: comm_common.cpp:145

comm_finalize
void comm_finalize(void)
Definition: comm_common.cpp:573

d
static __inline__ size_t size_t d
Definition: CMakeCUDACompilerId.cpp1.ii:3019

commAsyncReduction
bool commAsyncReduction()
Definition: comm_common.cpp:684

comm_set_neighbor_ranks
void comm_set_neighbor_ranks(Topology *topo)
Definition: comm_common.cpp:374

getenv
char * getenv(const char *)

a
#define a
Definition: dw_dslash4_core.h:82

comm_default_topology
Topology * comm_default_topology(void)
Definition: comm_common.cpp:361

MsgHandle_s
Definition: comm_mpi.cpp:22

comm_declare_send_relative_
MsgHandle * comm_declare_send_relative_(const char *func, const char *file, int line, void *buffer, int dim, int dir, size_t nbytes)
Definition: comm_common.cpp:421

asyncReduce
static bool asyncReduce
Definition: comm_common.cpp:661

manual_set_partition
static int manual_set_partition[QUDA_MAX_DIM]
Definition: comm_common.cpp:581

quda_internal.h

device_free
#define device_free(ptr)
Definition: malloc_quda.h:57

comm_barrier
void comm_barrier(void)
Definition: comm_mpi.cpp:328

len
int len
Definition: CMakeCUDACompilerId.cpp1.ii:2352