8 #include <shmem_helper.cuh>
50 : precision(field.Precision()), ghost_precision(field.Precision()),
51 nDim(field.Ndim()), pad(field.Pad()),
52 siteSubset(field.SiteSubset()), mem_type(field.MemType()),
53 ghostExchange(field.GhostExchange()), scale(field.Scale())
55 for(
int dir=0; dir<
nDim; ++dir) {
56 x[dir] = field.
X()[dir];
57 r[dir] = field.
R()[dir];
67 precision(
param.Precision()),
68 ghost_precision(
param.GhostPrecision()),
69 ghost_precision_reset(false),
71 siteSubset(
param.siteSubset),
72 ghostExchange(
param.ghostExchange),
76 ghost_face_bytes_aligned {},
85 mem_type(
param.mem_type),
87 backup_norm_h(nullptr),
92 for (
int dir = 0; dir < 2; dir++) {
95 for (
int b = 0; b < 2; b++) {
117 for (
int i=0; i<
nDim; i++) {
123 for (
int j=0; j<
nDim; j++) {
135 for (
int i=0; i<
nDim; i++)
160 precision(field.precision),
161 ghost_precision(field.ghost_precision),
162 ghost_precision_reset(false),
164 siteSubset(field.siteSubset),
165 ghostExchange(field.ghostExchange),
169 ghost_face_bytes_aligned {},
178 mem_type(field.mem_type),
180 backup_norm_h(nullptr),
185 for (
int dir = 0; dir < 2; dir++) {
199 for (
int i=0; i<
nDim; i++) {
205 for (
int j=0; j<
nDim; j++) {
217 for (
int i=0; i<
nDim; i++)
240 for (
int b=0; b<2; b++) {
250 for (
int b = 0; b < 2; ++b) {
289 for (
int b=0; b<2; b++) {
323 for (
int b=0; b<2; b++) {
336 for (
int b=0; b<2; ++b) {
347 for (
int b=0; b<2; ++b) {
366 for (
int b=0; b<2; ++b) {
395 for (
int b=0; b<2; ++b) {
423 errorQuda(
"ghost_field appears not to be allocated");
424 #ifndef NVSHMEM_COMMS
426 cudaIpcMemHandle_t ipcRemoteGhostDestHandle[2][2][
QUDA_MAX_DIM];
429 for (
int b=0; b<2; b++) {
430 #ifndef NVSHMEM_COMMS
433 for (
int dir=0; dir<2; ++dir) {
436 int disp = (dir == 1) ? +1 : -1;
442 sizeof(ipcRemoteGhostDestHandle[b][1-dir][
dim]));
445 cudaIpcMemHandle_t ipcLocalGhostDestHandle;
450 sizeof(ipcLocalGhostDestHandle));
455 if (receiveHandle)
comm_wait(receiveHandle);
459 if (receiveHandle)
comm_free(receiveHandle);
467 #ifndef NVSHMEM_COMMS
474 for (
int dir = 0; dir < num_dir; ++dir) {
475 #ifndef NVSHMEM_COMMS
478 cudaIpcOpenMemHandle(ghostDest, ipcRemoteGhostDestHandle[b][dir][
dim], cudaIpcMemLazyEnablePeerAccess);
491 cudaIpcEventHandle_t ipcRemoteEventHandle[2][2][
QUDA_MAX_DIM];
497 for (
int dir=0; dir<2; ++dir) {
498 for (
int b=0; b<2; b++) {
502 int disp = (dir == 1) ? +1 : -1;
507 sizeof(ipcRemoteEventHandle[b][1-dir][
dim]));
511 cudaIpcEventHandle_t ipcLocalEventHandle;
513 cudaEventCreate(&
ipcCopyEvent[b][dir][
dim], cudaEventDisableTiming | cudaEventInterprocess);
514 cudaIpcGetEventHandle(&ipcLocalEventHandle,
ipcCopyEvent[b][dir][
dim]);
517 sizeof(ipcLocalEventHandle));
523 if (receiveHandle)
comm_wait(receiveHandle);
527 if (receiveHandle)
comm_free(receiveHandle);
537 for (
int dir=0; dir<2; ++dir) {
539 for (
int b=0; b<2; b++) {
549 for (
int b=0; b<2; b++) {
558 for (
int b=0; b<2; b++) {
584 #ifndef NVSHMEM_COMMS
587 for (
int b=0; b<2; b++) {
593 #ifndef NVSHMEM_COMMS
605 #ifndef NVSHMEM_COMMS
646 for (
int d=1; d<
nDim; d++) {
657 size_t a_volume_interior = 1;
658 for (
int i=0; i<
nDim; i++) {
659 if (a.
x[i]-2*a.
r[i] !=
x[i])
errorQuda(
"x[%d] does not match %d %d", i,
x[i], a.
x[i]-2*a.
r[i]);
660 a_volume_interior *= a.
x[i] - 2*a.
r[i];
662 if (a_volume_interior !=
volume)
errorQuda(
"Interior volume does not match %lu %lu",
volume, a_volume_interior);
665 size_t this_volume_interior = 1;
666 for (
int i=0; i<
nDim; i++) {
667 if (
x[i]-2*
r[i] != a.
x[i])
errorQuda(
"x[%d] does not match %d %d", i,
x[i]-2*
r[i], a.
x[i]);
668 this_volume_interior *=
x[i] - 2*
r[i];
670 if (this_volume_interior != a.
volume)
671 errorQuda(
"Interior volume does not match %lu %lu", this_volume_interior, a.
volume);
675 for (
int i=0; i<
nDim; i++) {
676 if (a.
x[i] !=
x[i])
errorQuda(
"x[%d] does not match %d %d", i,
x[i], a.
x[i]);
694 errorQuda(
"Unknown field %s, so cannot determine location",
typeid(*this).name());
711 return static_cast<int>(csField.
FieldOrder());
714 if (gField.
Order() == 2 || gField.
Order() == 4)
715 return static_cast<int>(gField.
Order());
718 if (cField.
Order() == 2 || cField.
Order() == 4)
719 return static_cast<int>(cField.
Order());
729 output <<
"nDim = " <<
param.nDim << std::endl;
730 for (
int i=0; i<
param.nDim; i++) {
731 output <<
"x[" << i <<
"] = " <<
param.x[i] << std::endl;
733 output <<
"pad = " <<
param.pad << std::endl;
734 output <<
"precision = " <<
param.Precision() << std::endl;
735 output <<
"ghost_precision = " <<
param.GhostPrecision() << std::endl;
736 output <<
"scale = " <<
param.
scale << std::endl;
738 output <<
"ghostExchange = " <<
param.ghostExchange << std::endl;
739 for (
int i=0; i<
param.nDim; i++) {
740 output <<
"r[" << i <<
"] = " <<
param.r[i] << std::endl;
QudaCloverFieldOrder Order() const
QudaFieldOrder FieldOrder() const
QudaGaugeFieldOrder Order() const
QudaGhostExchange ghostExchange
static bool initGhostFaceBuffer
MsgHandle * mh_send_fwd[2][QUDA_MAX_DIM]
QudaSiteSubset siteSubset
static int buffer_recv_p2p_fwd[2][QUDA_MAX_DIM]
MsgHandle * mh_recv_rdma_back[2][QUDA_MAX_DIM]
MsgHandle * mh_send_rdma_fwd[2][QUDA_MAX_DIM]
void * from_face_dim_dir_d[2][QUDA_MAX_DIM][2]
static MsgHandle * mh_recv_p2p_back[2][QUDA_MAX_DIM]
bool ipcCopyComplete(int dir, int dim)
MsgHandle * mh_send_rdma_back[2][QUDA_MAX_DIM]
static void * ghost_pinned_send_buffer_hd[2]
void * my_face_dim_dir_h[2][QUDA_MAX_DIM][2]
void * from_face_dim_dir_h[2][QUDA_MAX_DIM][2]
static MsgHandle * mh_send_p2p_fwd[2][QUDA_MAX_DIM]
static MsgHandle * mh_recv_p2p_fwd[2][QUDA_MAX_DIM]
static void * ghost_pinned_recv_buffer_h[2]
size_t ghost_offset[QUDA_MAX_DIM][2]
QudaFieldLocation Location() const
void * my_face_dim_dir_d[2][QUDA_MAX_DIM][2]
static void destroyIPCComms()
static void * ghost_pinned_recv_buffer_hd[2]
size_t ghost_face_bytes[QUDA_MAX_DIM]
virtual void write(char *filename)
static size_t ghostFaceBytes
char vol_string[TuneKey::volume_n]
static void * ghost_pinned_send_buffer_h[2]
static void * ghost_remote_send_buffer_d[2][QUDA_MAX_DIM][2]
void * from_face_dim_dir_hd[2][QUDA_MAX_DIM][2]
static MsgHandle * mh_send_p2p_back[2][QUDA_MAX_DIM]
static int buffer_send_p2p_fwd[2][QUDA_MAX_DIM]
virtual void setTuningString()
static bool ghost_field_reset
int surfaceCB[QUDA_MAX_DIM]
int surface[QUDA_MAX_DIM]
static int buffer_recv_p2p_back[2][QUDA_MAX_DIM]
bool ipcRemoteCopyComplete(int dir, int dim)
void checkField(const LatticeField &a) const
const cudaEvent_t & getIPCCopyEvent(int dir, int dim) const
MsgHandle * mh_send_back[2][QUDA_MAX_DIM]
void * my_face_dim_dir_hd[2][QUDA_MAX_DIM][2]
static cudaEvent_t ipcCopyEvent[2][2][QUDA_MAX_DIM]
static cudaEvent_t ipcRemoteCopyEvent[2][2][QUDA_MAX_DIM]
const cudaEvent_t & getIPCRemoteCopyEvent(int dir, int dim) const
MsgHandle * mh_recv_fwd[2][QUDA_MAX_DIM]
static int buffer_send_p2p_back[2][QUDA_MAX_DIM]
void allocateGhostBuffer(size_t ghost_bytes) const
Allocate the static ghost buffers.
virtual void read(char *filename)
MsgHandle * mh_recv_rdma_fwd[2][QUDA_MAX_DIM]
static void * ghost_recv_buffer_d[2]
MsgHandle * mh_recv_back[2][QUDA_MAX_DIM]
void createComms(bool no_comms_fill=false, bool bidir=true)
LatticeField(const LatticeFieldParam ¶m)
static void freeGhostBuffer(void)
Free statically allocated ghost buffers.
static void * ghost_send_buffer_d[2]
void comm_start(MsgHandle *mh)
int comm_neighbor_rank(int dir, int dim)
bool comm_gdr_enabled()
Query if GPU Direct RDMA communication is enabled (global setting)
bool comm_peer2peer_enabled(int dir, int dim)
#define comm_declare_receive_relative(buffer, dim, dir, nbytes)
void comm_wait(MsgHandle *mh)
void comm_free(MsgHandle *&mh)
int commDimPartitioned(int dir)
#define comm_declare_send_relative(buffer, dim, dir, nbytes)
@ QUDA_CUDA_FIELD_LOCATION
@ QUDA_CPU_FIELD_LOCATION
@ QUDA_INVALID_FIELD_LOCATION
@ QUDA_INVALID_SITE_SUBSET
enum QudaFieldLocation_s QudaFieldLocation
@ QUDA_GHOST_EXCHANGE_EXTENDED
void initComms(int argc, char **argv, std::array< int, 4 > &commDims)
#define device_comms_pinned_free(ptr)
#define device_comms_pinned_malloc(size)
#define get_mapped_device_pointer(ptr)
#define mapped_malloc(size)
QudaFieldLocation reorder_location()
Return whether data is reordered on the CPU or GPU. This can set at QUDA initialization using the env...
void reorder_location_set(QudaFieldLocation reorder_location_)
Set whether data is reorderd on the CPU or GPU. This can set at QUDA initialization using the environ...
std::ostream & operator<<(std::ostream &output, const CloverFieldParam ¶m)
#define qudaMemset(ptr, value, count)
#define qudaDeviceSynchronize()
#define QUDA_MAX_DIM
Maximum number of dimensions supported by QUDA. In practice, no routines make use of more than 5.
LatticeFieldParam()
Default constructor for LatticeFieldParam.
static const int volume_n