QUDA  v1.1.0
A library for QCD on GPUs
lattice_field.cpp
Go to the documentation of this file.
1 #include <typeinfo>
2 #include <quda_internal.h>
3 #include <lattice_field.h>
4 #include <color_spinor_field.h>
5 #include <gauge_field.h>
6 #include <clover_field.h>
7 
8 #include <shmem_helper.cuh>
9 
10 namespace quda {
11 
12  bool LatticeField::initIPCComms = false;
13 
18 
23 
24  cudaEvent_t LatticeField::ipcCopyEvent[2][2][QUDA_MAX_DIM];
26 
27  void *LatticeField::ghost_pinned_send_buffer_h[2] = {nullptr, nullptr};
28  void *LatticeField::ghost_pinned_send_buffer_hd[2] = {nullptr, nullptr};
29 
30  void *LatticeField::ghost_pinned_recv_buffer_h[2] = {nullptr, nullptr};
31  void *LatticeField::ghost_pinned_recv_buffer_hd[2] = {nullptr, nullptr};
32 
33  // gpu ghost receive buffer
34  void *LatticeField::ghost_recv_buffer_d[2] = {nullptr, nullptr};
35 
36  // gpu ghost send buffer
37  void *LatticeField::ghost_send_buffer_d[2] = {nullptr, nullptr};
38 
40 
42 
44 
46 
48 
50  : precision(field.Precision()), ghost_precision(field.Precision()),
51  nDim(field.Ndim()), pad(field.Pad()),
52  siteSubset(field.SiteSubset()), mem_type(field.MemType()),
53  ghostExchange(field.GhostExchange()), scale(field.Scale())
54  {
55  for(int dir=0; dir<nDim; ++dir) {
56  x[dir] = field.X()[dir];
57  r[dir] = field.R()[dir];
58  }
59  }
60 
62  volume(1),
63  localVolume(1),
64  pad(param.pad),
65  total_bytes(0),
66  nDim(param.nDim),
67  precision(param.Precision()),
68  ghost_precision(param.GhostPrecision()),
69  ghost_precision_reset(false),
70  scale(param.scale),
71  siteSubset(param.siteSubset),
72  ghostExchange(param.ghostExchange),
73  ghost_bytes(0),
74  ghost_bytes_old(0),
75  ghost_face_bytes {},
76  ghost_face_bytes_aligned {},
77  ghost_offset(),
78  my_face_h {},
79  my_face_hd {},
80  my_face_d {},
81  from_face_h {},
82  from_face_hd {},
83  from_face_d {},
84  initComms(false),
85  mem_type(param.mem_type),
86  backup_h(nullptr),
87  backup_norm_h(nullptr),
88  backed_up(false)
89  {
91 
92  for (int dir = 0; dir < 2; dir++) { // XLC cannot do multi-dimensional array initialization
93  for (int dim = 0; dim < QUDA_MAX_DIM; dim++) {
94 
95  for (int b = 0; b < 2; b++) {
96  my_face_dim_dir_d[b][dim][dir] = nullptr;
97  my_face_dim_dir_hd[b][dim][dir] = nullptr;
98  my_face_dim_dir_h[b][dim][dir] = nullptr;
99 
100  from_face_dim_dir_d[b][dim][dir] = nullptr;
101  from_face_dim_dir_hd[b][dim][dir] = nullptr;
102  from_face_dim_dir_h[b][dim][dir] = nullptr;
103  }
104 
105  mh_recv_fwd[dir][dim] = nullptr;
106  mh_recv_back[dir][dim] = nullptr;
107  mh_send_fwd[dir][dim] = nullptr;
108  mh_send_back[dir][dim] = nullptr;
109 
110  mh_recv_rdma_fwd[dir][dim] = nullptr;
111  mh_recv_rdma_back[dir][dim] = nullptr;
112  mh_send_rdma_fwd[dir][dim] = nullptr;
113  mh_send_rdma_back[dir][dim] = nullptr;
114  }
115  }
116 
117  for (int i=0; i<nDim; i++) {
118  x[i] = param.x[i];
120  volume *= param.x[i];
121  localVolume *= (x[i] - 2 * r[i]);
122  surface[i] = 1;
123  for (int j=0; j<nDim; j++) {
124  if (i==j) continue;
125  surface[i] *= param.x[j];
126  }
127  }
128 
129  if (siteSubset == QUDA_INVALID_SITE_SUBSET) errorQuda("siteSubset is not set");
132  stride = volumeCB + pad;
133 
134  // for parity fields the factor of half is present for all surfaces dimensions except x, so add it manually
135  for (int i=0; i<nDim; i++)
136  surfaceCB[i] = (siteSubset == QUDA_FULL_SITE_SUBSET || i==0) ? surface[i] / 2 : surface[i];
137 
138  // for 5-dimensional fields, we only communicate in the space-time dimensions
139  nDimComms = nDim == 5 ? 4 : nDim;
140 
141  switch (precision) {
144  case QUDA_HALF_PRECISION:
146  break;
147  default:
148  errorQuda("Unknown precision %d", precision);
149  }
150 
151  setTuningString();
152  }
153 
155  volume(1),
156  localVolume(1),
157  pad(field.pad),
158  total_bytes(0),
159  nDim(field.nDim),
160  precision(field.precision),
161  ghost_precision(field.ghost_precision),
162  ghost_precision_reset(false),
163  scale(field.scale),
164  siteSubset(field.siteSubset),
165  ghostExchange(field.ghostExchange),
166  ghost_bytes(0),
167  ghost_bytes_old(0),
168  ghost_face_bytes {},
169  ghost_face_bytes_aligned {},
170  ghost_offset(),
171  my_face_h {},
172  my_face_hd {},
173  my_face_d {},
174  from_face_h {},
175  from_face_hd {},
176  from_face_d {},
177  initComms(false),
178  mem_type(field.mem_type),
179  backup_h(nullptr),
180  backup_norm_h(nullptr),
181  backed_up(false)
182  {
183  precisionCheck();
184 
185  for (int dir = 0; dir < 2; dir++) { // XLC cannot do multi-dimensional array initialization
186  for (int dim = 0; dim < QUDA_MAX_DIM; dim++) {
187  mh_recv_fwd[dir][dim] = nullptr;
188  mh_recv_back[dir][dim] = nullptr;
189  mh_send_fwd[dir][dim] = nullptr;
190  mh_send_back[dir][dim] = nullptr;
191 
192  mh_recv_rdma_fwd[dir][dim] = nullptr;
193  mh_recv_rdma_back[dir][dim] = nullptr;
194  mh_send_rdma_fwd[dir][dim] = nullptr;
195  mh_send_rdma_back[dir][dim] = nullptr;
196  }
197  }
198 
199  for (int i=0; i<nDim; i++) {
200  x[i] = field.x[i];
201  r[i] = ghostExchange == QUDA_GHOST_EXCHANGE_EXTENDED ? field.r[i] : 0;
202  volume *= field.x[i];
203  localVolume *= (x[i] - 2 * r[i]);
204  surface[i] = 1;
205  for (int j=0; j<nDim; j++) {
206  if (i==j) continue;
207  surface[i] *= field.x[j];
208  }
209  }
210 
211  if (siteSubset == QUDA_INVALID_SITE_SUBSET) errorQuda("siteSubset is not set");
214  stride = volumeCB + pad;
215 
216  // for parity fields the factor of half is present for all surfaces dimensions except x, so add it manually
217  for (int i=0; i<nDim; i++)
218  surfaceCB[i] = (siteSubset == QUDA_FULL_SITE_SUBSET || i==0) ? surface[i] / 2 : surface[i];
219 
220  // for 5-dimensional fields, we only communicate in the space-time dimensions
221  nDimComms = nDim == 5 ? 4 : nDim;
222 
223  setTuningString();
224  }
225 
227 
228  void LatticeField::allocateGhostBuffer(size_t ghost_bytes) const
229  {
230  // only allocate if not already allocated or buffer required is bigger than previously
232 
233  if (initGhostFaceBuffer) {
234  if (ghostFaceBytes) {
235  // remove potential for inter-process race conditions
236  // ensures that all outstanding communication is complete
237  // before we free any comms buffers
239  comm_barrier();
240  for (int b=0; b<2; b++) {
245  }
246  }
247  }
248 
249  if (ghost_bytes > 0) {
250  for (int b = 0; b < 2; ++b) {
251  // gpu receive buffer (use pinned allocator to avoid this being redirected, e.g., by QDPJIT)
253  // silence any false cuda-memcheck initcheck errors
255 
256  // gpu send buffer (use pinned allocator to avoid this being redirected, e.g., by QDPJIT)
258  // silence any false cuda-memcheck initcheck errors
260 
261  // pinned buffer used for sending
263 
264  // set the matching device-mapped pointer
266 
267  // pinned buffer used for receiving
269 
270  // set the matching device-mapped pointer
272  }
273 
274  initGhostFaceBuffer = true;
276  }
277 
278  LatticeField::ghost_field_reset = true; // this signals that we must reset the IPC comms
279  }
280 
281  }
282 
284  {
285  destroyIPCComms();
286 
287  if (!initGhostFaceBuffer) return;
288 
289  for (int b=0; b<2; b++) {
290  // free receive buffer
292  ghost_recv_buffer_d[b] = nullptr;
293 
294  // free send buffer
296  ghost_send_buffer_d[b] = nullptr;
297 
298  // free pinned send memory buffer
300 
301  // free pinned send memory buffer
303 
304  ghost_pinned_recv_buffer_h[b] = nullptr;
305  ghost_pinned_recv_buffer_hd[b] = nullptr;
306  ghost_pinned_send_buffer_h[b] = nullptr;
307  ghost_pinned_send_buffer_hd[b] = nullptr;
308  }
309  initGhostFaceBuffer = false;
310  }
311 
312  void LatticeField::createComms(bool no_comms_fill, bool bidir)
313  {
314  destroyComms(); // if we are requesting a new number of faces destroy and start over
315 
316  // before allocating local comm handles, synchronize since the
317  // comms buffers are static so remove potential for interferring
318  // with any outstanding exchanges to the same buffers
320  comm_barrier();
321 
322  // initialize the ghost pinned buffers
323  for (int b=0; b<2; b++) {
330  }
331 
332  // initialize ghost send pointers
333  for (int i=0; i<nDimComms; i++) {
334  if (!commDimPartitioned(i) && no_comms_fill==false) continue;
335 
336  for (int b=0; b<2; ++b) {
337  my_face_dim_dir_h[b][i][0] = static_cast<char *>(my_face_h[b]) + ghost_offset[i][0];
338  from_face_dim_dir_h[b][i][0] = static_cast<char *>(from_face_h[b]) + ghost_offset[i][0];
339 
340  my_face_dim_dir_hd[b][i][0] = static_cast<char *>(my_face_hd[b]) + ghost_offset[i][0];
341  from_face_dim_dir_hd[b][i][0] = static_cast<char *>(from_face_hd[b]) + ghost_offset[i][0];
342 
343  my_face_dim_dir_d[b][i][0] = static_cast<char *>(my_face_d[b]) + ghost_offset[i][0];
344  from_face_dim_dir_d[b][i][0] = static_cast<char *>(from_face_d[b]) + ghost_offset[i][0];
345  } // loop over b
346 
347  for (int b=0; b<2; ++b) {
348  my_face_dim_dir_h[b][i][1] = static_cast<char *>(my_face_h[b]) + ghost_offset[i][1];
349  from_face_dim_dir_h[b][i][1] = static_cast<char *>(from_face_h[b]) + ghost_offset[i][1];
350 
351  my_face_dim_dir_hd[b][i][1] = static_cast<char *>(my_face_hd[b]) + ghost_offset[i][1];
352  from_face_dim_dir_hd[b][i][1] = static_cast<char *>(from_face_hd[b]) + ghost_offset[i][1];
353 
354  my_face_dim_dir_d[b][i][1] = static_cast<char *>(my_face_d[b]) + ghost_offset[i][1];
355  from_face_dim_dir_d[b][i][1] = static_cast<char *>(from_face_d[b]) + ghost_offset[i][1];
356  } // loop over b
357 
358  } // loop over dimension
359 
360  bool gdr = comm_gdr_enabled(); // only allocate rdma buffers if GDR enabled
361 
362  // initialize the message handlers
363  for (int i=0; i<nDimComms; i++) {
364  if (!commDimPartitioned(i)) continue;
365 
366  for (int b=0; b<2; ++b) {
369 
372 
373  mh_send_rdma_fwd[b][i] = gdr ? comm_declare_send_relative(my_face_dim_dir_d[b][i][1], i, +1, ghost_face_bytes[i]) : nullptr;
374  mh_send_rdma_back[b][i] = gdr ? comm_declare_send_relative(my_face_dim_dir_d[b][i][0], i, -1, ghost_face_bytes[i]) : nullptr;
375 
376  mh_recv_rdma_fwd[b][i] = gdr ? comm_declare_receive_relative(from_face_dim_dir_d[b][i][1], i, +1, ghost_face_bytes[i]) : nullptr;
377  mh_recv_rdma_back[b][i] = gdr ? comm_declare_receive_relative(from_face_dim_dir_d[b][i][0], i, -1, ghost_face_bytes[i]) : nullptr;
378  } // loop over b
379 
380  } // loop over dimension
381 
382  initComms = true;
383  checkCudaError();
384  }
385 
387  {
388  if (initComms) {
389 
390  // ensure that all processes bring down their communicators
391  // synchronously so that we don't end up in an undefined state
393  comm_barrier();
394 
395  for (int b=0; b<2; ++b) {
396  for (int i=0; i<nDimComms; i++) {
397  if (mh_recv_fwd[b][i]) comm_free(mh_recv_fwd[b][i]);
398  if (mh_recv_back[b][i]) comm_free(mh_recv_back[b][i]);
399  if (mh_send_fwd[b][i]) comm_free(mh_send_fwd[b][i]);
400  if (mh_send_back[b][i]) comm_free(mh_send_back[b][i]);
401 
402  if (mh_recv_rdma_fwd[b][i]) comm_free(mh_recv_rdma_fwd[b][i]);
404  if (mh_send_rdma_fwd[b][i]) comm_free(mh_send_rdma_fwd[b][i]);
406  }
407  } // loop over b
408 
409  // local take down complete - now synchronize to ensure globally complete
411  comm_barrier();
412 
413  initComms = false;
414  }
415 
416  }
417 
419  if ( initIPCComms && !ghost_field_reset ) return;
420 
421  if (!initComms) errorQuda("Can only be called after create comms");
422  if ((!ghost_recv_buffer_d[0] || !ghost_recv_buffer_d[1]) && comm_size() > 1)
423  errorQuda("ghost_field appears not to be allocated");
424 #ifndef NVSHMEM_COMMS
425  // handles for obtained ghost pointers
426  cudaIpcMemHandle_t ipcRemoteGhostDestHandle[2][2][QUDA_MAX_DIM];
427 #endif
428 
429  for (int b=0; b<2; b++) {
430 #ifndef NVSHMEM_COMMS
431  for (int dim=0; dim<4; ++dim) {
432  if (comm_dim(dim)==1) continue;
433  for (int dir=0; dir<2; ++dir) {
434  MsgHandle* sendHandle = nullptr;
435  MsgHandle* receiveHandle = nullptr;
436  int disp = (dir == 1) ? +1 : -1;
437 
438  // first set up receive
439  if (comm_peer2peer_enabled(1-dir,dim)) {
440  receiveHandle = comm_declare_receive_relative(&ipcRemoteGhostDestHandle[b][1-dir][dim],
441  dim, -disp,
442  sizeof(ipcRemoteGhostDestHandle[b][1-dir][dim]));
443  }
444  // now send
445  cudaIpcMemHandle_t ipcLocalGhostDestHandle;
446  if (comm_peer2peer_enabled(dir,dim)) {
447  cudaIpcGetMemHandle(&ipcLocalGhostDestHandle, ghost_recv_buffer_d[b]);
448  sendHandle = comm_declare_send_relative(&ipcLocalGhostDestHandle,
449  dim, disp,
450  sizeof(ipcLocalGhostDestHandle));
451  }
452  if (receiveHandle) comm_start(receiveHandle);
453  if (sendHandle) comm_start(sendHandle);
454 
455  if (receiveHandle) comm_wait(receiveHandle);
456  if (sendHandle) comm_wait(sendHandle);
457 
458  if (sendHandle) comm_free(sendHandle);
459  if (receiveHandle) comm_free(receiveHandle);
460  }
461  }
462 
463  checkCudaError();
464 #endif
465  // open the remote memory handles and set the send ghost pointers
466  for (int dim = 0; dim < 4; ++dim) {
467 #ifndef NVSHMEM_COMMS
468  // TODO: We maybe can force loopback comms to use the IB path here
469  if (comm_dim(dim) == 1) continue;
470 #endif
471  // even if comm_dim(2) == 2, we might not have p2p enabled in both directions, so check this
472  const int num_dir
473  = (comm_dim(dim) == 2 && comm_peer2peer_enabled(0, dim) && comm_peer2peer_enabled(1, dim)) ? 1 : 2;
474  for (int dir = 0; dir < num_dir; ++dir) {
475 #ifndef NVSHMEM_COMMS
476  if (!comm_peer2peer_enabled(dir, dim)) continue;
477  void **ghostDest = &(ghost_remote_send_buffer_d[b][dim][dir]);
478  cudaIpcOpenMemHandle(ghostDest, ipcRemoteGhostDestHandle[b][dir][dim], cudaIpcMemLazyEnablePeerAccess);
479 #else
481  = nvshmem_ptr(static_cast<char *>(ghost_recv_buffer_d[b]), comm_neighbor_rank(dir, dim));
482 #endif
483  }
484  if (num_dir == 1) ghost_remote_send_buffer_d[b][dim][1] = ghost_remote_send_buffer_d[b][dim][0];
485  }
486  } // buffer index
487 
488  checkCudaError();
489 
490  // handles for obtained events
491  cudaIpcEventHandle_t ipcRemoteEventHandle[2][2][QUDA_MAX_DIM];
492 
493  // Note that no b index is necessary here
494  // Now communicate the event handles
495  for (int dim=0; dim<4; ++dim) {
496  if (comm_dim(dim)==1) continue;
497  for (int dir=0; dir<2; ++dir) {
498  for (int b=0; b<2; b++) {
499 
500  MsgHandle* sendHandle = NULL;
501  MsgHandle* receiveHandle = NULL;
502  int disp = (dir == 1) ? +1 : -1;
503 
504  // first set up receive
505  if (comm_peer2peer_enabled(1-dir,dim)) {
506  receiveHandle = comm_declare_receive_relative(&ipcRemoteEventHandle[b][1-dir][dim], dim, -disp,
507  sizeof(ipcRemoteEventHandle[b][1-dir][dim]));
508  }
509 
510  // now send
511  cudaIpcEventHandle_t ipcLocalEventHandle;
512  if (comm_peer2peer_enabled(dir,dim)) {
513  cudaEventCreate(&ipcCopyEvent[b][dir][dim], cudaEventDisableTiming | cudaEventInterprocess);
514  cudaIpcGetEventHandle(&ipcLocalEventHandle, ipcCopyEvent[b][dir][dim]);
515 
516  sendHandle = comm_declare_send_relative(&ipcLocalEventHandle, dim, disp,
517  sizeof(ipcLocalEventHandle));
518  }
519 
520  if (receiveHandle) comm_start(receiveHandle);
521  if (sendHandle) comm_start(sendHandle);
522 
523  if (receiveHandle) comm_wait(receiveHandle);
524  if (sendHandle) comm_wait(sendHandle);
525 
526  if (sendHandle) comm_free(sendHandle);
527  if (receiveHandle) comm_free(receiveHandle);
528 
529  } // buffer index
530  }
531  }
532 
533  checkCudaError();
534 
535  for (int dim=0; dim<4; ++dim) {
536  if (comm_dim(dim)==1) continue;
537  for (int dir=0; dir<2; ++dir) {
538  if (!comm_peer2peer_enabled(dir,dim)) continue;
539  for (int b=0; b<2; b++) {
540  cudaIpcOpenEventHandle(&(ipcRemoteCopyEvent[b][dir][dim]), ipcRemoteEventHandle[b][dir][dim]);
541  }
542  }
543  }
544 
545  // Create message handles for IPC synchronization
546  for (int dim=0; dim<4; ++dim) {
547  if (comm_dim(dim)==1) continue;
548  if (comm_peer2peer_enabled(1,dim)) {
549  for (int b=0; b<2; b++) {
550  // send to processor in forward direction
552  // receive from processor in forward direction
554  }
555  }
556 
557  if (comm_peer2peer_enabled(0,dim)) {
558  for (int b=0; b<2; b++) {
559  // send to processor in backward direction
561  // receive from processor in backward direction
563  }
564  }
565  }
566  checkCudaError();
567 
568  initIPCComms = true;
569  ghost_field_reset = false;
570  }
571 
573 
574  if (!initIPCComms) return;
575 
576  // ensure that all processes bring down their communicators
577  // synchronously so that we don't end up in an undefined state
579  comm_barrier();
580 
581  for (int dim=0; dim<4; ++dim) {
582 
583  if (comm_dim(dim)==1) continue;
584 #ifndef NVSHMEM_COMMS
585  const int num_dir = (comm_dim(dim) == 2 && comm_peer2peer_enabled(0,dim) && comm_peer2peer_enabled(1,dim)) ? 1 : 2;
586 #endif
587  for (int b=0; b<2; b++) {
588  if (comm_peer2peer_enabled(1,dim)) {
589  if (mh_send_p2p_fwd[b][dim] || mh_recv_p2p_fwd[b][dim]) {
590  cudaEventDestroy(ipcCopyEvent[b][1][dim]);
591  // only close this handle if it doesn't alias the back ghost
592 
593 #ifndef NVSHMEM_COMMS
594  if (num_dir == 2) cudaIpcCloseMemHandle(ghost_remote_send_buffer_d[b][dim][1]);
595 #endif
596  }
599  }
600 
601  if (comm_peer2peer_enabled(0,dim)) {
602  if (mh_send_p2p_back[b][dim] || mh_recv_p2p_back[b][dim]) {
603  cudaEventDestroy(ipcCopyEvent[b][0][dim]);
604 
605 #ifndef NVSHMEM_COMMS
606  cudaIpcCloseMemHandle(ghost_remote_send_buffer_d[b][dim][0]);
607 #endif
608  }
611  }
612  } // buffer
613  } // iterate over dim
614 
615  checkCudaError();
616 
617  // local take down complete - now synchronize to ensure globally complete
619  comm_barrier();
620 
621  initIPCComms = false;
622  }
623 
625  {
626  return (cudaSuccess == cudaEventQuery(ipcCopyEvent[bufferIndex][dir][dim]) ? true : false);
627  }
628 
630  {
631  return (cudaSuccess == cudaEventQuery(ipcRemoteCopyEvent[bufferIndex][dir][dim]) ? true : false);
632  }
633 
634  const cudaEvent_t& LatticeField::getIPCCopyEvent(int dir, int dim) const {
635  return ipcCopyEvent[bufferIndex][dir][dim];
636  }
637 
638  const cudaEvent_t& LatticeField::getIPCRemoteCopyEvent(int dir, int dim) const {
639  return ipcRemoteCopyEvent[bufferIndex][dir][dim];
640  }
641 
643  char vol_tmp[TuneKey::volume_n];
644  int check = snprintf(vol_string, TuneKey::volume_n, "%d", x[0]);
645  if (check < 0 || check >= TuneKey::volume_n) errorQuda("Error writing volume string");
646  for (int d=1; d<nDim; d++) {
647  strcpy(vol_tmp, vol_string);
648  check = snprintf(vol_string, TuneKey::volume_n, "%sx%d", vol_tmp, x[d]);
649  if (check < 0 || check >= TuneKey::volume_n) errorQuda("Error writing volume string");
650  }
651  }
652 
653  void LatticeField::checkField(const LatticeField &a) const {
654  if (a.nDim != nDim) errorQuda("nDim does not match %d %d", nDim, a.nDim);
656  // if source is extended by I am not then we need to compare their interior volume to my volume
657  size_t a_volume_interior = 1;
658  for (int i=0; i<nDim; i++) {
659  if (a.x[i]-2*a.r[i] != x[i]) errorQuda("x[%d] does not match %d %d", i, x[i], a.x[i]-2*a.r[i]);
660  a_volume_interior *= a.x[i] - 2*a.r[i];
661  }
662  if (a_volume_interior != volume) errorQuda("Interior volume does not match %lu %lu", volume, a_volume_interior);
664  // if source is extended by I am not then we need to compare their interior volume to my volume
665  size_t this_volume_interior = 1;
666  for (int i=0; i<nDim; i++) {
667  if (x[i]-2*r[i] != a.x[i]) errorQuda("x[%d] does not match %d %d", i, x[i]-2*r[i], a.x[i]);
668  this_volume_interior *= x[i] - 2*r[i];
669  }
670  if (this_volume_interior != a.volume)
671  errorQuda("Interior volume does not match %lu %lu", this_volume_interior, a.volume);
672  } else {
673  if (a.volume != volume) errorQuda("Volume does not match %lu %lu", volume, a.volume);
674  if (a.volumeCB != volumeCB) errorQuda("VolumeCB does not match %lu %lu", volumeCB, a.volumeCB);
675  for (int i=0; i<nDim; i++) {
676  if (a.x[i] != x[i]) errorQuda("x[%d] does not match %d %d", i, x[i], a.x[i]);
677  if (a.surface[i] != surface[i]) errorQuda("surface[%d] does not match %d %d", i, surface[i], a.surface[i]);
678  if (a.surfaceCB[i] != surfaceCB[i]) errorQuda("surfaceCB[%d] does not match %d %d", i, surfaceCB[i], a.surfaceCB[i]);
679  }
680  }
681  }
682 
685  if (typeid(*this)==typeid(cudaCloverField) ||
686  typeid(*this)==typeid(cudaColorSpinorField) ||
687  typeid(*this)==typeid(cudaGaugeField)) {
688  location = QUDA_CUDA_FIELD_LOCATION;
689  } else if (typeid(*this)==typeid(cpuCloverField) ||
690  typeid(*this)==typeid(cpuColorSpinorField) ||
691  typeid(*this)==typeid(cpuGaugeField)) {
692  location = QUDA_CPU_FIELD_LOCATION;
693  } else {
694  errorQuda("Unknown field %s, so cannot determine location", typeid(*this).name());
695  }
696  return location;
697  }
698 
699  void LatticeField::read(char *filename) {
700  errorQuda("Not implemented");
701  }
702 
703  void LatticeField::write(char *filename) {
704  errorQuda("Not implemented");
705  }
706 
707  int LatticeField::Nvec() const {
708  if (typeid(*this) == typeid(const cudaColorSpinorField)) {
709  const ColorSpinorField &csField = static_cast<const ColorSpinorField&>(*this);
710  if (csField.FieldOrder() == 2 || csField.FieldOrder() == 4)
711  return static_cast<int>(csField.FieldOrder());
712  } else if (typeid(*this) == typeid(const cudaGaugeField)) {
713  const GaugeField &gField = static_cast<const GaugeField&>(*this);
714  if (gField.Order() == 2 || gField.Order() == 4)
715  return static_cast<int>(gField.Order());
716  } else if (typeid(*this) == typeid(const cudaCloverField)) {
717  const CloverField &cField = static_cast<const CloverField&>(*this);
718  if (cField.Order() == 2 || cField.Order() == 4)
719  return static_cast<int>(cField.Order());
720  }
721 
722  errorQuda("Unsupported field type");
723  return -1;
724  }
725 
726  // This doesn't really live here, but is fine for the moment
727  std::ostream& operator<<(std::ostream& output, const LatticeFieldParam& param)
728  {
729  output << "nDim = " << param.nDim << std::endl;
730  for (int i=0; i<param.nDim; i++) {
731  output << "x[" << i << "] = " << param.x[i] << std::endl;
732  }
733  output << "pad = " << param.pad << std::endl;
734  output << "precision = " << param.Precision() << std::endl;
735  output << "ghost_precision = " << param.GhostPrecision() << std::endl;
736  output << "scale = " << param.scale << std::endl;
737 
738  output << "ghostExchange = " << param.ghostExchange << std::endl;
739  for (int i=0; i<param.nDim; i++) {
740  output << "r[" << i << "] = " << param.r[i] << std::endl;
741  }
742 
743  return output; // for multiple << operators.
744  }
745 
746  static QudaFieldLocation reorder_location_ = QUDA_CUDA_FIELD_LOCATION;
747 
748  QudaFieldLocation reorder_location() { return reorder_location_; }
749  void reorder_location_set(QudaFieldLocation _reorder_location) { reorder_location_ = _reorder_location; }
750 
751 } // namespace quda
QudaCloverFieldOrder Order() const
Definition: clover_field.h:162
QudaFieldOrder FieldOrder() const
QudaGaugeFieldOrder Order() const
Definition: gauge_field.h:287
QudaGhostExchange ghostExchange
static bool initGhostFaceBuffer
MsgHandle * mh_send_fwd[2][QUDA_MAX_DIM]
QudaSiteSubset siteSubset
static int buffer_recv_p2p_fwd[2][QUDA_MAX_DIM]
static int bufferIndex
MsgHandle * mh_recv_rdma_back[2][QUDA_MAX_DIM]
MsgHandle * mh_send_rdma_fwd[2][QUDA_MAX_DIM]
void * from_face_dim_dir_d[2][QUDA_MAX_DIM][2]
static MsgHandle * mh_recv_p2p_back[2][QUDA_MAX_DIM]
bool ipcCopyComplete(int dir, int dim)
MsgHandle * mh_send_rdma_back[2][QUDA_MAX_DIM]
static void * ghost_pinned_send_buffer_hd[2]
int x[QUDA_MAX_DIM]
void * my_face_dim_dir_h[2][QUDA_MAX_DIM][2]
void * from_face_dim_dir_h[2][QUDA_MAX_DIM][2]
static MsgHandle * mh_send_p2p_fwd[2][QUDA_MAX_DIM]
static MsgHandle * mh_recv_p2p_fwd[2][QUDA_MAX_DIM]
static void * ghost_pinned_recv_buffer_h[2]
size_t ghost_offset[QUDA_MAX_DIM][2]
QudaFieldLocation Location() const
QudaPrecision precision
void * my_face_dim_dir_d[2][QUDA_MAX_DIM][2]
static void destroyIPCComms()
static void * ghost_pinned_recv_buffer_hd[2]
size_t ghost_face_bytes[QUDA_MAX_DIM]
virtual void write(char *filename)
static size_t ghostFaceBytes
char vol_string[TuneKey::volume_n]
static void * ghost_pinned_send_buffer_h[2]
static void * ghost_remote_send_buffer_d[2][QUDA_MAX_DIM][2]
void * from_face_dim_dir_hd[2][QUDA_MAX_DIM][2]
static MsgHandle * mh_send_p2p_back[2][QUDA_MAX_DIM]
static int buffer_send_p2p_fwd[2][QUDA_MAX_DIM]
virtual void setTuningString()
static bool ghost_field_reset
const int * R() const
const int * X() const
int surfaceCB[QUDA_MAX_DIM]
int surface[QUDA_MAX_DIM]
static int buffer_recv_p2p_back[2][QUDA_MAX_DIM]
bool ipcRemoteCopyComplete(int dir, int dim)
void checkField(const LatticeField &a) const
const cudaEvent_t & getIPCCopyEvent(int dir, int dim) const
static bool initIPCComms
int r[QUDA_MAX_DIM]
MsgHandle * mh_send_back[2][QUDA_MAX_DIM]
void * my_face_dim_dir_hd[2][QUDA_MAX_DIM][2]
static cudaEvent_t ipcCopyEvent[2][2][QUDA_MAX_DIM]
static cudaEvent_t ipcRemoteCopyEvent[2][2][QUDA_MAX_DIM]
const cudaEvent_t & getIPCRemoteCopyEvent(int dir, int dim) const
MsgHandle * mh_recv_fwd[2][QUDA_MAX_DIM]
static int buffer_send_p2p_back[2][QUDA_MAX_DIM]
void allocateGhostBuffer(size_t ghost_bytes) const
Allocate the static ghost buffers.
virtual void read(char *filename)
MsgHandle * mh_recv_rdma_fwd[2][QUDA_MAX_DIM]
static void * ghost_recv_buffer_d[2]
MsgHandle * mh_recv_back[2][QUDA_MAX_DIM]
void createComms(bool no_comms_fill=false, bool bidir=true)
void * from_face_hd[2]
LatticeField(const LatticeFieldParam &param)
static void freeGhostBuffer(void)
Free statically allocated ghost buffers.
static void * ghost_send_buffer_d[2]
void comm_start(MsgHandle *mh)
void comm_barrier(void)
int comm_neighbor_rank(int dir, int dim)
int comm_size(void)
bool comm_gdr_enabled()
Query if GPU Direct RDMA communication is enabled (global setting)
bool comm_peer2peer_enabled(int dir, int dim)
#define comm_declare_receive_relative(buffer, dim, dir, nbytes)
Definition: comm_quda.h:82
void comm_wait(MsgHandle *mh)
void comm_free(MsgHandle *&mh)
int comm_dim(int dim)
int commDimPartitioned(int dir)
#define comm_declare_send_relative(buffer, dim, dir, nbytes)
Definition: comm_quda.h:67
std::array< int, 4 > dim
@ QUDA_CUDA_FIELD_LOCATION
Definition: enum_quda.h:326
@ QUDA_CPU_FIELD_LOCATION
Definition: enum_quda.h:325
@ QUDA_INVALID_FIELD_LOCATION
Definition: enum_quda.h:327
@ QUDA_INVALID_SITE_SUBSET
Definition: enum_quda.h:334
@ QUDA_FULL_SITE_SUBSET
Definition: enum_quda.h:333
enum QudaFieldLocation_s QudaFieldLocation
@ QUDA_GHOST_EXCHANGE_EXTENDED
Definition: enum_quda.h:510
@ QUDA_DOUBLE_PRECISION
Definition: enum_quda.h:65
@ QUDA_SINGLE_PRECISION
Definition: enum_quda.h:64
@ QUDA_QUARTER_PRECISION
Definition: enum_quda.h:62
@ QUDA_HALF_PRECISION
Definition: enum_quda.h:63
void initComms(int argc, char **argv, std::array< int, 4 > &commDims)
Definition: host_utils.cpp:255
#define device_comms_pinned_free(ptr)
Definition: malloc_quda.h:112
#define device_comms_pinned_malloc(size)
Definition: malloc_quda.h:104
#define get_mapped_device_pointer(ptr)
Definition: malloc_quda.h:116
#define host_free(ptr)
Definition: malloc_quda.h:115
#define mapped_malloc(size)
Definition: malloc_quda.h:108
QudaFieldLocation reorder_location()
Return whether data is reordered on the CPU or GPU. This can set at QUDA initialization using the env...
void reorder_location_set(QudaFieldLocation reorder_location_)
Set whether data is reorderd on the CPU or GPU. This can set at QUDA initialization using the environ...
std::ostream & operator<<(std::ostream &output, const CloverFieldParam &param)
QudaGaugeParam param
Definition: pack_test.cpp:18
#define qudaMemset(ptr, value, count)
Definition: quda_api.h:218
#define qudaDeviceSynchronize()
Definition: quda_api.h:250
#define QUDA_MAX_DIM
Maximum number of dimensions supported by QUDA. In practice, no routines make use of more than 5.
double scale
Definition: quda.h:39
LatticeFieldParam()
Default constructor for LatticeFieldParam.
Definition: lattice_field.h:88
int r[QUDA_MAX_DIM]
Definition: lattice_field.h:80
int x[QUDA_MAX_DIM]
Definition: lattice_field.h:68
static const int volume_n
Definition: tune_key.h:10
#define checkCudaError()
Definition: util_quda.h:158
#define errorQuda(...)
Definition: util_quda.h:120