QUDA  1.0.0
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros
lattice_field.cpp
Go to the documentation of this file.
1 #include <typeinfo>
2 #include <quda_internal.h>
3 #include <lattice_field.h>
4 #include <color_spinor_field.h>
5 #include <gauge_field.h>
6 #include <clover_field.h>
7 
8 namespace quda {
9 
10  bool LatticeField::initIPCComms = false;
11 
16 
21 
22  cudaEvent_t LatticeField::ipcCopyEvent[2][2][QUDA_MAX_DIM];
24 
25  void *LatticeField::ghost_pinned_send_buffer_h[2] = {nullptr, nullptr};
26  void *LatticeField::ghost_pinned_send_buffer_hd[2] = {nullptr, nullptr};
27 
28  void *LatticeField::ghost_pinned_recv_buffer_h[2] = {nullptr, nullptr};
29  void *LatticeField::ghost_pinned_recv_buffer_hd[2] = {nullptr, nullptr};
30 
31  // gpu ghost receive buffer
32  void *LatticeField::ghost_recv_buffer_d[2] = {nullptr, nullptr};
33 
34  // gpu ghost send buffer
35  void *LatticeField::ghost_send_buffer_d[2] = {nullptr, nullptr};
36 
38 
40 
42 
44 
46 
48  : precision(field.Precision()), ghost_precision(field.Precision()),
49  nDim(field.Ndim()), pad(field.Pad()),
50  siteSubset(field.SiteSubset()), mem_type(field.MemType()),
51  ghostExchange(field.GhostExchange()), scale(field.Scale())
52  {
53  for(int dir=0; dir<nDim; ++dir) {
54  x[dir] = field.X()[dir];
55  r[dir] = field.R()[dir];
56  }
57  }
58 
60  volume(1),
61  pad(param.pad),
62  total_bytes(0),
63  nDim(param.nDim),
64  precision(param.Precision()),
66  ghost_precision_reset(false),
67  scale(param.scale),
68  siteSubset(param.siteSubset),
70  ghost_bytes(0),
71  ghost_bytes_old(0),
72  ghost_face_bytes {},
73  ghostOffset(),
75  my_face_h {},
76  my_face_hd {},
77  my_face_d {},
78  from_face_h {},
79  from_face_hd {},
80  from_face_d {},
81  initComms(false),
82  mem_type(param.mem_type),
83  backup_h(nullptr),
84  backup_norm_h(nullptr),
85  backed_up(false)
86  {
88 
89  for (int dir = 0; dir < 2; dir++) { // XLC cannot do multi-dimensional array initialization
90  for (int dim = 0; dim < QUDA_MAX_DIM; dim++) {
91 
92  for (int b = 0; b < 2; b++) {
93  my_face_dim_dir_d[b][dim][dir] = nullptr;
94  my_face_dim_dir_hd[b][dim][dir] = nullptr;
95  my_face_dim_dir_h[b][dim][dir] = nullptr;
96 
97  from_face_dim_dir_d[b][dim][dir] = nullptr;
98  from_face_dim_dir_hd[b][dim][dir] = nullptr;
99  from_face_dim_dir_h[b][dim][dir] = nullptr;
100  }
101 
102  mh_recv_fwd[dir][dim] = nullptr;
103  mh_recv_back[dir][dim] = nullptr;
104  mh_send_fwd[dir][dim] = nullptr;
105  mh_send_back[dir][dim] = nullptr;
106 
107  mh_recv_rdma_fwd[dir][dim] = nullptr;
108  mh_recv_rdma_back[dir][dim] = nullptr;
109  mh_send_rdma_fwd[dir][dim] = nullptr;
110  mh_send_rdma_back[dir][dim] = nullptr;
111  }
112  }
113 
114  for (int i=0; i<nDim; i++) {
115  x[i] = param.x[i];
117  volume *= param.x[i];
118  surface[i] = 1;
119  for (int j=0; j<nDim; j++) {
120  if (i==j) continue;
121  surface[i] *= param.x[j];
122  }
123  }
124 
125  if (siteSubset == QUDA_INVALID_SITE_SUBSET) errorQuda("siteSubset is not set");
127  stride = volumeCB + pad;
128 
129  // for parity fields the factor of half is present for all surfaces dimensions except x, so add it manually
130  for (int i=0; i<nDim; i++)
131  surfaceCB[i] = (siteSubset == QUDA_FULL_SITE_SUBSET || i==0) ? surface[i] / 2 : surface[i];
132 
133  // for 5-dimensional fields, we only communicate in the space-time dimensions
134  nDimComms = nDim == 5 ? 4 : nDim;
135 
136  switch (precision) {
139  case QUDA_HALF_PRECISION:
141  break;
142  default:
143  errorQuda("Unknown precision %d", precision);
144  }
145 
146  setTuningString();
147  }
148 
150  volume(1),
151  pad(field.pad),
152  total_bytes(0),
153  nDim(field.nDim),
154  precision(field.precision),
156  ghost_precision_reset(false),
157  scale(field.scale),
158  siteSubset(field.siteSubset),
160  ghost_bytes(0),
161  ghost_bytes_old(0),
162  ghost_face_bytes {},
163  ghostOffset(),
164  ghostNormOffset(),
165  my_face_h {},
166  my_face_hd {},
167  my_face_d {},
168  from_face_h {},
169  from_face_hd {},
170  from_face_d {},
171  initComms(false),
172  mem_type(field.mem_type),
173  backup_h(nullptr),
174  backup_norm_h(nullptr),
175  backed_up(false)
176  {
177  precisionCheck();
178 
179  for (int dir = 0; dir < 2; dir++) { // XLC cannot do multi-dimensional array initialization
180  for (int dim = 0; dim < QUDA_MAX_DIM; dim++) {
181  mh_recv_fwd[dir][dim] = nullptr;
182  mh_recv_back[dir][dim] = nullptr;
183  mh_send_fwd[dir][dim] = nullptr;
184  mh_send_back[dir][dim] = nullptr;
185 
186  mh_recv_rdma_fwd[dir][dim] = nullptr;
187  mh_recv_rdma_back[dir][dim] = nullptr;
188  mh_send_rdma_fwd[dir][dim] = nullptr;
189  mh_send_rdma_back[dir][dim] = nullptr;
190  }
191  }
192 
193  for (int i=0; i<nDim; i++) {
194  x[i] = field.x[i];
195  r[i] = ghostExchange == QUDA_GHOST_EXCHANGE_EXTENDED ? field.r[i] : 0;
196  volume *= field.x[i];
197  surface[i] = 1;
198  for (int j=0; j<nDim; j++) {
199  if (i==j) continue;
200  surface[i] *= field.x[j];
201  }
202  }
203 
204  if (siteSubset == QUDA_INVALID_SITE_SUBSET) errorQuda("siteSubset is not set");
206  stride = volumeCB + pad;
207 
208  // for parity fields the factor of half is present for all surfaces dimensions except x, so add it manually
209  for (int i=0; i<nDim; i++)
210  surfaceCB[i] = (siteSubset == QUDA_FULL_SITE_SUBSET || i==0) ? surface[i] / 2 : surface[i];
211 
212  // for 5-dimensional fields, we only communicate in the space-time dimensions
213  nDimComms = nDim == 5 ? 4 : nDim;
214 
215  setTuningString();
216  }
217 
219 
221  {
222  // only allocate if not already allocated or buffer required is bigger than previously
223  if ( !initGhostFaceBuffer || ghost_bytes > ghostFaceBytes) {
224 
225  if (initGhostFaceBuffer) {
226  if (ghostFaceBytes) {
227  // remove potential for inter-process race conditions
228  // ensures that all outstanding communication is complete
229  // before we free any comms buffers
231  comm_barrier();
232  for (int b=0; b<2; b++) {
237  }
238  }
239  }
240 
241  if (ghost_bytes > 0) {
242  for (int b = 0; b < 2; ++b) {
243  // gpu receive buffer (use pinned allocator to avoid this being redirected, e.g., by QDPJIT)
244  ghost_recv_buffer_d[b] = device_pinned_malloc(ghost_bytes);
245 
246  // gpu send buffer (use pinned allocator to avoid this being redirected, e.g., by QDPJIT)
247  ghost_send_buffer_d[b] = device_pinned_malloc(ghost_bytes);
248 
249  // pinned buffer used for sending
250  ghost_pinned_send_buffer_h[b] = mapped_malloc(ghost_bytes);
251 
252  // set the matching device-mapped pointer
253  cudaHostGetDevicePointer(&ghost_pinned_send_buffer_hd[b], ghost_pinned_send_buffer_h[b], 0);
254 
255  // pinned buffer used for receiving
256  ghost_pinned_recv_buffer_h[b] = mapped_malloc(ghost_bytes);
257 
258  // set the matching device-mapped pointer
259  cudaHostGetDevicePointer(&ghost_pinned_recv_buffer_hd[b], ghost_pinned_recv_buffer_h[b], 0);
260  }
261 
262  initGhostFaceBuffer = true;
264  }
265 
266  LatticeField::ghost_field_reset = true; // this signals that we must reset the IPC comms
267  }
268 
269  }
270 
272  {
273  destroyIPCComms();
274 
275  if (!initGhostFaceBuffer) return;
276 
277  for (int b=0; b<2; b++) {
278  // free receive buffer
280  ghost_recv_buffer_d[b] = nullptr;
281 
282  // free send buffer
284  ghost_send_buffer_d[b] = nullptr;
285 
286  // free pinned send memory buffer
288 
289  // free pinned send memory buffer
291 
292  ghost_pinned_recv_buffer_h[b] = nullptr;
293  ghost_pinned_recv_buffer_hd[b] = nullptr;
294  ghost_pinned_send_buffer_h[b] = nullptr;
295  ghost_pinned_send_buffer_hd[b] = nullptr;
296  }
297  initGhostFaceBuffer = false;
298  }
299 
300  void LatticeField::createComms(bool no_comms_fill, bool bidir)
301  {
302  destroyComms(); // if we are requesting a new number of faces destroy and start over
303 
304  // before allocating local comm handles, synchronize since the
305  // comms buffers are static so remove potential for interferring
306  // with any outstanding exchanges to the same buffers
308  comm_barrier();
309 
310  // initialize the ghost pinned buffers
311  for (int b=0; b<2; b++) {
318  }
319 
320  // initialize ghost send pointers
321  size_t offset = 0;
322  for (int i=0; i<nDimComms; i++) {
323  if (!commDimPartitioned(i) && no_comms_fill==false) continue;
324 
325  for (int b=0; b<2; ++b) {
326  my_face_dim_dir_h[b][i][0] = static_cast<char*>(my_face_h[b]) + offset;
327  from_face_dim_dir_h[b][i][0] = static_cast<char*>(from_face_h[b]) + offset;
328 
329  my_face_dim_dir_hd[b][i][0] = static_cast<char*>(my_face_hd[b]) + offset;
330  from_face_dim_dir_hd[b][i][0] = static_cast<char*>(from_face_hd[b]) + offset;
331 
332  my_face_dim_dir_d[b][i][0] = static_cast<char *>(my_face_d[b]) + offset;
333  from_face_dim_dir_d[b][i][0] = static_cast<char *>(from_face_d[b]) + ghostOffset[i][0] * ghost_precision;
334  } // loop over b
335 
336  // if not bidir then forwards and backwards will alias
337  if (bidir) offset += ghost_face_bytes[i];
338 
339  for (int b=0; b<2; ++b) {
340  my_face_dim_dir_h[b][i][1] = static_cast<char*>(my_face_h[b]) + offset;
341  from_face_dim_dir_h[b][i][1] = static_cast<char*>(from_face_h[b]) + offset;
342 
343  my_face_dim_dir_hd[b][i][1] = static_cast<char*>(my_face_hd[b]) + offset;
344  from_face_dim_dir_hd[b][i][1] = static_cast<char*>(from_face_hd[b]) + offset;
345 
346  my_face_dim_dir_d[b][i][1] = static_cast<char *>(my_face_d[b]) + offset;
347  from_face_dim_dir_d[b][i][1] = static_cast<char *>(from_face_d[b]) + ghostOffset[i][1] * ghost_precision;
348  } // loop over b
349  offset += ghost_face_bytes[i];
350 
351  } // loop over dimension
352 
353  bool gdr = comm_gdr_enabled(); // only allocate rdma buffers if GDR enabled
354 
355  // initialize the message handlers
356  for (int i=0; i<nDimComms; i++) {
357  if (!commDimPartitioned(i)) continue;
358 
359  for (int b=0; b<2; ++b) {
362 
365 
366  mh_send_rdma_fwd[b][i] = gdr ? comm_declare_send_relative(my_face_dim_dir_d[b][i][1], i, +1, ghost_face_bytes[i]) : nullptr;
367  mh_send_rdma_back[b][i] = gdr ? comm_declare_send_relative(my_face_dim_dir_d[b][i][0], i, -1, ghost_face_bytes[i]) : nullptr;
368 
369  mh_recv_rdma_fwd[b][i] = gdr ? comm_declare_receive_relative(from_face_dim_dir_d[b][i][1], i, +1, ghost_face_bytes[i]) : nullptr;
370  mh_recv_rdma_back[b][i] = gdr ? comm_declare_receive_relative(from_face_dim_dir_d[b][i][0], i, -1, ghost_face_bytes[i]) : nullptr;
371  } // loop over b
372 
373  } // loop over dimension
374 
375  initComms = true;
376  checkCudaError();
377  }
378 
380  {
381  if (initComms) {
382 
383  // ensure that all processes bring down their communicators
384  // synchronously so that we don't end up in an undefined state
386  comm_barrier();
387 
388  for (int b=0; b<2; ++b) {
389  for (int i=0; i<nDimComms; i++) {
390  if (mh_recv_fwd[b][i]) comm_free(mh_recv_fwd[b][i]);
391  if (mh_recv_back[b][i]) comm_free(mh_recv_back[b][i]);
392  if (mh_send_fwd[b][i]) comm_free(mh_send_fwd[b][i]);
393  if (mh_send_back[b][i]) comm_free(mh_send_back[b][i]);
394 
395  if (mh_recv_rdma_fwd[b][i]) comm_free(mh_recv_rdma_fwd[b][i]);
397  if (mh_send_rdma_fwd[b][i]) comm_free(mh_send_rdma_fwd[b][i]);
399  }
400  } // loop over b
401 
402  // local take down complete - now synchronize to ensure globally complete
404  comm_barrier();
405 
406  initComms = false;
407  checkCudaError();
408  }
409 
410  }
411 
413  if ( initIPCComms && !ghost_field_reset ) return;
414 
415  if (!initComms) errorQuda("Can only be called after create comms");
416  if ((!ghost_recv_buffer_d[0] || !ghost_recv_buffer_d[1]) && comm_size() > 1)
417  errorQuda("ghost_field appears not to be allocated");
418 
419  // handles for obtained ghost pointers
420  cudaIpcMemHandle_t ipcRemoteGhostDestHandle[2][2][QUDA_MAX_DIM];
421 
422  for (int b=0; b<2; b++) {
423  for (int dim=0; dim<4; ++dim) {
424  if (comm_dim(dim)==1) continue;
425  for (int dir=0; dir<2; ++dir) {
426  MsgHandle* sendHandle = nullptr;
427  MsgHandle* receiveHandle = nullptr;
428  int disp = (dir == 1) ? +1 : -1;
429 
430  // first set up receive
431  if (comm_peer2peer_enabled(1-dir,dim)) {
432  receiveHandle = comm_declare_receive_relative(&ipcRemoteGhostDestHandle[b][1-dir][dim],
433  dim, -disp,
434  sizeof(ipcRemoteGhostDestHandle[b][1-dir][dim]));
435  }
436  // now send
437  cudaIpcMemHandle_t ipcLocalGhostDestHandle;
438  if (comm_peer2peer_enabled(dir,dim)) {
439  cudaIpcGetMemHandle(&ipcLocalGhostDestHandle, ghost_recv_buffer_d[b]);
440  sendHandle = comm_declare_send_relative(&ipcLocalGhostDestHandle,
441  dim, disp,
442  sizeof(ipcLocalGhostDestHandle));
443  }
444  if (receiveHandle) comm_start(receiveHandle);
445  if (sendHandle) comm_start(sendHandle);
446 
447  if (receiveHandle) comm_wait(receiveHandle);
448  if (sendHandle) comm_wait(sendHandle);
449 
450  if (sendHandle) comm_free(sendHandle);
451  if (receiveHandle) comm_free(receiveHandle);
452  }
453  }
454 
455  checkCudaError();
456 
457  // open the remote memory handles and set the send ghost pointers
458  for (int dim=0; dim<4; ++dim) {
459  if (comm_dim(dim)==1) continue;
460  // even if comm_dim(2) == 2, we might not have p2p enabled in both directions, so check this
461  const int num_dir = (comm_dim(dim) == 2 && comm_peer2peer_enabled(0,dim) && comm_peer2peer_enabled(1,dim)) ? 1 : 2;
462  for (int dir=0; dir<num_dir; ++dir) {
463  if (!comm_peer2peer_enabled(dir,dim)) continue;
464  void **ghostDest = &(ghost_remote_send_buffer_d[b][dim][dir]);
465  cudaIpcOpenMemHandle(ghostDest, ipcRemoteGhostDestHandle[b][dir][dim],
466  cudaIpcMemLazyEnablePeerAccess);
467  }
468  if (num_dir == 1) ghost_remote_send_buffer_d[b][dim][1] = ghost_remote_send_buffer_d[b][dim][0];
469  }
470  } // buffer index
471 
472  checkCudaError();
473 
474  // handles for obtained events
475  cudaIpcEventHandle_t ipcRemoteEventHandle[2][2][QUDA_MAX_DIM];
476 
477  // Note that no b index is necessary here
478  // Now communicate the event handles
479  for (int dim=0; dim<4; ++dim) {
480  if (comm_dim(dim)==1) continue;
481  for (int dir=0; dir<2; ++dir) {
482  for (int b=0; b<2; b++) {
483 
484  MsgHandle* sendHandle = NULL;
485  MsgHandle* receiveHandle = NULL;
486  int disp = (dir == 1) ? +1 : -1;
487 
488  // first set up receive
489  if (comm_peer2peer_enabled(1-dir,dim)) {
490  receiveHandle = comm_declare_receive_relative(&ipcRemoteEventHandle[b][1-dir][dim], dim, -disp,
491  sizeof(ipcRemoteEventHandle[b][1-dir][dim]));
492  }
493 
494  // now send
495  cudaIpcEventHandle_t ipcLocalEventHandle;
496  if (comm_peer2peer_enabled(dir,dim)) {
497  cudaEventCreate(&ipcCopyEvent[b][dir][dim], cudaEventDisableTiming | cudaEventInterprocess);
498  cudaIpcGetEventHandle(&ipcLocalEventHandle, ipcCopyEvent[b][dir][dim]);
499 
500  sendHandle = comm_declare_send_relative(&ipcLocalEventHandle, dim, disp,
501  sizeof(ipcLocalEventHandle));
502  }
503 
504  if (receiveHandle) comm_start(receiveHandle);
505  if (sendHandle) comm_start(sendHandle);
506 
507  if (receiveHandle) comm_wait(receiveHandle);
508  if (sendHandle) comm_wait(sendHandle);
509 
510  if (sendHandle) comm_free(sendHandle);
511  if (receiveHandle) comm_free(receiveHandle);
512 
513  } // buffer index
514  }
515  }
516 
517  checkCudaError();
518 
519  for (int dim=0; dim<4; ++dim) {
520  if (comm_dim(dim)==1) continue;
521  for (int dir=0; dir<2; ++dir) {
522  if (!comm_peer2peer_enabled(dir,dim)) continue;
523  for (int b=0; b<2; b++) {
524  cudaIpcOpenEventHandle(&(ipcRemoteCopyEvent[b][dir][dim]), ipcRemoteEventHandle[b][dir][dim]);
525  }
526  }
527  }
528 
529  // Create message handles for IPC synchronization
530  for (int dim=0; dim<4; ++dim) {
531  if (comm_dim(dim)==1) continue;
532  if (comm_peer2peer_enabled(1,dim)) {
533  for (int b=0; b<2; b++) {
534  // send to processor in forward direction
535  mh_send_p2p_fwd[b][dim] = comm_declare_send_relative(&buffer_send_p2p_fwd[b][dim], dim, +1, sizeof(int));
536  // receive from processor in forward direction
537  mh_recv_p2p_fwd[b][dim] = comm_declare_receive_relative(&buffer_recv_p2p_fwd[b][dim], dim, +1, sizeof(int));
538  }
539  }
540 
541  if (comm_peer2peer_enabled(0,dim)) {
542  for (int b=0; b<2; b++) {
543  // send to processor in backward direction
544  mh_send_p2p_back[b][dim] = comm_declare_send_relative(&buffer_send_p2p_back[b][dim], dim, -1, sizeof(int));
545  // receive from processor in backward direction
546  mh_recv_p2p_back[b][dim] = comm_declare_receive_relative(&buffer_recv_p2p_back[b][dim], dim, -1, sizeof(int));
547  }
548  }
549  }
550  checkCudaError();
551 
552  initIPCComms = true;
553  ghost_field_reset = false;
554  }
555 
557 
558  if (!initIPCComms) return;
559  checkCudaError();
560 
561  // ensure that all processes bring down their communicators
562  // synchronously so that we don't end up in an undefined state
564  comm_barrier();
565 
566  for (int dim=0; dim<4; ++dim) {
567 
568  if (comm_dim(dim)==1) continue;
569  const int num_dir = (comm_dim(dim) == 2 && comm_peer2peer_enabled(0,dim) && comm_peer2peer_enabled(1,dim)) ? 1 : 2;
570 
571  for (int b=0; b<2; b++) {
572  if (comm_peer2peer_enabled(1,dim)) {
573  if (mh_send_p2p_fwd[b][dim] || mh_recv_p2p_fwd[b][dim]) {
574  cudaEventDestroy(ipcCopyEvent[b][1][dim]);
575  // only close this handle if it doesn't alias the back ghost
576  if (num_dir == 2) cudaIpcCloseMemHandle(ghost_remote_send_buffer_d[b][dim][1]);
577  }
578  if (mh_send_p2p_fwd[b][dim]) comm_free(mh_send_p2p_fwd[b][dim]);
579  if (mh_recv_p2p_fwd[b][dim]) comm_free(mh_recv_p2p_fwd[b][dim]);
580  }
581 
582  if (comm_peer2peer_enabled(0,dim)) {
583  if (mh_send_p2p_back[b][dim] || mh_recv_p2p_back[b][dim]) {
584  cudaEventDestroy(ipcCopyEvent[b][0][dim]);
585  cudaIpcCloseMemHandle(ghost_remote_send_buffer_d[b][dim][0]);
586  }
587  if (mh_send_p2p_back[b][dim]) comm_free(mh_send_p2p_back[b][dim]);
588  if (mh_recv_p2p_back[b][dim]) comm_free(mh_recv_p2p_back[b][dim]);
589  }
590  } // buffer
591  } // iterate over dim
592 
593  checkCudaError();
594 
595  // local take down complete - now synchronize to ensure globally complete
597  comm_barrier();
598 
599  initIPCComms = false;
600  }
601 
602  bool LatticeField::ipcCopyComplete(int dir, int dim)
603  {
604  return (cudaSuccess == cudaEventQuery(ipcCopyEvent[bufferIndex][dir][dim]) ? true : false);
605  }
606 
607  bool LatticeField::ipcRemoteCopyComplete(int dir, int dim)
608  {
609  return (cudaSuccess == cudaEventQuery(ipcRemoteCopyEvent[bufferIndex][dir][dim]) ? true : false);
610  }
611 
612  const cudaEvent_t& LatticeField::getIPCCopyEvent(int dir, int dim) const {
613  return ipcCopyEvent[bufferIndex][dir][dim];
614  }
615 
616  const cudaEvent_t& LatticeField::getIPCRemoteCopyEvent(int dir, int dim) const {
617  return ipcRemoteCopyEvent[bufferIndex][dir][dim];
618  }
619 
621  char vol_tmp[TuneKey::volume_n];
622  int check = snprintf(vol_string, TuneKey::volume_n, "%d", x[0]);
623  if (check < 0 || check >= TuneKey::volume_n) errorQuda("Error writing volume string");
624  for (int d=1; d<nDim; d++) {
625  strcpy(vol_tmp, vol_string);
626  check = snprintf(vol_string, TuneKey::volume_n, "%sx%d", vol_tmp, x[d]);
627  if (check < 0 || check >= TuneKey::volume_n) errorQuda("Error writing volume string");
628  }
629  }
630 
631  void LatticeField::checkField(const LatticeField &a) const {
632  if (a.nDim != nDim) errorQuda("nDim does not match %d %d", nDim, a.nDim);
634  // if source is extended by I am not then we need to compare their interior volume to my volume
635  int a_volume_interior = 1;
636  for (int i=0; i<nDim; i++) {
637  if (a.x[i]-2*a.r[i] != x[i]) errorQuda("x[%d] does not match %d %d", i, x[i], a.x[i]-2*a.r[i]);
638  a_volume_interior *= a.x[i] - 2*a.r[i];
639  }
640  if (a_volume_interior != volume) errorQuda("Interior volume does not match %d %d", volume, a_volume_interior);
642  // if source is extended by I am not then we need to compare their interior volume to my volume
643  int this_volume_interior = 1;
644  for (int i=0; i<nDim; i++) {
645  if (x[i]-2*r[i] != a.x[i]) errorQuda("x[%d] does not match %d %d", i, x[i]-2*r[i], a.x[i]);
646  this_volume_interior *= x[i] - 2*r[i];
647  }
648  if (this_volume_interior != a.volume) errorQuda("Interior volume does not match %d %d", this_volume_interior, a.volume);
649  } else {
650  if (a.volume != volume) errorQuda("Volume does not match %d %d", volume, a.volume);
651  if (a.volumeCB != volumeCB) errorQuda("VolumeCB does not match %d %d", volumeCB, a.volumeCB);
652  for (int i=0; i<nDim; i++) {
653  if (a.x[i] != x[i]) errorQuda("x[%d] does not match %d %d", i, x[i], a.x[i]);
654  if (a.surface[i] != surface[i]) errorQuda("surface[%d] does not match %d %d", i, surface[i], a.surface[i]);
655  if (a.surfaceCB[i] != surfaceCB[i]) errorQuda("surfaceCB[%d] does not match %d %d", i, surfaceCB[i], a.surfaceCB[i]);
656  }
657  }
658  }
659 
662  if (typeid(*this)==typeid(cudaCloverField) ||
663  typeid(*this)==typeid(cudaColorSpinorField) ||
664  typeid(*this)==typeid(cudaGaugeField)) {
665  location = QUDA_CUDA_FIELD_LOCATION;
666  } else if (typeid(*this)==typeid(cpuCloverField) ||
667  typeid(*this)==typeid(cpuColorSpinorField) ||
668  typeid(*this)==typeid(cpuGaugeField)) {
669  location = QUDA_CPU_FIELD_LOCATION;
670  } else {
671  errorQuda("Unknown field %s, so cannot determine location", typeid(*this).name());
672  }
673  return location;
674  }
675 
676  void LatticeField::read(char *filename) {
677  errorQuda("Not implemented");
678  }
679 
680  void LatticeField::write(char *filename) {
681  errorQuda("Not implemented");
682  }
683 
684  int LatticeField::Nvec() const {
685  if (typeid(*this) == typeid(const cudaColorSpinorField)) {
686  const ColorSpinorField &csField = static_cast<const ColorSpinorField&>(*this);
687  if (csField.FieldOrder() == 2 || csField.FieldOrder() == 4)
688  return static_cast<int>(csField.FieldOrder());
689  } else if (typeid(*this) == typeid(const cudaGaugeField)) {
690  const GaugeField &gField = static_cast<const GaugeField&>(*this);
691  if (gField.Order() == 2 || gField.Order() == 4)
692  return static_cast<int>(gField.Order());
693  } else if (typeid(*this) == typeid(const cudaCloverField)) {
694  const CloverField &cField = static_cast<const CloverField&>(*this);
695  if (cField.Order() == 2 || cField.Order() == 4)
696  return static_cast<int>(cField.Order());
697  }
698 
699  errorQuda("Unsupported field type");
700  return -1;
701  }
702 
703  // This doesn't really live here, but is fine for the moment
704  std::ostream& operator<<(std::ostream& output, const LatticeFieldParam& param)
705  {
706  output << "nDim = " << param.nDim << std::endl;
707  for (int i=0; i<param.nDim; i++) {
708  output << "x[" << i << "] = " << param.x[i] << std::endl;
709  }
710  output << "pad = " << param.pad << std::endl;
711  output << "precision = " << param.Precision() << std::endl;
712  output << "ghost_precision = " << param.GhostPrecision() << std::endl;
713  output << "scale = " << param.scale << std::endl;
714 
715  output << "ghostExchange = " << param.ghostExchange << std::endl;
716  for (int i=0; i<param.nDim; i++) {
717  output << "r[" << i << "] = " << param.r[i] << std::endl;
718  }
719 
720  return output; // for multiple << operators.
721  }
722 
724 
726  void reorder_location_set(QudaFieldLocation _reorder_location) { reorder_location_ = _reorder_location; }
727 
728 } // namespace quda
static int buffer_recv_p2p_back[2][QUDA_MAX_DIM]
bool ipcRemoteCopyComplete(int dir, int dim)
QudaFieldLocation reorder_location()
Return whether data is reordered on the CPU or GPU. This can set at QUDA initialization using the env...
QudaGhostExchange ghostExchange
Definition: lattice_field.h:76
virtual void read(char *filename)
void allocateGhostBuffer(size_t ghost_bytes) const
Allocate the static ghost buffers.
int commDimPartitioned(int dir)
int ghostNormOffset[QUDA_MAX_DIM][2]
void * my_face_dim_dir_d[2][QUDA_MAX_DIM][2]
#define errorQuda(...)
Definition: util_quda.h:121
#define host_free(ptr)
Definition: malloc_quda.h:71
QudaSiteSubset siteSubset
int comm_dim(int dim)
static void * ghost_pinned_recv_buffer_hd[2]
QudaPrecision GhostPrecision() const
Definition: lattice_field.h:61
int x[QUDA_MAX_DIM]
QudaPrecision precision
Definition: lattice_field.h:51
static void * ghost_pinned_send_buffer_h[2]
static MsgHandle * mh_send_p2p_back[2][QUDA_MAX_DIM]
void * from_face_dim_dir_hd[2][QUDA_MAX_DIM][2]
QudaCloverFieldOrder Order() const
Definition: clover_field.h:93
virtual void setTuningString()
MsgHandle * mh_send_rdma_fwd[2][QUDA_MAX_DIM]
bool ipcCopyComplete(int dir, int dim)
static bool initGhostFaceBuffer
QudaSiteSubset siteSubset
Definition: lattice_field.h:71
std::ostream & operator<<(std::ostream &output, const CloverFieldParam &param)
static int buffer_recv_p2p_fwd[2][QUDA_MAX_DIM]
QudaGaugeParam param
Definition: pack_test.cpp:17
void * my_face_dim_dir_hd[2][QUDA_MAX_DIM][2]
int x[QUDA_MAX_DIM]
Definition: lattice_field.h:67
const int * R() const
MsgHandle * mh_send_rdma_back[2][QUDA_MAX_DIM]
static MsgHandle * mh_recv_p2p_fwd[2][QUDA_MAX_DIM]
const cudaEvent_t & getIPCCopyEvent(int dir, int dim) const
int comm_size(void)
Definition: comm_mpi.cpp:88
#define qudaDeviceSynchronize()
static bool ghost_field_reset
static int bufferIndex
void * from_face_hd[2]
#define comm_declare_send_relative(buffer, dim, dir, nbytes)
Definition: comm_quda.h:59
void checkField(const LatticeField &a) const
#define comm_declare_receive_relative(buffer, dim, dir, nbytes)
Definition: comm_quda.h:74
MsgHandle * mh_recv_back[2][QUDA_MAX_DIM]
#define device_pinned_malloc(size)
Definition: malloc_quda.h:65
void comm_start(MsgHandle *mh)
Definition: comm_mpi.cpp:216
MsgHandle * mh_recv_rdma_fwd[2][QUDA_MAX_DIM]
QudaGhostExchange ghostExchange
static void * ghost_remote_send_buffer_d[2][QUDA_MAX_DIM][2]
void comm_free(MsgHandle *&mh)
Definition: comm_mpi.cpp:207
const cudaEvent_t & getIPCRemoteCopyEvent(int dir, int dim) const
char vol_string[TuneKey::volume_n]
virtual void write(char *filename)
void * from_face_dim_dir_d[2][QUDA_MAX_DIM][2]
size_t ghost_face_bytes[QUDA_MAX_DIM]
static void destroyIPCComms()
MsgHandle * mh_send_fwd[2][QUDA_MAX_DIM]
static int buffer_send_p2p_fwd[2][QUDA_MAX_DIM]
bool comm_peer2peer_enabled(int dir, int dim)
QudaFieldLocation Location() const
static QudaFieldLocation reorder_location_
static int buffer_send_p2p_back[2][QUDA_MAX_DIM]
static size_t ghostFaceBytes
static void * ghost_send_buffer_d[2]
int surface[QUDA_MAX_DIM]
enum QudaFieldLocation_s QudaFieldLocation
int ghostOffset[QUDA_MAX_DIM][2]
QudaPrecision ghost_precision
void * from_face_dim_dir_h[2][QUDA_MAX_DIM][2]
int r[QUDA_MAX_DIM]
static MsgHandle * mh_send_p2p_fwd[2][QUDA_MAX_DIM]
QudaPrecision Precision() const
Definition: lattice_field.h:58
static void * ghost_recv_buffer_d[2]
MsgHandle * mh_recv_rdma_back[2][QUDA_MAX_DIM]
static cudaEvent_t ipcCopyEvent[2][2][QUDA_MAX_DIM]
LatticeFieldParam()
Default constructor for LatticeFieldParam.
Definition: lattice_field.h:87
bool comm_gdr_enabled()
Query if GPU Direct RDMA communication is enabled (global setting)
QudaPrecision ghost_precision
Definition: lattice_field.h:54
static void * ghost_pinned_send_buffer_hd[2]
static long total_bytes[N_ALLOC_TYPE]
Definition: malloc.cpp:54
int surfaceCB[QUDA_MAX_DIM]
static cudaEvent_t ipcRemoteCopyEvent[2][2][QUDA_MAX_DIM]
QudaGaugeFieldOrder Order() const
Definition: gauge_field.h:251
#define QUDA_MAX_DIM
Maximum number of dimensions supported by QUDA. In practice, no routines make use of more than 5...
#define checkCudaError()
Definition: util_quda.h:161
MsgHandle * mh_recv_fwd[2][QUDA_MAX_DIM]
void createComms(bool no_comms_fill=false, bool bidir=true)
#define mapped_malloc(size)
Definition: malloc_quda.h:68
void comm_wait(MsgHandle *mh)
Definition: comm_mpi.cpp:222
static MsgHandle * mh_recv_p2p_back[2][QUDA_MAX_DIM]
#define device_pinned_free(ptr)
Definition: malloc_quda.h:70
static const int volume_n
Definition: tune_key.h:10
void * my_face_dim_dir_h[2][QUDA_MAX_DIM][2]
int r[QUDA_MAX_DIM]
Definition: lattice_field.h:79
void reorder_location_set(QudaFieldLocation reorder_location_)
Set whether data is reorderd on the CPU or GPU. This can set at QUDA initialization using the environ...
static void freeGhostBuffer(void)
Free statically allocated ghost buffers.
QudaMemoryType mem_type
QudaPrecision precision
LatticeField(const LatticeFieldParam &param)
MsgHandle * mh_send_back[2][QUDA_MAX_DIM]
QudaFieldOrder FieldOrder() const
static void * ghost_pinned_recv_buffer_h[2]
static bool initIPCComms
const int * X() const
void comm_barrier(void)
Definition: comm_mpi.cpp:326