QUDA  v1.1.0
A library for QCD on GPUs
cuda_gauge_field.cpp
Go to the documentation of this file.
1 #include <string.h>
2 #include <gauge_field.h>
3 #include <typeinfo>
4 #include <blas_quda.h>
5 
6 namespace quda {
7 
9  GaugeField(param), gauge(0), even(0), odd(0)
10  {
13  errorQuda("QDP ordering only supported for reference fields");
14  }
15 
16  if (order == QUDA_QDP_GAUGE_ORDER ||
19  errorQuda("Field ordering %d presently disabled for this type", order);
20 
21 #ifdef MULTI_GPU
24  isNative()) {
25  bool pad_check = true;
26  for (int i=0; i<nDim; i++) {
27  // when we have coarse links we need to double the pad since we're storing forwards and backwards links
28  int minimum_pad = nFace*surfaceCB[i] * (geometry == QUDA_COARSE_GEOMETRY ? 2 : 1);
29  if (pad < minimum_pad) pad_check = false;
30  if (!pad_check)
31  errorQuda("cudaGaugeField being constructed with insufficient padding in dim %d (%d < %d)\n", i, pad, minimum_pad);
32  }
33  }
34 #endif
35 
39  errorQuda("ERROR: create type(%d) not supported yet\n", create);
40  }
41 
43  switch(mem_type) {
44  case QUDA_MEMORY_DEVICE: gauge = bytes ? pool_device_malloc(bytes) : nullptr; break;
45  case QUDA_MEMORY_MAPPED:
46  gauge_h = bytes ? mapped_malloc(bytes) : nullptr;
47  gauge = bytes ? get_mapped_device_pointer(gauge_h) : nullptr; // set the matching device pointer
48  break;
49  default:
50  errorQuda("Unsupported memory type %d", mem_type);
51  }
52  if (create == QUDA_ZERO_FIELD_CREATE && bytes) qudaMemset(gauge, 0, bytes);
53  } else {
54  gauge = param.gauge;
55  }
56 
57  if ( !isNative() ) {
58  for (int i=0; i<nDim; i++) {
59  size_t nbytes = nFace * surface[i] * nInternal * precision;
60  ghost[i] = nbytes ? pool_device_malloc(nbytes) : nullptr;
61  ghost[i+4] = (nbytes && geometry == QUDA_COARSE_GEOMETRY) ? pool_device_malloc(nbytes) : nullptr;
62  }
63  }
64 
67  }
68 
69  even = gauge;
70  odd = static_cast<char*>(gauge) + bytes/2;
72  }
73 
74  void cudaGaugeField::zeroPad() {
75  size_t pad_bytes = (stride - volumeCB) * precision * order;
76  int Npad = (geometry * (reconstruct != QUDA_RECONSTRUCT_NO ? reconstruct : nColor * nColor * 2)) / order;
77 
78  size_t pitch = stride*order*precision;
79  if (pad_bytes) {
80  qudaMemset2D(static_cast<char *>(even) + volumeCB * order * precision, pitch, 0, pad_bytes, Npad);
81  qudaMemset2D(static_cast<char *>(odd) + volumeCB * order * precision, pitch, 0, pad_bytes, Npad);
82  }
83  }
84 
86  {
87  destroyComms();
88 
90  switch(mem_type) {
91  case QUDA_MEMORY_DEVICE:
92  if (gauge) pool_device_free(gauge);
93  break;
94  case QUDA_MEMORY_MAPPED:
95  if (gauge_h) host_free(gauge_h);
96  break;
97  default:
98  errorQuda("Unsupported memory type %d", mem_type);
99  }
100  }
101 
102  if ( !isNative() ) {
103  for (int i=0; i<nDim; i++) {
104  if (ghost[i]) pool_device_free(ghost[i]);
105  if (ghost[i + 4] && geometry == QUDA_COARSE_GEOMETRY) pool_device_free(ghost[i + 4]);
106  }
107  }
108 
109  }
110 
111  // This does the exchange of the forwards boundary gauge field ghost zone and places
112  // it into the ghost array of the next node
114 
115  if (ghostExchange != QUDA_GHOST_EXCHANGE_PAD) errorQuda("Cannot call exchangeGhost with ghostExchange=%d", ghostExchange);
116  if (geometry != QUDA_VECTOR_GEOMETRY && geometry != QUDA_COARSE_GEOMETRY) errorQuda("Invalid geometry=%d", geometry);
117  if ( (link_direction == QUDA_LINK_BIDIRECTIONAL || link_direction == QUDA_LINK_FORWARDS) && geometry != QUDA_COARSE_GEOMETRY)
118  errorQuda("Cannot request exchange of forward links on non-coarse geometry");
119  if (nFace == 0) errorQuda("nFace = 0");
120 
121  const int dir = 1; // sending forwards only
122  const int R[] = {nFace, nFace, nFace, nFace};
123  const bool no_comms_fill = true; // dslash kernels presently require this
124  const bool bidir = false; // communication is only ever done in one direction at once
125  createComms(R, true, bidir); // always need to allocate space for non-partitioned dimension for copyGenericGauge
126 
127  // loop over backwards and forwards links
129  for (int link_dir = 0; link_dir<2; link_dir++) {
130  if (!(link_direction == QUDA_LINK_BIDIRECTIONAL || link_direction == directions[link_dir])) continue;
131 
132  void *send_d[2*QUDA_MAX_DIM] = { };
133  void *recv_d[2*QUDA_MAX_DIM] = { };
134 
135  size_t offset = 0;
136  for (int d=0; d<nDim; d++) {
137  recv_d[d] = static_cast<char *>(ghost_recv_buffer_d[bufferIndex]) + offset;
138  if (bidir) offset += ghost_face_bytes_aligned[d];
139  send_d[d] = static_cast<char *>(ghost_send_buffer_d[bufferIndex]) + offset;
140  offset += ghost_face_bytes_aligned[d];
141  }
142 
143  extractGaugeGhost(*this, send_d, true, link_dir*nDim); // get the links into contiguous buffers
144 
145  // issue receive preposts and host-to-device copies if needed
146  for (int dim=0; dim<nDim; dim++) {
147  if (!comm_dim_partitioned(dim)) continue;
148  recvStart(dim, dir); // prepost the receive
149  if (!comm_peer2peer_enabled(dir,dim) && !comm_gdr_enabled()) {
151  ghost_face_bytes[dim], cudaMemcpyDeviceToHost, streams[2 * dim + dir]);
152  }
153  }
154 
155  // if gdr enabled then synchronize
157 
158  // if the sending direction is not peer-to-peer then we need to synchronize before we start sending
159  for (int dim=0; dim<nDim; dim++) {
160  if (!comm_dim_partitioned(dim)) continue;
162  sendStart(dim, dir, &streams[2*dim+dir]); // start sending
163  }
164 
165  // complete communication and issue host-to-device copies if needed
166  for (int dim=0; dim<nDim; dim++) {
167  if (!comm_dim_partitioned(dim)) continue;
168  commsComplete(dim, dir);
169  if (!comm_peer2peer_enabled(1-dir,dim) && !comm_gdr_enabled()) {
171  ghost_face_bytes[dim], cudaMemcpyHostToDevice, streams[2 * dim + dir]);
172  }
173  }
174 
175  // fill in the halos for non-partitioned dimensions
176  for (int dim=0; dim<nDim; dim++) {
177  if (!comm_dim_partitioned(dim) && no_comms_fill) {
178  qudaMemcpy(recv_d[dim], send_d[dim], ghost_face_bytes[dim], cudaMemcpyDeviceToDevice);
179  }
180  }
181 
182  if (isNative()) {
183  copyGenericGauge(*this, *this, QUDA_CUDA_FIELD_LOCATION, 0, 0, 0, recv_d, 1 + 2*link_dir); // 1, 3
184  } else {
185  // copy from receive buffer into ghost array
186  for (int dim=0; dim<nDim; dim++)
187  qudaMemcpy(ghost[dim+link_dir*nDim], recv_d[dim], ghost_face_bytes[dim], cudaMemcpyDeviceToDevice);
188  }
189 
191  } // link_dir
192 
194  }
195 
196  // This does the opposite of exchangeGhost and sends back the ghost
197  // zone to the node from which it came and injects it back into the
198  // field
200 
201  if (ghostExchange != QUDA_GHOST_EXCHANGE_PAD) errorQuda("Cannot call exchangeGhost with ghostExchange=%d", ghostExchange);
202  if (geometry != QUDA_VECTOR_GEOMETRY && geometry != QUDA_COARSE_GEOMETRY) errorQuda("Invalid geometry=%d", geometry);
203  if (link_direction != QUDA_LINK_BACKWARDS) errorQuda("Invalid link_direction = %d", link_direction);
204  if (nFace == 0) errorQuda("nFace = 0");
205 
206  const int dir = 0; // sending backwards only
207  const int R[] = {nFace, nFace, nFace, nFace};
208  const bool no_comms_fill = false; // injection never does no_comms_fill
209  const bool bidir = false; // communication is only ever done in one direction at once
210  createComms(R, true, bidir); // always need to allocate space for non-partitioned dimension for copyGenericGauge
211 
212  // loop over backwards and forwards links (forwards links never sent but leave here just in case)
214  for (int link_dir = 0; link_dir<2; link_dir++) {
215  if (!(link_direction == QUDA_LINK_BIDIRECTIONAL || link_direction == directions[link_dir])) continue;
216 
217  void *send_d[2*QUDA_MAX_DIM] = { };
218  void *recv_d[2*QUDA_MAX_DIM] = { };
219 
220  size_t offset = 0;
221  for (int d=0; d<nDim; d++) {
222  // send backwards is first half of each ghost_send_buffer
223  send_d[d] = static_cast<char *>(ghost_send_buffer_d[bufferIndex]) + offset;
224  if (bidir) offset += ghost_face_bytes_aligned[d];
225  // receive from forwards is the second half of each ghost_recv_buffer
226  recv_d[d] = static_cast<char *>(ghost_recv_buffer_d[bufferIndex]) + offset;
227  offset += ghost_face_bytes_aligned[d];
228  }
229 
230  if (isNative()) { // copy from padded region in gauge field into send buffer
231  copyGenericGauge(*this, *this, QUDA_CUDA_FIELD_LOCATION, 0, 0, send_d, 0, 1 + 2*link_dir);
232  } else { // copy from receive buffer into ghost array
233  for (int dim=0; dim<nDim; dim++) qudaMemcpy(send_d[dim], ghost[dim+link_dir*nDim], ghost_face_bytes[dim], cudaMemcpyDeviceToDevice);
234  }
235 
236  // issue receive preposts and host-to-device copies if needed
237  for (int dim=0; dim<nDim; dim++) {
238  if (!comm_dim_partitioned(dim)) continue;
239  recvStart(dim, dir); // prepost the receive
240  if (!comm_peer2peer_enabled(dir,dim) && !comm_gdr_enabled()) {
242  ghost_face_bytes[dim], cudaMemcpyDeviceToHost, streams[2 * dim + dir]);
243  }
244  }
245 
246  // if gdr enabled then synchronize
248 
249  // if the sending direction is not peer-to-peer then we need to synchronize before we start sending
250  for (int dim=0; dim<nDim; dim++) {
251  if (!comm_dim_partitioned(dim)) continue;
253  sendStart(dim, dir, &streams[2*dim+dir]); // start sending
254  }
255 
256  // complete communication and issue host-to-device copies if needed
257  for (int dim=0; dim<nDim; dim++) {
258  if (!comm_dim_partitioned(dim)) continue;
259  commsComplete(dim, dir);
260  if (!comm_peer2peer_enabled(1-dir,dim) && !comm_gdr_enabled()) {
262  ghost_face_bytes[dim], cudaMemcpyHostToDevice, streams[2 * dim + dir]);
263  }
264  }
265 
266  // fill in the halos for non-partitioned dimensions
267  for (int dim=0; dim<nDim; dim++) {
268  if (!comm_dim_partitioned(dim) && no_comms_fill) {
269  qudaMemcpy(recv_d[dim], send_d[dim], ghost_face_bytes[dim], cudaMemcpyDeviceToDevice);
270  }
271  }
272 
273  // get the links into contiguous buffers
274  extractGaugeGhost(*this, recv_d, false, link_dir*nDim);
275 
277  } // link_dir
278 
280  }
281 
282  void cudaGaugeField::allocateGhostBuffer(const int *R, bool no_comms_fill, bool bidir) const
283  {
284  createGhostZone(R, no_comms_fill, bidir);
286  }
287 
288  void cudaGaugeField::createComms(const int *R, bool no_comms_fill, bool bidir)
289  {
290  allocateGhostBuffer(R, no_comms_fill, bidir); // allocate the ghost buffer if not yet allocated
291 
292  // ascertain if this instance needs it comms buffers to be updated
293  bool comms_reset = ghost_field_reset || // FIXME add send buffer check
296  ghost_bytes != ghost_bytes_old; // ghost buffer has been resized (e.g., bidir to unidir)
297 
298  if (!initComms || comms_reset) LatticeField::createComms(no_comms_fill, bidir);
299 
301  createIPCComms();
302  }
303 
304  void cudaGaugeField::recvStart(int dim, int dir)
305  {
306  if (!comm_dim_partitioned(dim)) return;
307 
308  if (dir==0) { // sending backwards
309  // receive from the processor in the +1 direction
310  if (comm_peer2peer_enabled(1,dim)) {
312  } else if (comm_gdr_enabled()) {
314  } else {
316  }
317  } else { //sending forwards
318  // receive from the processor in the -1 direction
319  if (comm_peer2peer_enabled(0,dim)) {
321  } else if (comm_gdr_enabled()) {
323  } else {
325  }
326  }
327  }
328 
329  void cudaGaugeField::sendStart(int dim, int dir, qudaStream_t *stream_p)
330  {
331  if (!comm_dim_partitioned(dim)) return;
332 
333  if (!comm_peer2peer_enabled(dir,dim)) {
334  if (dir == 0)
335  if (comm_gdr_enabled()) {
337  } else {
339  }
340  else
341  if (comm_gdr_enabled()) {
343  } else {
345  }
346  } else { // doing peer-to-peer
347 
348  void *ghost_dst
349  = static_cast<char *>(ghost_remote_send_buffer_d[bufferIndex][dim][dir]) + ghost_offset[dim][(dir + 1) % 2];
350 
351  cudaMemcpyAsync(ghost_dst, my_face_dim_dir_d[bufferIndex][dim][dir],
352  ghost_face_bytes[dim], cudaMemcpyDeviceToDevice,
353  stream_p ? *stream_p : 0);
354 
355  if (dir == 0) {
356  // record the event
357  qudaEventRecord(ipcCopyEvent[bufferIndex][0][dim], stream_p ? *stream_p : 0);
358  // send to the processor in the -1 direction
360  } else {
361  qudaEventRecord(ipcCopyEvent[bufferIndex][1][dim], stream_p ? *stream_p : 0);
362  // send to the processor in the +1 direction
364  }
365  }
366  }
367 
369  {
370  if (!comm_dim_partitioned(dim)) return;
371 
372  if (dir==0) {
373  if (comm_peer2peer_enabled(1,dim)) {
376  } else if (comm_gdr_enabled()) {
378  } else {
380  }
381 
382  if (comm_peer2peer_enabled(0,dim)) {
385  } else if (comm_gdr_enabled()) {
387  } else {
389  }
390  } else {
391  if (comm_peer2peer_enabled(0,dim)) {
394  } else if (comm_gdr_enabled()) {
396  } else {
398  }
399 
400  if (comm_peer2peer_enabled(1,dim)) {
403  } else if (comm_gdr_enabled()) {
405  } else {
407  }
408  }
409  }
410 
411  void cudaGaugeField::exchangeExtendedGhost(const int *R, bool no_comms_fill)
412  {
413  const int b = bufferIndex;
414  void *send_d[QUDA_MAX_DIM], *recv_d[QUDA_MAX_DIM];
415 
416  createComms(R, no_comms_fill);
417 
418  size_t offset = 0;
419  for (int dim=0; dim<nDim; dim++) {
420  if ( !(comm_dim_partitioned(dim) || (no_comms_fill && R[dim])) ) continue;
421  send_d[dim] = static_cast<char*>(ghost_send_buffer_d[b]) + offset;
422  recv_d[dim] = static_cast<char*>(ghost_recv_buffer_d[b]) + offset;
423 
424  // silence cuda-memcheck initcheck errors that arise since we
425  // have an oversized ghost buffer when doing the extended exchange
426  qudaMemsetAsync(send_d[dim], 0, 2 * ghost_face_bytes_aligned[dim], 0);
427  offset += 2 * ghost_face_bytes_aligned[dim]; // factor of two from fwd/back
428  }
429 
430  for (int dim=0; dim<nDim; dim++) {
431  if ( !(comm_dim_partitioned(dim) || (no_comms_fill && R[dim])) ) continue;
432 
433  //extract into a contiguous buffer
434  extractExtendedGaugeGhost(*this, dim, R, send_d, true);
435 
436  if (comm_dim_partitioned(dim)) {
437  for (int dir=0; dir<2; dir++) recvStart(dim, dir);
438 
439  for (int dir=0; dir<2; dir++) {
440  // issue host-to-device copies if needed
441  if (!comm_peer2peer_enabled(dir,dim) && !comm_gdr_enabled()) {
443  ghost_face_bytes[dim], cudaMemcpyDeviceToHost, streams[dir]);
444  }
445  }
446 
447  // if either direction is not peer-to-peer then we need to synchronize
449 
450  // if we pass a stream to sendStart then we must ensure that stream is synchronized
451  for (int dir = 0; dir < 2; dir++) sendStart(dim, dir, &streams[dir]);
452  for (int dir = 0; dir < 2; dir++) commsComplete(dim, dir);
453 
454  for (int dir = 0; dir < 2; dir++) {
455  // issue host-to-device copies if needed
456  if (!comm_peer2peer_enabled(dir, dim) && !comm_gdr_enabled()) {
458  ghost_face_bytes[dim], cudaMemcpyHostToDevice, streams[dir]);
459  }
460  }
461 
462  } else { // if just doing a local exchange to fill halo then need to swap faces
464  ghost_face_bytes[dim], cudaMemcpyDeviceToDevice);
466  ghost_face_bytes[dim], cudaMemcpyDeviceToDevice);
467  }
468 
469  // inject back into the gauge field
470  extractExtendedGaugeGhost(*this, dim, R, recv_d, false);
471  }
472 
475  }
476 
477  void cudaGaugeField::exchangeExtendedGhost(const int *R, TimeProfile &profile, bool no_comms_fill) {
478  profile.TPSTART(QUDA_PROFILE_COMMS);
479  exchangeExtendedGhost(R, no_comms_fill);
480  profile.TPSTOP(QUDA_PROFILE_COMMS);
481  }
482 
483  void cudaGaugeField::setGauge(void *gauge_)
484  {
486  errorQuda("Setting gauge pointer is only allowed when create="
487  "QUDA_REFERENCE_FIELD_CREATE type\n");
488  }
489  gauge = gauge_;
490  }
491 
493  if (order == QUDA_QDP_GAUGE_ORDER) {
494  void **buffer = new void*[geometry];
495  for (int d=0; d<geometry; d++) buffer[d] = pool_device_malloc(bytes/geometry);
496  return ((void*)buffer);
497  } else {
498  return pool_device_malloc(bytes);
499  }
500 
501  }
502 
504 
505  if (order > 4) {
506  void **buffer = new void*[geometry];
507  for (int d=0; d<geometry; d++) buffer[d] = pool_device_malloc(bytes[d]);
508  return buffer;
509  } else {
510  return 0;
511  }
512 
513  }
514 
515  void free_gauge_buffer(void *buffer, QudaGaugeFieldOrder order, QudaFieldGeometry geometry) {
516  if (order == QUDA_QDP_GAUGE_ORDER) {
517  for (int d=0; d<geometry; d++) pool_device_free(((void**)buffer)[d]);
518  delete []((void**)buffer);
519  } else {
520  pool_device_free(buffer);
521  }
522  }
523 
524  void free_ghost_buffer(void **buffer, QudaGaugeFieldOrder order, QudaFieldGeometry geometry) {
525  if (order > 4) {
526  for (int d=0; d<geometry; d++) pool_device_free(buffer[d]);
527  delete []buffer;
528  }
529  }
530 
531  void cudaGaugeField::copy(const GaugeField &src) {
532  if (this == &src) return;
533 
534  checkField(src);
535 
537  fat_link_max = src.LinkMax();
539  } else {
540  fat_link_max = 1.0;
541  }
542 
543  if (typeid(src) == typeid(cudaGaugeField)) {
544 
546  // copy field and ghost zone into this field
547  copyGenericGauge(*this, src, QUDA_CUDA_FIELD_LOCATION, gauge, static_cast<const cudaGaugeField&>(src).gauge);
548 
550  copyGenericGauge(*this, src, QUDA_CUDA_FIELD_LOCATION, gauge, static_cast<const cudaGaugeField&>(src).gauge, 0, 0, 3);
551  } else {
552  copyExtendedGauge(*this, src, QUDA_CUDA_FIELD_LOCATION, gauge, static_cast<const cudaGaugeField&>(src).gauge);
553  if (geometry == QUDA_COARSE_GEOMETRY) errorQuda("Extended gauge copy for coarse geometry not supported");
554  }
555 
556  } else if (typeid(src) == typeid(cpuGaugeField)) {
557  if (reorder_location() == QUDA_CPU_FIELD_LOCATION) { // do reorder on the CPU
558  void *buffer = pool_pinned_malloc(bytes);
559 
561  // copy field and ghost zone into buffer
562  copyGenericGauge(*this, src, QUDA_CPU_FIELD_LOCATION, buffer, static_cast<const cpuGaugeField&>(src).gauge);
563 
565  copyGenericGauge(*this, src, QUDA_CPU_FIELD_LOCATION, buffer, static_cast<const cpuGaugeField &>(src).gauge,
566  0, 0, 3);
567  } else {
568  copyExtendedGauge(*this, src, QUDA_CPU_FIELD_LOCATION, buffer, static_cast<const cpuGaugeField&>(src).gauge);
569  if (geometry == QUDA_COARSE_GEOMETRY) errorQuda("Extended gauge copy for coarse geometry not supported");
570  }
571 
572  // this copies over both even and odd
573  qudaMemcpy(gauge, buffer, bytes, cudaMemcpyDefault);
574  pool_pinned_free(buffer);
575  } else { // else on the GPU
576 
577  if (src.Order() == QUDA_MILC_SITE_GAUGE_ORDER ||
578  src.Order() == QUDA_BQCD_GAUGE_ORDER ||
580  // special case where we use zero-copy memory to read/write directly from application's array
581  void *src_d = get_mapped_device_pointer(src.Gauge_p());
582 
583  if (src.GhostExchange() == QUDA_GHOST_EXCHANGE_NO) {
584  copyGenericGauge(*this, src, QUDA_CUDA_FIELD_LOCATION, gauge, src_d);
585  } else {
586  errorQuda("Ghost copy not supported here");
587  }
588 
589  } else {
590  void *buffer = create_gauge_buffer(src.Bytes(), src.Order(), src.Geometry());
591  size_t ghost_bytes[8];
592  int srcNinternal = src.Reconstruct() != QUDA_RECONSTRUCT_NO ? src.Reconstruct() : 2*nColor*nColor;
593  for (int d=0; d<geometry; d++) ghost_bytes[d] = nFace * surface[d%4] * srcNinternal * src.Precision();
594  void **ghost_buffer = (nFace > 0) ? create_ghost_buffer(ghost_bytes, src.Order(), geometry) : nullptr;
595 
596  if (src.Order() == QUDA_QDP_GAUGE_ORDER) {
597  for (int d=0; d<geometry; d++) {
598  qudaMemcpy(((void **)buffer)[d], ((void **)src.Gauge_p())[d], src.Bytes() / geometry, cudaMemcpyDefault);
599  }
600  } else {
601  qudaMemcpy(buffer, src.Gauge_p(), src.Bytes(), cudaMemcpyDefault);
602  }
603 
604  if (src.Order() > 4 && GhostExchange() == QUDA_GHOST_EXCHANGE_PAD
606  for (int d = 0; d < geometry; d++)
607  qudaMemcpy(ghost_buffer[d], src.Ghost()[d], ghost_bytes[d], cudaMemcpyDefault);
608 
610  copyGenericGauge(*this, src, QUDA_CUDA_FIELD_LOCATION, gauge, buffer, 0, ghost_buffer);
612  copyGenericGauge(*this, src, QUDA_CUDA_FIELD_LOCATION, gauge, buffer, 0, ghost_buffer, 3);
613  } else {
614  copyExtendedGauge(*this, src, QUDA_CUDA_FIELD_LOCATION, gauge, buffer);
615  if (geometry == QUDA_COARSE_GEOMETRY) errorQuda("Extended gauge copy for coarse geometry not supported");
616  }
617  free_gauge_buffer(buffer, src.Order(), src.Geometry());
618  if (nFace > 0) free_ghost_buffer(ghost_buffer, src.Order(), geometry);
619  }
620  } // reorder_location
621  } else {
622  errorQuda("Invalid gauge field type");
623  }
624 
625  // if we have copied from a source without a pad then we need to exchange
628 
631 
632  qudaDeviceSynchronize(); // include sync here for accurate host-device profiling
633  }
634 
636  copy(cpu);
638  }
639 
641  profile.TPSTART(QUDA_PROFILE_H2D);
642  loadCPUField(cpu);
643  profile.TPSTOP(QUDA_PROFILE_H2D);
644  }
645 
647  {
648  static_cast<LatticeField&>(cpu).checkField(*this);
649 
651 
652  if (cpu.Order() == QUDA_MILC_SITE_GAUGE_ORDER ||
653  cpu.Order() == QUDA_BQCD_GAUGE_ORDER ||
655  // special case where we use zero-copy memory to read/write directly from application's array
656  void *cpu_d = get_mapped_device_pointer(cpu.Gauge_p());
657  if (cpu.GhostExchange() == QUDA_GHOST_EXCHANGE_NO) {
658  copyGenericGauge(cpu, *this, QUDA_CUDA_FIELD_LOCATION, cpu_d, gauge);
659  } else {
660  errorQuda("Ghost copy not supported here");
661  }
662  } else {
663  void *buffer = create_gauge_buffer(cpu.Bytes(), cpu.Order(), cpu.Geometry());
664 
665  // Allocate space for ghost zone if required
666  size_t ghost_bytes[8];
667  int cpuNinternal = cpu.Reconstruct() != QUDA_RECONSTRUCT_NO ? cpu.Reconstruct() : 2*nColor*nColor;
668  for (int d=0; d<geometry; d++) ghost_bytes[d] = nFace * surface[d%4] * cpuNinternal * cpu.Precision();
669  void **ghost_buffer = (nFace > 0) ? create_ghost_buffer(ghost_bytes, cpu.Order(), geometry) : nullptr;
670 
672  copyGenericGauge(cpu, *this, QUDA_CUDA_FIELD_LOCATION, buffer, gauge, ghost_buffer, 0);
673  if (geometry == QUDA_COARSE_GEOMETRY) copyGenericGauge(cpu, *this, QUDA_CUDA_FIELD_LOCATION, buffer, gauge, ghost_buffer, 0, 3);
674  } else {
675  copyExtendedGauge(cpu, *this, QUDA_CUDA_FIELD_LOCATION, buffer, gauge);
676  }
677 
678  if (cpu.Order() == QUDA_QDP_GAUGE_ORDER) {
679  for (int d = 0; d < geometry; d++)
680  qudaMemcpy(((void **)cpu.gauge)[d], ((void **)buffer)[d], cpu.Bytes() / geometry, cudaMemcpyDefault);
681  } else {
682  qudaMemcpy(cpu.gauge, buffer, cpu.Bytes(), cudaMemcpyDefault);
683  }
684 
685  if (cpu.Order() > 4 && GhostExchange() == QUDA_GHOST_EXCHANGE_PAD
687  for (int d = 0; d < geometry; d++)
688  qudaMemcpy(cpu.Ghost()[d], ghost_buffer[d], ghost_bytes[d], cudaMemcpyDefault);
689 
690  free_gauge_buffer(buffer, cpu.Order(), cpu.Geometry());
691  if (nFace > 0) free_ghost_buffer(ghost_buffer, cpu.Order(), geometry);
692  }
693  } else if (reorder_location() == QUDA_CPU_FIELD_LOCATION) { // do copy then host-side reorder
694 
695  void *buffer = pool_pinned_malloc(bytes);
696  qudaMemcpy(buffer, gauge, bytes, cudaMemcpyDefault);
697 
699  copyGenericGauge(cpu, *this, QUDA_CPU_FIELD_LOCATION, cpu.gauge, buffer);
700  } else {
701  copyExtendedGauge(cpu, *this, QUDA_CPU_FIELD_LOCATION, cpu.gauge, buffer);
702  }
703  pool_pinned_free(buffer);
704 
705  } else {
706  errorQuda("Invalid pack location %d", reorder_location());
707  }
708 
711 
713  }
714 
716  profile.TPSTART(QUDA_PROFILE_D2H);
717  saveCPUField(cpu);
718  profile.TPSTOP(QUDA_PROFILE_D2H);
719  }
720 
721  void cudaGaugeField::backup() const {
722  if (backed_up) errorQuda("Gauge field already backed up");
723  backup_h = new char[bytes];
724  qudaMemcpy(backup_h, gauge, bytes, cudaMemcpyDefault);
725  backed_up = true;
726  }
727 
729  {
730  if (!backed_up) errorQuda("Cannot restore since not backed up");
731  qudaMemcpy(gauge, backup_h, bytes, cudaMemcpyDefault);
732  delete []backup_h;
733  backed_up = false;
734  }
735 
737  {
739  if (gauge) qudaMemPrefetchAsync(gauge, bytes, mem_space, stream);
740  if (!isNative()) {
741  for (int i = 0; i < nDim; i++) {
742  size_t nbytes = nFace * surface[i] * nInternal * precision;
743  if (ghost[i] && nbytes) qudaMemPrefetchAsync(ghost[i], nbytes, mem_space, stream);
744  if (ghost[i + 4] && nbytes && geometry == QUDA_COARSE_GEOMETRY)
745  qudaMemPrefetchAsync(ghost[i + 4], nbytes, mem_space, stream);
746  }
747  }
748  }
749  }
750 
751  void cudaGaugeField::zero() { qudaMemset(gauge, 0, bytes); }
752 
753  void cudaGaugeField::copy_to_buffer(void *buffer) const
754  {
755  qudaMemcpy(buffer, Gauge_p(), Bytes(), cudaMemcpyDeviceToHost);
756  }
757 
759  {
760  qudaMemcpy(Gauge_p(), buffer, Bytes(), cudaMemcpyHostToDevice);
761  }
762 
763 } // namespace quda
QudaFieldGeometry Geometry() const
Definition: gauge_field.h:294
QudaLinkType link_type
Definition: gauge_field.h:216
QudaStaggeredPhase StaggeredPhase() const
Definition: gauge_field.h:295
QudaFieldCreate create
Definition: gauge_field.h:223
void * ghost[2 *QUDA_MAX_DIM]
Definition: gauge_field.h:225
QudaGaugeFieldOrder order
Definition: gauge_field.h:214
bool staggeredPhaseApplied
Definition: gauge_field.h:237
QudaStaggeredPhase staggeredPhaseType
Definition: gauge_field.h:232
QudaGaugeFieldOrder Order() const
Definition: gauge_field.h:287
const double & LinkMax() const
Definition: gauge_field.h:321
size_t Bytes() const
Definition: gauge_field.h:352
virtual void * Gauge_p()
Definition: gauge_field.h:358
bool isNative() const
Definition: gauge_field.h:350
double abs_max(int dim=-1, bool fixed=false) const
Compute the absolute maximum of the field (Linfinity norm)
void checkField(const LatticeField &) const
QudaFieldGeometry geometry
Definition: gauge_field.h:210
QudaReconstructType reconstruct
Definition: gauge_field.h:212
const void ** Ghost() const
Definition: gauge_field.h:368
bool StaggeredPhaseApplied() const
Definition: gauge_field.h:296
void createGhostZone(const int *R, bool no_comms_fill, bool bidir=true) const
QudaReconstructType Reconstruct() const
Definition: gauge_field.h:286
QudaGhostExchange ghostExchange
MsgHandle * mh_send_fwd[2][QUDA_MAX_DIM]
static int bufferIndex
MsgHandle * mh_recv_rdma_back[2][QUDA_MAX_DIM]
MsgHandle * mh_send_rdma_fwd[2][QUDA_MAX_DIM]
void * from_face_dim_dir_d[2][QUDA_MAX_DIM][2]
static MsgHandle * mh_recv_p2p_back[2][QUDA_MAX_DIM]
MsgHandle * mh_send_rdma_back[2][QUDA_MAX_DIM]
void * my_face_dim_dir_h[2][QUDA_MAX_DIM][2]
void * from_face_dim_dir_h[2][QUDA_MAX_DIM][2]
static MsgHandle * mh_send_p2p_fwd[2][QUDA_MAX_DIM]
static MsgHandle * mh_recv_p2p_fwd[2][QUDA_MAX_DIM]
static void * ghost_pinned_recv_buffer_h[2]
size_t ghost_offset[QUDA_MAX_DIM][2]
QudaPrecision Precision() const
QudaPrecision precision
void * my_face_dim_dir_d[2][QUDA_MAX_DIM][2]
static void destroyIPCComms()
size_t ghost_face_bytes[QUDA_MAX_DIM]
QudaMemoryType mem_type
static void * ghost_pinned_send_buffer_h[2]
static void * ghost_remote_send_buffer_d[2][QUDA_MAX_DIM][2]
static MsgHandle * mh_send_p2p_back[2][QUDA_MAX_DIM]
static bool ghost_field_reset
const int * R() const
int surfaceCB[QUDA_MAX_DIM]
int surface[QUDA_MAX_DIM]
MsgHandle * mh_send_back[2][QUDA_MAX_DIM]
QudaGhostExchange GhostExchange() const
static cudaEvent_t ipcCopyEvent[2][2][QUDA_MAX_DIM]
static cudaEvent_t ipcRemoteCopyEvent[2][2][QUDA_MAX_DIM]
MsgHandle * mh_recv_fwd[2][QUDA_MAX_DIM]
void allocateGhostBuffer(size_t ghost_bytes) const
Allocate the static ghost buffers.
MsgHandle * mh_recv_rdma_fwd[2][QUDA_MAX_DIM]
size_t ghost_face_bytes_aligned[QUDA_MAX_DIM]
static void * ghost_recv_buffer_d[2]
MsgHandle * mh_recv_back[2][QUDA_MAX_DIM]
void createComms(bool no_comms_fill=false, bool bidir=true)
static void * ghost_send_buffer_d[2]
virtual void copy_from_buffer(void *buffer)
Copy all contents of the field from a host buffer to this field.
void setGauge(void *_gauge)
void copy(const GaugeField &src)
void prefetch(QudaFieldLocation mem_space, qudaStream_t stream=0) const
If managed memory and prefetch is enabled, prefetch the gauge field and buffers to the CPU or the GPU...
void exchangeGhost(QudaLinkDirection link_direction=QUDA_LINK_BACKWARDS)
Exchange the ghost and store store in the padded region.
void createComms(const int *R, bool no_comms_fill, bool bidir=true)
Create the communication handlers and buffers.
void injectGhost(QudaLinkDirection link_direction=QUDA_LINK_BACKWARDS)
The opposite of exchangeGhost: take the ghost zone on x, send to node x-1, and inject back into the f...
void recvStart(int dim, int dir)
Start the receive communicators.
void sendStart(int dim, int dir, qudaStream_t *stream_p=nullptr)
Start the sending communicators.
void loadCPUField(const cpuGaugeField &cpu)
Download into this field from a CPU field.
void backup() const
Backs up the cudaGaugeField to CPU memory.
void allocateGhostBuffer(const int *R, bool no_comms_fill, bool bidir=true) const
Allocate the ghost buffers.
void saveCPUField(cpuGaugeField &cpu) const
Upload from this field into a CPU field.
virtual void copy_to_buffer(void *buffer) const
Copy all contents of the field to a host buffer.
void restore() const
Restores the cudaGaugeField to CUDA memory.
cudaGaugeField(const GaugeFieldParam &)
void commsComplete(int dim, int dir)
Wait for communication to complete.
void exchangeExtendedGhost(const int *R, bool no_comms_fill=false)
This does routine will populate the border / halo region of a gauge field that has been created using...
void comm_start(MsgHandle *mh)
bool comm_gdr_enabled()
Query if GPU Direct RDMA communication is enabled (global setting)
bool comm_peer2peer_enabled(int dir, int dim)
int comm_dim_partitioned(int dim)
void comm_wait(MsgHandle *mh)
std::array< int, 4 > dim
enum QudaLinkDirection_s QudaLinkDirection
@ QUDA_CUDA_FIELD_LOCATION
Definition: enum_quda.h:326
@ QUDA_CPU_FIELD_LOCATION
Definition: enum_quda.h:325
@ QUDA_LINK_BIDIRECTIONAL
Definition: enum_quda.h:497
@ QUDA_LINK_FORWARDS
Definition: enum_quda.h:497
@ QUDA_LINK_BACKWARDS
Definition: enum_quda.h:497
enum QudaGaugeFieldOrder_s QudaGaugeFieldOrder
@ QUDA_RECONSTRUCT_NO
Definition: enum_quda.h:70
@ QUDA_MEMORY_MAPPED
Definition: enum_quda.h:15
@ QUDA_MEMORY_DEVICE
Definition: enum_quda.h:13
@ QUDA_VECTOR_GEOMETRY
Definition: enum_quda.h:501
@ QUDA_COARSE_GEOMETRY
Definition: enum_quda.h:503
enum QudaFieldGeometry_s QudaFieldGeometry
enum QudaFieldLocation_s QudaFieldLocation
@ QUDA_GHOST_EXCHANGE_EXTENDED
Definition: enum_quda.h:510
@ QUDA_GHOST_EXCHANGE_NO
Definition: enum_quda.h:508
@ QUDA_GHOST_EXCHANGE_PAD
Definition: enum_quda.h:509
@ QUDA_SINGLE_PRECISION
Definition: enum_quda.h:64
@ QUDA_BQCD_GAUGE_ORDER
Definition: enum_quda.h:49
@ QUDA_TIFR_GAUGE_ORDER
Definition: enum_quda.h:50
@ QUDA_QDP_GAUGE_ORDER
Definition: enum_quda.h:44
@ QUDA_MILC_SITE_GAUGE_ORDER
Definition: enum_quda.h:48
@ QUDA_CPS_WILSON_GAUGE_ORDER
Definition: enum_quda.h:46
@ QUDA_TIFR_PADDED_GAUGE_ORDER
Definition: enum_quda.h:51
@ QUDA_QDPJIT_GAUGE_ORDER
Definition: enum_quda.h:45
@ QUDA_ZERO_FIELD_CREATE
Definition: enum_quda.h:361
@ QUDA_REFERENCE_FIELD_CREATE
Definition: enum_quda.h:363
@ QUDA_NULL_FIELD_CREATE
Definition: enum_quda.h:360
@ QUDA_ASQTAD_MOM_LINKS
Definition: enum_quda.h:33
@ QUDA_ASQTAD_FAT_LINKS
Definition: enum_quda.h:31
#define pool_pinned_malloc(size)
Definition: malloc_quda.h:172
#define pool_device_malloc(size)
Definition: malloc_quda.h:170
#define pool_pinned_free(ptr)
Definition: malloc_quda.h:173
#define pool_device_free(ptr)
Definition: malloc_quda.h:171
#define get_mapped_device_pointer(ptr)
Definition: malloc_quda.h:116
#define host_free(ptr)
Definition: malloc_quda.h:115
#define mapped_malloc(size)
Definition: malloc_quda.h:108
unsigned long long bytes
void * create_gauge_buffer(size_t bytes, QudaGaugeFieldOrder order, QudaFieldGeometry geometry)
void ** create_ghost_buffer(size_t bytes[], QudaGaugeFieldOrder order, QudaFieldGeometry geometry)
void copyGenericGauge(GaugeField &out, const GaugeField &in, QudaFieldLocation location, void *Out=0, void *In=0, void **ghostOut=0, void **ghostIn=0, int type=0)
Definition: copy_gauge.cpp:44
void extractGaugeGhost(const GaugeField &u, void **ghost, bool extract=true, int offset=0)
void extractExtendedGaugeGhost(const GaugeField &u, int dim, const int *R, void **ghost, bool extract)
void free_gauge_buffer(void *buffer, QudaGaugeFieldOrder order, QudaFieldGeometry geometry)
qudaStream_t * stream
@ QUDA_PROFILE_COMMS
Definition: timer.h:109
@ QUDA_PROFILE_H2D
Definition: timer.h:104
@ QUDA_PROFILE_D2H
Definition: timer.h:105
void free_ghost_buffer(void **buffer, QudaGaugeFieldOrder order, QudaFieldGeometry geometry)
QudaFieldLocation reorder_location()
Return whether data is reordered on the CPU or GPU. This can set at QUDA initialization using the env...
void copyExtendedGauge(GaugeField &out, const GaugeField &in, QudaFieldLocation location, void *Out=0, void *In=0)
bool is_prefetch_enabled()
Definition: malloc.cpp:198
QudaGaugeParam param
Definition: pack_test.cpp:18
#define qudaMemset2D(ptr, pitch, value, width, height)
Definition: quda_api.h:221
#define qudaMemsetAsync(ptr, value, count, stream)
Definition: quda_api.h:224
#define qudaMemPrefetchAsync(ptr, count, mem_space, stream)
Definition: quda_api.h:231
#define qudaMemcpy(dst, src, count, kind)
Definition: quda_api.h:204
#define qudaEventSynchronize(event)
Definition: quda_api.h:244
#define qudaMemset(ptr, value, count)
Definition: quda_api.h:218
#define qudaEventRecord(event, stream)
Definition: quda_api.h:238
#define qudaMemcpyAsync(dst, src, count, kind, stream)
Definition: quda_api.h:207
#define qudaStreamSynchronize(stream)
Definition: quda_api.h:247
cudaStream_t qudaStream_t
Definition: quda_api.h:9
#define qudaDeviceSynchronize()
Definition: quda_api.h:250
#define QUDA_MAX_DIM
Maximum number of dimensions supported by QUDA. In practice, no routines make use of more than 5.
qudaStream_t * streams
Definition: device.cpp:15
#define errorQuda(...)
Definition: util_quda.h:120