QUDA  1.0.0
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros
cuda_gauge_field.cpp
Go to the documentation of this file.
1 #include <string.h>
2 #include <gauge_field.h>
3 #include <typeinfo>
4 #include <blas_quda.h>
5 
6 namespace quda {
7 
9  GaugeField(param), gauge(0), even(0), odd(0)
10  {
13  errorQuda("QDP ordering only supported for reference fields");
14  }
15 
16  if (order == QUDA_QDP_GAUGE_ORDER ||
19  errorQuda("Field ordering %d presently disabled for this type", order);
20 
21 #ifdef MULTI_GPU
24  isNative()) {
25  bool pad_check = true;
26  for (int i=0; i<nDim; i++) {
27  // when we have coarse links we need to double the pad since we're storing forwards and backwards links
28  int minimum_pad = nFace*surfaceCB[i] * (geometry == QUDA_COARSE_GEOMETRY ? 2 : 1);
29  if (pad < minimum_pad) pad_check = false;
30  if (!pad_check)
31  errorQuda("cudaGaugeField being constructed with insufficient padding in dim %d (%d < %d)\n", i, pad, minimum_pad);
32  }
33  }
34 #endif
35 
39  errorQuda("ERROR: create type(%d) not supported yet\n", create);
40  }
41 
43  switch(mem_type) {
44  case QUDA_MEMORY_DEVICE:
46  break;
47  case QUDA_MEMORY_MAPPED:
49  cudaHostGetDevicePointer(&gauge, gauge_h, 0); // set the matching device pointer
50  break;
51  default:
52  errorQuda("Unsupported memory type %d", mem_type);
53  }
54  if (create == QUDA_ZERO_FIELD_CREATE) cudaMemset(gauge, 0, bytes);
55  } else {
56  gauge = param.gauge;
57  }
58 
59  if ( !isNative() ) {
60  for (int i=0; i<nDim; i++) {
61  size_t nbytes = nFace * surface[i] * nInternal * precision;
62  ghost[i] = nbytes ? pool_device_malloc(nbytes) : nullptr;
63  ghost[i+4] = (nbytes && geometry == QUDA_COARSE_GEOMETRY) ? pool_device_malloc(nbytes) : nullptr;
64  }
65  }
66 
69  }
70 
71  even = gauge;
72  odd = static_cast<char*>(gauge) + bytes/2;
74 
75 #ifdef USE_TEXTURE_OBJECTS
76  createTexObject(tex, gauge, true);
77  createTexObject(evenTex, even, false);
78  createTexObject(oddTex, odd, false);
80  // Create texture objects for the phases
81  bool isPhase = true;
82  createTexObject(phaseTex, (char*)gauge + phase_offset, true, isPhase);
83  createTexObject(evenPhaseTex, (char*)even + phase_offset, false, isPhase);
84  createTexObject(oddPhaseTex, (char*)odd + phase_offset, false, isPhase);
85  }
86 #endif
87 
88  }
89 
91  size_t pad_bytes = (stride - volumeCB) * precision * order;
92  int Npad = (geometry * (reconstruct != QUDA_RECONSTRUCT_NO ? reconstruct : nColor * nColor * 2)) / order;
93 
94  size_t pitch = stride*order*precision;
95  if (pad_bytes) {
96  cudaMemset2D(static_cast<char*>(even) + volumeCB*order*precision, pitch, 0, pad_bytes, Npad);
97  cudaMemset2D(static_cast<char*>(odd) + volumeCB*order*precision, pitch, 0, pad_bytes, Npad);
98  }
99  }
100 
101 #ifdef USE_TEXTURE_OBJECTS
102  void cudaGaugeField::createTexObject(cudaTextureObject_t &tex, void *field, bool full, bool isPhase) {
103 
104  if (isNative() && geometry != QUDA_COARSE_GEOMETRY) {
105  // create the texture for the field components
106  cudaChannelFormatDesc desc;
107  memset(&desc, 0, sizeof(cudaChannelFormatDesc));
108  if (precision == QUDA_SINGLE_PRECISION) desc.f = cudaChannelFormatKindFloat;
109  else desc.f = cudaChannelFormatKindSigned; // half is short, double is int2
110 
111  int texel_size = 1;
112  if (isPhase) {
114  desc.x = 8*sizeof(int);
115  desc.y = 8*sizeof(int);
116  desc.z = 0;
117  desc.w = 0;
118  texel_size = 2*sizeof(int);
119  } else {
120  desc.x = 8*precision;
121  desc.y = desc.z = desc.w = 0;
122  texel_size = precision;
123  }
124  } else {
125  // always four components regardless of precision
127  desc.x = 8*sizeof(int);
128  desc.y = 8*sizeof(int);
129  desc.z = 8*sizeof(int);
130  desc.w = 8*sizeof(int);
131  texel_size = 4*sizeof(int);
132  } else {
133  desc.x = 8*precision;
134  desc.y = 8*precision;
135  desc.z = (reconstruct == 18 || reconstruct == 10) ? 0 : 8*precision; // float2 or short2 for 18 reconstruct
136  desc.w = (reconstruct == 18 || reconstruct == 10) ? 0 : 8*precision;
137  texel_size = (reconstruct == 18 || reconstruct == 10 ? 2 : 4) * precision;
138  }
139  }
140 
141  cudaResourceDesc resDesc;
142  memset(&resDesc, 0, sizeof(resDesc));
143  resDesc.resType = cudaResourceTypeLinear;
144  resDesc.res.linear.devPtr = field;
145  resDesc.res.linear.desc = desc;
146  resDesc.res.linear.sizeInBytes = (isPhase ? phase_bytes : bytes) / (!full ? 2 : 1);
147 
148  if (resDesc.res.linear.sizeInBytes % deviceProp.textureAlignment != 0
149  || !is_aligned(resDesc.res.linear.devPtr, deviceProp.textureAlignment)) {
150  errorQuda("Allocation size %lu does not have correct alignment for textures (%lu)",
151  resDesc.res.linear.sizeInBytes, deviceProp.textureAlignment);
152  }
153 
154  unsigned long texels = resDesc.res.linear.sizeInBytes / texel_size;
155  if (texels > (unsigned)deviceProp.maxTexture1DLinear) {
156  errorQuda("Attempting to bind too large a texture %lu > %d", texels, deviceProp.maxTexture1DLinear);
157  }
158 
159  cudaTextureDesc texDesc;
160  memset(&texDesc, 0, sizeof(texDesc));
161  if (precision == QUDA_HALF_PRECISION || precision == QUDA_QUARTER_PRECISION) texDesc.readMode = cudaReadModeNormalizedFloat;
162  else texDesc.readMode = cudaReadModeElementType;
163 
164  cudaCreateTextureObject(&tex, &resDesc, &texDesc, NULL);
165  checkCudaError();
166  }
167  }
168 
169  void cudaGaugeField::destroyTexObject() {
170  if ( isNative() && geometry != QUDA_COARSE_GEOMETRY ) {
171  cudaDestroyTextureObject(tex);
172  cudaDestroyTextureObject(evenTex);
173  cudaDestroyTextureObject(oddTex);
175  cudaDestroyTextureObject(phaseTex);
176  cudaDestroyTextureObject(evenPhaseTex);
177  cudaDestroyTextureObject(oddPhaseTex);
178  }
179  checkCudaError();
180  }
181  }
182 #endif
183 
185  {
186 #ifdef USE_TEXTURE_OBJECTS
187  destroyTexObject();
188 #endif
189 
190  destroyComms();
191 
193  switch(mem_type) {
194  case QUDA_MEMORY_DEVICE:
196  break;
197  case QUDA_MEMORY_MAPPED:
198  if (gauge_h) host_free(gauge_h);
199  break;
200  default:
201  errorQuda("Unsupported memory type %d", mem_type);
202  }
203  }
204 
205  if ( !isNative() ) {
206  for (int i=0; i<nDim; i++) {
207  if (ghost[i]) pool_device_free(ghost[i]);
209  }
210  }
211 
212  }
213 
214  // This does the exchange of the forwards boundary gauge field ghost zone and places
215  // it into the ghost array of the next node
217 
218  if (ghostExchange != QUDA_GHOST_EXCHANGE_PAD) errorQuda("Cannot call exchangeGhost with ghostExchange=%d", ghostExchange);
219  if (geometry != QUDA_VECTOR_GEOMETRY && geometry != QUDA_COARSE_GEOMETRY) errorQuda("Invalid geometry=%d", geometry);
220  if ( (link_direction == QUDA_LINK_BIDIRECTIONAL || link_direction == QUDA_LINK_FORWARDS) && geometry != QUDA_COARSE_GEOMETRY)
221  errorQuda("Cannot request exchange of forward links on non-coarse geometry");
222  if (nFace == 0) errorQuda("nFace = 0");
223 
224  const int dir = 1; // sending forwards only
225  const int R[] = {nFace, nFace, nFace, nFace};
226  const bool no_comms_fill = true; // dslash kernels presently require this
227  const bool bidir = false; // communication is only ever done in one direction at once
228  createComms(R, true, bidir); // always need to allocate space for non-partitioned dimension for copyGenericGauge
229 
230  // loop over backwards and forwards links
232  for (int link_dir = 0; link_dir<2; link_dir++) {
233  if (!(link_direction == QUDA_LINK_BIDIRECTIONAL || link_direction == directions[link_dir])) continue;
234 
235  void *send_d[2*QUDA_MAX_DIM] = { };
236  void *recv_d[2*QUDA_MAX_DIM] = { };
237 
238  size_t offset = 0;
239  for (int d=0; d<nDim; d++) {
240  recv_d[d] = static_cast<char*>(ghost_recv_buffer_d[bufferIndex]) + offset; if (bidir) offset += ghost_face_bytes[d];
241  send_d[d] = static_cast<char*>(ghost_send_buffer_d[bufferIndex]) + offset; offset += ghost_face_bytes[d];
242  }
243 
244  extractGaugeGhost(*this, send_d, true, link_dir*nDim); // get the links into contiguous buffers
245 
246  // issue receive preposts and host-to-device copies if needed
247  for (int dim=0; dim<nDim; dim++) {
248  if (!comm_dim_partitioned(dim)) continue;
249  recvStart(dim, dir); // prepost the receive
250  if (!comm_peer2peer_enabled(dir,dim) && !comm_gdr_enabled()) {
251  cudaMemcpyAsync(my_face_dim_dir_h[bufferIndex][dim][dir], my_face_dim_dir_d[bufferIndex][dim][dir],
252  ghost_face_bytes[dim], cudaMemcpyDeviceToHost, streams[2*dim+dir]);
253  }
254  }
255 
256  // if gdr enabled then synchronize
258 
259  // if the sending direction is not peer-to-peer then we need to synchronize before we start sending
260  for (int dim=0; dim<nDim; dim++) {
261  if (!comm_dim_partitioned(dim)) continue;
262  if (!comm_peer2peer_enabled(dir,dim) && !comm_gdr_enabled()) qudaStreamSynchronize(streams[2*dim+dir]);
263  sendStart(dim, dir, &streams[2*dim+dir]); // start sending
264  }
265 
266  // complete communication and issue host-to-device copies if needed
267  for (int dim=0; dim<nDim; dim++) {
268  if (!comm_dim_partitioned(dim)) continue;
269  commsComplete(dim, dir);
270  if (!comm_peer2peer_enabled(1-dir,dim) && !comm_gdr_enabled()) {
271  cudaMemcpyAsync(from_face_dim_dir_d[bufferIndex][dim][1-dir], from_face_dim_dir_h[bufferIndex][dim][1-dir],
272  ghost_face_bytes[dim], cudaMemcpyHostToDevice, streams[2*dim+dir]);
273  }
274  }
275 
276  // fill in the halos for non-partitioned dimensions
277  for (int dim=0; dim<nDim; dim++) {
278  if (!comm_dim_partitioned(dim) && no_comms_fill) {
279  qudaMemcpy(recv_d[dim], send_d[dim], ghost_face_bytes[dim], cudaMemcpyDeviceToDevice);
280  }
281  }
282 
283  if (isNative()) {
284  copyGenericGauge(*this, *this, QUDA_CUDA_FIELD_LOCATION, 0, 0, 0, recv_d, 1 + 2*link_dir); // 1, 3
285  } else {
286  // copy from receive buffer into ghost array
287  for (int dim=0; dim<nDim; dim++)
288  qudaMemcpy(ghost[dim+link_dir*nDim], recv_d[dim], ghost_face_bytes[dim], cudaMemcpyDeviceToDevice);
289  }
290 
292  } // link_dir
293 
295  }
296 
297  // This does the opposite of exchangeGhost and sends back the ghost
298  // zone to the node from which it came and injects it back into the
299  // field
301 
302  if (ghostExchange != QUDA_GHOST_EXCHANGE_PAD) errorQuda("Cannot call exchangeGhost with ghostExchange=%d", ghostExchange);
303  if (geometry != QUDA_VECTOR_GEOMETRY && geometry != QUDA_COARSE_GEOMETRY) errorQuda("Invalid geometry=%d", geometry);
304  if (link_direction != QUDA_LINK_BACKWARDS) errorQuda("Invalid link_direction = %d", link_direction);
305  if (nFace == 0) errorQuda("nFace = 0");
306 
307  const int dir = 0; // sending backwards only
308  const int R[] = {nFace, nFace, nFace, nFace};
309  const bool no_comms_fill = false; // injection never does no_comms_fill
310  const bool bidir = false; // communication is only ever done in one direction at once
311  createComms(R, true, bidir); // always need to allocate space for non-partitioned dimension for copyGenericGauge
312 
313  // loop over backwards and forwards links (forwards links never sent but leave here just in case)
315  for (int link_dir = 0; link_dir<2; link_dir++) {
316  if (!(link_direction == QUDA_LINK_BIDIRECTIONAL || link_direction == directions[link_dir])) continue;
317 
318  void *send_d[2*QUDA_MAX_DIM] = { };
319  void *recv_d[2*QUDA_MAX_DIM] = { };
320 
321  size_t offset = 0;
322  for (int d=0; d<nDim; d++) {
323  // send backwards is first half of each ghost_send_buffer
324  send_d[d] = static_cast<char*>(ghost_send_buffer_d[bufferIndex]) + offset; if (bidir) offset += ghost_face_bytes[d];
325  // receive from forwards is the second half of each ghost_recv_buffer
326  recv_d[d] = static_cast<char*>(ghost_recv_buffer_d[bufferIndex]) + offset; offset += ghost_face_bytes[d];
327  }
328 
329  if (isNative()) { // copy from padded region in gauge field into send buffer
330  copyGenericGauge(*this, *this, QUDA_CUDA_FIELD_LOCATION, 0, 0, send_d, 0, 1 + 2*link_dir);
331  } else { // copy from receive buffer into ghost array
332  for (int dim=0; dim<nDim; dim++) qudaMemcpy(send_d[dim], ghost[dim+link_dir*nDim], ghost_face_bytes[dim], cudaMemcpyDeviceToDevice);
333  }
334 
335  // issue receive preposts and host-to-device copies if needed
336  for (int dim=0; dim<nDim; dim++) {
337  if (!comm_dim_partitioned(dim)) continue;
338  recvStart(dim, dir); // prepost the receive
339  if (!comm_peer2peer_enabled(dir,dim) && !comm_gdr_enabled()) {
340  cudaMemcpyAsync(my_face_dim_dir_h[bufferIndex][dim][dir], my_face_dim_dir_d[bufferIndex][dim][dir],
341  ghost_face_bytes[dim], cudaMemcpyDeviceToHost, streams[2*dim+dir]);
342  }
343  }
344 
345  // if gdr enabled then synchronize
347 
348  // if the sending direction is not peer-to-peer then we need to synchronize before we start sending
349  for (int dim=0; dim<nDim; dim++) {
350  if (!comm_dim_partitioned(dim)) continue;
351  if (!comm_peer2peer_enabled(dir,dim) && !comm_gdr_enabled()) qudaStreamSynchronize(streams[2*dim+dir]);
352  sendStart(dim, dir, &streams[2*dim+dir]); // start sending
353  }
354 
355  // complete communication and issue host-to-device copies if needed
356  for (int dim=0; dim<nDim; dim++) {
357  if (!comm_dim_partitioned(dim)) continue;
358  commsComplete(dim, dir);
359  if (!comm_peer2peer_enabled(1-dir,dim) && !comm_gdr_enabled()) {
360  cudaMemcpyAsync(from_face_dim_dir_d[bufferIndex][dim][1-dir], from_face_dim_dir_h[bufferIndex][dim][1-dir],
361  ghost_face_bytes[dim], cudaMemcpyHostToDevice, streams[2*dim+dir]);
362  }
363  }
364 
365  // fill in the halos for non-partitioned dimensions
366  for (int dim=0; dim<nDim; dim++) {
367  if (!comm_dim_partitioned(dim) && no_comms_fill) {
368  qudaMemcpy(recv_d[dim], send_d[dim], ghost_face_bytes[dim], cudaMemcpyDeviceToDevice);
369  }
370  }
371 
372  // get the links into contiguous buffers
373  extractGaugeGhost(*this, recv_d, false, link_dir*nDim);
374 
376  } // link_dir
377 
379  }
380 
381  void cudaGaugeField::allocateGhostBuffer(const int *R, bool no_comms_fill, bool bidir) const
382  {
383  createGhostZone(R, no_comms_fill, bidir);
385  }
386 
387  void cudaGaugeField::createComms(const int *R, bool no_comms_fill, bool bidir)
388  {
389  allocateGhostBuffer(R, no_comms_fill, bidir); // allocate the ghost buffer if not yet allocated
390 
391  // ascertain if this instance needs it comms buffers to be updated
392  bool comms_reset = ghost_field_reset || // FIXME add send buffer check
395  ghost_bytes != ghost_bytes_old; // ghost buffer has been resized (e.g., bidir to unidir)
396 
397  if (!initComms || comms_reset) LatticeField::createComms(no_comms_fill, bidir);
398 
400  createIPCComms();
401  }
402 
403  void cudaGaugeField::recvStart(int dim, int dir)
404  {
405  if (!comm_dim_partitioned(dim)) return;
406 
407  if (dir==0) { // sending backwards
408  // receive from the processor in the +1 direction
409  if (comm_peer2peer_enabled(1,dim)) {
411  } else if (comm_gdr_enabled()) {
413  } else {
415  }
416  } else { //sending forwards
417  // receive from the processor in the -1 direction
418  if (comm_peer2peer_enabled(0,dim)) {
420  } else if (comm_gdr_enabled()) {
422  } else {
424  }
425  }
426  }
427 
428  void cudaGaugeField::sendStart(int dim, int dir, cudaStream_t* stream_p)
429  {
430  if (!comm_dim_partitioned(dim)) return;
431 
432  if (!comm_peer2peer_enabled(dir,dim)) {
433  if (dir == 0)
434  if (comm_gdr_enabled()) {
436  } else {
438  }
439  else
440  if (comm_gdr_enabled()) {
442  } else {
444  }
445  } else { // doing peer-to-peer
446 
447  void* ghost_dst = static_cast<char*>(ghost_remote_send_buffer_d[bufferIndex][dim][dir])
448  + precision*ghostOffset[dim][(dir+1)%2];
449 
450  cudaMemcpyAsync(ghost_dst, my_face_dim_dir_d[bufferIndex][dim][dir],
451  ghost_face_bytes[dim], cudaMemcpyDeviceToDevice,
452  stream_p ? *stream_p : 0);
453 
454  if (dir == 0) {
455  // record the event
456  qudaEventRecord(ipcCopyEvent[bufferIndex][0][dim], stream_p ? *stream_p : 0);
457  // send to the processor in the -1 direction
459  } else {
460  qudaEventRecord(ipcCopyEvent[bufferIndex][1][dim], stream_p ? *stream_p : 0);
461  // send to the processor in the +1 direction
463  }
464  }
465  }
466 
467  void cudaGaugeField::commsComplete(int dim, int dir)
468  {
469  if (!comm_dim_partitioned(dim)) return;
470 
471  if (dir==0) {
472  if (comm_peer2peer_enabled(1,dim)) {
475  } else if (comm_gdr_enabled()) {
477  } else {
479  }
480 
481  if (comm_peer2peer_enabled(0,dim)) {
484  } else if (comm_gdr_enabled()) {
486  } else {
488  }
489  } else {
490  if (comm_peer2peer_enabled(0,dim)) {
493  } else if (comm_gdr_enabled()) {
495  } else {
497  }
498 
499  if (comm_peer2peer_enabled(1,dim)) {
502  } else if (comm_gdr_enabled()) {
504  } else {
506  }
507  }
508  }
509 
510  void cudaGaugeField::exchangeExtendedGhost(const int *R, bool no_comms_fill)
511  {
512  const int b = bufferIndex;
513  void *send_d[QUDA_MAX_DIM], *recv_d[QUDA_MAX_DIM];
514 
515  createComms(R, no_comms_fill);
516 
517  size_t offset = 0;
518  for (int dim=0; dim<nDim; dim++) {
519  if ( !(comm_dim_partitioned(dim) || (no_comms_fill && R[dim])) ) continue;
520  send_d[dim] = static_cast<char*>(ghost_send_buffer_d[b]) + offset;
521  recv_d[dim] = static_cast<char*>(ghost_recv_buffer_d[b]) + offset;
522 
523  // silence cuda-memcheck initcheck errors that arise since we
524  // have an oversized ghost buffer when doing the extended exchange
525  cudaMemsetAsync(send_d[dim], 0, 2*ghost_face_bytes[dim]);
526  offset += 2*ghost_face_bytes[dim]; // factor of two from fwd/back
527  }
528 
529  for (int dim=0; dim<nDim; dim++) {
530  if ( !(comm_dim_partitioned(dim) || (no_comms_fill && R[dim])) ) continue;
531 
532  //extract into a contiguous buffer
533  extractExtendedGaugeGhost(*this, dim, R, send_d, true);
534 
535  if (comm_dim_partitioned(dim)) {
536  for (int dir=0; dir<2; dir++) recvStart(dim, dir);
537 
538  for (int dir=0; dir<2; dir++) {
539  // issue host-to-device copies if needed
540  if (!comm_peer2peer_enabled(dir,dim) && !comm_gdr_enabled()) {
541  cudaMemcpyAsync(my_face_dim_dir_h[bufferIndex][dim][dir], my_face_dim_dir_d[bufferIndex][dim][dir],
542  ghost_face_bytes[dim], cudaMemcpyDeviceToHost, streams[dir]);
543  }
544  }
545 
546  // if either direction is not peer-to-peer then we need to synchronize
548 
549  // if we pass a stream to sendStart then we must ensure that stream is synchronized
550  for (int dir=0; dir<2; dir++) sendStart(dim, dir, &streams[dir]);
551  for (int dir=0; dir<2; dir++) commsComplete(dim, dir);
552 
553  for (int dir=0; dir<2; dir++) {
554  // issue host-to-device copies if needed
555  if (!comm_peer2peer_enabled(dir,dim) && !comm_gdr_enabled()) {
556  cudaMemcpyAsync(from_face_dim_dir_d[bufferIndex][dim][dir], from_face_dim_dir_h[bufferIndex][dim][dir],
557  ghost_face_bytes[dim], cudaMemcpyHostToDevice, streams[dir]);
558  }
559  }
560 
561  } else { // if just doing a local exchange to fill halo then need to swap faces
562  qudaMemcpy(from_face_dim_dir_d[b][dim][1], my_face_dim_dir_d[b][dim][0],
563  ghost_face_bytes[dim], cudaMemcpyDeviceToDevice);
564  qudaMemcpy(from_face_dim_dir_d[b][dim][0], my_face_dim_dir_d[b][dim][1],
565  ghost_face_bytes[dim], cudaMemcpyDeviceToDevice);
566  }
567 
568  // inject back into the gauge field
569  extractExtendedGaugeGhost(*this, dim, R, recv_d, false);
570  }
571 
574  }
575 
576  void cudaGaugeField::exchangeExtendedGhost(const int *R, TimeProfile &profile, bool no_comms_fill) {
577  profile.TPSTART(QUDA_PROFILE_COMMS);
578  exchangeExtendedGhost(R, no_comms_fill);
579  profile.TPSTOP(QUDA_PROFILE_COMMS);
580  }
581 
582  void cudaGaugeField::setGauge(void *gauge_)
583  {
585  errorQuda("Setting gauge pointer is only allowed when create="
586  "QUDA_REFERENCE_FIELD_CREATE type\n");
587  }
588  gauge = gauge_;
589  }
590 
592  if (order == QUDA_QDP_GAUGE_ORDER) {
593  void **buffer = new void*[geometry];
594  for (int d=0; d<geometry; d++) buffer[d] = pool_device_malloc(bytes/geometry);
595  return ((void*)buffer);
596  } else {
597  return pool_device_malloc(bytes);
598  }
599 
600  }
601 
603 
604  if (order > 4) {
605  void **buffer = new void*[geometry];
606  for (int d=0; d<geometry; d++) buffer[d] = pool_device_malloc(bytes[d]);
607  return buffer;
608  } else {
609  return 0;
610  }
611 
612  }
613 
615  if (order == QUDA_QDP_GAUGE_ORDER) {
616  for (int d=0; d<geometry; d++) pool_device_free(((void**)buffer)[d]);
617  delete []((void**)buffer);
618  } else {
619  pool_device_free(buffer);
620  }
621  }
622 
624  if (order > 4) {
625  for (int d=0; d<geometry; d++) pool_device_free(buffer[d]);
626  delete []buffer;
627  }
628  }
629 
630  void cudaGaugeField::copy(const GaugeField &src) {
631  if (this == &src) return;
632 
633  checkField(src);
634 
636  fat_link_max = src.LinkMax();
638  errorQuda("fat_link_max has not been computed");
639  } else {
640  fat_link_max = 1.0;
641  }
642 
643  if (typeid(src) == typeid(cudaGaugeField)) {
644 
646  // copy field and ghost zone into this field
647  copyGenericGauge(*this, src, QUDA_CUDA_FIELD_LOCATION, gauge, static_cast<const cudaGaugeField&>(src).gauge);
648 
650  copyGenericGauge(*this, src, QUDA_CUDA_FIELD_LOCATION, gauge, static_cast<const cudaGaugeField&>(src).gauge, 0, 0, 3);
651  } else {
652  copyExtendedGauge(*this, src, QUDA_CUDA_FIELD_LOCATION, gauge, static_cast<const cudaGaugeField&>(src).gauge);
653  if (geometry == QUDA_COARSE_GEOMETRY) errorQuda("Extended gauge copy for coarse geometry not supported");
654  }
655 
656  } else if (typeid(src) == typeid(cpuGaugeField)) {
657  if (reorder_location() == QUDA_CPU_FIELD_LOCATION) { // do reorder on the CPU
658  void *buffer = pool_pinned_malloc(bytes);
659 
661  // copy field and ghost zone into buffer
662  copyGenericGauge(*this, src, QUDA_CPU_FIELD_LOCATION, buffer, static_cast<const cpuGaugeField&>(src).gauge);
663 
665  copyGenericGauge(*this, src, QUDA_CPU_FIELD_LOCATION, buffer, static_cast<const cpuGaugeField &>(src).gauge,
666  0, 0, 3);
667  } else {
668  copyExtendedGauge(*this, src, QUDA_CPU_FIELD_LOCATION, buffer, static_cast<const cpuGaugeField&>(src).gauge);
669  if (geometry == QUDA_COARSE_GEOMETRY) errorQuda("Extended gauge copy for coarse geometry not supported");
670  }
671 
672  // this copies over both even and odd
673  qudaMemcpy(gauge, buffer, bytes, cudaMemcpyHostToDevice);
674  pool_pinned_free(buffer);
675  } else { // else on the GPU
676 
677  if (src.Order() == QUDA_MILC_SITE_GAUGE_ORDER ||
678  src.Order() == QUDA_BQCD_GAUGE_ORDER ||
680  // special case where we use zero-copy memory to read/write directly from application's array
681  void *src_d;
682  cudaError_t error = cudaHostGetDevicePointer(&src_d, const_cast<void*>(src.Gauge_p()), 0);
683  if (error != cudaSuccess) errorQuda("Failed to get device pointer for MILC site / BQCD array");
684 
685  if (src.GhostExchange() == QUDA_GHOST_EXCHANGE_NO) {
686  copyGenericGauge(*this, src, QUDA_CUDA_FIELD_LOCATION, gauge, src_d);
687  } else {
688  errorQuda("Ghost copy not supported here");
689  }
690 
691  } else {
692  void *buffer = create_gauge_buffer(src.Bytes(), src.Order(), src.Geometry());
693  size_t ghost_bytes[8];
694  int srcNinternal = src.Reconstruct() != QUDA_RECONSTRUCT_NO ? src.Reconstruct() : 2*nColor*nColor;
695  for (int d=0; d<geometry; d++) ghost_bytes[d] = nFace * surface[d%4] * srcNinternal * src.Precision();
696  void **ghost_buffer = (nFace > 0) ? create_ghost_buffer(ghost_bytes, src.Order(), geometry) : nullptr;
697 
698  if (src.Order() == QUDA_QDP_GAUGE_ORDER) {
699  for (int d=0; d<geometry; d++) {
700  qudaMemcpy(((void**)buffer)[d], ((void**)src.Gauge_p())[d], src.Bytes()/geometry, cudaMemcpyHostToDevice);
701  }
702  } else {
703  qudaMemcpy(buffer, src.Gauge_p(), src.Bytes(), cudaMemcpyHostToDevice);
704  }
705 
706  if (src.Order() > 4 && GhostExchange() == QUDA_GHOST_EXCHANGE_PAD &&
708  for (int d=0; d<geometry; d++)
709  qudaMemcpy(ghost_buffer[d], src.Ghost()[d], ghost_bytes[d], cudaMemcpyHostToDevice);
710 
712  copyGenericGauge(*this, src, QUDA_CUDA_FIELD_LOCATION, gauge, buffer, 0, ghost_buffer);
713  if (geometry == QUDA_COARSE_GEOMETRY) copyGenericGauge(*this, src, QUDA_CUDA_FIELD_LOCATION, gauge, buffer, 0, ghost_buffer, 3);
714  } else {
715  copyExtendedGauge(*this, src, QUDA_CUDA_FIELD_LOCATION, gauge, buffer);
716  if (geometry == QUDA_COARSE_GEOMETRY) errorQuda("Extended gauge copy for coarse geometry not supported");
717  }
718  free_gauge_buffer(buffer, src.Order(), src.Geometry());
719  if (nFace > 0) free_ghost_buffer(ghost_buffer, src.Order(), geometry);
720  }
721  } // reorder_location
722  } else {
723  errorQuda("Invalid gauge field type");
724  }
725 
726  // if we have copied from a source without a pad then we need to exchange
729 
732 
733  qudaDeviceSynchronize(); // include sync here for accurate host-device profiling
734  checkCudaError();
735  }
736 
738  copy(cpu);
740  checkCudaError();
741  }
742 
744  profile.TPSTART(QUDA_PROFILE_H2D);
745  loadCPUField(cpu);
746  profile.TPSTOP(QUDA_PROFILE_H2D);
747  }
748 
750  {
751  static_cast<LatticeField&>(cpu).checkField(*this);
752 
754 
755  if (cpu.Order() == QUDA_MILC_SITE_GAUGE_ORDER ||
756  cpu.Order() == QUDA_BQCD_GAUGE_ORDER ||
758  // special case where we use zero-copy memory to read/write directly from application's array
759  void *cpu_d;
760  cudaError_t error = cudaHostGetDevicePointer(&cpu_d, const_cast<void*>(cpu.Gauge_p()), 0);
761  if (error != cudaSuccess) errorQuda("Failed to get device pointer for MILC site / BQCD array");
762  if (cpu.GhostExchange() == QUDA_GHOST_EXCHANGE_NO) {
763  copyGenericGauge(cpu, *this, QUDA_CUDA_FIELD_LOCATION, cpu_d, gauge);
764  } else {
765  errorQuda("Ghost copy not supported here");
766  }
767  } else {
768  void *buffer = create_gauge_buffer(cpu.Bytes(), cpu.Order(), cpu.Geometry());
769 
770  // Allocate space for ghost zone if required
771  size_t ghost_bytes[8];
772  int cpuNinternal = cpu.Reconstruct() != QUDA_RECONSTRUCT_NO ? cpu.Reconstruct() : 2*nColor*nColor;
773  for (int d=0; d<geometry; d++) ghost_bytes[d] = nFace * surface[d%4] * cpuNinternal * cpu.Precision();
774  void **ghost_buffer = (nFace > 0) ? create_ghost_buffer(ghost_bytes, cpu.Order(), geometry) : nullptr;
775 
777  copyGenericGauge(cpu, *this, QUDA_CUDA_FIELD_LOCATION, buffer, gauge, ghost_buffer, 0);
778  if (geometry == QUDA_COARSE_GEOMETRY) copyGenericGauge(cpu, *this, QUDA_CUDA_FIELD_LOCATION, buffer, gauge, ghost_buffer, 0, 3);
779  } else {
780  copyExtendedGauge(cpu, *this, QUDA_CUDA_FIELD_LOCATION, buffer, gauge);
781  }
782 
783  if (cpu.Order() == QUDA_QDP_GAUGE_ORDER) {
784  for (int d=0; d<geometry; d++) qudaMemcpy(((void**)cpu.gauge)[d], ((void**)buffer)[d], cpu.Bytes()/geometry, cudaMemcpyDeviceToHost);
785  } else {
786  qudaMemcpy(cpu.gauge, buffer, cpu.Bytes(), cudaMemcpyDeviceToHost);
787  }
788 
789  if (cpu.Order() > 4 && GhostExchange() == QUDA_GHOST_EXCHANGE_PAD &&
791  for (int d=0; d<geometry; d++)
792  qudaMemcpy(cpu.Ghost()[d], ghost_buffer[d], ghost_bytes[d], cudaMemcpyDeviceToHost);
793 
794  free_gauge_buffer(buffer, cpu.Order(), cpu.Geometry());
795  if (nFace > 0) free_ghost_buffer(ghost_buffer, cpu.Order(), geometry);
796  }
797  } else if (reorder_location() == QUDA_CPU_FIELD_LOCATION) { // do copy then host-side reorder
798 
799  void *buffer = pool_pinned_malloc(bytes);
800  qudaMemcpy(buffer, gauge, bytes, cudaMemcpyDeviceToHost);
801 
803  copyGenericGauge(cpu, *this, QUDA_CPU_FIELD_LOCATION, cpu.gauge, buffer);
804  } else {
805  copyExtendedGauge(cpu, *this, QUDA_CPU_FIELD_LOCATION, cpu.gauge, buffer);
806  }
807  pool_pinned_free(buffer);
808 
809  } else {
810  errorQuda("Invalid pack location %d", reorder_location());
811  }
812 
815 
817  checkCudaError();
818  }
819 
821  profile.TPSTART(QUDA_PROFILE_D2H);
822  saveCPUField(cpu);
823  profile.TPSTOP(QUDA_PROFILE_D2H);
824  }
825 
826  void cudaGaugeField::backup() const {
827  if (backed_up) errorQuda("Gauge field already backed up");
828  backup_h = new char[bytes];
829  cudaMemcpy(backup_h, gauge, bytes, cudaMemcpyDeviceToHost);
830  checkCudaError();
831  backed_up = true;
832  }
833 
835  {
836  if (!backed_up) errorQuda("Cannot restore since not backed up");
837  cudaMemcpy(gauge, backup_h, bytes, cudaMemcpyHostToDevice);
838  delete []backup_h;
839  checkCudaError();
840  backed_up = false;
841  }
842 
844  cudaMemset(gauge, 0, bytes);
845  }
846 
847 
848 } // namespace quda
#define qudaMemcpy(dst, src, count, kind)
Definition: quda_cuda_api.h:33
QudaFieldLocation reorder_location()
Return whether data is reordered on the CPU or GPU. This can set at QUDA initialization using the env...
void extractGaugeGhost(const GaugeField &u, void **ghost, bool extract=true, int offset=0)
void allocateGhostBuffer(size_t ghost_bytes) const
Allocate the static ghost buffers.
#define pool_pinned_free(ptr)
Definition: malloc_quda.h:128
cudaError_t qudaEventSynchronize(cudaEvent_t &event)
Wrapper around cudaEventSynchronize or cuEventSynchronize.
void * my_face_dim_dir_d[2][QUDA_MAX_DIM][2]
void saveCPUField(cpuGaugeField &cpu) const
Upload from this field into a CPU field.
void copyGenericGauge(GaugeField &out, const GaugeField &in, QudaFieldLocation location, void *Out=0, void *In=0, void **ghostOut=0, void **ghostIn=0, int type=0)
Definition: copy_gauge.cu:41
cudaDeviceProp deviceProp
#define errorQuda(...)
Definition: util_quda.h:121
void free_gauge_buffer(void *buffer, QudaGaugeFieldOrder order, QudaFieldGeometry geometry)
#define host_free(ptr)
Definition: malloc_quda.h:71
enum QudaLinkDirection_s QudaLinkDirection
cudaStream_t * streams
void injectGhost(QudaLinkDirection link_direction=QUDA_LINK_BACKWARDS)
The opposite of exchangeGhost: take the ghost zone on x, send to node x-1, and inject back into the f...
void free_ghost_buffer(void **buffer, QudaGaugeFieldOrder order, QudaFieldGeometry geometry)
QudaReconstructType reconstruct
Definition: gauge_field.h:176
bool staggeredPhaseApplied
Definition: gauge_field.h:201
static void * ghost_pinned_send_buffer_h[2]
static MsgHandle * mh_send_p2p_back[2][QUDA_MAX_DIM]
QudaFieldGeometry Geometry() const
Definition: gauge_field.h:258
void createComms(const int *R, bool no_comms_fill, bool bidir=true)
Create the communication handlers and buffers.
MsgHandle * mh_send_rdma_fwd[2][QUDA_MAX_DIM]
QudaStaggeredPhase staggeredPhaseType
Definition: gauge_field.h:196
cudaGaugeField(const GaugeFieldParam &)
QudaFieldGeometry geometry
Definition: gauge_field.h:174
bool is_aligned(const void *ptr, size_t alignment)
Definition: malloc_quda.h:57
QudaGaugeParam param
Definition: pack_test.cpp:17
const int * R() const
void loadCPUField(const cpuGaugeField &cpu)
Download into this field from a CPU field.
MsgHandle * mh_send_rdma_back[2][QUDA_MAX_DIM]
static MsgHandle * mh_recv_p2p_fwd[2][QUDA_MAX_DIM]
void checkField(const LatticeField &) const
#define qudaDeviceSynchronize()
static bool ghost_field_reset
void commsComplete(int dim, int dir)
Wait for communication to complete.
static int bufferIndex
void allocateGhostBuffer(const int *R, bool no_comms_fill, bool bidir=true) const
Allocate the ghost buffers.
cudaError_t qudaStreamSynchronize(cudaStream_t &stream)
Wrapper around cudaStreamSynchronize or cuStreamSynchronize.
size_t Bytes() const
Definition: gauge_field.h:311
void zeroPad()
Initialize the padded region to 0.
bool StaggeredPhaseApplied() const
Definition: gauge_field.h:260
void exchangeExtendedGhost(const int *R, bool no_comms_fill=false)
This does routine will populate the border / halo region of a gauge field that has been created using...
MsgHandle * mh_recv_back[2][QUDA_MAX_DIM]
void sendStart(int dim, int dir, cudaStream_t *stream_p=nullptr)
Start the sending communicators.
void extractExtendedGaugeGhost(const GaugeField &u, int dim, const int *R, void **ghost, bool extract)
void comm_start(MsgHandle *mh)
Definition: comm_mpi.cpp:216
#define pool_device_malloc(size)
Definition: malloc_quda.h:125
MsgHandle * mh_recv_rdma_fwd[2][QUDA_MAX_DIM]
QudaGhostExchange ghostExchange
static void * ghost_remote_send_buffer_d[2][QUDA_MAX_DIM][2]
void * create_gauge_buffer(size_t bytes, QudaGaugeFieldOrder order, QudaFieldGeometry geometry)
void exchangeGhost(QudaLinkDirection link_direction=QUDA_LINK_BACKWARDS)
Exchange the ghost and store store in the padded region.
const void ** Ghost() const
Definition: gauge_field.h:323
enum QudaGaugeFieldOrder_s QudaGaugeFieldOrder
void backup() const
Backs up the cudaGaugeField to CPU memory.
void * from_face_dim_dir_d[2][QUDA_MAX_DIM][2]
size_t ghost_face_bytes[QUDA_MAX_DIM]
static void destroyIPCComms()
MsgHandle * mh_send_fwd[2][QUDA_MAX_DIM]
void createGhostZone(const int *R, bool no_comms_fill, bool bidir=true) const
void * memset(void *s, int c, size_t n)
bool comm_peer2peer_enabled(int dir, int dim)
static void * ghost_send_buffer_d[2]
int surface[QUDA_MAX_DIM]
#define pool_pinned_malloc(size)
Definition: malloc_quda.h:127
void recvStart(int dim, int dir)
Start the receive communicators.
int ghostOffset[QUDA_MAX_DIM][2]
void ** create_ghost_buffer(size_t bytes[], QudaGaugeFieldOrder order, QudaFieldGeometry geometry)
QudaLinkType link_type
Definition: gauge_field.h:180
const double & LinkMax() const
Definition: gauge_field.h:280
void * from_face_dim_dir_h[2][QUDA_MAX_DIM][2]
static MsgHandle * mh_send_p2p_fwd[2][QUDA_MAX_DIM]
static void * ghost_recv_buffer_d[2]
MsgHandle * mh_recv_rdma_back[2][QUDA_MAX_DIM]
static cudaEvent_t ipcCopyEvent[2][2][QUDA_MAX_DIM]
void setGauge(void *_gauge)
QudaFieldCreate create
Definition: gauge_field.h:187
bool comm_gdr_enabled()
Query if GPU Direct RDMA communication is enabled (global setting)
cudaError_t qudaEventRecord(cudaEvent_t &event, cudaStream_t stream=0)
Wrapper around cudaEventRecord or cuEventRecord.
int surfaceCB[QUDA_MAX_DIM]
QudaReconstructType Reconstruct() const
Definition: gauge_field.h:250
static cudaEvent_t ipcRemoteCopyEvent[2][2][QUDA_MAX_DIM]
enum QudaFieldGeometry_s QudaFieldGeometry
#define pool_device_free(ptr)
Definition: malloc_quda.h:126
virtual void * Gauge_p()
Definition: gauge_field.h:315
QudaGaugeFieldOrder Order() const
Definition: gauge_field.h:251
#define QUDA_MAX_DIM
Maximum number of dimensions supported by QUDA. In practice, no routines make use of more than 5...
#define checkCudaError()
Definition: util_quda.h:161
MsgHandle * mh_recv_fwd[2][QUDA_MAX_DIM]
QudaStaggeredPhase StaggeredPhase() const
Definition: gauge_field.h:259
void createComms(bool no_comms_fill=false, bool bidir=true)
#define mapped_malloc(size)
Definition: malloc_quda.h:68
void comm_wait(MsgHandle *mh)
Definition: comm_mpi.cpp:222
static MsgHandle * mh_recv_p2p_back[2][QUDA_MAX_DIM]
QudaGaugeFieldOrder order
Definition: gauge_field.h:178
QudaGhostExchange GhostExchange() const
void copy(const GaugeField &src)
QudaPrecision Precision() const
bool isNative() const
void * my_face_dim_dir_h[2][QUDA_MAX_DIM][2]
void * ghost[2 *QUDA_MAX_DIM]
Definition: gauge_field.h:189
QudaMemoryType mem_type
QudaPrecision precision
MsgHandle * mh_send_back[2][QUDA_MAX_DIM]
void copyExtendedGauge(GaugeField &out, const GaugeField &in, QudaFieldLocation location, void *Out=0, void *In=0)
void restore() const
Restores the cudaGaugeField to CUDA memory.
static void * ghost_pinned_recv_buffer_h[2]
int comm_dim_partitioned(int dim)