QUDA  0.9.0
cuda_gauge_field.cu
Go to the documentation of this file.
1 #include <string.h>
2 #include <gauge_field.h>
3 #include <typeinfo>
4 #include <blas_quda.h>
5 
6 namespace quda {
7 
9  GaugeField(param), gauge(0), even(0), odd(0)
10  {
13  errorQuda("QDP ordering only supported for reference fields");
14  }
15 
16  if (order == QUDA_QDP_GAUGE_ORDER ||
19  errorQuda("Field ordering %d presently disabled for this type", order);
20 
21 #ifdef MULTI_GPU
24  isNative()) {
25  bool pad_check = true;
26  for (int i=0; i<nDim; i++) {
27  // when we have coarse links we need to double the pad since we're storing forwards and backwards links
28  int minimum_pad = nFace*surfaceCB[i] * (geometry == QUDA_COARSE_GEOMETRY ? 2 : 1);
29  if (pad < minimum_pad) pad_check = false;
30  if (!pad_check)
31  errorQuda("cudaGaugeField being constructed with insufficient padding (%d < %d)\n", pad, minimum_pad);
32  }
33  }
34 #endif
35 
39  errorQuda("ERROR: create type(%d) not supported yet\n", create);
40  }
41 
44  if (create == QUDA_ZERO_FIELD_CREATE) cudaMemset(gauge, 0, bytes);
45  } else {
46  gauge = param.gauge;
47  }
48 
49  if ( !isNative() ) {
50  for (int i=0; i<nDim; i++) {
51  size_t nbytes = nFace * surface[i] * nInternal * precision;
52  ghost[i] = nbytes ? pool_device_malloc(nbytes) : nullptr;
53  ghost[i+4] = (nbytes && geometry == QUDA_COARSE_GEOMETRY) ? pool_device_malloc(nbytes) : nullptr;
54  }
55  }
56 
59  }
60 
61  even = gauge;
62  odd = (char*)gauge + bytes/2;
63 
64 #ifdef USE_TEXTURE_OBJECTS
65  createTexObject(tex, gauge, true);
66  createTexObject(evenTex, even, false);
67  createTexObject(oddTex, odd, false);
69  { // Create texture objects for the phases
70  bool isPhase = true;
71  createTexObject(phaseTex, (char*)gauge + phase_offset, true, isPhase);
72  createTexObject(evenPhaseTex, (char*)even + phase_offset, false, isPhase);
73  createTexObject(oddPhaseTex, (char*)odd + phase_offset, false, isPhase);
74  }
75 #endif
76 
77  }
78 
79 #ifdef USE_TEXTURE_OBJECTS
80  void cudaGaugeField::createTexObject(cudaTextureObject_t &tex, void *field, bool full, bool isPhase) {
81 
82  if( isNative() ){
83  // create the texture for the field components
84  cudaChannelFormatDesc desc;
85  memset(&desc, 0, sizeof(cudaChannelFormatDesc));
86  if (precision == QUDA_SINGLE_PRECISION) desc.f = cudaChannelFormatKindFloat;
87  else desc.f = cudaChannelFormatKindSigned; // half is short, double is int2
88 
89  int texel_size = 1;
90  if (isPhase) {
92  desc.x = 8*sizeof(int);
93  desc.y = 8*sizeof(int);
94  desc.z = 0;
95  desc.w = 0;
96  texel_size = 2*sizeof(int);
97  } else {
98  desc.x = 8*precision;
99  desc.y = desc.z = desc.w = 0;
100  texel_size = precision;
101  }
102  } else {
103  // always four components regardless of precision
105  desc.x = 8*sizeof(int);
106  desc.y = 8*sizeof(int);
107  desc.z = 8*sizeof(int);
108  desc.w = 8*sizeof(int);
109  texel_size = 4*sizeof(int);
110  } else {
111  desc.x = 8*precision;
112  desc.y = 8*precision;
113  desc.z = (reconstruct == 18 || reconstruct == 10) ? 0 : 8*precision; // float2 or short2 for 18 reconstruct
114  desc.w = (reconstruct == 18 || reconstruct == 10) ? 0 : 8*precision;
115  texel_size = (reconstruct == 18 || reconstruct == 10 ? 2 : 4) * precision;
116  }
117  }
118 
119  cudaResourceDesc resDesc;
120  memset(&resDesc, 0, sizeof(resDesc));
121  resDesc.resType = cudaResourceTypeLinear;
122  resDesc.res.linear.devPtr = field;
123  resDesc.res.linear.desc = desc;
124  resDesc.res.linear.sizeInBytes = isPhase ? phase_bytes/(!full ? 2 : 1) : (bytes-phase_bytes)/(!full ? 2 : 1);
125 
126  unsigned long texels = resDesc.res.linear.sizeInBytes / texel_size;
127  if (texels > (unsigned)deviceProp.maxTexture1DLinear) {
128  errorQuda("Attempting to bind too large a texture %lu > %d", texels, deviceProp.maxTexture1DLinear);
129  }
130 
131  cudaTextureDesc texDesc;
132  memset(&texDesc, 0, sizeof(texDesc));
133  if (precision == QUDA_HALF_PRECISION) texDesc.readMode = cudaReadModeNormalizedFloat;
134  else texDesc.readMode = cudaReadModeElementType;
135 
136  cudaCreateTextureObject(&tex, &resDesc, &texDesc, NULL);
137  checkCudaError();
138  }
139  }
140 
141  void cudaGaugeField::destroyTexObject() {
142  if( isNative() ){
143  cudaDestroyTextureObject(tex);
144  cudaDestroyTextureObject(evenTex);
145  cudaDestroyTextureObject(oddTex);
147  cudaDestroyTextureObject(phaseTex);
148  cudaDestroyTextureObject(evenPhaseTex);
149  cudaDestroyTextureObject(oddPhaseTex);
150  }
151  checkCudaError();
152  }
153  }
154 #endif
155 
157  {
158 #ifdef USE_TEXTURE_OBJECTS
159  destroyTexObject();
160 #endif
161 
162  destroyComms();
163 
166  }
167 
168  if ( !isNative() ) {
169  for (int i=0; i<nDim; i++) {
170  if (ghost[i]) pool_device_free(ghost[i]);
172  }
173  }
174 
175  }
176 
177  // This does the exchange of the forwards boundary gauge field ghost zone and places
178  // it into the ghost array of the next node
180 
181  if (ghostExchange != QUDA_GHOST_EXCHANGE_PAD) errorQuda("Cannot call exchangeGhost with ghostExchange=%d", ghostExchange);
182  if (geometry != QUDA_VECTOR_GEOMETRY && geometry != QUDA_COARSE_GEOMETRY) errorQuda("Invalid geometry=%d", geometry);
183  if ( (link_direction == QUDA_LINK_BIDIRECTIONAL || link_direction == QUDA_LINK_FORWARDS) && geometry != QUDA_COARSE_GEOMETRY)
184  errorQuda("Cannot request exchange of forward links on non-coarse geometry");
185  if (nFace == 0) errorQuda("nFace = 0");
186 
187  const int dir = 1; // sending forwards only
188  const int R[] = {nFace, nFace, nFace, nFace};
189  const bool no_comms_fill = true; // dslash kernels presently require this
190  createComms(R, true); // always need to allocate space for non-partitioned dimension for copyGenericGauge
191 
192  // loop over backwards and forwards links
194  for (int link_dir = 0; link_dir<2; link_dir++) {
195  if (!(link_direction == QUDA_LINK_BIDIRECTIONAL || link_direction == directions[link_dir])) continue;
196 
197  void *send_d[2*QUDA_MAX_DIM] = { };
198  void *recv_d[2*QUDA_MAX_DIM] = { };
199 
200  size_t offset = 0;
201  for (int d=0; d<nDim; d++) {
202  // receive from backwards is first half of each ghost_recv_buffer
203  recv_d[d] = static_cast<char*>(ghost_recv_buffer_d[bufferIndex]) + offset; offset += ghost_face_bytes[d];
204  // send forwards is second half of each ghost_send_buffer
205  send_d[d] = static_cast<char*>(ghost_send_buffer_d[bufferIndex]) + offset; offset += ghost_face_bytes[d];
206  }
207 
208  extractGaugeGhost(*this, send_d, true, link_dir*nDim); // get the links into contiguous buffers
209 
210  // issue receive preposts and host-to-device copies if needed
211  for (int dim=0; dim<nDim; dim++) {
212  if (!comm_dim_partitioned(dim)) continue;
213  recvStart(dim, dir); // prepost the receive
214  if (!comm_peer2peer_enabled(dir,dim) && !comm_gdr_enabled()) {
215  cudaMemcpyAsync(my_face_dim_dir_h[bufferIndex][dim][dir], my_face_dim_dir_d[bufferIndex][dim][dir],
216  ghost_face_bytes[dim], cudaMemcpyDeviceToHost, streams[2*dim+dir]);
217  }
218  }
219 
220  // if gdr enabled then synchronize
222 
223  // if the sending direction is not peer-to-peer then we need to synchronize before we start sending
224  for (int dim=0; dim<nDim; dim++) {
225  if (!comm_dim_partitioned(dim)) continue;
227  sendStart(dim, dir, &streams[2*dim+dir]); // start sending
228  }
229 
230  // complete communication and issue host-to-device copies if needed
231  for (int dim=0; dim<nDim; dim++) {
232  if (!comm_dim_partitioned(dim)) continue;
233  commsComplete(dim, dir);
234  if (!comm_peer2peer_enabled(1-dir,dim) && !comm_gdr_enabled()) {
235  cudaMemcpyAsync(from_face_dim_dir_d[bufferIndex][dim][1-dir], from_face_dim_dir_h[bufferIndex][dim][1-dir],
236  ghost_face_bytes[dim], cudaMemcpyHostToDevice, streams[2*dim+dir]);
237  }
238  }
239 
240  // fill in the halos for non-partitioned dimensions
241  for (int dim=0; dim<nDim; dim++) {
242  if (!comm_dim_partitioned(dim) && no_comms_fill) {
243  qudaMemcpy(recv_d[dim], send_d[dim], ghost_face_bytes[dim], cudaMemcpyDeviceToDevice);
244  }
245  }
246 
247  if (isNative()) {
248  copyGenericGauge(*this, *this, QUDA_CUDA_FIELD_LOCATION, 0, 0, 0, recv_d, 1 + 2*link_dir); // 1, 3
249  } else {
250  // copy from receive buffer into ghost array
251  for (int dim=0; dim<nDim; dim++)
252  qudaMemcpy(ghost[dim+link_dir*nDim], recv_d[dim], ghost_face_bytes[dim], cudaMemcpyDeviceToDevice);
253  }
254 
256  } // link_dir
257 
259  }
260 
261  // This does the opposite of exchangeGhost and sends back the ghost
262  // zone to the node from which it came and injects it back into the
263  // field
265 
266  if (ghostExchange != QUDA_GHOST_EXCHANGE_PAD) errorQuda("Cannot call exchangeGhost with ghostExchange=%d", ghostExchange);
267  if (geometry != QUDA_VECTOR_GEOMETRY && geometry != QUDA_COARSE_GEOMETRY) errorQuda("Invalid geometry=%d", geometry);
268  if (link_direction != QUDA_LINK_BACKWARDS) errorQuda("Invalid link_direction = %d", link_direction);
269  if (nFace == 0) errorQuda("nFace = 0");
270 
271  const int dir = 0; // sending backwards only
272  const int R[] = {nFace, nFace, nFace, nFace};
273  const bool no_comms_fill = false; // injection never does no_comms_fill
274  createComms(R, true); // always need to allocate space for non-partitioned dimension for copyGenericGauge
275 
276  // loop over backwards and forwards links (forwards links never sent but leave here just in case)
278  for (int link_dir = 0; link_dir<2; link_dir++) {
279  if (!(link_direction == QUDA_LINK_BIDIRECTIONAL || link_direction == directions[link_dir])) continue;
280 
281  void *send_d[2*QUDA_MAX_DIM] = { };
282  void *recv_d[2*QUDA_MAX_DIM] = { };
283 
284  size_t offset = 0;
285  for (int d=0; d<nDim; d++) {
286  // send backwards is first half of each ghost_send_buffer
287  send_d[d] = static_cast<char*>(ghost_send_buffer_d[bufferIndex]) + offset; offset += ghost_face_bytes[d];
288  // receive from forwards is the second half of each ghost_recv_buffer
289  recv_d[d] = static_cast<char*>(ghost_recv_buffer_d[bufferIndex]) + offset; offset += ghost_face_bytes[d];
290  }
291 
292  if (isNative()) { // copy from padded region in gauge field into send buffer
293  copyGenericGauge(*this, *this, QUDA_CUDA_FIELD_LOCATION, 0, 0, send_d, 0, 1 + 2*link_dir);
294  } else { // copy from receive buffer into ghost array
295  for (int dim=0; dim<nDim; dim++) qudaMemcpy(send_d[dim], ghost[dim+link_dir*nDim], ghost_face_bytes[dim], cudaMemcpyDeviceToDevice);
296  }
297 
298  // issue receive preposts and host-to-device copies if needed
299  for (int dim=0; dim<nDim; dim++) {
300  if (!comm_dim_partitioned(dim)) continue;
301  recvStart(dim, dir); // prepost the receive
302  if (!comm_peer2peer_enabled(dir,dim) && !comm_gdr_enabled()) {
303  cudaMemcpyAsync(my_face_dim_dir_h[bufferIndex][dim][dir], my_face_dim_dir_d[bufferIndex][dim][dir],
304  ghost_face_bytes[dim], cudaMemcpyDeviceToHost, streams[2*dim+dir]);
305  }
306  }
307 
308  // if gdr enabled then synchronize
310 
311  // if the sending direction is not peer-to-peer then we need to synchronize before we start sending
312  for (int dim=0; dim<nDim; dim++) {
313  if (!comm_dim_partitioned(dim)) continue;
315  sendStart(dim, dir, &streams[2*dim+dir]); // start sending
316  }
317 
318  // complete communication and issue host-to-device copies if needed
319  for (int dim=0; dim<nDim; dim++) {
320  if (!comm_dim_partitioned(dim)) continue;
321  commsComplete(dim, dir);
322  if (!comm_peer2peer_enabled(1-dir,dim) && !comm_gdr_enabled()) {
323  cudaMemcpyAsync(from_face_dim_dir_d[bufferIndex][dim][1-dir], from_face_dim_dir_h[bufferIndex][dim][1-dir],
324  ghost_face_bytes[dim], cudaMemcpyHostToDevice, streams[2*dim+dir]);
325  }
326  }
327 
328  // fill in the halos for non-partitioned dimensions
329  for (int dim=0; dim<nDim; dim++) {
330  if (!comm_dim_partitioned(dim) && no_comms_fill) {
331  qudaMemcpy(recv_d[dim], send_d[dim], ghost_face_bytes[dim], cudaMemcpyDeviceToDevice);
332  }
333  }
334 
335  // get the links into contiguous buffers
336  extractGaugeGhost(*this, recv_d, false, link_dir*nDim);
337 
339  } // link_dir
340 
342  }
343 
344  void cudaGaugeField::allocateGhostBuffer(const int *R, bool no_comms_fill) const
345  {
346  createGhostZone(R, no_comms_fill);
348  }
349 
350  void cudaGaugeField::createComms(const int *R, bool no_comms_fill)
351  {
352  allocateGhostBuffer(R, no_comms_fill); // allocate the ghost buffer if not yet allocated
353 
354  // ascertain if this instance needs it comms buffers to be updated
355  bool comms_reset = ghost_field_reset || // FIXME add send buffer check
356  (my_face_h[0] != ghost_pinned_buffer_h[0]) || (my_face_h[1] != ghost_pinned_buffer_h[1]); // pinned buffers
357 
358  if (!initComms || comms_reset) LatticeField::createComms(no_comms_fill);
359 
361  createIPCComms();
362  }
363 
364  void cudaGaugeField::recvStart(int dim, int dir)
365  {
366  if (!comm_dim_partitioned(dim)) return;
367 
368  if (dir==0) { // sending backwards
369  // receive from the processor in the +1 direction
370  if (comm_peer2peer_enabled(1,dim)) {
372  } else if (comm_gdr_enabled()) {
374  } else {
376  }
377  } else { //sending forwards
378  // receive from the processor in the -1 direction
379  if (comm_peer2peer_enabled(0,dim)) {
381  } else if (comm_gdr_enabled()) {
383  } else {
385  }
386  }
387  }
388 
389  void cudaGaugeField::sendStart(int dim, int dir, cudaStream_t* stream_p)
390  {
391  if (!comm_dim_partitioned(dim)) return;
392 
393  if (!comm_peer2peer_enabled(dir,dim)) {
394  if (dir == 0)
395  if (comm_gdr_enabled()) {
397  } else {
399  }
400  else
401  if (comm_gdr_enabled()) {
403  } else {
405  }
406  } else { // doing peer-to-peer
407 
408  void* ghost_dst = static_cast<char*>(ghost_remote_send_buffer_d[bufferIndex][dim][dir])
409  + precision*ghostOffset[dim][(dir+1)%2];
410 
411  cudaMemcpyAsync(ghost_dst, my_face_dim_dir_d[bufferIndex][dim][dir],
412  ghost_face_bytes[dim], cudaMemcpyDeviceToDevice,
413  stream_p ? *stream_p : 0);
414 
415  if (dir == 0) {
416  // record the event
417  qudaEventRecord(ipcCopyEvent[bufferIndex][0][dim], stream_p ? *stream_p : 0);
418  // send to the processor in the -1 direction
420  } else {
421  qudaEventRecord(ipcCopyEvent[bufferIndex][1][dim], stream_p ? *stream_p : 0);
422  // send to the processor in the +1 direction
424  }
425  }
426  }
427 
429  {
430  if (!comm_dim_partitioned(dim)) return;
431 
432  if (dir==0) {
433  if (comm_peer2peer_enabled(1,dim)) {
436  } else if (comm_gdr_enabled()) {
438  } else {
440  }
441 
442  if (comm_peer2peer_enabled(0,dim)) {
445  } else if (comm_gdr_enabled()) {
447  } else {
449  }
450  } else {
451  if (comm_peer2peer_enabled(0,dim)) {
454  } else if (comm_gdr_enabled()) {
456  } else {
458  }
459 
460  if (comm_peer2peer_enabled(1,dim)) {
463  } else if (comm_gdr_enabled()) {
465  } else {
467  }
468  }
469  }
470 
471  void cudaGaugeField::exchangeExtendedGhost(const int *R, bool no_comms_fill)
472  {
473  const int b = bufferIndex;
474  void *send_d[QUDA_MAX_DIM], *recv_d[QUDA_MAX_DIM];
475 
476  createComms(R, no_comms_fill);
477 
478  size_t offset = 0;
479  for (int dim=0; dim<nDim; dim++) {
480  if ( !(comm_dim_partitioned(dim) || (no_comms_fill && R[dim])) ) continue;
481  send_d[dim] = static_cast<char*>(ghost_send_buffer_d[b]) + offset;
482  recv_d[dim] = static_cast<char*>(ghost_recv_buffer_d[b]) + offset;
483  offset += 2*ghost_face_bytes[dim]; // factor of two from fwd/back
484  }
485 
486  for (int dim=0; dim<nDim; dim++) {
487  if ( !(comm_dim_partitioned(dim) || (no_comms_fill && R[dim])) ) continue;
488 
489  //extract into a contiguous buffer
490  extractExtendedGaugeGhost(*this, dim, R, send_d, true);
491 
492  if (comm_dim_partitioned(dim)) {
493  for (int dir=0; dir<2; dir++) recvStart(dim, dir);
494 
495  for (int dir=0; dir<2; dir++) {
496  // issue host-to-device copies if needed
497  if (!comm_peer2peer_enabled(dir,dim) && !comm_gdr_enabled()) {
498  cudaMemcpyAsync(my_face_dim_dir_h[bufferIndex][dim][dir], my_face_dim_dir_d[bufferIndex][dim][dir],
499  ghost_face_bytes[dim], cudaMemcpyDeviceToHost, streams[dir]);
500  }
501  }
502 
503  // if either direction is not peer-to-peer then we need to synchronize
505 
506  // if we pass a stream to sendStart then we must ensure that stream is synchronized
507  for (int dir=0; dir<2; dir++) sendStart(dim, dir, &streams[dir]);
508  for (int dir=0; dir<2; dir++) commsComplete(dim, dir);
509 
510  for (int dir=0; dir<2; dir++) {
511  // issue host-to-device copies if needed
512  if (!comm_peer2peer_enabled(dir,dim) && !comm_gdr_enabled()) {
513  cudaMemcpyAsync(from_face_dim_dir_d[bufferIndex][dim][dir], from_face_dim_dir_h[bufferIndex][dim][dir],
514  ghost_face_bytes[dim], cudaMemcpyHostToDevice, streams[dir]);
515  }
516  }
517 
518  } else { // if just doing a local exchange to fill halo then need to swap faces
520  ghost_face_bytes[dim], cudaMemcpyDeviceToDevice);
522  ghost_face_bytes[dim], cudaMemcpyDeviceToDevice);
523  }
524 
525  // inject back into the gauge field
526  extractExtendedGaugeGhost(*this, dim, R, recv_d, false);
527  }
528 
531  }
532 
533  void cudaGaugeField::exchangeExtendedGhost(const int *R, TimeProfile &profile, bool no_comms_fill) {
534  profile.TPSTART(QUDA_PROFILE_COMMS);
535  exchangeExtendedGhost(R, no_comms_fill);
536  profile.TPSTOP(QUDA_PROFILE_COMMS);
537  }
538 
539  void cudaGaugeField::setGauge(void *gauge_)
540  {
542  errorQuda("Setting gauge pointer is only allowed when create="
543  "QUDA_REFERENCE_FIELD_CREATE type\n");
544  }
545  gauge = gauge_;
546  }
547 
549  if (order == QUDA_QDP_GAUGE_ORDER) {
550  void **buffer = new void*[geometry];
551  for (int d=0; d<geometry; d++) buffer[d] = pool_device_malloc(bytes/geometry);
552  return ((void*)buffer);
553  } else {
554  return pool_device_malloc(bytes);
555  }
556 
557  }
558 
560 
561  if (order > 4) {
562  void **buffer = new void*[geometry];
563  for (int d=0; d<geometry; d++) buffer[d] = pool_device_malloc(bytes[d]);
564  return buffer;
565  } else {
566  return 0;
567  }
568 
569  }
570 
571  void free_gauge_buffer(void *buffer, QudaGaugeFieldOrder order, QudaFieldGeometry geometry) {
572  if (order == QUDA_QDP_GAUGE_ORDER) {
573  for (int d=0; d<geometry; d++) pool_device_free(((void**)buffer)[d]);
574  delete []((void**)buffer);
575  } else {
576  pool_device_free(buffer);
577  }
578  }
579 
580  void free_ghost_buffer(void **buffer, QudaGaugeFieldOrder order, QudaFieldGeometry geometry) {
581  if (order > 4) {
582  for (int d=0; d<geometry; d++) pool_device_free(buffer[d]);
583  delete []buffer;
584  }
585  }
586 
588  if (this == &src) return;
589 
590  checkField(src);
591 
593  fat_link_max = src.LinkMax();
594  if (precision == QUDA_HALF_PRECISION && fat_link_max == 0.0)
595  errorQuda("fat_link_max has not been computed");
596  } else {
597  fat_link_max = 1.0;
598  }
599 
600  if (typeid(src) == typeid(cudaGaugeField)) {
601 
603  // copy field and ghost zone into this field
604  copyGenericGauge(*this, src, QUDA_CUDA_FIELD_LOCATION, gauge, static_cast<const cudaGaugeField&>(src).gauge);
605 
607  copyGenericGauge(*this, src, QUDA_CUDA_FIELD_LOCATION, gauge, static_cast<const cudaGaugeField&>(src).gauge, 0, 0, 3);
608  } else {
609  copyExtendedGauge(*this, src, QUDA_CUDA_FIELD_LOCATION, gauge, static_cast<const cudaGaugeField&>(src).gauge);
610  if (geometry == QUDA_COARSE_GEOMETRY) errorQuda("Extended gauge copy for coarse geometry not supported");
611  }
612 
613  } else if (typeid(src) == typeid(cpuGaugeField)) {
614  if (reorder_location() == QUDA_CPU_FIELD_LOCATION) { // do reorder on the CPU
615  void *buffer = pool_pinned_malloc(bytes);
616 
618  // copy field and ghost zone into buffer
619  copyGenericGauge(*this, src, QUDA_CPU_FIELD_LOCATION, buffer, static_cast<const cpuGaugeField&>(src).gauge);
620 
622  copyGenericGauge(*this, src, QUDA_CPU_FIELD_LOCATION, buffer, static_cast<const cpuGaugeField&>(src).gauge, 0, 0, 3);
623  } else {
624  copyExtendedGauge(*this, src, QUDA_CPU_FIELD_LOCATION, buffer, static_cast<const cpuGaugeField&>(src).gauge);
625  if (geometry == QUDA_COARSE_GEOMETRY) errorQuda("Extended gauge copy for coarse geometry not supported");
626  }
627 
628  // this copies over both even and odd
629  qudaMemcpy(gauge, buffer, bytes, cudaMemcpyHostToDevice);
630  pool_pinned_free(buffer);
631  } else { // else on the GPU
632 
633  if (src.Order() == QUDA_MILC_SITE_GAUGE_ORDER || src.Order() == QUDA_BQCD_GAUGE_ORDER) {
634  // special case where we use zero-copy memory to read/write directly from application's array
635  void *src_d;
636  cudaError_t error = cudaHostGetDevicePointer(&src_d, const_cast<void*>(src.Gauge_p()), 0);
637  if (error != cudaSuccess) errorQuda("Failed to get device pointer for MILC site / BQCD array");
638 
639  if (src.GhostExchange() == QUDA_GHOST_EXCHANGE_NO) {
641  } else {
642  errorQuda("Ghost copy not supported here");
643  }
644 
645  } else {
646  void *buffer = create_gauge_buffer(src.Bytes(), src.Order(), src.Geometry());
647  size_t ghost_bytes[8];
648  int srcNinternal = src.Reconstruct() != QUDA_RECONSTRUCT_NO ? src.Reconstruct() : 2*nColor*nColor;
649  for (int d=0; d<geometry; d++) ghost_bytes[d] = nFace * surface[d%4] * srcNinternal * src.Precision();
650  void **ghost_buffer = (nFace > 0) ? create_ghost_buffer(ghost_bytes, src.Order(), geometry) : nullptr;
651 
652  if (src.Order() == QUDA_QDP_GAUGE_ORDER) {
653  for (int d=0; d<geometry; d++) {
654  qudaMemcpy(((void**)buffer)[d], ((void**)src.Gauge_p())[d], src.Bytes()/geometry, cudaMemcpyHostToDevice);
655  }
656  } else {
657  qudaMemcpy(buffer, src.Gauge_p(), src.Bytes(), cudaMemcpyHostToDevice);
658  }
659 
660  if (src.Order() > 4 && GhostExchange() == QUDA_GHOST_EXCHANGE_PAD &&
661  src.GhostExchange() == QUDA_GHOST_EXCHANGE_PAD && nFace)
662  for (int d=0; d<geometry; d++)
663  qudaMemcpy(ghost_buffer[d], src.Ghost()[d], ghost_bytes[d], cudaMemcpyHostToDevice);
664 
666  copyGenericGauge(*this, src, QUDA_CUDA_FIELD_LOCATION, gauge, buffer, 0, ghost_buffer);
667  if (geometry == QUDA_COARSE_GEOMETRY) copyGenericGauge(*this, src, QUDA_CUDA_FIELD_LOCATION, gauge, buffer, 0, ghost_buffer, 3);
668  } else {
670  if (geometry == QUDA_COARSE_GEOMETRY) errorQuda("Extended gauge copy for coarse geometry not supported");
671  }
672  free_gauge_buffer(buffer, src.Order(), src.Geometry());
673  if (nFace > 0) free_ghost_buffer(ghost_buffer, src.Order(), geometry);
674  }
675  } // reorder_location
676  } else {
677  errorQuda("Invalid gauge field type");
678  }
679 
680  // if we have copied from a source without a pad then we need to exchange
683 
684  staggeredPhaseApplied = src.StaggeredPhaseApplied();
685  staggeredPhaseType = src.StaggeredPhase();
686 
687  checkCudaError();
688  }
689 
691  copy(cpu);
693  checkCudaError();
694  }
695 
697  profile.TPSTART(QUDA_PROFILE_H2D);
698  loadCPUField(cpu);
699  profile.TPSTOP(QUDA_PROFILE_H2D);
700  }
701 
703  {
704  static_cast<LatticeField&>(cpu).checkField(*this);
705 
707 
708  if (cpu.Order() == QUDA_MILC_SITE_GAUGE_ORDER || cpu.Order() == QUDA_BQCD_GAUGE_ORDER) {
709  // special case where we use zero-copy memory to read/write directly from application's array
710  void *cpu_d;
711  cudaError_t error = cudaHostGetDevicePointer(&cpu_d, const_cast<void*>(cpu.Gauge_p()), 0);
712  if (error != cudaSuccess) errorQuda("Failed to get device pointer for MILC site / BQCD array");
713  if (cpu.GhostExchange() == QUDA_GHOST_EXCHANGE_NO) {
714  copyGenericGauge(cpu, *this, QUDA_CUDA_FIELD_LOCATION, cpu_d, gauge);
715  } else {
716  errorQuda("Ghost copy not supported here");
717  }
718  } else {
719  void *buffer = create_gauge_buffer(cpu.Bytes(), cpu.Order(), cpu.Geometry());
720 
721  // Allocate space for ghost zone if required
722  size_t ghost_bytes[8];
723  int cpuNinternal = cpu.Reconstruct() != QUDA_RECONSTRUCT_NO ? cpu.Reconstruct() : 2*nColor*nColor;
724  for (int d=0; d<geometry; d++) ghost_bytes[d] = nFace * surface[d%4] * cpuNinternal * cpu.Precision();
725  void **ghost_buffer = (nFace > 0) ? create_ghost_buffer(ghost_bytes, cpu.Order(), geometry) : nullptr;
726 
728  copyGenericGauge(cpu, *this, QUDA_CUDA_FIELD_LOCATION, buffer, gauge, ghost_buffer, 0);
729  if (geometry == QUDA_COARSE_GEOMETRY) copyGenericGauge(cpu, *this, QUDA_CUDA_FIELD_LOCATION, buffer, gauge, ghost_buffer, 0, 3);
730  } else {
731  copyExtendedGauge(cpu, *this, QUDA_CUDA_FIELD_LOCATION, buffer, gauge);
732  }
733 
734  if (cpu.Order() == QUDA_QDP_GAUGE_ORDER) {
735  for (int d=0; d<geometry; d++) qudaMemcpy(((void**)cpu.gauge)[d], ((void**)buffer)[d], cpu.Bytes()/geometry, cudaMemcpyDeviceToHost);
736  } else {
737  qudaMemcpy(cpu.gauge, buffer, cpu.Bytes(), cudaMemcpyDeviceToHost);
738  }
739 
740  if (cpu.Order() > 4 && GhostExchange() == QUDA_GHOST_EXCHANGE_PAD &&
742  for (int d=0; d<geometry; d++)
743  qudaMemcpy(cpu.Ghost()[d], ghost_buffer[d], ghost_bytes[d], cudaMemcpyDeviceToHost);
744 
745  free_gauge_buffer(buffer, cpu.Order(), cpu.Geometry());
746  if (nFace > 0) free_ghost_buffer(ghost_buffer, cpu.Order(), geometry);
747  }
748  } else if (reorder_location() == QUDA_CPU_FIELD_LOCATION) { // do copy then host-side reorder
749 
750  void *buffer = pool_pinned_malloc(bytes);
751  qudaMemcpy(buffer, gauge, bytes, cudaMemcpyDeviceToHost);
752 
754  copyGenericGauge(cpu, *this, QUDA_CPU_FIELD_LOCATION, cpu.gauge, buffer);
755  } else {
756  copyExtendedGauge(cpu, *this, QUDA_CPU_FIELD_LOCATION, cpu.gauge, buffer);
757  }
758  pool_pinned_free(buffer);
759 
760  } else {
761  errorQuda("Invalid pack location %d", reorder_location());
762  }
763 
766 
768  checkCudaError();
769  }
770 
772  profile.TPSTART(QUDA_PROFILE_D2H);
773  saveCPUField(cpu);
774  profile.TPSTOP(QUDA_PROFILE_D2H);
775  }
776 
777  void cudaGaugeField::backup() const {
778  if (backed_up) errorQuda("Gauge field already backed up");
779  backup_h = new char[bytes];
780  cudaMemcpy(backup_h, gauge, bytes, cudaMemcpyDeviceToHost);
781  checkCudaError();
782  backed_up = true;
783  }
784 
786  if (!backed_up) errorQuda("Cannot restore since not backed up");
787  cudaMemcpy(gauge, backup_h, bytes, cudaMemcpyHostToDevice);
788  delete []backup_h;
789  checkCudaError();
790  backed_up = false;
791  }
792 
794  cudaMemset(gauge, 0, bytes);
795  }
796 
797 
798 } // namespace quda
#define qudaMemcpy(dst, src, count, kind)
Definition: quda_cuda_api.h:32
QudaFieldLocation reorder_location()
Return whether data is reordered on the CPU or GPU. This can set at QUDA initialization using the env...
void extractGaugeGhost(const GaugeField &u, void **ghost, bool extract=true, int offset=0)
void allocateGhostBuffer(size_t ghost_bytes) const
Allocate the static ghost buffers.
#define pool_pinned_free(ptr)
Definition: malloc_quda.h:116
cudaError_t qudaEventSynchronize(cudaEvent_t &event)
Wrapper around cudaEventSynchronize or cuEventSynchronize.
void * my_face_dim_dir_d[2][QUDA_MAX_DIM][2]
void saveCPUField(cpuGaugeField &cpu) const
Upload from this field into a CPU field.
void copyGenericGauge(GaugeField &out, const GaugeField &in, QudaFieldLocation location, void *Out=0, void *In=0, void **ghostOut=0, void **ghostIn=0, int type=0)
Definition: copy_gauge.cu:38
cudaDeviceProp deviceProp
void createComms(bool no_comms_fill=false)
const void * src
#define errorQuda(...)
Definition: util_quda.h:90
void free_gauge_buffer(void *buffer, QudaGaugeFieldOrder order, QudaFieldGeometry geometry)
enum QudaLinkDirection_s QudaLinkDirection
cudaStream_t * streams
void injectGhost(QudaLinkDirection link_direction=QUDA_LINK_BACKWARDS)
The opposite of exchangeGhost: take the ghost zone on x, send to node x-1, and inject back into the f...
void free_ghost_buffer(void **buffer, QudaGaugeFieldOrder order, QudaFieldGeometry geometry)
QudaReconstructType reconstruct
Definition: gauge_field.h:135
bool staggeredPhaseApplied
Definition: gauge_field.h:161
static __inline__ dim3 dim3 void size_t cudaStream_t int dim
static int R[4]
static MsgHandle * mh_send_p2p_back[2][QUDA_MAX_DIM]
QudaFieldGeometry Geometry() const
Definition: gauge_field.h:212
void restore()
Restores the cudaGaugeField to CUDA memory.
MsgHandle * mh_send_rdma_fwd[2][QUDA_MAX_DIM]
QudaStaggeredPhase staggeredPhaseType
Definition: gauge_field.h:156
cudaGaugeField(const GaugeFieldParam &)
size_t size_t offset
QudaFieldGeometry geometry
Definition: gauge_field.h:133
QudaGaugeParam param
Definition: pack_test.cpp:17
#define b
static void * ghost_pinned_buffer_h[2]
const int * R() const
void loadCPUField(const cpuGaugeField &cpu)
Download into this field from a CPU field.
MsgHandle * mh_send_rdma_back[2][QUDA_MAX_DIM]
static MsgHandle * mh_recv_p2p_fwd[2][QUDA_MAX_DIM]
void checkField(const LatticeField &) const
static bool ghost_field_reset
void commsComplete(int dim, int dir)
Wait for communication to complete.
void allocateGhostBuffer(const int *R, bool no_comms_fill) const
Allocate the ghost buffers.
static int bufferIndex
cudaError_t qudaStreamSynchronize(cudaStream_t &stream)
Wrapper around cudaStreamSynchronize or cuStreamSynchronize.
size_t Bytes() const
Definition: gauge_field.h:242
void exchangeExtendedGhost(const int *R, bool no_comms_fill=false)
This does routine will populate the border / halo region of a gauge field that has been created using...
MsgHandle * mh_recv_back[2][QUDA_MAX_DIM]
void sendStart(int dim, int dir, cudaStream_t *stream_p=nullptr)
Start the sending communicators.
void extractExtendedGaugeGhost(const GaugeField &u, int dim, const int *R, void **ghost, bool extract)
void comm_start(MsgHandle *mh)
Definition: comm_mpi.cpp:260
#define pool_device_malloc(size)
Definition: malloc_quda.h:113
MsgHandle * mh_recv_rdma_fwd[2][QUDA_MAX_DIM]
QudaGhostExchange ghostExchange
static void * ghost_remote_send_buffer_d[2][QUDA_MAX_DIM][2]
void * create_gauge_buffer(size_t bytes, QudaGaugeFieldOrder order, QudaFieldGeometry geometry)
void exchangeGhost(QudaLinkDirection link_direction=QUDA_LINK_BACKWARDS)
Exchange the ghost and store store in the padded region.
const void ** Ghost() const
Definition: gauge_field.h:254
enum QudaGaugeFieldOrder_s QudaGaugeFieldOrder
void backup() const
Backs up the cudaGaugeField to CPU memory.
void * from_face_dim_dir_d[2][QUDA_MAX_DIM][2]
size_t ghost_face_bytes[QUDA_MAX_DIM]
cudaError_t qudaDeviceSynchronize()
Wrapper around cudaDeviceSynchronize or cuDeviceSynchronize.
static void destroyIPCComms()
MsgHandle * mh_send_fwd[2][QUDA_MAX_DIM]
bool comm_peer2peer_enabled(int dir, int dim)
static void * ghost_send_buffer_d[2]
void * memset(void *__b, int __c, size_t __len)
int surface[QUDA_MAX_DIM]
#define pool_pinned_malloc(size)
Definition: malloc_quda.h:115
void recvStart(int dim, int dir)
Start the receive communicators.
void createGhostZone(const int *R, bool no_comms_fill) const
Definition: gauge_field.cpp:90
int ghostOffset[QUDA_MAX_DIM][2]
void ** create_ghost_buffer(size_t bytes[], QudaGaugeFieldOrder order, QudaFieldGeometry geometry)
QudaLinkType link_type
Definition: gauge_field.h:139
void * from_face_dim_dir_h[2][QUDA_MAX_DIM][2]
static MsgHandle * mh_send_p2p_fwd[2][QUDA_MAX_DIM]
static void * ghost_recv_buffer_d[2]
MsgHandle * mh_recv_rdma_back[2][QUDA_MAX_DIM]
static cudaEvent_t ipcCopyEvent[2][2][QUDA_MAX_DIM]
void setGauge(void *_gauge)
static __inline__ dim3 dim3 void size_t cudaStream_t int enum cudaTextureReadMode readMode static __inline__ const struct texture< T, dim, readMode > & tex
void createComms(const int *R, bool no_comms_fill)
Create the communication handlers and buffers.
QudaFieldCreate create
Definition: gauge_field.h:147
bool comm_gdr_enabled()
Query if GPU Direct RDMA communication is enabled (global setting)
cudaError_t qudaEventRecord(cudaEvent_t &event, cudaStream_t stream=0)
Wrapper around cudaEventRecord or cuEventRecord.
int surfaceCB[QUDA_MAX_DIM]
QudaReconstructType Reconstruct() const
Definition: gauge_field.h:203
static cudaEvent_t ipcRemoteCopyEvent[2][2][QUDA_MAX_DIM]
enum QudaFieldGeometry_s QudaFieldGeometry
#define pool_device_free(ptr)
Definition: malloc_quda.h:114
QudaGaugeFieldOrder Order() const
Definition: gauge_field.h:204
#define QUDA_MAX_DIM
Maximum number of dimensions supported by QUDA. In practice, no routines make use of more than 5...
#define checkCudaError()
Definition: util_quda.h:129
const struct cudaChannelFormatDesc * desc
MsgHandle * mh_recv_fwd[2][QUDA_MAX_DIM]
void comm_wait(MsgHandle *mh)
Definition: comm_mpi.cpp:266
static MsgHandle * mh_recv_p2p_back[2][QUDA_MAX_DIM]
QudaGaugeFieldOrder order
Definition: gauge_field.h:137
QudaGhostExchange GhostExchange() const
void copy(const GaugeField &src)
static __inline__ size_t size_t d
QudaPrecision Precision() const
bool isNative() const
void * my_face_dim_dir_h[2][QUDA_MAX_DIM][2]
void * ghost[2 *QUDA_MAX_DIM]
Definition: gauge_field.h:149
QudaPrecision precision
MsgHandle * mh_send_back[2][QUDA_MAX_DIM]
void copyExtendedGauge(GaugeField &out, const GaugeField &in, QudaFieldLocation location, void *Out=0, void *In=0)
unsigned long long bytes
Definition: blas_quda.cu:43
int comm_dim_partitioned(int dim)