QUDA  v1.1.0
A library for QCD on GPUs
cuda_color_spinor_field.cpp
Go to the documentation of this file.
1 #include <stdlib.h>
2 #include <stdio.h>
3 #include <typeinfo>
4 #include <string.h>
5 #include <iostream>
6 #include <limits>
7 
8 #include <color_spinor_field.h>
9 #include <blas_quda.h>
10 #include <dslash_quda.h>
11 
12 static bool zeroCopy = false;
13 
14 namespace quda {
15 
18  alloc(false),
19  init(true)
20  {
21  // this must come before create
22  if (param.create == QUDA_REFERENCE_FIELD_CREATE) {
23  v = param.v;
24  norm = param.norm;
25  }
26 
27  create(param.create);
28 
29  switch (param.create) {
31  case QUDA_REFERENCE_FIELD_CREATE: break; // do nothing;
32  case QUDA_ZERO_FIELD_CREATE: zero(); break;
33  case QUDA_COPY_FIELD_CREATE: errorQuda("Copy field create not implemented for this constructor");
34  default: errorQuda("Unexpected create type %d", param.create);
35  }
36  }
37 
39  ColorSpinorField(src),
40  alloc(false),
41  init(true)
42  {
43  create(QUDA_COPY_FIELD_CREATE);
44  copySpinorField(src);
45  }
46 
47  // creates a copy of src, any differences defined in param
49  ColorSpinorField(src),
50  alloc(false),
51  init(true)
52  {
53  // can only overide if we are not using a reference or parity special case
54  if (param.create != QUDA_REFERENCE_FIELD_CREATE ||
55  (param.create == QUDA_REFERENCE_FIELD_CREATE &&
57  param.siteSubset == QUDA_PARITY_SITE_SUBSET &&
58  typeid(src) == typeid(cudaColorSpinorField) ) ||
59  (param.create == QUDA_REFERENCE_FIELD_CREATE && (param.is_composite || param.is_component))) {
60  reset(param);
61  } else {
62  errorQuda("Undefined behaviour"); // else silent bug possible?
63  }
64 
65  // This must be set before create is called
66  if (param.create == QUDA_REFERENCE_FIELD_CREATE) {
67  if (typeid(src) == typeid(cudaColorSpinorField)) {
68  v = (void*)src.V();
69  norm = (void*)src.Norm();
70  } else {
71  errorQuda("Cannot reference a non-cuda field");
72  }
73 
74  if (composite_descr.is_component && !(src.SiteSubset() == QUDA_FULL_SITE_SUBSET && this->SiteSubset() == QUDA_PARITY_SITE_SUBSET))
75  {//setup eigenvector form the set
76  v = (void*)((char*)v + composite_descr.id*bytes);
77  norm = (void*)((char*)norm + composite_descr.id*norm_bytes);
78  }
79  }
80 
81  create(param.create);
82 
83  if (param.create == QUDA_NULL_FIELD_CREATE) {
84  // do nothing
85  } else if (param.create == QUDA_ZERO_FIELD_CREATE) {
86  zero();
87  } else if (param.create == QUDA_COPY_FIELD_CREATE) {
88  copySpinorField(src);
89  } else if (param.create == QUDA_REFERENCE_FIELD_CREATE) {
90  // do nothing
91  } else {
92  errorQuda("CreateType %d not implemented", param.create);
93  }
94 
95  }
96 
98  ColorSpinorField(src),
99  alloc(false),
100  init(true)
101  {
102  create(QUDA_COPY_FIELD_CREATE);
103  copySpinorField(src);
104  }
105 
107  if (typeid(src) == typeid(cudaColorSpinorField)) {
108  *this = (dynamic_cast<const cudaColorSpinorField&>(src));
109  } else if (typeid(src) == typeid(cpuColorSpinorField)) {
110  *this = (dynamic_cast<const cpuColorSpinorField&>(src));
111  } else {
112  errorQuda("Unknown input ColorSpinorField %s", typeid(src).name());
113  }
114  return *this;
115  }
116 
118  if (&src != this) {
119  // keep current attributes unless unset
120  if (!ColorSpinorField::init) { // note this will turn a reference field into a regular field
121  destroy();
122  destroyComms(); // not sure if this necessary
124  create(QUDA_COPY_FIELD_CREATE);
125  }
126  copySpinorField(src);
127  }
128  return *this;
129  }
130 
132  // keep current attributes unless unset
133  if (!ColorSpinorField::init) { // note this will turn a reference field into a regular field
134  destroy();
136  create(QUDA_COPY_FIELD_CREATE);
137  }
138  loadSpinorField(src);
139  return *this;
140  }
141 
143  destroyComms();
144  destroy();
145  }
146 
147  void cudaColorSpinorField::create(const QudaFieldCreate create) {
148 
150  errorQuda("Subset not implemented");
151  }
152 
153  if (create != QUDA_REFERENCE_FIELD_CREATE) {
154  switch(mem_type) {
155  case QUDA_MEMORY_DEVICE:
158  break;
159  case QUDA_MEMORY_MAPPED:
164  norm = get_mapped_device_pointer(norm_h); // set the matching device pointer
165  }
166  break;
167  default:
168  errorQuda("Unsupported memory type %d", mem_type);
169  }
170  alloc = true;
171  }
172 
175  if(composite_descr.dim <= 0) errorQuda("\nComposite size is not defined\n");
176 
178  param.siteSubset = QUDA_FULL_SITE_SUBSET;
179  param.nDim = nDim;
180  memcpy(param.x, x, nDim*sizeof(int));
182  param.v = v;
183  param.norm = norm;
184  param.is_composite = false;
185  param.composite_dim = 0;
186  param.is_component = true;
187  param.mem_type = mem_type;
188 
189  components.reserve(composite_descr.dim);
190  for(int cid = 0; cid < composite_descr.dim; cid++) {
191  param.component_id = cid;
192  components.push_back(new cudaColorSpinorField(*this, param));
193  }
194  } else {
195  // create the associated even and odd subsets
197  param.siteSubset = QUDA_PARITY_SITE_SUBSET;
198  param.nDim = nDim;
199  memcpy(param.x, x, nDim*sizeof(int));
200  param.x[0] /= 2; // set single parity dimensions
202  param.v = v;
203  param.norm = norm;
204  param.is_composite = false;
205  param.composite_dim = 0;
206  param.is_component = composite_descr.is_component;
207  param.component_id = composite_descr.id;
208  param.mem_type = mem_type;
209 
210  even = new cudaColorSpinorField(*this, param);
211  odd = new cudaColorSpinorField(*this, param);
212 
213  // need this hackery for the moment (need to locate the odd pointers half way into the full field)
214  // check for special metadata wrapper (look at reference comments in
215  // createTexObject() below)
216  if (!((uint64_t)v == (uint64_t)(void *)std::numeric_limits<uint64_t>::max()
218  && (uint64_t)norm == (uint64_t)(void *)std::numeric_limits<uint64_t>::max()))) {
219  (dynamic_cast<cudaColorSpinorField *>(odd))->v = (void *)((char *)v + bytes / 2);
221  (dynamic_cast<cudaColorSpinorField *>(odd))->norm = (void *)((char *)norm + norm_bytes / 2);
222  }
223  }
224  } else { //siteSubset == QUDA_PARITY_SITE_SUBSET
225 
228  {
229  if(composite_descr.dim <= 0) errorQuda("\nComposite size is not defined\n");
230  //if(bytes > 1811939328) warningQuda("\nCUDA API probably won't be able to create texture object for the eigenvector set... Object size is : %u bytes\n", bytes);
231  // create the associated even and odd subsets
233  param.siteSubset = QUDA_PARITY_SITE_SUBSET;
234  param.nDim = nDim;
235  memcpy(param.x, x, nDim*sizeof(int));
237  param.v = v;
238  param.norm = norm;
239  param.is_composite = false;
240  param.composite_dim = 0;
241  param.is_component = true;
242  param.mem_type = mem_type;
243 
244  //reserve eigvector set
245  components.reserve(composite_descr.dim);
246  //setup volume, [real_]length and stride for a single eigenvector
247  for(int cid = 0; cid < composite_descr.dim; cid++)
248  {
249  param.component_id = cid;
250  components.push_back(new cudaColorSpinorField(*this, param));
251  }
252  }
253  }
254 
255  if (create != QUDA_REFERENCE_FIELD_CREATE) {
257  zeroPad();
258  } else { //temporary hack for the full spinor field sets, manual zeroPad for each component:
259  for(int cid = 0; cid < composite_descr.dim; cid++) {
260  (dynamic_cast<cudaColorSpinorField&>(components[cid]->Even())).zeroPad();
261  (dynamic_cast<cudaColorSpinorField&>(components[cid]->Odd())).zeroPad();
262  }
263  }
264  }
265  }
266 
267  void cudaColorSpinorField::destroy()
268  {
269  if (alloc) {
270  switch(mem_type) {
271  case QUDA_MEMORY_DEVICE:
274  break;
275  case QUDA_MEMORY_MAPPED:
276  host_free(v_h);
278  break;
279  default:
280  errorQuda("Unsupported memory type %d", mem_type);
281  }
282  }
283 
284 
286  {
287  CompositeColorSpinorField::iterator vec;
288  for (vec = components.begin(); vec != components.end(); vec++) delete *vec;
289  }
290 
292  delete even;
293  delete odd;
294  }
295  }
296 
298  if (backed_up) errorQuda("ColorSpinorField already backed up");
299  backup_h = new char[bytes];
300  qudaMemcpy(backup_h, v, bytes, cudaMemcpyDefault);
301  if (norm_bytes) {
302  backup_norm_h = new char[norm_bytes];
303  qudaMemcpy(backup_norm_h, norm, norm_bytes, cudaMemcpyDefault);
304  }
305  backed_up = true;
306  }
307 
309  {
310  if (!backed_up) errorQuda("Cannot restore since not backed up");
311  qudaMemcpy(v, backup_h, bytes, cudaMemcpyDefault);
312  delete []backup_h;
313  if (norm_bytes) {
314  qudaMemcpy(norm, backup_norm_h, norm_bytes, cudaMemcpyDefault);
315  delete []backup_norm_h;
316  }
317  backed_up = false;
318  }
319 
321  {
322  // conditionals based on destructor
323  if (is_prefetch_enabled() && alloc && mem_type == QUDA_MEMORY_DEVICE) {
324  qudaMemPrefetchAsync(v, bytes, mem_space, stream);
327  }
328  }
329 
330  // cuda's floating point format, IEEE-754, represents the floating point
331  // zero as 4 zero bytes
333  qudaMemsetAsync(v, 0, bytes, 0);
336  }
337 
338  void cudaColorSpinorField::zeroPad() {
339 
340  { // zero initialize the field pads
341  size_t pad_bytes = (stride - volumeCB) * precision * fieldOrder;
342  int Npad = nColor * nSpin * 2 / fieldOrder;
343 
344  if (composite_descr.is_composite && !composite_descr.is_component){//we consider the whole eigenvector set:
345  Npad *= composite_descr.dim;
346  pad_bytes /= composite_descr.dim;
347  }
348 
351  if (pad_bytes)
352  for (int subset=0; subset<siteSubset; subset++) {
353  qudaMemset2DAsync(dst + subset * bytes / siteSubset, pitch, 0, pad_bytes, Npad, 0);
354  }
355  }
356 
357  if (norm_bytes > 0) { // zero initialize the norm pad
358  size_t pad_bytes = (stride - volumeCB) * sizeof(float);
359  if (pad_bytes)
360  for (int subset=0; subset<siteSubset; subset++) {
361  qudaMemsetAsync((char *)norm + volumeCB * sizeof(float), 0, (stride - volumeCB) * sizeof(float), 0);
362  }
363  }
364 
365  // zero the region added for alignment reasons
366  if (bytes != (size_t)length*precision) {
367  size_t subset_bytes = bytes/siteSubset;
368  size_t subset_length = length/siteSubset;
369  for (int subset=0; subset < siteSubset; subset++) {
370  qudaMemsetAsync((char *)v + subset_length * precision + subset_bytes * subset, 0,
371  subset_bytes - subset_length * precision, 0);
372  }
373  }
374 
375  // zero the region added for alignment reasons (norm)
376  if (norm_bytes && norm_bytes != siteSubset*stride*sizeof(float)) {
377  size_t subset_bytes = norm_bytes/siteSubset;
378  for (int subset=0; subset < siteSubset; subset++) {
379  qudaMemsetAsync((char *)norm + (size_t)stride * sizeof(float) + subset_bytes * subset, 0,
380  subset_bytes - (size_t)stride * sizeof(float), 0);
381  }
382  }
383  }
384 
386  {
387  checkField(*this, src);
389  }
390 
391  void cudaColorSpinorField::copySpinorField(const ColorSpinorField &src)
392  {
393  if (typeid(src) == typeid(cudaColorSpinorField)) { // src is on the device
395  } else if (typeid(src) == typeid(cpuColorSpinorField)) { // src is on the host
396  loadSpinorField(src);
397  } else {
398  errorQuda("Unknown input ColorSpinorField %s", typeid(src).name());
399  }
400  }
401 
402  void cudaColorSpinorField::loadSpinorField(const ColorSpinorField &src) {
403 
404  if ( reorder_location() == QUDA_CPU_FIELD_LOCATION && typeid(src) == typeid(cpuColorSpinorField)) {
405  void *buffer = pool_pinned_malloc(bytes + norm_bytes);
406  memset(buffer, 0, bytes+norm_bytes); // FIXME (temporary?) bug fix for padding
407 
408  copyGenericColorSpinor(*this, src, QUDA_CPU_FIELD_LOCATION, buffer, 0, static_cast<char*>(buffer)+bytes, 0);
409 
410  qudaMemcpy(v, buffer, bytes, cudaMemcpyDefault);
411  qudaMemcpy(norm, static_cast<char *>(buffer) + bytes, norm_bytes, cudaMemcpyDefault);
412 
413  pool_pinned_free(buffer);
414  } else if (typeid(src) == typeid(cudaColorSpinorField)) {
416  } else {
417 
418  if (src.FieldOrder() == QUDA_PADDED_SPACE_SPIN_COLOR_FIELD_ORDER) {
419  // special case where we use mapped memory to read/write directly from application's array
420  void *src_d = get_mapped_device_pointer(src.V());
422  } else {
423  void *Src=nullptr, *srcNorm=nullptr, *buffer=nullptr;
424  if (!zeroCopy) {
425  buffer = pool_device_malloc(src.Bytes()+src.NormBytes());
426  Src = buffer;
427  srcNorm = static_cast<char*>(Src) + src.Bytes();
428  qudaMemcpy(Src, src.V(), src.Bytes(), cudaMemcpyDefault);
429  qudaMemcpy(srcNorm, src.Norm(), src.NormBytes(), cudaMemcpyDefault);
430  } else {
431  buffer = pool_pinned_malloc(src.Bytes()+src.NormBytes());
432  memcpy(buffer, src.V(), src.Bytes());
433  memcpy(static_cast<char*>(buffer)+src.Bytes(), src.Norm(), src.NormBytes());
434  Src = get_mapped_device_pointer(buffer);
435  srcNorm = static_cast<char*>(Src) + src.Bytes();
436  }
437 
438  qudaMemsetAsync(v, 0, bytes, 0); // FIXME (temporary?) bug fix for padding
439  copyGenericColorSpinor(*this, src, QUDA_CUDA_FIELD_LOCATION, 0, Src, 0, srcNorm);
440 
441  if (zeroCopy) pool_pinned_free(buffer);
442  else pool_device_free(buffer);
443  }
444  }
445 
446  qudaDeviceSynchronize(); // include sync here for accurate host-device profiling
447  }
448 
449 
450  void cudaColorSpinorField::saveSpinorField(ColorSpinorField &dest) const {
451 
452  if ( reorder_location() == QUDA_CPU_FIELD_LOCATION && typeid(dest) == typeid(cpuColorSpinorField)) {
453  void *buffer = pool_pinned_malloc(bytes+norm_bytes);
454  qudaMemcpy(buffer, v, bytes, cudaMemcpyDefault);
455  qudaMemcpy(static_cast<char *>(buffer) + bytes, norm, norm_bytes, cudaMemcpyDefault);
456 
457  copyGenericColorSpinor(dest, *this, QUDA_CPU_FIELD_LOCATION, 0, buffer, 0, static_cast<char*>(buffer)+bytes);
458  pool_pinned_free(buffer);
459  } else if (typeid(dest) == typeid(cudaColorSpinorField)) {
461  } else {
462 
463  if (dest.FieldOrder() == QUDA_PADDED_SPACE_SPIN_COLOR_FIELD_ORDER) {
464  // special case where we use zero-copy memory to read/write directly from application's array
465  void *dest_d = get_mapped_device_pointer(dest.V());
466  copyGenericColorSpinor(dest, *this, QUDA_CUDA_FIELD_LOCATION, dest_d, v);
467  } else {
468  void *dst = nullptr, *dstNorm = nullptr, *buffer = nullptr;
469  if (!zeroCopy) {
470  buffer = pool_device_malloc(dest.Bytes()+dest.NormBytes());
471  dst = buffer;
472  dstNorm = static_cast<char*>(dst) + dest.Bytes();
473  } else {
474  buffer = pool_pinned_malloc(dest.Bytes()+dest.NormBytes());
475  dst = get_mapped_device_pointer(buffer);
476  dstNorm = static_cast<char*>(dst)+dest.Bytes();
477  }
478 
479  copyGenericColorSpinor(dest, *this, QUDA_CUDA_FIELD_LOCATION, dst, 0, dstNorm, 0);
480 
481  if (!zeroCopy) {
482  qudaMemcpy(dest.V(), dst, dest.Bytes(), cudaMemcpyDefault);
483  qudaMemcpy(dest.Norm(), dstNorm, dest.NormBytes(), cudaMemcpyDefault);
484  } else {
486  memcpy(dest.V(), buffer, dest.Bytes());
487  memcpy(dest.Norm(), static_cast<char*>(buffer) + dest.Bytes(), dest.NormBytes());
488  }
489 
490  if (zeroCopy) pool_pinned_free(buffer);
491  else pool_device_free(buffer);
492  }
493  }
494 
495  qudaDeviceSynchronize(); // need to sync before data can be used on CPU
496  }
497 
498  void cudaColorSpinorField::allocateGhostBuffer(int nFace, bool spin_project) const
499  {
500  createGhostZone(nFace, spin_project);
502  }
503 
504  // pack the ghost zone into a contiguous buffer for communications
505  void cudaColorSpinorField::packGhost(const int nFace, const QudaParity parity, const int dim, const QudaDirection dir,
506  const int dagger, qudaStream_t *stream,
507  MemoryLocation location[2 * QUDA_MAX_DIM], MemoryLocation location_label,
508  bool spin_project, double a, double b, double c, int shmem)
509  {
510 #ifdef MULTI_GPU
511  void *packBuffer[4 * QUDA_MAX_DIM] = {};
512 
513  for (int dim=0; dim<4; dim++) {
514  for (int dir=0; dir<2; dir++) {
515  switch (location[2 * dim + dir]) {
516 
517  case Device: // pack to local device buffer
518  packBuffer[2 * dim + dir] = my_face_dim_dir_d[bufferIndex][dim][dir];
519  packBuffer[2 * QUDA_MAX_DIM + 2 * dim + dir] = nullptr;
520  break;
521  case Shmem:
522  // this is the remote buffer when using shmem ...
523  // if the ghost_remote_send_buffer_d exists we can directly use it
524  // - else we need pack locally and send data to the recv buffer
525  packBuffer[2 * dim + dir] = ghost_remote_send_buffer_d[bufferIndex][dim][dir] != nullptr ?
526  static_cast<char *>(ghost_remote_send_buffer_d[bufferIndex][dim][dir]) + ghost_offset[dim][1 - dir] :
528  packBuffer[2 * QUDA_MAX_DIM + 2 * dim + dir] = ghost_remote_send_buffer_d[bufferIndex][dim][dir] != nullptr ?
529  nullptr :
530  static_cast<char *>(ghost_recv_buffer_d[bufferIndex]) + ghost_offset[dim][1 - dir];
531  break;
532  case Host: // pack to zero-copy memory
533  packBuffer[2*dim+dir] = my_face_dim_dir_hd[bufferIndex][dim][dir];
534  break;
535  case Remote: // pack to remote peer memory
536  packBuffer[2 * dim + dir]
537  = static_cast<char *>(ghost_remote_send_buffer_d[bufferIndex][dim][dir]) + ghost_offset[dim][1 - dir];
538  break;
539  default: errorQuda("Undefined location %d", location[2*dim+dir]);
540  }
541  }
542  }
543  PackGhost(packBuffer, *this, location_label, nFace, dagger, parity, spin_project, a, b, c, shmem, *stream);
544 #else
545  errorQuda("packGhost not built on single-GPU build");
546 #endif
547  }
548 
549  // send the ghost zone to the host
550  void cudaColorSpinorField::sendGhost(void *ghost_spinor, const int nFace, const int dim, const QudaDirection dir,
551  const int dagger, qudaStream_t *stream)
552  {
553 #ifdef MULTI_GPU
554  if (precision != ghost_precision) { pushKernelPackT(true); }
555 
556  if (dim !=3 || getKernelPackT()) { // use kernels to pack into contiguous buffers then a single cudaMemcpy
557 
558  void* gpu_buf = (dir == QUDA_BACKWARDS) ? my_face_dim_dir_d[bufferIndex][dim][0] : my_face_dim_dir_d[bufferIndex][dim][1];
559  qudaMemcpyAsync(ghost_spinor, gpu_buf, ghost_face_bytes[dim], cudaMemcpyDeviceToHost, *stream);
560 
561  } else {
562 
563  const int Nvec = (nSpin == 1 || ghost_precision == QUDA_DOUBLE_PRECISION) ? 2 : 4;
564  const int Nint = (nColor * nSpin * 2) / (nSpin == 4 ? 2 : 1); // (spin proj.) degrees of freedom
565  const int Npad = Nint / Nvec; // number Nvec buffers we have
566  const int nParity = siteSubset;
567  const int x4 = nDim==5 ? x[4] : 1;
568  const int Nt_minus1_offset = (volumeCB - nFace * ghostFaceCB[3]) / x4; // N_t-1 = Vh-Vsh
569 
570  int offset = 0;
571  if (nSpin == 1) {
572  offset = (dir == QUDA_BACKWARDS) ? 0 : Nt_minus1_offset;
573  } else if (nSpin == 4) {
574  // !dagger: send lower components backwards, send upper components forwards
575  // dagger: send upper components backwards, send lower components forwards
576  bool upper = dagger ? true : false; // Fwd is !Back
577  if (dir == QUDA_FORWARDS) upper = !upper;
578  int lower_spin_offset = Npad*stride;
579  if (upper) offset = (dir == QUDA_BACKWARDS ? 0 : Nt_minus1_offset);
580  else offset = lower_spin_offset + (dir == QUDA_BACKWARDS ? 0 : Nt_minus1_offset);
581  }
582 
583  size_t len = nFace * (ghostFaceCB[3] / x4) * Nvec * ghost_precision;
584  size_t dpitch = x4*len;
585  size_t spitch = stride*Nvec*ghost_precision;
586 
587  // QUDA Memcpy NPad's worth.
588  // -- Dest will point to the right beginning PAD.
589  // -- Each Pad has size Nvec*Vsh Floats.
590  // -- There is Nvec*Stride Floats from the start of one PAD to the start of the next
591 
592  for (int parity = 0; parity < nParity; parity++) {
593  for (int s = 0; s < x4; s++) { // loop over multiple 4-d volumes (if they exist)
594  void *dst = (char *)ghost_spinor + s * len + parity * nFace * Nint * ghostFaceCB[3] * ghost_precision;
595  void *src = (char *)v + (offset + s * (volumeCB / x4)) * Nvec * ghost_precision + parity * bytes / 2;
596  qudaMemcpy2DAsync(dst, dpitch, src, spitch, len, Npad, cudaMemcpyDeviceToHost, *stream);
597 
598  // we can probably issue this as a single cudaMemcpy2d along the fifth dimension
600  size_t len = nFace * (ghostFaceCB[3] / x4) * sizeof(float);
601  int norm_offset = (dir == QUDA_BACKWARDS) ? 0 : Nt_minus1_offset * sizeof(float);
602  void *dst = (char *)ghost_spinor + nParity * nFace * Nint * ghostFaceCB[3] * ghost_precision + s * len
603  + parity * nFace * ghostFaceCB[3] * sizeof(float);
604  void *src = (char *)norm + norm_offset + s * (volumeCB / x4) * sizeof(float) + parity * norm_bytes / 2;
605  qudaMemcpyAsync(dst, src, len, cudaMemcpyDeviceToHost, *stream);
606  }
607  } // fifth dimension
608  } // parity
609  }
610 
612 
613 #else
614  errorQuda("sendGhost not built on single-GPU build");
615 #endif
616  }
617 
618  void cudaColorSpinorField::unpackGhost(const void *ghost_spinor, const int nFace, const int dim,
619  const QudaDirection dir, const int dagger, qudaStream_t *stream)
620  {
621  const void *src = ghost_spinor;
622  auto offset = (dir == QUDA_BACKWARDS) ? ghost_offset[dim][0] : ghost_offset[dim][1];
623  void *ghost_dst = static_cast<char *>(ghost_recv_buffer_d[bufferIndex]) + offset;
624 
625  qudaMemcpyAsync(ghost_dst, src, ghost_face_bytes[dim], cudaMemcpyHostToDevice, *stream);
626  }
627 
628  // pack the ghost zone into a contiguous buffer for communications
629  void cudaColorSpinorField::packGhostExtended(const int nFace, const int R[], const QudaParity parity, const int dim,
630  const QudaDirection dir, const int dagger, qudaStream_t *stream,
631  bool zero_copy)
632  {
633  errorQuda("not implemented");
634  }
635 
636  // copy data from host buffer into boundary region of device field
637  void cudaColorSpinorField::unpackGhostExtended(const void *ghost_spinor, const int nFace, const QudaParity parity,
638  const int dim, const QudaDirection dir, const int dagger,
639  qudaStream_t *stream, bool zero_copy)
640  {
641  errorQuda("not implemented");
642  }
643 
645 
646  void cudaColorSpinorField::createComms(int nFace, bool spin_project) {
647 
648  allocateGhostBuffer(nFace,spin_project); // allocate the ghost buffer if not yet allocated
649 
650  // ascertain if this instance needs its comms buffers to be updated
651  bool comms_reset = ghost_field_reset || // FIXME add send buffer check
654  || (my_face_d[0] != ghost_send_buffer_d[0]) || (my_face_d[1] != ghost_send_buffer_d[1]) || // send buffers
655  (from_face_d[0] != ghost_recv_buffer_d[0]) || (from_face_d[1] != ghost_recv_buffer_d[1]) || // receive buffers
656  ghost_precision_reset; // ghost_precision has changed
657 
658  if (!initComms || comms_reset) {
659 
661 
662  // reinitialize the ghost receive pointers
663  for (int i=0; i<nDimComms; ++i) {
664  if (commDimPartitioned(i)) {
665  for (int b=0; b<2; b++) {
666  ghost[b][i] = static_cast<char *>(ghost_recv_buffer_d[b]) + ghost_offset[i][0];
668  ghostNorm[b][i] = static_cast<char *>(ghost[b][i])
669  + nFace * surface[i] * (nSpin / (spin_project ? 2 : 1)) * nColor * 2 * ghost_precision;
670  }
671  }
672  }
673 
674  ghost_precision_reset = false;
675  }
676 
678  createIPCComms();
679  }
680 
681  void cudaColorSpinorField::streamInit(qudaStream_t *stream_p) { stream = stream_p; }
682 
683  void cudaColorSpinorField::pack(int nFace, int parity, int dagger, int stream_idx,
684  MemoryLocation location[2 * QUDA_MAX_DIM], MemoryLocation location_label,
685  bool spin_project, double a, double b, double c, int shmem)
686  {
687  createComms(nFace, spin_project); // must call this first
688 
689  const int dim=-1; // pack all partitioned dimensions
690 
691  packGhost(nFace, (QudaParity)parity, dim, QUDA_BOTH_DIRS, dagger, &stream[stream_idx], location, location_label,
692  spin_project, a, b, c, shmem);
693  }
694 
695  void cudaColorSpinorField::packExtended(const int nFace, const int R[], const int parity, const int dagger,
696  const int dim, qudaStream_t *stream_p, const bool zero_copy)
697  {
698  createComms(nFace); // must call this first
699 
700  stream = stream_p;
701 
702  packGhostExtended(nFace, R, (QudaParity)parity, dim, QUDA_BOTH_DIRS, dagger, &stream[zero_copy ? 0 : (Nstream-1)], zero_copy);
703  }
704 
705  void cudaColorSpinorField::gather(int nFace, int dagger, int dir, qudaStream_t *stream_p)
706  {
707  int dim = dir/2;
708 
709  // If stream_p != 0, use pack_stream, else use the stream array
710  qudaStream_t *pack_stream = (stream_p) ? stream_p : stream + dir;
711 
712  if (dir%2 == 0) {
713  // backwards copy to host
714  if (comm_peer2peer_enabled(0,dim)) return;
715 
716  sendGhost(my_face_dim_dir_h[bufferIndex][dim][0], nFace, dim, QUDA_BACKWARDS, dagger, pack_stream);
717  } else {
718  // forwards copy to host
719  if (comm_peer2peer_enabled(1,dim)) return;
720 
721  sendGhost(my_face_dim_dir_h[bufferIndex][dim][1], nFace, dim, QUDA_FORWARDS, dagger, pack_stream);
722  }
723  }
724 
725  void cudaColorSpinorField::recvStart(int nFace, int d, int dagger, qudaStream_t *stream_p, bool gdr)
726  {
727 
728  // note this is scatter centric, so dir=0 (1) is send backwards
729  // (forwards) and receive from forwards (backwards)
730 
731  int dim = d/2;
732  int dir = d%2;
733  if (!commDimPartitioned(dim)) return;
734  if (gdr && !comm_gdr_enabled()) errorQuda("Requesting GDR comms but GDR is not enabled");
735 
736  if (dir == 0) { // receive from forwards
737  // receive from the processor in the +1 direction
738  if (comm_peer2peer_enabled(1,dim)) {
740  } else if (gdr) {
742  } else {
744  }
745  } else { // receive from backwards
746  // receive from the processor in the -1 direction
747  if (comm_peer2peer_enabled(0,dim)) {
749  } else if (gdr) {
751  } else {
753  }
754  }
755  }
756 
757  void cudaColorSpinorField::sendStart(int nFace, int d, int dagger, qudaStream_t *stream_p, bool gdr, bool remote_write)
758  {
759  // note this is scatter centric, so dir=0 (1) is send backwards
760  // (forwards) and receive from forwards (backwards)
761 
762  int dim = d/2;
763  int dir = d%2;
764  if (!commDimPartitioned(dim)) return;
765  if (gdr && !comm_gdr_enabled()) errorQuda("Requesting GDR comms but GDR is not enabled");
766 
767  int Nvec = (nSpin == 1 || ghost_precision == QUDA_DOUBLE_PRECISION) ? 2 : 4;
768  int Nint = (nColor * nSpin * 2)/(nSpin == 4 ? 2 : 1); // (spin proj.) degrees of freedom
769  int Npad = Nint/Nvec;
770 
771  if (!comm_peer2peer_enabled(dir,dim)) {
772  if (dir == 0)
775  else
778  } else { // doing peer-to-peer
779  qudaStream_t *copy_stream = (stream_p) ? stream_p : stream + d;
780 
781  // if not using copy engine then the packing kernel will remotely write the halos
782  if (!remote_write) {
783  // all goes here
784  void *ghost_dst
785  = static_cast<char *>(ghost_remote_send_buffer_d[bufferIndex][dim][dir]) + ghost_offset[dim][(dir + 1) % 2];
786 
788 
789  if (dim != 3 || getKernelPackT()) {
790 
791  void *ghost_dst
792  = static_cast<char *>(ghost_remote_send_buffer_d[bufferIndex][dim][dir]) + ghost_offset[dim][(dir + 1) % 2];
793  cudaMemcpyAsync(ghost_dst,
796  cudaMemcpyDeviceToDevice,
797  *copy_stream); // copy to forward processor
798 
799  } else {
800 
801  const int nParity = siteSubset;
802  const int x4 = nDim==5 ? x[4] : 1;
803  const int Nt_minus_offset = (volumeCB - nFace * ghostFaceCB[3]) / x4;
804 
805  int offset = 0;
806  if (nSpin == 1) {
807  offset = (dir == 0) ? 0 : Nt_minus_offset;
808  } else if (nSpin == 4) {
809  // !dagger: send lower components backwards, send upper components forwards
810  // dagger: send upper components backwards, send lower components forwards
811  bool upper = dagger ? true : false;
812  if (dir == 1) upper = !upper;
813  int lower_spin_offset = Npad*stride;
814  if (upper)
815  offset = (dir == 0 ? 0 : Nt_minus_offset);
816  else
817  offset = lower_spin_offset + (dir == 0 ? 0 : Nt_minus_offset);
818  }
819 
820  size_t len = nFace * (ghostFaceCB[3] / x4) * Nvec * ghost_precision;
821  size_t dpitch = x4*len;
822  size_t spitch = stride*Nvec*ghost_precision;
823 
824  for (int parity = 0; parity < nParity; parity++) {
825  for (int s = 0; s < x4; s++) {
826  void *dst = (char *)ghost_dst + s * len + parity * nFace * Nint * ghostFaceCB[3] * ghost_precision;
827  void *src = (char *)v + (offset + s * (volumeCB / x4)) * Nvec * ghost_precision + parity * bytes / 2;
828  // start the copy
829  cudaMemcpy2DAsync(dst, dpitch, src, spitch, len, Npad, cudaMemcpyDeviceToDevice, *copy_stream);
830 
831  // we can probably issue this as a single cudaMemcpy2d along the fifth dimension
833  size_t len = nFace * (ghostFaceCB[3] / x4) * sizeof(float);
834  int norm_offset = (dir == 0) ? 0 : Nt_minus_offset * sizeof(float);
835  void *dst = (char *)ghost_dst + nParity * nFace * Nint * ghostFaceCB[3] * ghost_precision + s * len
836  + parity * nFace * ghostFaceCB[3] * sizeof(float);
837  void *src = (char *)norm + norm_offset + s * (volumeCB / x4) * sizeof(float) + parity * norm_bytes / 2;
838  cudaMemcpyAsync(dst, src, len, cudaMemcpyDeviceToDevice, *copy_stream);
839  }
840  }
841  } // fifth dimension
842  } // parity
843  } // remote_write
844 
846 
847  if (dir == 0) {
848  // record the event
849  qudaEventRecord(ipcCopyEvent[bufferIndex][0][dim], *copy_stream);
850  // send to the processor in the -1 direction
852  } else {
853  qudaEventRecord(ipcCopyEvent[bufferIndex][1][dim], *copy_stream);
854  // send to the processor in the +1 direction
856  }
857  }
858  }
859 
860  void cudaColorSpinorField::commsStart(int nFace, int dir, int dagger, qudaStream_t *stream_p, bool gdr_send,
861  bool gdr_recv)
862  {
863  recvStart(nFace, dir, dagger, stream_p, gdr_recv);
864  sendStart(nFace, dir, dagger, stream_p, gdr_send);
865  }
866 
867  static bool complete_recv_fwd[QUDA_MAX_DIM] = { };
868  static bool complete_recv_back[QUDA_MAX_DIM] = { };
869  static bool complete_send_fwd[QUDA_MAX_DIM] = { };
870  static bool complete_send_back[QUDA_MAX_DIM] = { };
871 
872  int cudaColorSpinorField::commsQuery(int nFace, int d, int dagger, qudaStream_t *stream_p, bool gdr_send, bool gdr_recv)
873  {
874 
875  // note this is scatter centric, so dir=0 (1) is send backwards
876  // (forwards) and receive from forwards (backwards)
877 
878  int dim = d/2;
879  int dir = d%2;
880 
881  if (!commDimPartitioned(dim)) return 1;
882  if ((gdr_send || gdr_recv) && !comm_gdr_enabled()) errorQuda("Requesting GDR comms but GDR is not enabled");
883 
884  if (dir==0) {
885 
886  // first query send to backwards
887  if (comm_peer2peer_enabled(0,dim)) {
888  if (!complete_send_back[dim]) complete_send_back[dim] = comm_query(mh_send_p2p_back[bufferIndex][dim]);
889  } else if (gdr_send) {
890  if (!complete_send_back[dim]) complete_send_back[dim] = comm_query(mh_send_rdma_back[bufferIndex][dim]);
891  } else {
892  if (!complete_send_back[dim]) complete_send_back[dim] = comm_query(mh_send_back[bufferIndex][dim]);
893  }
894 
895  // second query receive from forwards
896  if (comm_peer2peer_enabled(1,dim)) {
897  if (!complete_recv_fwd[dim]) complete_recv_fwd[dim] = comm_query(mh_recv_p2p_fwd[bufferIndex][dim]);
898  } else if (gdr_recv) {
899  if (!complete_recv_fwd[dim]) complete_recv_fwd[dim] = comm_query(mh_recv_rdma_fwd[bufferIndex][dim]);
900  } else {
901  if (!complete_recv_fwd[dim]) complete_recv_fwd[dim] = comm_query(mh_recv_fwd[bufferIndex][dim]);
902  }
903 
904  if (complete_recv_fwd[dim] && complete_send_back[dim]) {
905  complete_send_back[dim] = false;
906  complete_recv_fwd[dim] = false;
907  return 1;
908  }
909 
910  } else { // dir == 1
911 
912  // first query send to forwards
913  if (comm_peer2peer_enabled(1,dim)) {
914  if (!complete_send_fwd[dim]) complete_send_fwd[dim] = comm_query(mh_send_p2p_fwd[bufferIndex][dim]);
915  } else if (gdr_send) {
916  if (!complete_send_fwd[dim]) complete_send_fwd[dim] = comm_query(mh_send_rdma_fwd[bufferIndex][dim]);
917  } else {
918  if (!complete_send_fwd[dim]) complete_send_fwd[dim] = comm_query(mh_send_fwd[bufferIndex][dim]);
919  }
920 
921  // second query receive from backwards
922  if (comm_peer2peer_enabled(0,dim)) {
923  if (!complete_recv_back[dim]) complete_recv_back[dim] = comm_query(mh_recv_p2p_back[bufferIndex][dim]);
924  } else if (gdr_recv) {
925  if (!complete_recv_back[dim]) complete_recv_back[dim] = comm_query(mh_recv_rdma_back[bufferIndex][dim]);
926  } else {
927  if (!complete_recv_back[dim]) complete_recv_back[dim] = comm_query(mh_recv_back[bufferIndex][dim]);
928  }
929 
930  if (complete_recv_back[dim] && complete_send_fwd[dim]) {
931  complete_send_fwd[dim] = false;
932  complete_recv_back[dim] = false;
933  return 1;
934  }
935 
936  }
937 
938  return 0;
939  }
940 
941  void cudaColorSpinorField::commsWait(int nFace, int d, int dagger, qudaStream_t *stream_p, bool gdr_send, bool gdr_recv)
942  {
943 
944  // note this is scatter centric, so dir=0 (1) is send backwards
945  // (forwards) and receive from forwards (backwards)
946 
947  int dim = d/2;
948  int dir = d%2;
949 
950  if (!commDimPartitioned(dim)) return;
951  if ( (gdr_send && gdr_recv) && !comm_gdr_enabled()) errorQuda("Requesting GDR comms but GDR is not enabled");
952 
953  if (dir==0) {
954 
955  // first wait on send to backwards
956  if (comm_peer2peer_enabled(0,dim)) {
958  cudaEventSynchronize(ipcCopyEvent[bufferIndex][0][dim]);
959  } else if (gdr_send) {
961  } else {
963  }
964 
965  // second wait on receive from forwards
966  if (comm_peer2peer_enabled(1,dim)) {
968  cudaEventSynchronize(ipcRemoteCopyEvent[bufferIndex][1][dim]);
969  } else if (gdr_recv) {
971  } else {
973  }
974 
975  } else {
976 
977  // first wait on send to forwards
978  if (comm_peer2peer_enabled(1,dim)) {
980  cudaEventSynchronize(ipcCopyEvent[bufferIndex][1][dim]);
981  } else if (gdr_send) {
983  } else {
985  }
986 
987  // second wait on receive from backwards
988  if (comm_peer2peer_enabled(0,dim)) {
990  cudaEventSynchronize(ipcRemoteCopyEvent[bufferIndex][0][dim]);
991  } else if (gdr_recv) {
993  } else {
995  }
996 
997  }
998 
999  return;
1000  }
1001 
1002  void cudaColorSpinorField::scatter(int nFace, int dagger, int dim_dir, qudaStream_t *stream_p)
1003  {
1004  // note this is scatter centric, so input expects dir=0 (1) is send backwards
1005  // (forwards) and receive from forwards (backwards), so here we need flip to receive centric
1006 
1007  int dim = dim_dir/2;
1008  int dir = (dim_dir+1)%2; // dir = 1 - receive from forwards, dir == 0 recive from backwards
1009  if (!commDimPartitioned(dim)) return;
1010 
1011  if (comm_peer2peer_enabled(dir,dim)) return;
1012  unpackGhost(from_face_dim_dir_h[bufferIndex][dim][dir], nFace, dim, dir == 0 ? QUDA_BACKWARDS : QUDA_FORWARDS, dagger, stream_p);
1013  }
1014 
1015  void cudaColorSpinorField::scatter(int nFace, int dagger, int dim_dir)
1016  {
1017  // note this is scatter centric, so dir=0 (1) is send backwards
1018  // (forwards) and receive from forwards (backwards), so here we need flip to receive centric
1019 
1020  int dim = dim_dir/2;
1021  int dir = (dim_dir+1)%2; // dir = 1 - receive from forwards, dir == 0 receive from backwards
1022  if (!commDimPartitioned(dim)) return;
1023 
1024  if (comm_peer2peer_enabled(dir,dim)) return;
1025  unpackGhost(from_face_dim_dir_h[bufferIndex][dim][dir], nFace, dim, dir == 0 ? QUDA_BACKWARDS : QUDA_FORWARDS, dagger, &stream[dim_dir]);
1026  }
1027 
1028  void cudaColorSpinorField::scatterExtended(int nFace, int parity, int dagger, int dim_dir)
1029  {
1030  bool zero_copy = false;
1031  int dim = dim_dir/2;
1032  int dir = (dim_dir+1)%2; // dir = 1 - receive from forwards, dir == 0 receive from backwards
1033  if (!commDimPartitioned(dim)) return;
1034  unpackGhostExtended(from_face_dim_dir_h[bufferIndex][dim][dir], nFace, static_cast<QudaParity>(parity), dim, dir == 0 ? QUDA_BACKWARDS : QUDA_FORWARDS, dagger, &stream[2*dim/*+0*/], zero_copy);
1035  }
1036 
1038  const MemoryLocation *pack_destination_, const MemoryLocation *halo_location_,
1039  bool gdr_send, bool gdr_recv, QudaPrecision ghost_precision_) const
1040  {
1041 
1042  // we are overriding the ghost precision, and it doesn't match what has already been allocated
1043  if (ghost_precision_ != QUDA_INVALID_PRECISION && ghost_precision != ghost_precision_) {
1044  ghost_precision_reset = true;
1045  ghost_precision = ghost_precision_;
1046  }
1047 
1048  // not overriding the ghost precision, but we did previously so need to update
1050  ghost_precision_reset = true;
1052  }
1053 
1054  if ((gdr_send || gdr_recv) && !comm_gdr_enabled()) errorQuda("Requesting GDR comms but GDR is not enabled");
1055  pushKernelPackT(true); // ensure kernel packing is enabled for all dimensions
1056  const_cast<cudaColorSpinorField&>(*this).streamInit(streams); // ensures streams are set (needed for p2p)
1057  const_cast<cudaColorSpinorField&>(*this).createComms(nFace, false);
1058 
1059  // first set default values to device if needed
1060  MemoryLocation pack_destination[2*QUDA_MAX_DIM], halo_location[2*QUDA_MAX_DIM];
1061  for (int i=0; i<2*nDimComms; i++) {
1062  pack_destination[i] = pack_destination_ ? pack_destination_[i] : Device;
1063  halo_location[i] = halo_location_ ? halo_location_[i] : Device;
1064  }
1065 
1066  // Contiguous send buffers and we aggregate copies to reduce
1067  // latency. Only if all locations are "Device" and no p2p
1068  bool fused_pack_memcpy = true;
1069 
1070  // Contiguous recv buffers and we aggregate copies to reduce
1071  // latency. Only if all locations are "Device" and no p2p
1072  bool fused_halo_memcpy = true;
1073 
1074  bool pack_host = false; // set to true if any of the ghost packing is being done to Host memory
1075  bool halo_host = false; // set to true if the final halos will be left in Host memory
1076 
1077  void *send[2*QUDA_MAX_DIM];
1078  for (int d=0; d<nDimComms; d++) {
1079  for (int dir=0; dir<2; dir++) {
1080  send[2*d+dir] = pack_destination[2*d+dir] == Host ? my_face_dim_dir_hd[bufferIndex][d][dir] : my_face_dim_dir_d[bufferIndex][d][dir];
1081  ghost_buf[2*d+dir] = halo_location[2*d+dir] == Host ? from_face_dim_dir_hd[bufferIndex][d][dir] : from_face_dim_dir_d[bufferIndex][d][dir];
1082  }
1083 
1084  // if doing p2p, then we must pack to and load the halo from device memory
1085  for (int dir=0; dir<2; dir++) {
1086  if (comm_peer2peer_enabled(dir,d)) { pack_destination[2*d+dir] = Device; halo_location[2*d+1-dir] = Device; }
1087  }
1088 
1089  // if zero-copy packing or p2p is enabled then we cannot do fused memcpy
1090  if (pack_destination[2*d+0] != Device || pack_destination[2*d+1] != Device || comm_peer2peer_enabled_global()) fused_pack_memcpy = false;
1091  // if zero-copy halo read or p2p is enabled then we cannot do fused memcpy
1092  if (halo_location[2*d+0] != Device || halo_location[2*d+1] != Device || comm_peer2peer_enabled_global()) fused_halo_memcpy = false;
1093 
1094  if (pack_destination[2*d+0] == Host || pack_destination[2*d+1] == Host) pack_host = true;
1095  if (halo_location[2*d+0] == Host || halo_location[2*d+1] == Host) halo_host = true;
1096  }
1097 
1098  // Error if zero-copy and p2p for now
1099  if ( (pack_host || halo_host) && comm_peer2peer_enabled_global()) errorQuda("Cannot use zero-copy memory with peer-to-peer comms yet");
1100 
1101  genericPackGhost(send, *this, parity, nFace, dagger, pack_destination); // FIXME - need support for asymmetric topologies
1102 
1103  size_t total_bytes = 0;
1104  for (int i = 0; i < nDimComms; i++)
1105  if (comm_dim_partitioned(i)) total_bytes += 2 * ghost_face_bytes_aligned[i]; // 2 for fwd/bwd
1106 
1107  if (!gdr_send) {
1108  if (!fused_pack_memcpy) {
1109  for (int i=0; i<nDimComms; i++) {
1110  if (comm_dim_partitioned(i)) {
1111  if (pack_destination[2*i+0] == Device && !comm_peer2peer_enabled(0,i) && // fuse forwards and backwards if possible
1112  pack_destination[2*i+1] == Device && !comm_peer2peer_enabled(1,i)) {
1114  2 * ghost_face_bytes_aligned[i], cudaMemcpyDeviceToHost, 0);
1115  } else {
1116  if (pack_destination[2 * i + 0] == Device && !comm_peer2peer_enabled(0, i))
1118  ghost_face_bytes[i], cudaMemcpyDeviceToHost, 0);
1119  if (pack_destination[2 * i + 1] == Device && !comm_peer2peer_enabled(1, i))
1121  ghost_face_bytes[i], cudaMemcpyDeviceToHost, 0);
1122  }
1123  }
1124  }
1125  } else if (total_bytes && !pack_host) {
1127  }
1128  }
1129 
1130  // prepost receive
1131  for (int i=0; i<2*nDimComms; i++) const_cast<cudaColorSpinorField*>(this)->recvStart(nFace, i, dagger, 0, gdr_recv);
1132 
1133  bool sync = pack_host ? true : false; // no p2p if pack_host so we need to synchronize
1134  // if not p2p in any direction then need to synchronize before MPI
1135  for (int i=0; i<nDimComms; i++) if (!comm_peer2peer_enabled(0,i) || !comm_peer2peer_enabled(1,i)) sync = true;
1136  if (sync) qudaDeviceSynchronize(); // need to make sure packing and/or memcpy has finished before kicking off MPI
1137 
1138  for (int p2p=0; p2p<2; p2p++) {
1139  for (int dim=0; dim<nDimComms; dim++) {
1140  for (int dir=0; dir<2; dir++) {
1141  if ( (comm_peer2peer_enabled(dir,dim) + p2p) % 2 == 0 ) { // issue non-p2p transfers first
1142  const_cast<cudaColorSpinorField*>(this)->sendStart(nFace, 2*dim+dir, dagger, 0, gdr_send);
1143  }
1144  }
1145  }
1146  }
1147 
1148  bool comms_complete[2*QUDA_MAX_DIM] = { };
1149  int comms_done = 0;
1150  while (comms_done < 2*nDimComms) { // non-blocking query of each exchange and exit once all have completed
1151  for (int dim=0; dim<nDimComms; dim++) {
1152  for (int dir=0; dir<2; dir++) {
1153  if (!comms_complete[dim*2+dir]) {
1154  comms_complete[2*dim+dir] = const_cast<cudaColorSpinorField*>(this)->commsQuery(nFace, 2*dim+dir, dagger, 0, gdr_send, gdr_recv);
1155  if (comms_complete[2*dim+dir]) {
1156  comms_done++;
1158  }
1159  }
1160  }
1161  }
1162  }
1163 
1164  if (!gdr_recv) {
1165  if (!fused_halo_memcpy) {
1166  for (int i=0; i<nDimComms; i++) {
1167  if (comm_dim_partitioned(i)) {
1168  if (halo_location[2*i+0] == Device && !comm_peer2peer_enabled(0,i) && // fuse forwards and backwards if possible
1169  halo_location[2*i+1] == Device && !comm_peer2peer_enabled(1,i)) {
1171  2 * ghost_face_bytes_aligned[i], cudaMemcpyHostToDevice, 0);
1172  } else {
1173  if (halo_location[2 * i + 0] == Device && !comm_peer2peer_enabled(0, i))
1175  ghost_face_bytes[i], cudaMemcpyHostToDevice, 0);
1176  if (halo_location[2 * i + 1] == Device && !comm_peer2peer_enabled(1, i))
1178  ghost_face_bytes[i], cudaMemcpyHostToDevice, 0);
1179  }
1180  }
1181  }
1182  } else if (total_bytes && !halo_host) {
1184  0);
1185  }
1186  }
1187 
1188  // ensure that the p2p sending is completed before returning
1189  for (int dim = 0; dim < nDimComms; dim++) {
1190  if (!comm_dim_partitioned(dim)) continue;
1191  for (int dir = 0; dir < 2; dir++) {
1193  }
1194  }
1195 
1196  popKernelPackT(); // restore kernel packing
1197  }
1198 
1199  std::ostream& operator<<(std::ostream &out, const cudaColorSpinorField &a) {
1200  out << (const ColorSpinorField&)a;
1201  out << "v = " << a.v << std::endl;
1202  out << "norm = " << a.norm << std::endl;
1203  out << "alloc = " << a.alloc << std::endl;
1204  out << "init = " << a.init << std::endl;
1205  return out;
1206  }
1207 
1210 
1211  if (this->IsComposite()) {
1212  if (idx < this->CompositeDim()) {//setup eigenvector form the set
1213  return *(dynamic_cast<cudaColorSpinorField*>(components[idx]));
1214  }
1215  else{
1216  errorQuda("Incorrect component index...");
1217  }
1218  }
1219  errorQuda("Cannot get requested component");
1220  exit(-1);
1221  }
1222 
1223 //copyCuda currently cannot not work with set of spinor fields..
1224  void cudaColorSpinorField::CopySubset(cudaColorSpinorField &dst, const int range, const int first_element) const{
1225 #if 0
1226  if (first_element < 0) errorQuda("\nError: trying to set negative first element.\n");
1227  if (siteSubset == QUDA_PARITY_SITE_SUBSET && this->EigvId() == -1) {
1228  if (first_element == 0 && range == this->EigvDim())
1229  {
1230  if (range != dst.EigvDim())errorQuda("\nError: eigenvector range to big.\n");
1231  checkField(dst, *this);
1232  copyCuda(dst, *this);
1233  }
1234  else if ((first_element+range) < this->EigvDim())
1235  {//setup eigenvector subset
1236 
1237  cudaColorSpinorField *eigv_subset;
1238 
1240 
1241  param.nColor = nColor;
1242  param.nSpin = nSpin;
1243  param.twistFlavor = twistFlavor;
1244  param.precision = precision;
1245  param.nDim = nDim;
1246  param.pad = pad;
1247  param.siteSubset = siteSubset;
1248  param.siteOrder = siteOrder;
1249  param.fieldOrder = fieldOrder;
1250  param.gammaBasis = gammaBasis;
1251  memcpy(param.x, x, nDim*sizeof(int));
1253 
1254  param.eigv_dim = range;
1255  param.eigv_id = -1;
1256  param.v = (void*)((char*)v + first_element*eigv_bytes);
1257  param.norm = (void*)((char*)norm + first_element*eigv_norm_bytes);
1258 
1259  eigv_subset = new cudaColorSpinorField(param);
1260 
1261  //Not really needed:
1262  eigv_subset->eigenvectors.reserve(param.eigv_dim);
1263  for (int id = first_element; id < (first_element+range); id++)
1264  {
1265  param.eigv_id = id;
1266  eigv_subset->eigenvectors.push_back(new cudaColorSpinorField(*this, param));
1267  }
1268  checkField(dst, *eigv_subset);
1269  copyCuda(dst, *eigv_subset);
1270 
1271  delete eigv_subset;
1272  } else {
1273  errorQuda("Incorrect eigenvector dimension...");
1274  }
1275  } else{
1276  errorQuda("Eigenvector must be a parity spinor");
1277  exit(-1);
1278  }
1279 #endif
1280  }
1281 
1282  void cudaColorSpinorField::Source(const QudaSourceType sourceType, const int st, const int s, const int c) {
1283  ColorSpinorParam param(*this);
1286  param.setPrecision((param.Precision() == QUDA_HALF_PRECISION || param.Precision() == QUDA_QUARTER_PRECISION) ?
1288  param.Precision());
1290 
1291  // since CPU fields cannot be low precision, use single precision instead
1293 
1295  tmp.Source(sourceType, st, s, c);
1296  *this = tmp;
1297  }
1298 
1299  void cudaColorSpinorField::PrintVector(unsigned int i) const { genericCudaPrintVector(*this, i); }
1300 
1301  void cudaColorSpinorField::copy_to_buffer(void *buffer) const
1302  {
1303  qudaMemcpy(buffer, v, bytes, cudaMemcpyDeviceToHost);
1305  qudaMemcpy(static_cast<char *>(buffer) + bytes, norm, norm_bytes, cudaMemcpyDeviceToHost);
1306  }
1307  }
1308 
1310  {
1311  qudaMemcpy(v, buffer, bytes, cudaMemcpyHostToDevice);
1313  qudaMemcpy(norm, static_cast<char *>(buffer) + bytes, norm_bytes, cudaMemcpyHostToDevice);
1314  }
1315  }
1316 
1317 } // namespace quda
virtual ColorSpinorField & operator=(const ColorSpinorField &)
void * ghost_buf[2 *QUDA_MAX_DIM]
ColorSpinorField * even
CompositeColorSpinorFieldDescriptor composite_descr
used for deflation eigenvector sets etc.:
QudaTwistFlavorType twistFlavor
CompositeColorSpinorField components
QudaSiteSubset SiteSubset() const
void * ghost[2][QUDA_MAX_DIM]
void reset(const ColorSpinorParam &)
void createGhostZone(int nFace, bool spin_project=true) const
static void checkField(const ColorSpinorField &, const ColorSpinorField &)
int ghostFaceCB[QUDA_MAX_DIM]
void * ghostNorm[2][QUDA_MAX_DIM]
ColorSpinorField * odd
MsgHandle * mh_send_fwd[2][QUDA_MAX_DIM]
static int bufferIndex
MsgHandle * mh_recv_rdma_back[2][QUDA_MAX_DIM]
MsgHandle * mh_send_rdma_fwd[2][QUDA_MAX_DIM]
void * from_face_dim_dir_d[2][QUDA_MAX_DIM][2]
static MsgHandle * mh_recv_p2p_back[2][QUDA_MAX_DIM]
MsgHandle * mh_send_rdma_back[2][QUDA_MAX_DIM]
void * my_face_dim_dir_h[2][QUDA_MAX_DIM][2]
void * from_face_dim_dir_h[2][QUDA_MAX_DIM][2]
static MsgHandle * mh_send_p2p_fwd[2][QUDA_MAX_DIM]
static MsgHandle * mh_recv_p2p_fwd[2][QUDA_MAX_DIM]
static void * ghost_pinned_recv_buffer_h[2]
size_t ghost_offset[QUDA_MAX_DIM][2]
QudaPrecision ghost_precision
QudaPrecision precision
void * my_face_dim_dir_d[2][QUDA_MAX_DIM][2]
static void destroyIPCComms()
size_t ghost_face_bytes[QUDA_MAX_DIM]
QudaMemoryType mem_type
static void * ghost_pinned_send_buffer_h[2]
static void * ghost_remote_send_buffer_d[2][QUDA_MAX_DIM][2]
void * from_face_dim_dir_hd[2][QUDA_MAX_DIM][2]
static MsgHandle * mh_send_p2p_back[2][QUDA_MAX_DIM]
static bool ghost_field_reset
const int * R() const
int surface[QUDA_MAX_DIM]
MsgHandle * mh_send_back[2][QUDA_MAX_DIM]
void * my_face_dim_dir_hd[2][QUDA_MAX_DIM][2]
static cudaEvent_t ipcCopyEvent[2][2][QUDA_MAX_DIM]
static cudaEvent_t ipcRemoteCopyEvent[2][2][QUDA_MAX_DIM]
MsgHandle * mh_recv_fwd[2][QUDA_MAX_DIM]
void allocateGhostBuffer(size_t ghost_bytes) const
Allocate the static ghost buffers.
MsgHandle * mh_recv_rdma_fwd[2][QUDA_MAX_DIM]
size_t ghost_face_bytes_aligned[QUDA_MAX_DIM]
static void * ghost_recv_buffer_d[2]
MsgHandle * mh_recv_back[2][QUDA_MAX_DIM]
void createComms(bool no_comms_fill=false, bool bidir=true)
static void * ghost_send_buffer_d[2]
void createComms(int nFace, bool spin_project=true)
Create the communication handlers and buffers.
void prefetch(QudaFieldLocation mem_space, qudaStream_t stream=0) const
If managed memory and prefetch is enabled, prefetch the spinor, the norm field (as appropriate),...
virtual void copy_from_buffer(void *buffer)
Copy all contents of the field from a host buffer to this field.
cudaColorSpinorField(const cudaColorSpinorField &)
void recvStart(int nFace, int dir, int dagger=0, qudaStream_t *stream_p=nullptr, bool gdr=false)
Initiate halo communication receive.
void commsWait(int nFace, int d, int dagger=0, qudaStream_t *stream_p=nullptr, bool gdr_send=false, bool gdr_recv=false)
Wait on halo communication to complete.
void pack(int nFace, int parity, int dagger, int stream_idx, MemoryLocation location[], MemoryLocation location_label, bool spin_project=true, double a=0, double b=0, double c=0, int shmem=0)
void CopySubset(cudaColorSpinorField &dst, const int range, const int first_element=0) const
void streamInit(qudaStream_t *stream_p)
void sendGhost(void *ghost_spinor, const int nFace, const int dim, const QudaDirection dir, const int dagger, qudaStream_t *stream)
void packGhostExtended(const int nFace, const int R[], const QudaParity parity, const int dim, const QudaDirection dir, const int dagger, qudaStream_t *stream, bool zero_copy=false)
ColorSpinorField & operator=(const ColorSpinorField &)
void sendStart(int nFace, int d, int dagger=0, qudaStream_t *stream_p=nullptr, bool gdr=false, bool remote_write=false)
Initiate halo communication sending.
cudaColorSpinorField & Component(const int idx) const
for composite fields:
void scatterExtended(int nFace, int parity, int dagger, int dir)
void unpackGhostExtended(const void *ghost_spinor, const int nFace, const QudaParity parity, const int dim, const QudaDirection dir, const int dagger, qudaStream_t *stream, bool zero_copy)
void backup() const
Backs up the cudaColorSpinorField.
void commsStart(int nFace, int d, int dagger=0, qudaStream_t *stream_p=nullptr, bool gdr_send=false, bool gdr_recv=false)
Initiate halo communication.
virtual void copy_to_buffer(void *buffer) const
Copy all contents of the field to a host buffer.
void gather(int nFace, int dagger, int dir, qudaStream_t *stream_p=NULL)
int commsQuery(int nFace, int d, int dagger=0, qudaStream_t *stream_p=nullptr, bool gdr_send=false, bool gdr_recv=false)
Non-blocking query if the halo communication has completed.
void Source(const QudaSourceType sourceType, const int st=0, const int s=0, const int c=0)
void exchangeGhost(QudaParity parity, int nFace, int dagger, const MemoryLocation *pack_destination=nullptr, const MemoryLocation *halo_location=nullptr, bool gdr_send=false, bool gdr_recv=false, QudaPrecision ghost_precision=QUDA_INVALID_PRECISION) const
This is a unified ghost exchange function for doing a complete halo exchange regardless of the type o...
void copy(const cudaColorSpinorField &)
void scatter(int nFace, int dagger, int dir, qudaStream_t *stream_p)
void restore() const
Restores the cudaColorSpinorField.
void packGhost(const int nFace, const QudaParity parity, const int dim, const QudaDirection dir, const int dagger, qudaStream_t *stream, MemoryLocation location[2 *QUDA_MAX_DIM], MemoryLocation location_label, bool spin_project, double a=0, double b=0, double c=0, int shmem=0)
Packs the cudaColorSpinorField's ghost zone.
void allocateGhostBuffer(int nFace, bool spin_project=true) const
Allocate the ghost buffers.
void packExtended(const int nFace, const int R[], const int parity, const int dagger, const int dim, qudaStream_t *stream_p, const bool zeroCopyPack=false)
void PrintVector(unsigned int x) const
void unpackGhost(const void *ghost_spinor, const int nFace, const int dim, const QudaDirection dir, const int dagger, qudaStream_t *stream)
void comm_start(MsgHandle *mh)
bool comm_gdr_enabled()
Query if GPU Direct RDMA communication is enabled (global setting)
int comm_query(MsgHandle *mh)
bool comm_peer2peer_enabled(int dir, int dim)
int comm_dim_partitioned(int dim)
void comm_wait(MsgHandle *mh)
int commDimPartitioned(int dir)
int comm_peer2peer_enabled_global()
std::array< int, 4 > dim
bool dagger
void * memset(void *s, int c, size_t n)
QudaParity parity
Definition: covdev_test.cpp:40
cudaColorSpinorField * tmp
Definition: covdev_test.cpp:34
enum QudaPrecision_s QudaPrecision
@ QUDA_POINT_SOURCE
Definition: enum_quda.h:375
@ QUDA_CUDA_FIELD_LOCATION
Definition: enum_quda.h:326
@ QUDA_CPU_FIELD_LOCATION
Definition: enum_quda.h:325
enum QudaDirection_s QudaDirection
@ QUDA_FULL_SITE_SUBSET
Definition: enum_quda.h:333
@ QUDA_PARITY_SITE_SUBSET
Definition: enum_quda.h:332
@ QUDA_MEMORY_MAPPED
Definition: enum_quda.h:15
@ QUDA_MEMORY_DEVICE
Definition: enum_quda.h:13
enum QudaFieldLocation_s QudaFieldLocation
@ QUDA_BOTH_DIRS
Definition: enum_quda.h:494
@ QUDA_FORWARDS
Definition: enum_quda.h:493
@ QUDA_BACKWARDS
Definition: enum_quda.h:491
enum QudaFieldCreate_s QudaFieldCreate
@ QUDA_EVEN_ODD_SITE_ORDER
Definition: enum_quda.h:340
@ QUDA_DOUBLE_PRECISION
Definition: enum_quda.h:65
@ QUDA_SINGLE_PRECISION
Definition: enum_quda.h:64
@ QUDA_INVALID_PRECISION
Definition: enum_quda.h:66
@ QUDA_QUARTER_PRECISION
Definition: enum_quda.h:62
@ QUDA_HALF_PRECISION
Definition: enum_quda.h:63
enum QudaSourceType_s QudaSourceType
@ QUDA_PADDED_SPACE_SPIN_COLOR_FIELD_ORDER
Definition: enum_quda.h:355
@ QUDA_SPACE_SPIN_COLOR_FIELD_ORDER
Definition: enum_quda.h:351
@ QUDA_ZERO_FIELD_CREATE
Definition: enum_quda.h:361
@ QUDA_COPY_FIELD_CREATE
Definition: enum_quda.h:362
@ QUDA_REFERENCE_FIELD_CREATE
Definition: enum_quda.h:363
@ QUDA_NULL_FIELD_CREATE
Definition: enum_quda.h:360
enum QudaParity_s QudaParity
#define pool_pinned_malloc(size)
Definition: malloc_quda.h:172
#define pool_device_malloc(size)
Definition: malloc_quda.h:170
#define pool_pinned_free(ptr)
Definition: malloc_quda.h:173
#define pool_device_free(ptr)
Definition: malloc_quda.h:171
#define get_mapped_device_pointer(ptr)
Definition: malloc_quda.h:116
#define host_free(ptr)
Definition: malloc_quda.h:115
#define mapped_malloc(size)
Definition: malloc_quda.h:108
void init()
Create the BLAS context.
void genericPackGhost(void **ghost, const ColorSpinorField &a, QudaParity parity, int nFace, int dagger, MemoryLocation *destination=nullptr)
Generic ghost packing routine.
const int Nstream
void PackGhost(void *ghost[2 *QUDA_MAX_DIM], const ColorSpinorField &field, MemoryLocation location, int nFace, bool dagger, int parity, bool spin_project, double a, double b, double c, int shmem, const qudaStream_t &stream)
Dslash face packing routine.
void copyGenericColorSpinor(ColorSpinorField &dst, const ColorSpinorField &src, QudaFieldLocation location, void *Dst=0, void *Src=0, void *dstNorm=0, void *srcNorm=0)
qudaStream_t * stream
void genericCudaPrintVector(const cudaColorSpinorField &a, unsigned x)
QudaFieldLocation reorder_location()
Return whether data is reordered on the CPU or GPU. This can set at QUDA initialization using the env...
bool getKernelPackT()
void pushKernelPackT(bool pack)
void popKernelPackT()
bool is_prefetch_enabled()
Definition: malloc.cpp:198
std::ostream & operator<<(std::ostream &output, const CloverFieldParam &param)
QudaGaugeParam param
Definition: pack_test.cpp:18
#define qudaMemset2DAsync(ptr, pitch, value, width, height, stream)
Definition: quda_api.h:227
#define qudaMemsetAsync(ptr, value, count, stream)
Definition: quda_api.h:224
#define qudaStreamWaitEvent(stream, event, flags)
Definition: quda_api.h:241
#define qudaMemPrefetchAsync(ptr, count, mem_space, stream)
Definition: quda_api.h:231
#define qudaMemcpy(dst, src, count, kind)
Definition: quda_api.h:204
#define qudaEventRecord(event, stream)
Definition: quda_api.h:238
#define qudaMemcpyAsync(dst, src, count, kind, stream)
Definition: quda_api.h:207
cudaStream_t qudaStream_t
Definition: quda_api.h:9
#define qudaDeviceSynchronize()
Definition: quda_api.h:250
#define qudaMemcpy2DAsync(dst, dpitch, src, spitch, width, height, kind, stream)
Definition: quda_api.h:214
#define QUDA_MAX_DIM
Maximum number of dimensions supported by QUDA. In practice, no routines make use of more than 5.
qudaStream_t * streams
Definition: device.cpp:15
QudaFieldLocation location
Definition: quda.h:33
#define errorQuda(...)
Definition: util_quda.h:120