QUDA  1.0.0
cuda_color_spinor_field.cpp
Go to the documentation of this file.
1 #include <stdlib.h>
2 #include <stdio.h>
3 #include <typeinfo>
4 #include <string.h>
5 #include <iostream>
6 
7 #include <color_spinor_field.h>
8 #include <blas_quda.h>
9 #include <dslash_quda.h>
10 
11 static bool zeroCopy = false;
12 
13 namespace quda {
14 
16  ColorSpinorField(param),
17  alloc(false),
18  init(true),
19  texInit(false),
20  ghostTexInit(false),
21  ghost_precision_tex(QUDA_INVALID_PRECISION),
22  ghost_field_tex {nullptr, nullptr, nullptr, nullptr}
23  {
24  // this must come before create
25  if (param.create == QUDA_REFERENCE_FIELD_CREATE) {
26  v = param.v;
27  norm = param.norm;
28  }
29 
30  create(param.create);
31 
32  if (param.create == QUDA_NULL_FIELD_CREATE) {
33  // do nothing
34  } else if (param.create == QUDA_ZERO_FIELD_CREATE) {
35  zero();
36  } else if (param.create == QUDA_REFERENCE_FIELD_CREATE) {
37  // do nothing
38  } else if (param.create == QUDA_COPY_FIELD_CREATE) {
39  errorQuda("not implemented");
40  }
41  }
42 
44  ColorSpinorField(src),
45  alloc(false),
46  init(true),
47  texInit(false),
48  ghostTexInit(false),
50  ghost_field_tex {nullptr, nullptr, nullptr, nullptr}
51  {
53  copySpinorField(src);
54  }
55 
56  // creates a copy of src, any differences defined in param
58  ColorSpinorField(src),
59  alloc(false),
60  init(true),
61  texInit(false),
62  ghostTexInit(false),
64  ghost_field_tex {nullptr, nullptr, nullptr, nullptr}
65  {
66  // can only overide if we are not using a reference or parity special case
67  if (param.create != QUDA_REFERENCE_FIELD_CREATE ||
68  (param.create == QUDA_REFERENCE_FIELD_CREATE &&
69  src.SiteSubset() == QUDA_FULL_SITE_SUBSET &&
70  param.siteSubset == QUDA_PARITY_SITE_SUBSET &&
71  typeid(src) == typeid(cudaColorSpinorField) ) ||
72  (param.create == QUDA_REFERENCE_FIELD_CREATE && (param.is_composite || param.is_component))) {
73  reset(param);
74  } else {
75  errorQuda("Undefined behaviour"); // else silent bug possible?
76  }
77 
78  // This must be set before create is called
79  if (param.create == QUDA_REFERENCE_FIELD_CREATE) {
80  if (typeid(src) == typeid(cudaColorSpinorField)) {
81  v = (void*)src.V();
82  norm = (void*)src.Norm();
83  } else {
84  errorQuda("Cannot reference a non-cuda field");
85  }
86 
87  if (composite_descr.is_component && !(src.SiteSubset() == QUDA_FULL_SITE_SUBSET && this->SiteSubset() == QUDA_PARITY_SITE_SUBSET))
88  {//setup eigenvector form the set
89  v = (void*)((char*)v + composite_descr.id*bytes);
90  norm = (void*)((char*)norm + composite_descr.id*norm_bytes);
91  }
92  }
93 
94  create(param.create);
95 
96  if (param.create == QUDA_NULL_FIELD_CREATE) {
97  // do nothing
98  } else if (param.create == QUDA_ZERO_FIELD_CREATE) {
99  zero();
100  } else if (param.create == QUDA_COPY_FIELD_CREATE) {
101  copySpinorField(src);
102  } else if (param.create == QUDA_REFERENCE_FIELD_CREATE) {
103  // do nothing
104  } else {
105  errorQuda("CreateType %d not implemented", param.create);
106  }
107 
108  }
109 
111  ColorSpinorField(src),
112  alloc(false),
113  init(true),
114  texInit(false),
115  ghostTexInit(false),
117  ghost_field_tex {nullptr, nullptr, nullptr, nullptr}
118  {
120  copySpinorField(src);
121  }
122 
124  if (typeid(src) == typeid(cudaColorSpinorField)) {
125  *this = (dynamic_cast<const cudaColorSpinorField&>(src));
126  } else if (typeid(src) == typeid(cpuColorSpinorField)) {
127  *this = (dynamic_cast<const cpuColorSpinorField&>(src));
128  } else {
129  errorQuda("Unknown input ColorSpinorField %s", typeid(src).name());
130  }
131  return *this;
132  }
133 
135  if (&src != this) {
136  // keep current attributes unless unset
137  if (!ColorSpinorField::init) { // note this will turn a reference field into a regular field
138  destroy();
139  destroyComms(); // not sure if this necessary
142  }
143  copySpinorField(src);
144  }
145  return *this;
146  }
147 
149  // keep current attributes unless unset
150  if (!ColorSpinorField::init) { // note this will turn a reference field into a regular field
151  destroy();
154  }
155  loadSpinorField(src);
156  return *this;
157  }
158 
160  destroyComms();
161  destroy();
162  }
163 
165 
167  errorQuda("Subset not implemented");
168  }
169 
170  if (create != QUDA_REFERENCE_FIELD_CREATE) {
171  switch(mem_type) {
172  case QUDA_MEMORY_DEVICE:
175  break;
176  case QUDA_MEMORY_MAPPED:
178  cudaHostGetDevicePointer(&v, v_h, 0); // set the matching device pointer
181  cudaHostGetDevicePointer(&norm, norm_h, 0); // set the matching device pointer
182  }
183  break;
184  default:
185  errorQuda("Unsupported memory type %d", mem_type);
186  }
187  alloc = true;
188  }
189 
192  if(composite_descr.dim <= 0) errorQuda("\nComposite size is not defined\n");
193 
196  param.nDim = nDim;
197  memcpy(param.x, x, nDim*sizeof(int));
199  param.v = v;
200  param.norm = norm;
201  param.is_composite = false;
202  param.composite_dim = 0;
203  param.is_component = true;
204  param.mem_type = mem_type;
205 
206  components.reserve(composite_descr.dim);
207  for(int cid = 0; cid < composite_descr.dim; cid++) {
208  param.component_id = cid;
209  components.push_back(new cudaColorSpinorField(*this, param));
210  }
211  } else {
212  // create the associated even and odd subsets
215  param.nDim = nDim;
216  memcpy(param.x, x, nDim*sizeof(int));
217  param.x[0] /= 2; // set single parity dimensions
219  param.v = v;
220  param.norm = norm;
221  param.is_composite = false;
222  param.composite_dim = 0;
225  param.mem_type = mem_type;
226 
227  even = new cudaColorSpinorField(*this, param);
228  odd = new cudaColorSpinorField(*this, param);
229 
230  // need this hackery for the moment (need to locate the odd pointers half way into the full field)
231  (dynamic_cast<cudaColorSpinorField*>(odd))->v = (void*)((char*)v + bytes/2);
233  (dynamic_cast<cudaColorSpinorField*>(odd))->norm = (void*)((char*)norm + norm_bytes/2);
234 
235 #ifdef USE_TEXTURE_OBJECTS
236  dynamic_cast<cudaColorSpinorField*>(even)->destroyTexObject();
237  dynamic_cast<cudaColorSpinorField*>(even)->createTexObject();
238  dynamic_cast<cudaColorSpinorField*>(odd)->destroyTexObject();
239  dynamic_cast<cudaColorSpinorField*>(odd)->createTexObject();
240 #endif
241  }
242  } else { //siteSubset == QUDA_PARITY_SITE_SUBSET
243 
246  {
247  if(composite_descr.dim <= 0) errorQuda("\nComposite size is not defined\n");
248  //if(bytes > 1811939328) warningQuda("\nCUDA API probably won't be able to create texture object for the eigenvector set... Object size is : %u bytes\n", bytes);
249  if (getVerbosity() == QUDA_DEBUG_VERBOSE) printfQuda("\nEigenvector set constructor...\n");
250  // create the associated even and odd subsets
253  param.nDim = nDim;
254  memcpy(param.x, x, nDim*sizeof(int));
256  param.v = v;
257  param.norm = norm;
258  param.is_composite = false;
259  param.composite_dim = 0;
260  param.is_component = true;
261  param.mem_type = mem_type;
262 
263  //reserve eigvector set
264  components.reserve(composite_descr.dim);
265  //setup volume, [real_]length and stride for a single eigenvector
266  for(int cid = 0; cid < composite_descr.dim; cid++)
267  {
268  param.component_id = cid;
269  components.push_back(new cudaColorSpinorField(*this, param));
270 
271 #ifdef USE_TEXTURE_OBJECTS //(a lot of texture objects...)
272  dynamic_cast<cudaColorSpinorField*>(components[cid])->destroyTexObject();
273  dynamic_cast<cudaColorSpinorField*>(components[cid])->createTexObject();
274 #endif
275  }
276  }
277  }
278 
279  if (create != QUDA_REFERENCE_FIELD_CREATE) {
281  zeroPad();
282  } else { //temporary hack for the full spinor field sets, manual zeroPad for each component:
283  for(int cid = 0; cid < composite_descr.dim; cid++) {
284  (dynamic_cast<cudaColorSpinorField&>(components[cid]->Even())).zeroPad();
285  (dynamic_cast<cudaColorSpinorField&>(components[cid]->Odd())).zeroPad();
286  }
287  }
288  }
289 
290 #ifdef USE_TEXTURE_OBJECTS
292  createTexObject();
293 #endif
294  }
295 
296 #ifdef USE_TEXTURE_OBJECTS
297  void cudaColorSpinorField::createTexObject() {
298 
299  if ( (isNative() || fieldOrder == QUDA_FLOAT2_FIELD_ORDER) && nVec == 1 ) {
300  if (texInit) errorQuda("Already bound textures");
301 
302  // create the texture for the field components
303 
304  cudaChannelFormatDesc desc;
305  memset(&desc, 0, sizeof(cudaChannelFormatDesc));
306  if (precision == QUDA_SINGLE_PRECISION) desc.f = cudaChannelFormatKindFloat;
307  else desc.f = cudaChannelFormatKindSigned; // quarter is char, half is short, double is int2
308 
309  // staggered and coarse fields in half and single are always two component
310  int texel_size = 1;
311  // all FLOAT2-ordred fields that are not double precision
313  desc.x = 8*precision;
314  desc.y = 8*precision;
315  desc.z = 0;
316  desc.w = 0;
317  texel_size = 2*precision;
318  } else { // all others are four component (double2 is spread across int4)
319  desc.x = (precision == QUDA_DOUBLE_PRECISION) ? 8*sizeof(int) : 8*precision;
320  desc.y = (precision == QUDA_DOUBLE_PRECISION) ? 8*sizeof(int) : 8*precision;
321  desc.z = (precision == QUDA_DOUBLE_PRECISION) ? 8*sizeof(int) : 8*precision;
322  desc.w = (precision == QUDA_DOUBLE_PRECISION) ? 8*sizeof(int) : 8*precision;
323  texel_size = 4 * (precision == QUDA_DOUBLE_PRECISION ? sizeof(int) : precision);
324  }
325 
326  cudaResourceDesc resDesc;
327  memset(&resDesc, 0, sizeof(resDesc));
328  resDesc.resType = cudaResourceTypeLinear;
329  resDesc.res.linear.devPtr = v;
330  resDesc.res.linear.desc = desc;
331  resDesc.res.linear.sizeInBytes = bytes;
332 
333  cudaTextureDesc texDesc;
334  memset(&texDesc, 0, sizeof(texDesc));
335  if (precision == QUDA_HALF_PRECISION || precision == QUDA_QUARTER_PRECISION) texDesc.readMode = cudaReadModeNormalizedFloat;
336  else texDesc.readMode = cudaReadModeElementType;
337 
338  if (resDesc.res.linear.sizeInBytes % deviceProp.textureAlignment != 0
339  || !is_aligned(resDesc.res.linear.devPtr, deviceProp.textureAlignment)) {
340  errorQuda("Allocation size %lu does not have correct alignment for textures (%lu)",
341  resDesc.res.linear.sizeInBytes, deviceProp.textureAlignment);
342  }
343 
344  unsigned long texels = resDesc.res.linear.sizeInBytes / texel_size;
345  if (texels > (unsigned)deviceProp.maxTexture1DLinear) {
346  errorQuda("Attempting to bind too large a texture %lu > %d", texels, deviceProp.maxTexture1DLinear);
347  }
348 
349  cudaCreateTextureObject(&tex, &resDesc, &texDesc, NULL);
350 
351  checkCudaError();
352 
353  // create the texture for the norm components
355  cudaChannelFormatDesc desc;
356  memset(&desc, 0, sizeof(cudaChannelFormatDesc));
357  desc.f = cudaChannelFormatKindFloat;
358  desc.x = 8*QUDA_SINGLE_PRECISION; desc.y = 0; desc.z = 0; desc.w = 0;
359 
360  cudaResourceDesc resDesc;
361  memset(&resDesc, 0, sizeof(resDesc));
362  resDesc.resType = cudaResourceTypeLinear;
363  resDesc.res.linear.devPtr = norm;
364  resDesc.res.linear.desc = desc;
365  resDesc.res.linear.sizeInBytes = norm_bytes;
366 
367  if (resDesc.res.linear.sizeInBytes % deviceProp.textureAlignment != 0
368  || !is_aligned(resDesc.res.linear.devPtr, deviceProp.textureAlignment)) {
369  errorQuda("Allocation size %lu does not have correct alignment for textures (%lu)",
370  resDesc.res.linear.sizeInBytes, deviceProp.textureAlignment);
371  }
372 
373  cudaTextureDesc texDesc;
374  memset(&texDesc, 0, sizeof(texDesc));
375  texDesc.readMode = cudaReadModeElementType;
376 
377  cudaCreateTextureObject(&texNorm, &resDesc, &texDesc, NULL);
378 
379  checkCudaError();
380  }
381 
382  texInit = true;
383 
384  checkCudaError();
385  }
386  }
387 
388  void cudaColorSpinorField::createGhostTexObject() const {
389  // create the ghost texture object
390  if ( (isNative() || fieldOrder == QUDA_FLOAT2_FIELD_ORDER) && nVec == 1 && ghost_bytes) {
391  if (ghostTexInit) errorQuda("Already bound ghost texture");
392 
393  for (int b=0; b<2; b++) {
394  cudaChannelFormatDesc desc;
395  memset(&desc, 0, sizeof(cudaChannelFormatDesc));
396  if (ghost_precision == QUDA_SINGLE_PRECISION) desc.f = cudaChannelFormatKindFloat;
397  else desc.f = cudaChannelFormatKindSigned; // half is short, double is int2
398 
399  // all FLOAT2-ordred fields that are not double precision
401  desc.x = 8*ghost_precision;
402  desc.y = 8*ghost_precision;
403  desc.z = 0;
404  desc.w = 0;
405  } else { // all others are four component (double2 is spread across int4)
410  }
411 
412  cudaResourceDesc resDesc;
413  memset(&resDesc, 0, sizeof(resDesc));
414  resDesc.resType = cudaResourceTypeLinear;
415  resDesc.res.linear.devPtr = ghost_recv_buffer_d[b];
416  resDesc.res.linear.desc = desc;
417  resDesc.res.linear.sizeInBytes = ghost_bytes;
418 
419  if (!is_aligned(resDesc.res.linear.devPtr, deviceProp.textureAlignment)) {
420  errorQuda("Allocation size %lu does not have correct alignment for textures (%lu)",
421  resDesc.res.linear.sizeInBytes, deviceProp.textureAlignment);
422  }
423 
424  cudaTextureDesc texDesc;
425  memset(&texDesc, 0, sizeof(texDesc));
426  if (ghost_precision == QUDA_HALF_PRECISION || ghost_precision == QUDA_QUARTER_PRECISION) texDesc.readMode = cudaReadModeNormalizedFloat;
427  else texDesc.readMode = cudaReadModeElementType;
428 
429  cudaCreateTextureObject(&ghostTex[b], &resDesc, &texDesc, NULL);
430 
431  // second set of ghost texture map to the host-mapped pinned receive buffers
432  resDesc.res.linear.devPtr = ghost_pinned_recv_buffer_hd[b];
433  if (!is_aligned(resDesc.res.linear.devPtr, deviceProp.textureAlignment)) {
434  errorQuda("Allocation size %lu does not have correct alignment for textures (%lu)",
435  resDesc.res.linear.sizeInBytes, deviceProp.textureAlignment);
436  }
437  cudaCreateTextureObject(&ghostTex[2 + b], &resDesc, &texDesc, NULL);
438 
440  cudaChannelFormatDesc desc;
441  memset(&desc, 0, sizeof(cudaChannelFormatDesc));
442  desc.f = cudaChannelFormatKindFloat;
443  desc.x = 8*QUDA_SINGLE_PRECISION; desc.y = 0; desc.z = 0; desc.w = 0;
444 
445  cudaResourceDesc resDesc;
446  memset(&resDesc, 0, sizeof(resDesc));
447  resDesc.resType = cudaResourceTypeLinear;
448  resDesc.res.linear.devPtr = ghost_recv_buffer_d[b];
449  resDesc.res.linear.desc = desc;
450  resDesc.res.linear.sizeInBytes = ghost_bytes;
451 
452  if (!is_aligned(resDesc.res.linear.devPtr, deviceProp.textureAlignment)) {
453  errorQuda("Allocation size %lu does not have correct alignment for textures (%lu)",
454  resDesc.res.linear.sizeInBytes, deviceProp.textureAlignment);
455  }
456 
457  cudaTextureDesc texDesc;
458  memset(&texDesc, 0, sizeof(texDesc));
459  texDesc.readMode = cudaReadModeElementType;
460 
461  cudaCreateTextureObject(&ghostTexNorm[b], &resDesc, &texDesc, NULL);
462 
463  resDesc.res.linear.devPtr = ghost_pinned_recv_buffer_hd[b];
464  if (!is_aligned(resDesc.res.linear.devPtr, deviceProp.textureAlignment)) {
465  errorQuda("Allocation size %lu does not have correct alignment for textures (%lu)",
466  resDesc.res.linear.sizeInBytes, deviceProp.textureAlignment);
467  }
468  cudaCreateTextureObject(&ghostTexNorm[2 + b], &resDesc, &texDesc, NULL);
469  }
470 
473  } // buffer index
474 
475  ghostTexInit = true;
477 
478  checkCudaError();
479  }
480 
481  }
482 
483  void cudaColorSpinorField::destroyTexObject() {
484  if ( (isNative() || fieldOrder == QUDA_FLOAT2_FIELD_ORDER) && nVec == 1 && texInit) {
485  cudaDestroyTextureObject(tex);
486  if (precision == QUDA_HALF_PRECISION || precision == QUDA_QUARTER_PRECISION) cudaDestroyTextureObject(texNorm);
487  texInit = false;
488  }
489  }
490 
491  void cudaColorSpinorField::destroyGhostTexObject() const {
492  if ( (isNative() || fieldOrder == QUDA_FLOAT2_FIELD_ORDER) && nVec == 1 && ghostTexInit) {
493  for (int i=0; i<4; i++) cudaDestroyTextureObject(ghostTex[i]);
495  for (int i=0; i<4; i++) cudaDestroyTextureObject(ghostTexNorm[i]);
496  ghostTexInit = false;
498  }
499  }
500 #endif
501 
503 
504  if (alloc) {
505  switch(mem_type) {
506  case QUDA_MEMORY_DEVICE:
509  break;
510  case QUDA_MEMORY_MAPPED:
511  host_free(v_h);
513  break;
514  default:
515  errorQuda("Unsupported memory type %d", mem_type);
516  }
517  }
518 
519 
521  {
522  CompositeColorSpinorField::iterator vec;
523  for (vec = components.begin(); vec != components.end(); vec++) delete *vec;
524  }
525 
527  delete even;
528  delete odd;
529  }
530 
531 #ifdef USE_TEXTURE_OBJECTS
533  destroyTexObject();
534  destroyGhostTexObject();
535  }
536 #endif
537 
538  }
539 
541  if (backed_up) errorQuda("ColorSpinorField already backed up");
542  backup_h = new char[bytes];
543  cudaMemcpy(backup_h, v, bytes, cudaMemcpyDeviceToHost);
544  if (norm_bytes) {
545  backup_norm_h = new char[norm_bytes];
546  cudaMemcpy(backup_norm_h, norm, norm_bytes, cudaMemcpyDeviceToHost);
547  }
548  checkCudaError();
549  backed_up = true;
550  }
551 
553  {
554  if (!backed_up) errorQuda("Cannot restore since not backed up");
555  cudaMemcpy(v, backup_h, bytes, cudaMemcpyHostToDevice);
556  delete []backup_h;
557  if (norm_bytes) {
558  cudaMemcpy(v, backup_norm_h, norm_bytes, cudaMemcpyHostToDevice);
559  delete []backup_norm_h;
560  }
561  checkCudaError();
562  backed_up = false;
563  }
564 
565  // cuda's floating point format, IEEE-754, represents the floating point
566  // zero as 4 zero bytes
568  cudaMemsetAsync(v, 0, bytes);
570  }
571 
573 
574  { // zero initialize the field pads
575  size_t pad_bytes = (stride - volumeCB) * precision * fieldOrder;
576  int Npad = nColor * nSpin * 2 / fieldOrder;
577 
578  if (composite_descr.is_composite && !composite_descr.is_component){//we consider the whole eigenvector set:
579  Npad *= composite_descr.dim;
580  pad_bytes /= composite_descr.dim;
581  }
582 
585  if (pad_bytes)
586  for (int subset=0; subset<siteSubset; subset++) {
587  cudaMemset2DAsync(dst + subset*bytes/siteSubset, pitch, 0, pad_bytes, Npad);
588  }
589  }
590 
591  if (norm_bytes > 0) { // zero initialize the norm pad
592  size_t pad_bytes = (stride - volumeCB) * sizeof(float);
593  if (pad_bytes)
594  for (int subset=0; subset<siteSubset; subset++) {
595  cudaMemsetAsync((char*)norm + volumeCB*sizeof(float), 0, (stride-volumeCB)*sizeof(float));
596  }
597  }
598 
599  // zero the region added for alignment reasons
600  if (bytes != (size_t)length*precision) {
601  size_t subset_bytes = bytes/siteSubset;
602  size_t subset_length = length/siteSubset;
603  for (int subset=0; subset < siteSubset; subset++) {
604  cudaMemsetAsync((char*)v + subset_length*precision + subset_bytes*subset, 0,
605  subset_bytes-subset_length*precision);
606  }
607  }
608 
609  // zero the region added for alignment reasons (norm)
610  if (norm_bytes && norm_bytes != siteSubset*stride*sizeof(float)) {
611  size_t subset_bytes = norm_bytes/siteSubset;
612  for (int subset=0; subset < siteSubset; subset++) {
613  cudaMemsetAsync((char*)norm + (size_t)stride*sizeof(float) + subset_bytes*subset, 0,
614  subset_bytes-(size_t)stride*sizeof(float));
615  }
616  }
617 
618  checkCudaError();
619  }
620 
622  checkField(*this, src);
623  if (this->GammaBasis() != src.GammaBasis()) errorQuda("cannot call this copy with different basis");
624  blas::copy(*this, src);
625  }
626 
628 
629  // src is on the device and is native
630  if (typeid(src) == typeid(cudaColorSpinorField) &&
631  isNative() && dynamic_cast<const cudaColorSpinorField &>(src).isNative() &&
632  this->GammaBasis() == src.GammaBasis()) {
633  copy(dynamic_cast<const cudaColorSpinorField&>(src));
634  } else if (typeid(src) == typeid(cudaColorSpinorField)) {
636  } else if (typeid(src) == typeid(cpuColorSpinorField)) { // src is on the host
637  loadSpinorField(src);
638  } else {
639  errorQuda("Unknown input ColorSpinorField %s", typeid(src).name());
640  }
641  }
642 
644 
645  if ( reorder_location() == QUDA_CPU_FIELD_LOCATION && typeid(src) == typeid(cpuColorSpinorField)) {
646  void *buffer = pool_pinned_malloc(bytes + norm_bytes);
647  memset(buffer, 0, bytes+norm_bytes); // FIXME (temporary?) bug fix for padding
648 
649  copyGenericColorSpinor(*this, src, QUDA_CPU_FIELD_LOCATION, buffer, 0, static_cast<char*>(buffer)+bytes, 0);
650 
651  qudaMemcpy(v, buffer, bytes, cudaMemcpyHostToDevice);
652  qudaMemcpy(norm, static_cast<char*>(buffer)+bytes, norm_bytes, cudaMemcpyHostToDevice);
653 
654  pool_pinned_free(buffer);
655  } else if (typeid(src) == typeid(cudaColorSpinorField)) {
657  } else {
658 
660  // special case where we use mapped memory to read/write directly from application's array
661  void *src_d;
662  cudaError_t error = cudaHostGetDevicePointer(&src_d, const_cast<void*>(src.V()), 0);
663  if (error != cudaSuccess) errorQuda("Failed to get device pointer for ColorSpinorField field");
665  } else {
666  void *Src=nullptr, *srcNorm=nullptr, *buffer=nullptr;
667  if (!zeroCopy) {
668  buffer = pool_device_malloc(src.Bytes()+src.NormBytes());
669  Src = buffer;
670  srcNorm = static_cast<char*>(Src) + src.Bytes();
671  qudaMemcpy(Src, src.V(), src.Bytes(), cudaMemcpyHostToDevice);
672  qudaMemcpy(srcNorm, src.Norm(), src.NormBytes(), cudaMemcpyHostToDevice);
673  } else {
674  buffer = pool_pinned_malloc(src.Bytes()+src.NormBytes());
675  memcpy(buffer, src.V(), src.Bytes());
676  memcpy(static_cast<char*>(buffer)+src.Bytes(), src.Norm(), src.NormBytes());
677  cudaError_t error = cudaHostGetDevicePointer(&Src, buffer, 0);
678  if (error != cudaSuccess) errorQuda("Failed to get device pointer for ColorSpinorField field");
679  srcNorm = static_cast<char*>(Src) + src.Bytes();
680  }
681 
682  cudaMemset(v, 0, bytes); // FIXME (temporary?) bug fix for padding
683  copyGenericColorSpinor(*this, src, QUDA_CUDA_FIELD_LOCATION, 0, Src, 0, srcNorm);
684 
685  if (zeroCopy) pool_pinned_free(buffer);
686  else pool_device_free(buffer);
687  }
688  }
689 
690  qudaDeviceSynchronize(); // include sync here for accurate host-device profiling
691  checkCudaError();
692  }
693 
694 
696 
697  if ( reorder_location() == QUDA_CPU_FIELD_LOCATION && typeid(dest) == typeid(cpuColorSpinorField)) {
698  void *buffer = pool_pinned_malloc(bytes+norm_bytes);
699  qudaMemcpy(buffer, v, bytes, cudaMemcpyDeviceToHost);
700  qudaMemcpy(static_cast<char*>(buffer)+bytes, norm, norm_bytes, cudaMemcpyDeviceToHost);
701 
702  copyGenericColorSpinor(dest, *this, QUDA_CPU_FIELD_LOCATION, 0, buffer, 0, static_cast<char*>(buffer)+bytes);
703  pool_pinned_free(buffer);
704  } else if (typeid(dest) == typeid(cudaColorSpinorField)) {
706  } else {
707 
709  // special case where we use zero-copy memory to read/write directly from application's array
710  void *dest_d;
711  cudaError_t error = cudaHostGetDevicePointer(&dest_d, const_cast<void*>(dest.V()), 0);
712  if (error != cudaSuccess) errorQuda("Failed to get device pointer for ColorSpinorField field");
713  copyGenericColorSpinor(dest, *this, QUDA_CUDA_FIELD_LOCATION, dest_d, v);
714  } else {
715  void *dst = nullptr, *dstNorm = nullptr, *buffer = nullptr;
716  if (!zeroCopy) {
717  buffer = pool_device_malloc(dest.Bytes()+dest.NormBytes());
718  dst = buffer;
719  dstNorm = static_cast<char*>(dst) + dest.Bytes();
720  } else {
721  buffer = pool_pinned_malloc(dest.Bytes()+dest.NormBytes());
722  cudaError_t error = cudaHostGetDevicePointer(&dst, buffer, 0);
723  if (error != cudaSuccess) errorQuda("Failed to get device pointer for ColorSpinorField");
724  dstNorm = static_cast<char*>(dst)+dest.Bytes();
725  }
726 
727  copyGenericColorSpinor(dest, *this, QUDA_CUDA_FIELD_LOCATION, dst, 0, dstNorm, 0);
728 
729  if (!zeroCopy) {
730  qudaMemcpy(dest.V(), dst, dest.Bytes(), cudaMemcpyDeviceToHost);
731  qudaMemcpy(dest.Norm(), dstNorm, dest.NormBytes(), cudaMemcpyDeviceToHost);
732  } else {
734  memcpy(dest.V(), buffer, dest.Bytes());
735  memcpy(dest.Norm(), static_cast<char*>(buffer) + dest.Bytes(), dest.NormBytes());
736  }
737 
738  if (zeroCopy) pool_pinned_free(buffer);
739  else pool_device_free(buffer);
740  }
741  }
742 
743  qudaDeviceSynchronize(); // need to sync before data can be used on CPU
744  checkCudaError();
745  }
746 
747  void cudaColorSpinorField::allocateGhostBuffer(int nFace, bool spin_project) const {
748 
749  createGhostZone(nFace, spin_project);
751 
752 #ifdef USE_TEXTURE_OBJECTS
753  // ghost texture is per object
757  destroyGhostTexObject();
758  if (!ghostTexInit) createGhostTexObject();
759 #endif
760  }
761 
762  // pack the ghost zone into a contiguous buffer for communications
763  void cudaColorSpinorField::packGhost(const int nFace, const QudaParity parity, const int dim, const QudaDirection dir,
764  const int dagger, cudaStream_t *stream, MemoryLocation location[2 * QUDA_MAX_DIM],
765  MemoryLocation location_label, bool spin_project, double a, double b, double c)
766  {
767 #ifdef MULTI_GPU
768  void *packBuffer[2 * QUDA_MAX_DIM] = {};
769 
770  for (int dim=0; dim<4; dim++) {
771  for (int dir=0; dir<2; dir++) {
772  switch(location[2*dim+dir]) {
773  case Device: // pack to local device buffer
774  packBuffer[2*dim+dir] = my_face_dim_dir_d[bufferIndex][dim][dir];
775  break;
776  case Host: // pack to zero-copy memory
777  packBuffer[2*dim+dir] = my_face_dim_dir_hd[bufferIndex][dim][dir];
778  break;
779  case Remote: // pack to remote peer memory
780  packBuffer[2*dim+dir] = static_cast<char*>(ghost_remote_send_buffer_d[bufferIndex][dim][dir]) + precision*ghostOffset[dim][1-dir];
781  break;
782  default: errorQuda("Undefined location %d", location[2*dim+dir]);
783  }
784  }
785  }
786 
787  PackGhost(packBuffer, *this, location_label, nFace, dagger, parity, spin_project, a, b, c, *stream);
788 
789 #else
790  errorQuda("packGhost not built on single-GPU build");
791 #endif
792  }
793 
794  // send the ghost zone to the host
795  void cudaColorSpinorField::sendGhost(void *ghost_spinor, const int nFace, const int dim,
796  const QudaDirection dir, const int dagger,
797  cudaStream_t *stream) {
798 
799 #ifdef MULTI_GPU
800  if (precision != ghost_precision) { pushKernelPackT(true); }
801 
802  if (dim !=3 || getKernelPackT()) { // use kernels to pack into contiguous buffers then a single cudaMemcpy
803 
804  void* gpu_buf = (dir == QUDA_BACKWARDS) ? my_face_dim_dir_d[bufferIndex][dim][0] : my_face_dim_dir_d[bufferIndex][dim][1];
805  qudaMemcpyAsync(ghost_spinor, gpu_buf, ghost_face_bytes[dim], cudaMemcpyDeviceToHost, *stream);
806 
807  } else {
808 
809  const int Nvec = (nSpin == 1 || ghost_precision == QUDA_DOUBLE_PRECISION) ? 2 : 4;
810  const int Nint = (nColor * nSpin * 2) / (nSpin == 4 ? 2 : 1); // (spin proj.) degrees of freedom
811  const int Npad = Nint / Nvec; // number Nvec buffers we have
812  const int nParity = siteSubset;
813  const int x4 = nDim==5 ? x[4] : 1;
814  const int Nt_minus1_offset = (volumeCB - nFace * ghostFaceCB[3]) / x4; // N_t-1 = Vh-Vsh
815 
816  int offset = 0;
817  if (nSpin == 1) {
818  offset = (dir == QUDA_BACKWARDS) ? 0 : Nt_minus1_offset;
819  } else if (nSpin == 4) {
820  // !dagger: send lower components backwards, send upper components forwards
821  // dagger: send upper components backwards, send lower components forwards
822  bool upper = dagger ? true : false; // Fwd is !Back
823  if (dir == QUDA_FORWARDS) upper = !upper;
824  int lower_spin_offset = Npad*stride;
825  if (upper) offset = (dir == QUDA_BACKWARDS ? 0 : Nt_minus1_offset);
826  else offset = lower_spin_offset + (dir == QUDA_BACKWARDS ? 0 : Nt_minus1_offset);
827  }
828 
829  size_t len = nFace * (ghostFaceCB[3] / x4) * Nvec * ghost_precision;
830  size_t dpitch = x4*len;
831  size_t spitch = stride*Nvec*ghost_precision;
832 
833  // QUDA Memcpy NPad's worth.
834  // -- Dest will point to the right beginning PAD.
835  // -- Each Pad has size Nvec*Vsh Floats.
836  // -- There is Nvec*Stride Floats from the start of one PAD to the start of the next
837 
838  for (int parity = 0; parity < nParity; parity++) {
839  for (int s = 0; s < x4; s++) { // loop over multiple 4-d volumes (if they exist)
840  void *dst = (char *)ghost_spinor + s * len + parity * nFace * Nint * ghostFaceCB[3] * ghost_precision;
841  void *src = (char *)v + (offset + s * (volumeCB / x4)) * Nvec * ghost_precision + parity * bytes / 2;
842  qudaMemcpy2DAsync(dst, dpitch, src, spitch, len, Npad, cudaMemcpyDeviceToHost, *stream);
843 
844  // we can probably issue this as a single cudaMemcpy2d along the fifth dimension
845  if (ghost_precision == QUDA_HALF_PRECISION || ghost_precision == QUDA_QUARTER_PRECISION) {
846  size_t len = nFace * (ghostFaceCB[3] / x4) * sizeof(float);
847  int norm_offset = (dir == QUDA_BACKWARDS) ? 0 : Nt_minus1_offset * sizeof(float);
848  void *dst = (char *)ghost_spinor + nParity * nFace * Nint * ghostFaceCB[3] * ghost_precision + s * len
849  + parity * nFace * ghostFaceCB[3] * sizeof(float);
850  void *src = (char *)norm + norm_offset + s * (volumeCB / x4) * sizeof(float) + parity * norm_bytes / 2;
851  qudaMemcpyAsync(dst, src, len, cudaMemcpyDeviceToHost, *stream);
852  }
853  } // fifth dimension
854  } // parity
855  }
856 
858 
859 #else
860  errorQuda("sendGhost not built on single-GPU build");
861 #endif
862 
863  }
864 
865 
866  void cudaColorSpinorField::unpackGhost(const void* ghost_spinor, const int nFace,
867  const int dim, const QudaDirection dir,
868  const int dagger, cudaStream_t* stream)
869  {
870  const void *src = ghost_spinor;
871 
872  int ghost_offset = (dir == QUDA_BACKWARDS) ? ghostOffset[dim][0] : ghostOffset[dim][1];
873  void *ghost_dst = (char*)ghost_recv_buffer_d[bufferIndex] + ghost_precision*ghost_offset;
874 
875  qudaMemcpyAsync(ghost_dst, src, ghost_face_bytes[dim], cudaMemcpyHostToDevice, *stream);
876  }
877 
878 
879  // pack the ghost zone into a contiguous buffer for communications
880  void cudaColorSpinorField::packGhostExtended(const int nFace, const int R[], const QudaParity parity,
881  const int dim, const QudaDirection dir,
882  const int dagger, cudaStream_t *stream, bool zero_copy)
883  {
884  errorQuda("not implemented");
885  }
886 
887 
888  // copy data from host buffer into boundary region of device field
889  void cudaColorSpinorField::unpackGhostExtended(const void* ghost_spinor, const int nFace, const QudaParity parity,
890  const int dim, const QudaDirection dir,
891  const int dagger, cudaStream_t* stream, bool zero_copy)
892  {
893  errorQuda("not implemented");
894  }
895 
896 
897  cudaStream_t *stream;
898 
899  void cudaColorSpinorField::createComms(int nFace, bool spin_project) {
900 
901  allocateGhostBuffer(nFace,spin_project); // allocate the ghost buffer if not yet allocated
902 
903  // ascertain if this instance needs its comms buffers to be updated
904  bool comms_reset = ghost_field_reset || // FIXME add send buffer check
907  || (my_face_d[0] != ghost_send_buffer_d[0]) || (my_face_d[1] != ghost_send_buffer_d[1]) || // send buffers
908  (from_face_d[0] != ghost_recv_buffer_d[0]) || (from_face_d[1] != ghost_recv_buffer_d[1]) || // receive buffers
909  ghost_precision_reset; // ghost_precision has changed
910 
911  if (!initComms || comms_reset) {
912 
914 
915  // reinitialize the ghost receive pointers
916  for (int i=0; i<nDimComms; ++i) {
917  if (commDimPartitioned(i)) {
918  for (int b=0; b<2; b++) {
919  ghost[b][i] = static_cast<char*>(ghost_recv_buffer_d[b]) + ghostOffset[i][0]*ghost_precision;
921  ghostNorm[b][i] = static_cast<char*>(ghost_recv_buffer_d[b]) + ghostNormOffset[i][0]*QUDA_SINGLE_PRECISION;
922  }
923  }
924  }
925 
926  ghost_precision_reset = false;
927  }
928 
930  createIPCComms();
931  }
932 
933  void cudaColorSpinorField::streamInit(cudaStream_t *stream_p) {
934  stream = stream_p;
935  }
936 
937  void cudaColorSpinorField::pack(int nFace, int parity, int dagger, int stream_idx,
938  MemoryLocation location[2 * QUDA_MAX_DIM], MemoryLocation location_label,
939  bool spin_project, double a, double b, double c)
940  {
941  createComms(nFace, spin_project); // must call this first
942 
943  const int dim=-1; // pack all partitioned dimensions
944 
945  packGhost(nFace, (QudaParity)parity, dim, QUDA_BOTH_DIRS, dagger, &stream[stream_idx], location, location_label,
946  spin_project, a, b, c);
947  }
948 
949  void cudaColorSpinorField::packExtended(const int nFace, const int R[], const int parity,
950  const int dagger, const int dim,
951  cudaStream_t *stream_p, const bool zero_copy)
952  {
953  createComms(nFace); // must call this first
954 
955  stream = stream_p;
956 
957  packGhostExtended(nFace, R, (QudaParity)parity, dim, QUDA_BOTH_DIRS, dagger, &stream[zero_copy ? 0 : (Nstream-1)], zero_copy);
958  }
959 
960  void cudaColorSpinorField::gather(int nFace, int dagger, int dir, cudaStream_t* stream_p)
961  {
962  int dim = dir/2;
963 
964  // If stream_p != 0, use pack_stream, else use the stream array
965  cudaStream_t *pack_stream = (stream_p) ? stream_p : stream+dir;
966 
967  if (dir%2 == 0) {
968  // backwards copy to host
969  if (comm_peer2peer_enabled(0,dim)) return;
970 
971  sendGhost(my_face_dim_dir_h[bufferIndex][dim][0], nFace, dim, QUDA_BACKWARDS, dagger, pack_stream);
972  } else {
973  // forwards copy to host
974  if (comm_peer2peer_enabled(1,dim)) return;
975 
976  sendGhost(my_face_dim_dir_h[bufferIndex][dim][1], nFace, dim, QUDA_FORWARDS, dagger, pack_stream);
977  }
978  }
979 
980 
981  void cudaColorSpinorField::recvStart(int nFace, int d, int dagger, cudaStream_t* stream_p, bool gdr) {
982 
983  // note this is scatter centric, so dir=0 (1) is send backwards
984  // (forwards) and receive from forwards (backwards)
985 
986  int dim = d/2;
987  int dir = d%2;
988  if (!commDimPartitioned(dim)) return;
989  if (gdr && !comm_gdr_enabled()) errorQuda("Requesting GDR comms but GDR is not enabled");
990 
991  if (dir == 0) { // receive from forwards
992  // receive from the processor in the +1 direction
993  if (comm_peer2peer_enabled(1,dim)) {
995  } else if (gdr) {
997  } else {
999  }
1000  } else { // receive from backwards
1001  // receive from the processor in the -1 direction
1002  if (comm_peer2peer_enabled(0,dim)) {
1004  } else if (gdr) {
1006  } else {
1008  }
1009  }
1010  }
1011 
1012 
1013  void cudaColorSpinorField::sendStart(int nFace, int d, int dagger, cudaStream_t* stream_p, bool gdr, bool remote_write) {
1014 
1015  // note this is scatter centric, so dir=0 (1) is send backwards
1016  // (forwards) and receive from forwards (backwards)
1017 
1018  int dim = d/2;
1019  int dir = d%2;
1020  if (!commDimPartitioned(dim)) return;
1021  if (gdr && !comm_gdr_enabled()) errorQuda("Requesting GDR comms but GDR is not enabled");
1022 
1023  int Nvec = (nSpin == 1 || ghost_precision == QUDA_DOUBLE_PRECISION) ? 2 : 4;
1024  int Nint = (nColor * nSpin * 2)/(nSpin == 4 ? 2 : 1); // (spin proj.) degrees of freedom
1025  int Npad = Nint/Nvec;
1026 
1027  if (!comm_peer2peer_enabled(dir,dim)) {
1028  if (dir == 0)
1029  if (gdr) comm_start(mh_send_rdma_back[bufferIndex][dim]);
1030  else comm_start(mh_send_back[bufferIndex][dim]);
1031  else
1032  if (gdr) comm_start(mh_send_rdma_fwd[bufferIndex][dim]);
1033  else comm_start(mh_send_fwd[bufferIndex][dim]);
1034  } else { // doing peer-to-peer
1035  cudaStream_t *copy_stream = (stream_p) ? stream_p : stream + d;
1036 
1037  // if not using copy engine then the packing kernel will remotely write the halos
1038  if (!remote_write) {
1039  // all goes here
1040  void* ghost_dst = static_cast<char*>(ghost_remote_send_buffer_d[bufferIndex][dim][dir])
1041  + ghost_precision*ghostOffset[dim][(dir+1)%2];
1042 
1044 
1045  if (dim != 3 || getKernelPackT()) {
1046 
1047  void* ghost_dst = static_cast<char*>(ghost_remote_send_buffer_d[bufferIndex][dim][dir])
1048  + ghost_precision*ghostOffset[dim][(dir+1)%2];
1049  cudaMemcpyAsync(ghost_dst,
1050  my_face_dim_dir_d[bufferIndex][dim][dir],
1051  ghost_face_bytes[dim],
1052  cudaMemcpyDeviceToDevice,
1053  *copy_stream); // copy to forward processor
1054 
1055  } else {
1056 
1057  const int nParity = siteSubset;
1058  const int x4 = nDim==5 ? x[4] : 1;
1059  const int Nt_minus_offset = (volumeCB - nFace * ghostFaceCB[3]) / x4;
1060 
1061  int offset = 0;
1062  if (nSpin == 1) {
1063  offset = (dir == 0) ? 0 : Nt_minus_offset;
1064  } else if (nSpin == 4) {
1065  // !dagger: send lower components backwards, send upper components forwards
1066  // dagger: send upper components backwards, send lower components forwards
1067  bool upper = dagger ? true : false;
1068  if (dir == 1) upper = !upper;
1069  int lower_spin_offset = Npad*stride;
1070  if (upper)
1071  offset = (dir == 0 ? 0 : Nt_minus_offset);
1072  else
1073  offset = lower_spin_offset + (dir == 0 ? 0 : Nt_minus_offset);
1074  }
1075 
1076  size_t len = nFace * (ghostFaceCB[3] / x4) * Nvec * ghost_precision;
1077  size_t dpitch = x4*len;
1078  size_t spitch = stride*Nvec*ghost_precision;
1079 
1080  for (int parity = 0; parity < nParity; parity++) {
1081  for (int s = 0; s < x4; s++) {
1082  void *dst = (char *)ghost_dst + s * len + parity * nFace * Nint * ghostFaceCB[3] * ghost_precision;
1083  void *src = (char *)v + (offset + s * (volumeCB / x4)) * Nvec * ghost_precision + parity * bytes / 2;
1084  // start the copy
1085  cudaMemcpy2DAsync(dst, dpitch, src, spitch, len, Npad, cudaMemcpyDeviceToDevice, *copy_stream);
1086 
1087  // we can probably issue this as a single cudaMemcpy2d along the fifth dimension
1088  if (ghost_precision == QUDA_HALF_PRECISION || ghost_precision == QUDA_QUARTER_PRECISION) {
1089  size_t len = nFace * (ghostFaceCB[3] / x4) * sizeof(float);
1090  int norm_offset = (dir == 0) ? 0 : Nt_minus_offset * sizeof(float);
1091  void *dst = (char *)ghost_dst + nParity * nFace * Nint * ghostFaceCB[3] * ghost_precision + s * len
1092  + parity * nFace * ghostFaceCB[3] * sizeof(float);
1093  void *src = (char *)norm + norm_offset + s * (volumeCB / x4) * sizeof(float) + parity * norm_bytes / 2;
1094  cudaMemcpyAsync(dst, src, len, cudaMemcpyDeviceToDevice, *copy_stream);
1095  }
1096  }
1097  } // fifth dimension
1098  } // parity
1099  } // remote_write
1100 
1102 
1103  if (dir == 0) {
1104  // record the event
1105  qudaEventRecord(ipcCopyEvent[bufferIndex][0][dim], *copy_stream);
1106  // send to the processor in the -1 direction
1108  } else {
1109  qudaEventRecord(ipcCopyEvent[bufferIndex][1][dim], *copy_stream);
1110  // send to the processor in the +1 direction
1112  }
1113  }
1114  }
1115 
1116  void cudaColorSpinorField::commsStart(int nFace, int dir, int dagger, cudaStream_t* stream_p, bool gdr_send, bool gdr_recv) {
1117  recvStart(nFace, dir, dagger, stream_p, gdr_recv);
1118  sendStart(nFace, dir, dagger, stream_p, gdr_send);
1119  }
1120 
1121 
1122  static bool complete_recv_fwd[QUDA_MAX_DIM] = { };
1123  static bool complete_recv_back[QUDA_MAX_DIM] = { };
1124  static bool complete_send_fwd[QUDA_MAX_DIM] = { };
1125  static bool complete_send_back[QUDA_MAX_DIM] = { };
1126 
1127  int cudaColorSpinorField::commsQuery(int nFace, int d, int dagger, cudaStream_t *stream_p, bool gdr_send, bool gdr_recv) {
1128 
1129  // note this is scatter centric, so dir=0 (1) is send backwards
1130  // (forwards) and receive from forwards (backwards)
1131 
1132  int dim = d/2;
1133  int dir = d%2;
1134 
1135  if (!commDimPartitioned(dim)) return 1;
1136  if ((gdr_send || gdr_recv) && !comm_gdr_enabled()) errorQuda("Requesting GDR comms but GDR is not enabled");
1137 
1138  if (dir==0) {
1139 
1140  // first query send to backwards
1141  if (comm_peer2peer_enabled(0,dim)) {
1142  if (!complete_send_back[dim]) complete_send_back[dim] = comm_query(mh_send_p2p_back[bufferIndex][dim]);
1143  } else if (gdr_send) {
1144  if (!complete_send_back[dim]) complete_send_back[dim] = comm_query(mh_send_rdma_back[bufferIndex][dim]);
1145  } else {
1146  if (!complete_send_back[dim]) complete_send_back[dim] = comm_query(mh_send_back[bufferIndex][dim]);
1147  }
1148 
1149  // second query receive from forwards
1150  if (comm_peer2peer_enabled(1,dim)) {
1151  if (!complete_recv_fwd[dim]) complete_recv_fwd[dim] = comm_query(mh_recv_p2p_fwd[bufferIndex][dim]);
1152  } else if (gdr_recv) {
1153  if (!complete_recv_fwd[dim]) complete_recv_fwd[dim] = comm_query(mh_recv_rdma_fwd[bufferIndex][dim]);
1154  } else {
1155  if (!complete_recv_fwd[dim]) complete_recv_fwd[dim] = comm_query(mh_recv_fwd[bufferIndex][dim]);
1156  }
1157 
1158  if (complete_recv_fwd[dim] && complete_send_back[dim]) {
1159  complete_send_back[dim] = false;
1160  complete_recv_fwd[dim] = false;
1161  return 1;
1162  }
1163 
1164  } else { // dir == 1
1165 
1166  // first query send to forwards
1167  if (comm_peer2peer_enabled(1,dim)) {
1168  if (!complete_send_fwd[dim]) complete_send_fwd[dim] = comm_query(mh_send_p2p_fwd[bufferIndex][dim]);
1169  } else if (gdr_send) {
1170  if (!complete_send_fwd[dim]) complete_send_fwd[dim] = comm_query(mh_send_rdma_fwd[bufferIndex][dim]);
1171  } else {
1172  if (!complete_send_fwd[dim]) complete_send_fwd[dim] = comm_query(mh_send_fwd[bufferIndex][dim]);
1173  }
1174 
1175  // second query receive from backwards
1176  if (comm_peer2peer_enabled(0,dim)) {
1177  if (!complete_recv_back[dim]) complete_recv_back[dim] = comm_query(mh_recv_p2p_back[bufferIndex][dim]);
1178  } else if (gdr_recv) {
1179  if (!complete_recv_back[dim]) complete_recv_back[dim] = comm_query(mh_recv_rdma_back[bufferIndex][dim]);
1180  } else {
1181  if (!complete_recv_back[dim]) complete_recv_back[dim] = comm_query(mh_recv_back[bufferIndex][dim]);
1182  }
1183 
1184  if (complete_recv_back[dim] && complete_send_fwd[dim]) {
1185  complete_send_fwd[dim] = false;
1186  complete_recv_back[dim] = false;
1187  return 1;
1188  }
1189 
1190  }
1191 
1192  return 0;
1193  }
1194 
1195  void cudaColorSpinorField::commsWait(int nFace, int d, int dagger, cudaStream_t *stream_p, bool gdr_send, bool gdr_recv) {
1196 
1197  // note this is scatter centric, so dir=0 (1) is send backwards
1198  // (forwards) and receive from forwards (backwards)
1199 
1200  int dim = d/2;
1201  int dir = d%2;
1202 
1203  if (!commDimPartitioned(dim)) return;
1204  if ( (gdr_send && gdr_recv) && !comm_gdr_enabled()) errorQuda("Requesting GDR comms but GDR is not enabled");
1205 
1206  if (dir==0) {
1207 
1208  // first wait on send to backwards
1209  if (comm_peer2peer_enabled(0,dim)) {
1211  cudaEventSynchronize(ipcCopyEvent[bufferIndex][0][dim]);
1212  } else if (gdr_send) {
1214  } else {
1216  }
1217 
1218  // second wait on receive from forwards
1219  if (comm_peer2peer_enabled(1,dim)) {
1221  cudaEventSynchronize(ipcRemoteCopyEvent[bufferIndex][1][dim]);
1222  } else if (gdr_recv) {
1224  } else {
1226  }
1227 
1228  } else {
1229 
1230  // first wait on send to forwards
1231  if (comm_peer2peer_enabled(1,dim)) {
1233  cudaEventSynchronize(ipcCopyEvent[bufferIndex][1][dim]);
1234  } else if (gdr_send) {
1236  } else {
1238  }
1239 
1240  // second wait on receive from backwards
1241  if (comm_peer2peer_enabled(0,dim)) {
1243  cudaEventSynchronize(ipcRemoteCopyEvent[bufferIndex][0][dim]);
1244  } else if (gdr_recv) {
1246  } else {
1248  }
1249 
1250  }
1251 
1252  return;
1253  }
1254 
1255  void cudaColorSpinorField::scatter(int nFace, int dagger, int dim_dir, cudaStream_t* stream_p)
1256  {
1257  // note this is scatter centric, so input expects dir=0 (1) is send backwards
1258  // (forwards) and receive from forwards (backwards), so here we need flip to receive centric
1259 
1260  int dim = dim_dir/2;
1261  int dir = (dim_dir+1)%2; // dir = 1 - receive from forwards, dir == 0 recive from backwards
1262  if (!commDimPartitioned(dim)) return;
1263 
1264  if (comm_peer2peer_enabled(dir,dim)) return;
1265  unpackGhost(from_face_dim_dir_h[bufferIndex][dim][dir], nFace, dim, dir == 0 ? QUDA_BACKWARDS : QUDA_FORWARDS, dagger, stream_p);
1266  }
1267 
1268  void cudaColorSpinorField::scatter(int nFace, int dagger, int dim_dir)
1269  {
1270  // note this is scatter centric, so dir=0 (1) is send backwards
1271  // (forwards) and receive from forwards (backwards), so here we need flip to receive centric
1272 
1273  int dim = dim_dir/2;
1274  int dir = (dim_dir+1)%2; // dir = 1 - receive from forwards, dir == 0 receive from backwards
1275  if (!commDimPartitioned(dim)) return;
1276 
1277  if (comm_peer2peer_enabled(dir,dim)) return;
1278  unpackGhost(from_face_dim_dir_h[bufferIndex][dim][dir], nFace, dim, dir == 0 ? QUDA_BACKWARDS : QUDA_FORWARDS, dagger, &stream[dim_dir]);
1279  }
1280 
1281  void cudaColorSpinorField::scatterExtended(int nFace, int parity, int dagger, int dim_dir)
1282  {
1283  bool zero_copy = false;
1284  int dim = dim_dir/2;
1285  int dir = (dim_dir+1)%2; // dir = 1 - receive from forwards, dir == 0 receive from backwards
1286  if (!commDimPartitioned(dim)) return;
1287  unpackGhostExtended(from_face_dim_dir_h[bufferIndex][dim][dir], nFace, static_cast<QudaParity>(parity), dim, dir == 0 ? QUDA_BACKWARDS : QUDA_FORWARDS, dagger, &stream[2*dim/*+0*/], zero_copy);
1288  }
1289 
1290  void cudaColorSpinorField::exchangeGhost(QudaParity parity, int nFace, int dagger, const MemoryLocation *pack_destination_,
1291  const MemoryLocation *halo_location_, bool gdr_send, bool gdr_recv,
1292  QudaPrecision ghost_precision_) const {
1293 
1294  // we are overriding the ghost precision, and it doesn't match what has already been allocated
1295  if (ghost_precision_ != QUDA_INVALID_PRECISION && ghost_precision != ghost_precision_) {
1296  ghost_precision_reset = true;
1297  ghost_precision = ghost_precision_;
1298  }
1299 
1300  // not overriding the ghost precision, but we did previously so need to update
1302  ghost_precision_reset = true;
1304  }
1305 
1306  if ((gdr_send || gdr_recv) && !comm_gdr_enabled()) errorQuda("Requesting GDR comms but GDR is not enabled");
1307  pushKernelPackT(true); // ensure kernel packing is enabled for all dimensions
1308  const_cast<cudaColorSpinorField&>(*this).streamInit(streams); // ensures streams are set (needed for p2p)
1309  const_cast<cudaColorSpinorField&>(*this).createComms(nFace, false);
1310 
1311  // first set default values to device if needed
1312  MemoryLocation pack_destination[2*QUDA_MAX_DIM], halo_location[2*QUDA_MAX_DIM];
1313  for (int i=0; i<2*nDimComms; i++) {
1314  pack_destination[i] = pack_destination_ ? pack_destination_[i] : Device;
1315  halo_location[i] = halo_location_ ? halo_location_[i] : Device;
1316  }
1317 
1318  // Contiguous send buffers and we aggregate copies to reduce
1319  // latency. Only if all locations are "Device" and no p2p
1320  bool fused_pack_memcpy = true;
1321 
1322  // Contiguous recv buffers and we aggregate copies to reduce
1323  // latency. Only if all locations are "Device" and no p2p
1324  bool fused_halo_memcpy = true;
1325 
1326  bool pack_host = false; // set to true if any of the ghost packing is being done to Host memory
1327  bool halo_host = false; // set to true if the final halos will be left in Host memory
1328 
1329  void *send[2*QUDA_MAX_DIM];
1330  for (int d=0; d<nDimComms; d++) {
1331  for (int dir=0; dir<2; dir++) {
1332  send[2*d+dir] = pack_destination[2*d+dir] == Host ? my_face_dim_dir_hd[bufferIndex][d][dir] : my_face_dim_dir_d[bufferIndex][d][dir];
1333  ghost_buf[2*d+dir] = halo_location[2*d+dir] == Host ? from_face_dim_dir_hd[bufferIndex][d][dir] : from_face_dim_dir_d[bufferIndex][d][dir];
1334  }
1335 
1336  // if doing p2p, then we must pack to and load the halo from device memory
1337  for (int dir=0; dir<2; dir++) {
1338  if (comm_peer2peer_enabled(dir,d)) { pack_destination[2*d+dir] = Device; halo_location[2*d+1-dir] = Device; }
1339  }
1340 
1341  // if zero-copy packing or p2p is enabled then we cannot do fused memcpy
1342  if (pack_destination[2*d+0] != Device || pack_destination[2*d+1] != Device || comm_peer2peer_enabled_global()) fused_pack_memcpy = false;
1343  // if zero-copy halo read or p2p is enabled then we cannot do fused memcpy
1344  if (halo_location[2*d+0] != Device || halo_location[2*d+1] != Device || comm_peer2peer_enabled_global()) fused_halo_memcpy = false;
1345 
1346  if (pack_destination[2*d+0] == Host || pack_destination[2*d+1] == Host) pack_host = true;
1347  if (halo_location[2*d+0] == Host || halo_location[2*d+1] == Host) halo_host = true;
1348  }
1349 
1350  // Error if zero-copy and p2p for now
1351  if ( (pack_host || halo_host) && comm_peer2peer_enabled_global()) errorQuda("Cannot use zero-copy memory with peer-to-peer comms yet");
1352 
1353  genericPackGhost(send, *this, parity, nFace, dagger, pack_destination); // FIXME - need support for asymmetric topologies
1354 
1355  size_t total_bytes = 0;
1356  for (int i=0; i<nDimComms; i++) if (comm_dim_partitioned(i)) total_bytes += 2*ghost_face_bytes[i]; // 2 for fwd/bwd
1357 
1358  if (!gdr_send) {
1359  if (!fused_pack_memcpy) {
1360  for (int i=0; i<nDimComms; i++) {
1361  if (comm_dim_partitioned(i)) {
1362  if (pack_destination[2*i+0] == Device && !comm_peer2peer_enabled(0,i) && // fuse forwards and backwards if possible
1363  pack_destination[2*i+1] == Device && !comm_peer2peer_enabled(1,i)) {
1364  cudaMemcpyAsync(my_face_dim_dir_h[bufferIndex][i][0], my_face_dim_dir_d[bufferIndex][i][0], 2*ghost_face_bytes[i], cudaMemcpyDeviceToHost, 0);
1365  } else {
1366  if (pack_destination[2*i+0] == Device && !comm_peer2peer_enabled(0,i))
1367  cudaMemcpyAsync(my_face_dim_dir_h[bufferIndex][i][0], my_face_dim_dir_d[bufferIndex][i][0], ghost_face_bytes[i], cudaMemcpyDeviceToHost, 0);
1368  if (pack_destination[2*i+1] == Device && !comm_peer2peer_enabled(1,i))
1369  cudaMemcpyAsync(my_face_dim_dir_h[bufferIndex][i][1], my_face_dim_dir_d[bufferIndex][i][1], ghost_face_bytes[i], cudaMemcpyDeviceToHost, 0);
1370  }
1371  }
1372  }
1373  } else if (total_bytes && !pack_host) {
1374  cudaMemcpyAsync(my_face_h[bufferIndex], ghost_send_buffer_d[bufferIndex], total_bytes, cudaMemcpyDeviceToHost, 0);
1375  }
1376  }
1377 
1378  // prepost receive
1379  for (int i=0; i<2*nDimComms; i++) const_cast<cudaColorSpinorField*>(this)->recvStart(nFace, i, dagger, 0, gdr_recv);
1380 
1381  bool sync = pack_host ? true : false; // no p2p if pack_host so we need to synchronize
1382  // if not p2p in any direction then need to synchronize before MPI
1383  for (int i=0; i<nDimComms; i++) if (!comm_peer2peer_enabled(0,i) || !comm_peer2peer_enabled(1,i)) sync = true;
1384  if (sync) qudaDeviceSynchronize(); // need to make sure packing and/or memcpy has finished before kicking off MPI
1385 
1386  for (int p2p=0; p2p<2; p2p++) {
1387  for (int dim=0; dim<nDimComms; dim++) {
1388  for (int dir=0; dir<2; dir++) {
1389  if ( (comm_peer2peer_enabled(dir,dim) + p2p) % 2 == 0 ) { // issue non-p2p transfers first
1390  const_cast<cudaColorSpinorField*>(this)->sendStart(nFace, 2*dim+dir, dagger, 0, gdr_send);
1391  }
1392  }
1393  }
1394  }
1395 
1396  bool comms_complete[2*QUDA_MAX_DIM] = { };
1397  int comms_done = 0;
1398  while (comms_done < 2*nDimComms) { // non-blocking query of each exchange and exit once all have completed
1399  for (int dim=0; dim<nDimComms; dim++) {
1400  for (int dir=0; dir<2; dir++) {
1401  if (!comms_complete[dim*2+dir]) {
1402  comms_complete[2*dim+dir] = const_cast<cudaColorSpinorField*>(this)->commsQuery(nFace, 2*dim+dir, dagger, 0, gdr_send, gdr_recv);
1403  if (comms_complete[2*dim+dir]) {
1404  comms_done++;
1405  if (comm_peer2peer_enabled(1-dir,dim)) qudaStreamWaitEvent(0, ipcRemoteCopyEvent[bufferIndex][1-dir][dim], 0);
1406  }
1407  }
1408  }
1409  }
1410  }
1411 
1412  if (!gdr_recv) {
1413  if (!fused_halo_memcpy) {
1414  for (int i=0; i<nDimComms; i++) {
1415  if (comm_dim_partitioned(i)) {
1416  if (halo_location[2*i+0] == Device && !comm_peer2peer_enabled(0,i) && // fuse forwards and backwards if possible
1417  halo_location[2*i+1] == Device && !comm_peer2peer_enabled(1,i)) {
1418  cudaMemcpyAsync(from_face_dim_dir_d[bufferIndex][i][0], from_face_dim_dir_h[bufferIndex][i][0], 2*ghost_face_bytes[i], cudaMemcpyHostToDevice, 0);
1419  } else {
1420  if (halo_location[2*i+0] == Device && !comm_peer2peer_enabled(0,i))
1421  cudaMemcpyAsync(from_face_dim_dir_d[bufferIndex][i][0], from_face_dim_dir_h[bufferIndex][i][0], ghost_face_bytes[i], cudaMemcpyHostToDevice, 0);
1422  if (halo_location[2*i+1] == Device && !comm_peer2peer_enabled(1,i))
1423  cudaMemcpyAsync(from_face_dim_dir_d[bufferIndex][i][1], from_face_dim_dir_h[bufferIndex][i][1], ghost_face_bytes[i], cudaMemcpyHostToDevice, 0);
1424  }
1425  }
1426  }
1427  } else if (total_bytes && !halo_host) {
1428  cudaMemcpyAsync(ghost_recv_buffer_d[bufferIndex], from_face_h[bufferIndex], total_bytes, cudaMemcpyHostToDevice, 0);
1429  }
1430  }
1431 
1432  popKernelPackT(); // restore kernel packing
1433  }
1434 
1435  std::ostream& operator<<(std::ostream &out, const cudaColorSpinorField &a) {
1436  out << (const ColorSpinorField&)a;
1437  out << "v = " << a.v << std::endl;
1438  out << "norm = " << a.norm << std::endl;
1439  out << "alloc = " << a.alloc << std::endl;
1440  out << "init = " << a.init << std::endl;
1441  return out;
1442  }
1443 
1446 
1447  if (this->IsComposite()) {
1448  if (idx < this->CompositeDim()) {//setup eigenvector form the set
1449  return *(dynamic_cast<cudaColorSpinorField*>(components[idx]));
1450  }
1451  else{
1452  errorQuda("Incorrect component index...");
1453  }
1454  }
1455  errorQuda("Cannot get requested component");
1456  exit(-1);
1457  }
1458 
1459 //copyCuda currently cannot not work with set of spinor fields..
1460  void cudaColorSpinorField::CopySubset(cudaColorSpinorField &dst, const int range, const int first_element) const{
1461 #if 0
1462  if (first_element < 0) errorQuda("\nError: trying to set negative first element.\n");
1463  if (siteSubset == QUDA_PARITY_SITE_SUBSET && this->EigvId() == -1) {
1464  if (first_element == 0 && range == this->EigvDim())
1465  {
1466  if (range != dst.EigvDim())errorQuda("\nError: eigenvector range to big.\n");
1467  checkField(dst, *this);
1468  copyCuda(dst, *this);
1469  }
1470  else if ((first_element+range) < this->EigvDim())
1471  {//setup eigenvector subset
1472 
1473  cudaColorSpinorField *eigv_subset;
1474 
1476 
1477  param.nColor = nColor;
1478  param.nSpin = nSpin;
1479  param.twistFlavor = twistFlavor;
1480  param.precision = precision;
1481  param.nDim = nDim;
1482  param.pad = pad;
1483  param.siteSubset = siteSubset;
1484  param.siteOrder = siteOrder;
1485  param.fieldOrder = fieldOrder;
1486  param.gammaBasis = gammaBasis;
1487  memcpy(param.x, x, nDim*sizeof(int));
1489 
1490  param.eigv_dim = range;
1491  param.eigv_id = -1;
1492  param.v = (void*)((char*)v + first_element*eigv_bytes);
1493  param.norm = (void*)((char*)norm + first_element*eigv_norm_bytes);
1494 
1495  eigv_subset = new cudaColorSpinorField(param);
1496 
1497  //Not really needed:
1498  eigv_subset->eigenvectors.reserve(param.eigv_dim);
1499  for (int id = first_element; id < (first_element+range); id++)
1500  {
1501  param.eigv_id = id;
1502  eigv_subset->eigenvectors.push_back(new cudaColorSpinorField(*this, param));
1503  }
1504  checkField(dst, *eigv_subset);
1505  copyCuda(dst, *eigv_subset);
1506 
1507  delete eigv_subset;
1508  } else {
1509  errorQuda("Incorrect eigenvector dimension...");
1510  }
1511  } else{
1512  errorQuda("Eigenvector must be a parity spinor");
1513  exit(-1);
1514  }
1515 #endif
1516  }
1517 
1519  {
1520 #ifdef USE_TEXTURE_OBJECTS
1521  printfQuda("\nPrint texture info for the field:\n");
1522  std::cout << *this;
1523  cudaResourceDesc resDesc;
1524  //memset(&resDesc, 0, sizeof(resDesc));
1525  cudaGetTextureObjectResourceDesc(&resDesc, this->Tex());
1526  printfQuda("\nDevice pointer: %p\n", resDesc.res.linear.devPtr);
1527  printfQuda("\nVolume (in bytes): %lu\n", resDesc.res.linear.sizeInBytes);
1528  if (resDesc.resType == cudaResourceTypeLinear) printfQuda("\nResource type: linear \n");
1529 #endif
1530  }
1531 
1532  void cudaColorSpinorField::Source(const QudaSourceType sourceType, const int st, const int s, const int c) {
1533  ColorSpinorParam param(*this);
1537  // since CPU fields cannot be low precision, use single precision instead
1539 
1540  cpuColorSpinorField tmp(param);
1541  tmp.Source(sourceType, st, s, c);
1542  *this = tmp;
1543  }
1544 
1545  void cudaColorSpinorField::PrintVector(unsigned int i) const { genericCudaPrintVector(*this, i); }
1546 
1547 } // namespace quda
#define qudaMemcpy(dst, src, count, kind)
Definition: quda_cuda_api.h:33
QudaFieldLocation reorder_location()
Return whether data is reordered on the CPU or GPU. This can set at QUDA initialization using the env...
CompositeColorSpinorField components
void setPrecision(QudaPrecision precision, QudaPrecision ghost_precision=QUDA_INVALID_PRECISION, bool force_native=false)
void allocateGhostBuffer(size_t ghost_bytes) const
Allocate the static ghost buffers.
int commDimPartitioned(int dir)
void commsStart(int nFace, int d, int dagger=0, cudaStream_t *stream_p=nullptr, bool gdr_send=false, bool gdr_recv=false)
Initiate halo communication.
#define pool_pinned_free(ptr)
Definition: malloc_quda.h:128
int ghostNormOffset[QUDA_MAX_DIM][2]
enum QudaPrecision_s QudaPrecision
void streamInit(cudaStream_t *stream_p)
void * my_face_dim_dir_d[2][QUDA_MAX_DIM][2]
static bool complete_recv_back[QUDA_MAX_DIM]
cudaDeviceProp deviceProp
void allocateGhostBuffer(int nFace, bool spin_project=true) const
Allocate the ghost buffers.
bool getKernelPackT()
Definition: dslash_quda.cu:26
cudaError_t qudaStreamWaitEvent(cudaStream_t stream, cudaEvent_t event, unsigned int flags)
Wrapper around cudaEventRecord or cuEventRecord.
void * ghostNorm[2][QUDA_MAX_DIM]
QudaVerbosity getVerbosity()
Definition: util_quda.cpp:21
void unpackGhost(const void *ghost_spinor, const int nFace, const int dim, const QudaDirection dir, const int dagger, cudaStream_t *stream)
void gather(int nFace, int dagger, int dir, cudaStream_t *stream_p=NULL)
#define errorQuda(...)
Definition: util_quda.h:121
void copySpinorField(const ColorSpinorField &src)
#define host_free(ptr)
Definition: malloc_quda.h:71
void sendGhost(void *ghost_spinor, const int nFace, const int dim, const QudaDirection dir, const int dagger, cudaStream_t *stream)
CompositeColorSpinorFieldDescriptor composite_descr
used for deflation eigenvector sets etc.:
cudaStream_t * streams
static void * ghost_pinned_recv_buffer_hd[2]
static std::map< void *, MemAlloc > alloc[N_ALLOC_TYPE]
Definition: malloc.cpp:53
void commsWait(int nFace, int d, int dagger=0, cudaStream_t *stream_p=nullptr, bool gdr_send=false, bool gdr_recv=false)
Wait on halo communication to complete.
cudaStream_t * stream
void Source(const QudaSourceType sourceType, const int st=0, const int s=0, const int c=0)
cudaColorSpinorField * tmp
Definition: covdev_test.cpp:44
void loadSpinorField(const ColorSpinorField &src)
const int Nstream
Definition: quda_internal.h:83
QudaGammaBasis GammaBasis() const
static bool complete_send_fwd[QUDA_MAX_DIM]
QudaPrecision precision
Definition: lattice_field.h:51
void copy(ColorSpinorField &dst, const ColorSpinorField &src)
Definition: copy_quda.cu:355
static void * ghost_pinned_send_buffer_h[2]
void copyGenericColorSpinor(ColorSpinorField &dst, const ColorSpinorField &src, QudaFieldLocation location, void *Dst=0, void *Src=0, void *dstNorm=0, void *srcNorm=0)
static MsgHandle * mh_send_p2p_back[2][QUDA_MAX_DIM]
void * from_face_dim_dir_hd[2][QUDA_MAX_DIM][2]
static bool zeroCopy
void scatterExtended(int nFace, int parity, int dagger, int dir)
MsgHandle * mh_send_rdma_fwd[2][QUDA_MAX_DIM]
void CopySubset(cudaColorSpinorField &dst, const int range, const int first_element=0) const
void copy(const cudaColorSpinorField &)
QudaSiteSubset siteSubset
Definition: lattice_field.h:71
int commsQuery(int nFace, int d, int dagger=0, cudaStream_t *stream_p=nullptr, bool gdr_send=false, bool gdr_recv=false)
Non-blocking query if the halo communication has completed.
void exchangeGhost(QudaParity parity, int nFace, int dagger, const MemoryLocation *pack_destination=nullptr, const MemoryLocation *halo_location=nullptr, bool gdr_send=false, bool gdr_recv=false, QudaPrecision ghost_precision=QUDA_INVALID_PRECISION) const
This is a unified ghost exchange function for doing a complete halo exchange regardless of the type o...
bool is_aligned(const void *ptr, size_t alignment)
Definition: malloc_quda.h:57
enum QudaSourceType_s QudaSourceType
QudaGaugeParam param
Definition: pack_test.cpp:17
void * my_face_dim_dir_hd[2][QUDA_MAX_DIM][2]
void popKernelPackT()
Definition: dslash_quda.cu:42
int x[QUDA_MAX_DIM]
Definition: lattice_field.h:67
const int * R() const
bool is_composite
for deflation solvers:
int ghostFaceCB[QUDA_MAX_DIM]
enum QudaDirection_s QudaDirection
void genericPackGhost(void **ghost, const ColorSpinorField &a, QudaParity parity, int nFace, int dagger, MemoryLocation *destination=nullptr)
Generic ghost packing routine.
MsgHandle * mh_send_rdma_back[2][QUDA_MAX_DIM]
static MsgHandle * mh_recv_p2p_fwd[2][QUDA_MAX_DIM]
static bool complete_send_back[QUDA_MAX_DIM]
#define qudaDeviceSynchronize()
ColorSpinorField * odd
static bool ghost_field_reset
void Source(const QudaSourceType sourceType, const int st=0, const int s=0, const int c=0)
QudaFieldLocation location
static int bufferIndex
static void checkField(const ColorSpinorField &, const ColorSpinorField &)
void PrintVector(unsigned int x) const
void createComms(int nFace, bool spin_project=true)
Create the communication handlers and buffers.
void reset(const ColorSpinorParam &)
MsgHandle * mh_recv_back[2][QUDA_MAX_DIM]
friend std::ostream & operator<<(std::ostream &out, const cudaColorSpinorField &)
QudaSiteSubset SiteSubset() const
void saveSpinorField(ColorSpinorField &src) const
void comm_start(MsgHandle *mh)
Definition: comm_mpi.cpp:216
#define pool_device_malloc(size)
Definition: malloc_quda.h:125
MsgHandle * mh_recv_rdma_fwd[2][QUDA_MAX_DIM]
void packExtended(const int nFace, const int R[], const int parity, const int dagger, const int dim, cudaStream_t *stream_p, const bool zeroCopyPack=false)
void * ghost[2][QUDA_MAX_DIM]
ColorSpinorField & operator=(const ColorSpinorField &)
static void * ghost_remote_send_buffer_d[2][QUDA_MAX_DIM][2]
enum QudaParity_s QudaParity
void * from_face_dim_dir_d[2][QUDA_MAX_DIM][2]
void init()
Create the CUBLAS context.
Definition: blas_cublas.cu:31
size_t ghost_face_bytes[QUDA_MAX_DIM]
void sendStart(int nFace, int d, int dagger=0, cudaStream_t *stream_p=nullptr, bool gdr=false, bool remote_write=false)
Initiate halo communication sending.
cudaColorSpinorField & Component(const int idx) const
for composite fields:
void PackGhost(void *ghost[2 *QUDA_MAX_DIM], const ColorSpinorField &field, MemoryLocation location, int nFace, bool dagger, int parity, bool spin_project, double a, double b, double c, const cudaStream_t &stream)
Dslash face packing routine.
QudaTwistFlavorType twistFlavor
static void destroyIPCComms()
void zeroPad()
Zero the padded regions added on to the field. Ensures correct reductions and silences false positive...
MsgHandle * mh_send_fwd[2][QUDA_MAX_DIM]
void create(const QudaFieldCreate)
void * memset(void *s, int c, size_t n)
virtual ColorSpinorField & operator=(const ColorSpinorField &)
void packGhost(const int nFace, const QudaParity parity, const int dim, const QudaDirection dir, const int dagger, cudaStream_t *stream, MemoryLocation location[2 *QUDA_MAX_DIM], MemoryLocation location_label, bool spin_project, double a=0, double b=0, double c=0)
Packs the cudaColorSpinorField&#39;s ghost zone.
bool comm_peer2peer_enabled(int dir, int dim)
void unpackGhostExtended(const void *ghost_spinor, const int nFace, const QudaParity parity, const int dim, const QudaDirection dir, const int dagger, cudaStream_t *stream, bool zero_copy)
static void * ghost_send_buffer_d[2]
#define pool_pinned_malloc(size)
Definition: malloc_quda.h:127
void restore() const
Restores the cudaColorSpinorField.
#define qudaMemcpyAsync(dst, src, count, kind, stream)
Definition: quda_cuda_api.h:38
cpuColorSpinorField * out
int ghostOffset[QUDA_MAX_DIM][2]
void createGhostZone(int nFace, bool spin_project=true) const
QudaPrecision ghost_precision
void pack(int nFace, int parity, int dagger, int stream_idx, MemoryLocation location[], MemoryLocation location_label, bool spin_project=true, double a=0, double b=0, double c=0)
void * from_face_dim_dir_h[2][QUDA_MAX_DIM][2]
static MsgHandle * mh_send_p2p_fwd[2][QUDA_MAX_DIM]
__shared__ float s[]
int comm_query(MsgHandle *mh)
Definition: comm_mpi.cpp:228
static void * ghost_recv_buffer_d[2]
MsgHandle * mh_recv_rdma_back[2][QUDA_MAX_DIM]
#define printfQuda(...)
Definition: util_quda.h:115
QudaMemoryType mem_type
Definition: lattice_field.h:73
static cudaEvent_t ipcCopyEvent[2][2][QUDA_MAX_DIM]
void backup() const
Backs up the cudaColorSpinorField.
QudaTwistFlavorType twistFlavor
static bool complete_recv_fwd[QUDA_MAX_DIM]
bool comm_gdr_enabled()
Query if GPU Direct RDMA communication is enabled (global setting)
cudaError_t qudaEventRecord(cudaEvent_t &event, cudaStream_t stream=0)
Wrapper around cudaEventRecord or cuEventRecord.
enum QudaFieldCreate_s QudaFieldCreate
static cudaEvent_t ipcRemoteCopyEvent[2][2][QUDA_MAX_DIM]
void packGhostExtended(const int nFace, const int R[], const QudaParity parity, const int dim, const QudaDirection dir, const int dagger, cudaStream_t *stream, bool zero_copy=false)
#define pool_device_free(ptr)
Definition: malloc_quda.h:126
cudaColorSpinorField(const cudaColorSpinorField &)
void pushKernelPackT(bool pack)
Definition: dslash_quda.cu:30
#define QUDA_MAX_DIM
Maximum number of dimensions supported by QUDA. In practice, no routines make use of more than 5...
#define checkCudaError()
Definition: util_quda.h:161
MsgHandle * mh_recv_fwd[2][QUDA_MAX_DIM]
void createComms(bool no_comms_fill=false, bool bidir=true)
#define mapped_malloc(size)
Definition: malloc_quda.h:68
void comm_wait(MsgHandle *mh)
Definition: comm_mpi.cpp:222
static MsgHandle * mh_recv_p2p_back[2][QUDA_MAX_DIM]
void * ghost_buf[2 *QUDA_MAX_DIM]
void scatter(int nFace, int dagger, int dir, cudaStream_t *stream_p)
QudaDagType dagger
Definition: test_util.cpp:1620
void * my_face_dim_dir_h[2][QUDA_MAX_DIM][2]
void recvStart(int nFace, int dir, int dagger=0, cudaStream_t *stream_p=nullptr, bool gdr=false)
Initiate halo communication receive.
QudaParity parity
Definition: covdev_test.cpp:54
QudaMemoryType mem_type
QudaPrecision precision
int comm_peer2peer_enabled_global()
MsgHandle * mh_send_back[2][QUDA_MAX_DIM]
QudaFieldOrder FieldOrder() const
ColorSpinorField * even
static void * ghost_pinned_recv_buffer_h[2]
int comm_dim_partitioned(int dim)
void genericCudaPrintVector(const cudaColorSpinorField &a, unsigned x)
#define qudaMemcpy2DAsync(dst, dpitch, src, spitch, width, height, kind, stream)
Definition: quda_cuda_api.h:43