QUDA  v0.7.0
A library for QCD on GPUs
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
cuda_color_spinor_field.cu
Go to the documentation of this file.
1 #include <stdlib.h>
2 #include <stdio.h>
3 #include <typeinfo>
4 
5 #include <color_spinor_field.h>
6 #include <blas_quda.h>
7 
8 #include <string.h>
9 #include <iostream>
10 #include <misc_helpers.h>
11 #include <face_quda.h>
12 #include <dslash_quda.h>
13 
14 #ifdef DEVICE_PACK
15 #define REORDER_LOCATION QUDA_CUDA_FIELD_LOCATION
16 #else
17 #define REORDER_LOCATION QUDA_CPU_FIELD_LOCATION
18 #endif
19 
20 int zeroCopy = 0;
21 
22 namespace quda {
23 
25  int cudaColorSpinorField::initGhostFaceBuffer = 0;
26  void* cudaColorSpinorField::ghostFaceBuffer[2]; //gpu memory
27  void* cudaColorSpinorField::fwdGhostFaceBuffer[2][QUDA_MAX_DIM]; //pointers to ghostFaceBuffer
28  void* cudaColorSpinorField::backGhostFaceBuffer[2][QUDA_MAX_DIM]; //pointers to ghostFaceBuffer
29  size_t cudaColorSpinorField::ghostFaceBytes = 0;
30 
31  /*cudaColorSpinorField::cudaColorSpinorField() :
32  ColorSpinorField(), v(0), norm(0), alloc(false), init(false) {
33 
34  }*/
35 
37  ColorSpinorField(param), alloc(false), init(true), texInit(false),
38  initComms(false), bufferMessageHandler(0), nFaceComms(0) {
39 
40  // this must come before create
41  if (param.create == QUDA_REFERENCE_FIELD_CREATE) {
42  v = param.v;
43  norm = param.norm;
44  }
45 
46  create(param.create);
47 
48  if (param.create == QUDA_NULL_FIELD_CREATE) {
49  // do nothing
50  } else if (param.create == QUDA_ZERO_FIELD_CREATE) {
51  zero();
52  } else if (param.create == QUDA_REFERENCE_FIELD_CREATE) {
53  // dp nothing
54  } else if (param.create == QUDA_COPY_FIELD_CREATE){
55  errorQuda("not implemented");
56  }
58  }
59 
61  ColorSpinorField(src), alloc(false), init(true), texInit(false),
62  initComms(false), bufferMessageHandler(0), nFaceComms(0) {
63  create(QUDA_COPY_FIELD_CREATE);
64  copySpinorField(src);
65  }
66 
67  // creates a copy of src, any differences defined in param
69  const ColorSpinorParam &param) :
70  ColorSpinorField(src), alloc(false), init(true), texInit(false),
71  initComms(false), bufferMessageHandler(0), nFaceComms(0) {
72 
73  // can only overide if we are not using a reference or parity special case
74  if (param.create != QUDA_REFERENCE_FIELD_CREATE ||
78  typeid(src) == typeid(cudaColorSpinorField) ) ||
79  (param.create == QUDA_REFERENCE_FIELD_CREATE && param.eigv_dim > 0)) {
80  reset(param);
81  } else {
82  errorQuda("Undefined behaviour"); // else silent bug possible?
83  }
84 
85  // This must be set before create is called
86  if (param.create == QUDA_REFERENCE_FIELD_CREATE) {
87  if (typeid(src) == typeid(cudaColorSpinorField)) {
88  v = (void*)src.V();
89  norm = (void*)src.Norm();
90  } else {
91  errorQuda("Cannot reference a non-cuda field");
92  }
93 
94  if (this->EigvDim() > 0)
95  {//setup eigenvector form the set
96  if(eigv_dim != this->EigvDim()) errorQuda("\nEigenvector set does not match..\n") ;//for debug only.
97  if(eigv_id > -1)
98  {
99  //printfQuda("\nSetting pointers for vector id %d\n", eigv_id); //for debug only.
100  v = (void*)((char*)v + eigv_id*bytes);
101  norm = (void*)((char*)norm + eigv_id*norm_bytes);
102  }
103  //do nothing for the eigenvector subset...
104  }
105  }
106 
107  create(param.create);
108 
109  if (param.create == QUDA_NULL_FIELD_CREATE) {
110  // do nothing
111  } else if (param.create == QUDA_ZERO_FIELD_CREATE) {
112  zero();
113  } else if (param.create == QUDA_COPY_FIELD_CREATE) {
114  copySpinorField(src);
115  } else if (param.create == QUDA_REFERENCE_FIELD_CREATE) {
116  // do nothing
117  } else {
118  errorQuda("CreateType %d not implemented", param.create);
119  }
120 
121  }
122 
124  : ColorSpinorField(src), alloc(false), init(true), texInit(false),
125  initComms(false), bufferMessageHandler(0), nFaceComms(0) {
126  create(QUDA_COPY_FIELD_CREATE);
127  copySpinorField(src);
128  }
129 
131  if (typeid(src) == typeid(cudaColorSpinorField)) {
132  *this = (dynamic_cast<const cudaColorSpinorField&>(src));
133  } else if (typeid(src) == typeid(cpuColorSpinorField)) {
134  *this = (dynamic_cast<const cpuColorSpinorField&>(src));
135  } else {
136  errorQuda("Unknown input ColorSpinorField %s", typeid(src).name());
137  }
138  return *this;
139  }
140 
142  if (&src != this) {
143  // keep current attributes unless unset
144  if (!ColorSpinorField::init) { // note this will turn a reference field into a regular field
145  destroy();
146  destroyComms(); // not sure if this necessary
148  create(QUDA_COPY_FIELD_CREATE);
149  }
150  copySpinorField(src);
151  }
152  return *this;
153  }
154 
156  // keep current attributes unless unset
157  if (!ColorSpinorField::init) { // note this will turn a reference field into a regular field
158  destroy();
160  create(QUDA_COPY_FIELD_CREATE);
161  }
162  loadSpinorField(src);
163  return *this;
164  }
165 
167  destroyComms();
168  destroy();
169  }
170 
172 
174  if (fieldOrder == QUDA_FLOAT2_FIELD_ORDER) return true;
175  } else if (precision == QUDA_SINGLE_PRECISION) {
176  if (nSpin == 4) {
177  if (fieldOrder == QUDA_FLOAT4_FIELD_ORDER) return true;
178  } else if (nSpin == 1) {
179  if (fieldOrder == QUDA_FLOAT2_FIELD_ORDER) return true;
180  }
181  } else if (precision == QUDA_HALF_PRECISION) {
182  if (nSpin == 4) {
183  if (fieldOrder == QUDA_FLOAT4_FIELD_ORDER) return true;
184  } else if (nSpin == 1) {
185  if (fieldOrder == QUDA_FLOAT2_FIELD_ORDER) return true;
186  }
187  }
188 
189  return false;
190  }
191 
192  void cudaColorSpinorField::create(const QudaFieldCreate create) {
193 
195  errorQuda("Subset not implemented");
196  }
197 
198  if (create != QUDA_REFERENCE_FIELD_CREATE) {
199  v = device_malloc(bytes);
202  }
203  alloc = true;
204  }
205 
207  if(eigv_dim != 0) errorQuda("Eigenvectors must be parity fields!");
208  // create the associated even and odd subsets
210  param.siteSubset = QUDA_PARITY_SITE_SUBSET;
211  param.nDim = nDim;
212  memcpy(param.x, x, nDim*sizeof(int));
213  param.x[0] /= 2; // set single parity dimensions
214  param.create = QUDA_REFERENCE_FIELD_CREATE;
215  param.v = v;
216  param.norm = norm;
217  even = new cudaColorSpinorField(*this, param);
218  odd = new cudaColorSpinorField(*this, param);
219 
220  // need this hackery for the moment (need to locate the odd pointer half way into the full field)
221  (dynamic_cast<cudaColorSpinorField*>(odd))->v = (void*)((char*)v + bytes/2);
223  (dynamic_cast<cudaColorSpinorField*>(odd))->norm = (void*)((char*)norm + norm_bytes/2);
224 
225 #ifdef USE_TEXTURE_OBJECTS
226  dynamic_cast<cudaColorSpinorField*>(even)->destroyTexObject();
227  dynamic_cast<cudaColorSpinorField*>(even)->createTexObject();
228  dynamic_cast<cudaColorSpinorField*>(odd)->destroyTexObject();
229  dynamic_cast<cudaColorSpinorField*>(odd)->createTexObject();
230 #endif
231  }
232  else{//siteSubset == QUDA_PARITY_SITE_SUBSET
233 
235  if ((eigv_dim > 0) && (create != QUDA_REFERENCE_FIELD_CREATE) && (eigv_id == -1))
236  {
237  //if(bytes > 1811939328) warningQuda("\nCUDA API probably won't be able to create texture object for the eigenvector set... Object size is : %u bytes\n", bytes);
238  if (getVerbosity() == QUDA_DEBUG_VERBOSE) printfQuda("\nEigenvector set constructor...\n");
239  // create the associated even and odd subsets
241  param.siteSubset = QUDA_PARITY_SITE_SUBSET;
242  param.nDim = nDim;
243  memcpy(param.x, x, nDim*sizeof(int));
244  param.create = QUDA_REFERENCE_FIELD_CREATE;
245  param.v = v;
246  param.norm = norm;
247  param.eigv_dim = eigv_dim;
248  //reserve eigvector set
249  eigenvectors.reserve(eigv_dim);
250  //setup volume, [real_]length and stride for a single eigenvector
251  for(int id = 0; id < eigv_dim; id++)
252  {
253  param.eigv_id = id;
254  eigenvectors.push_back(new cudaColorSpinorField(*this, param));
255 
256 #ifdef USE_TEXTURE_OBJECTS //(a lot of texture objects...)
257  dynamic_cast<cudaColorSpinorField*>(eigenvectors[id])->destroyTexObject();
258  dynamic_cast<cudaColorSpinorField*>(eigenvectors[id])->createTexObject();
259 #endif
260  }
261  }
262  }
263 
264  if (create != QUDA_REFERENCE_FIELD_CREATE) {
266  zeroPad();
267  } else {
268  (dynamic_cast<cudaColorSpinorField*>(even))->zeroPad();
269  (dynamic_cast<cudaColorSpinorField*>(odd))->zeroPad();
270  }
271  }
272 
273 #ifdef USE_TEXTURE_OBJECTS
274  if((eigv_dim == 0) || (eigv_dim > 0 && eigv_id > -1))
275  createTexObject();
276 #endif
277 
278  // initialize the ghost pointers
280  for(int i=0; i<nDim; ++i){
281  if(commDimPartitioned(i)){
282  ghost[i] = (char*)v + (stride + ghostOffset[i])*nColor*nSpin*2*precision;
283  if(precision == QUDA_HALF_PRECISION)
285  }
286  }
287  }
288  checkCudaError();
289  }
290 
291 #ifdef USE_TEXTURE_OBJECTS
292  void cudaColorSpinorField::createTexObject() {
293 
294  if (isNative()) {
295  if (texInit) errorQuda("Already bound textures");
296 
297  // create the texture for the field components
298 
299  cudaChannelFormatDesc desc;
300  memset(&desc, 0, sizeof(cudaChannelFormatDesc));
301  if (precision == QUDA_SINGLE_PRECISION) desc.f = cudaChannelFormatKindFloat;
302  else desc.f = cudaChannelFormatKindSigned; // half is short, double is int2
303 
304  // staggered fields in half and single are always two component
306  desc.x = 8*precision;
307  desc.y = 8*precision;
308  desc.z = 0;
309  desc.w = 0;
310  } else { // all others are four component
311  desc.x = (precision == QUDA_DOUBLE_PRECISION) ? 32 : 8*precision;
312  desc.y = (precision == QUDA_DOUBLE_PRECISION) ? 32 : 8*precision;
313  desc.z = (precision == QUDA_DOUBLE_PRECISION) ? 32 : 8*precision;
314  desc.w = (precision == QUDA_DOUBLE_PRECISION) ? 32 : 8*precision;
315  }
316 
317  cudaResourceDesc resDesc;
318  memset(&resDesc, 0, sizeof(resDesc));
319  resDesc.resType = cudaResourceTypeLinear;
320  resDesc.res.linear.devPtr = v;
321  resDesc.res.linear.desc = desc;
322  resDesc.res.linear.sizeInBytes = bytes;
323 
324  cudaTextureDesc texDesc;
325  memset(&texDesc, 0, sizeof(texDesc));
326  if (precision == QUDA_HALF_PRECISION) texDesc.readMode = cudaReadModeNormalizedFloat;
327  else texDesc.readMode = cudaReadModeElementType;
328 
329  cudaCreateTextureObject(&tex, &resDesc, &texDesc, NULL);
330  checkCudaError();
331 
332  // create the texture for the norm components
334  cudaChannelFormatDesc desc;
335  memset(&desc, 0, sizeof(cudaChannelFormatDesc));
336  desc.f = cudaChannelFormatKindFloat;
337  desc.x = 8*QUDA_SINGLE_PRECISION; desc.y = 0; desc.z = 0; desc.w = 0;
338 
339  cudaResourceDesc resDesc;
340  memset(&resDesc, 0, sizeof(resDesc));
341  resDesc.resType = cudaResourceTypeLinear;
342  resDesc.res.linear.devPtr = norm;
343  resDesc.res.linear.desc = desc;
344  resDesc.res.linear.sizeInBytes = norm_bytes;
345 
346  cudaTextureDesc texDesc;
347  memset(&texDesc, 0, sizeof(texDesc));
348  texDesc.readMode = cudaReadModeElementType;
349 
350  cudaCreateTextureObject(&texNorm, &resDesc, &texDesc, NULL);
351  checkCudaError();
352  }
353 
354  texInit = true;
355  }
356  }
357 
358  void cudaColorSpinorField::destroyTexObject() {
359  if (isNative() && texInit) {
360  cudaDestroyTextureObject(tex);
361  if (precision == QUDA_HALF_PRECISION) cudaDestroyTextureObject(texNorm);
362  texInit = false;
363  checkCudaError();
364  }
365  }
366 #endif
367 
368  void cudaColorSpinorField::destroy() {
369  if (alloc) {
370  device_free(v);
373  delete even;
374  delete odd;
375  }
376  else{
378  if (eigv_dim > 0)
379  {
380  std::vector<ColorSpinorField*>::iterator vec;
381  for(vec = eigenvectors.begin(); vec != eigenvectors.end(); vec++) delete *vec;
382  }
383  }
384  alloc = false;
385  }
386 
387 #ifdef USE_TEXTURE_OBJECTS
388  if((eigv_dim == 0) || (eigv_dim > 0 && eigv_id > -1))
389  destroyTexObject();
390 #endif
391 
392  }
393 
396  return *(dynamic_cast<cudaColorSpinorField*>(even));
397  }
398 
399  errorQuda("Cannot return even subset of %d subset", siteSubset);
400  exit(-1);
401  }
402 
405  return *(dynamic_cast<cudaColorSpinorField*>(odd));
406  }
407 
408  errorQuda("Cannot return odd subset of %d subset", siteSubset);
409  exit(-1);
410  }
411 
412  // cuda's floating point format, IEEE-754, represents the floating point
413  // zero as 4 zero bytes
415  cudaMemsetAsync(v, 0, bytes, streams[Nstream-1]);
416  if (precision == QUDA_HALF_PRECISION) cudaMemsetAsync(norm, 0, norm_bytes, streams[Nstream-1]);
417  }
418 
419 
420  void cudaColorSpinorField::zeroPad() {
421  size_t pad_bytes = (stride - volume) * precision * fieldOrder;
422  int Npad = nColor * nSpin * 2 / fieldOrder;
423 
424  if (eigv_dim > 0 && eigv_id == -1){//we consider the whole eigenvector set:
425  Npad *= eigv_dim;
426  pad_bytes /= eigv_dim;
427  }
428 
429  size_t pitch = ((eigv_dim == 0 || eigv_id != -1) ? stride : eigv_stride)*fieldOrder*precision;
430  char *dst = (char*)v + ((eigv_dim == 0 || eigv_id != -1) ? volume : eigv_volume)*fieldOrder*precision;
431  if(pad_bytes) cudaMemset2D(dst, pitch, 0, pad_bytes, Npad);
432 
433  //for (int i=0; i<Npad; i++) {
434  // if (pad_bytes) cudaMemset((char*)v + (volume + i*stride)*fieldOrder*precision, 0, pad_bytes);
435  //}
436  }
437 
438  void cudaColorSpinorField::copy(const cudaColorSpinorField &src) {
439  checkField(*this, src);
440  copyCuda(*this, src);
441  }
442 
443  void cudaColorSpinorField::copySpinorField(const ColorSpinorField &src) {
444 
445  // src is on the device and is native
446  if (typeid(src) == typeid(cudaColorSpinorField) &&
447  isNative() && dynamic_cast<const cudaColorSpinorField &>(src).isNative()) {
448  copy(dynamic_cast<const cudaColorSpinorField&>(src));
449  } else if (typeid(src) == typeid(cudaColorSpinorField)) {
451  } else if (typeid(src) == typeid(cpuColorSpinorField)) { // src is on the host
452  loadSpinorField(src);
453  } else {
454  errorQuda("Unknown input ColorSpinorField %s", typeid(src).name());
455  }
456  }
457 
458  void cudaColorSpinorField::loadSpinorField(const ColorSpinorField &src) {
459 
461  typeid(src) == typeid(cpuColorSpinorField)) {
462  for(int b=0; b<2; ++b){
464  memset(bufferPinned[b], 0, bytes+norm_bytes); // FIXME (temporary?) bug fix for padding
465  }
467  bufferPinned[bufferIndex], 0, (char*)bufferPinned[bufferIndex]+bytes, 0);
468 
469  cudaMemcpy(v, bufferPinned[bufferIndex], bytes, cudaMemcpyHostToDevice);
470  cudaMemcpy(norm, (char*)bufferPinned[bufferIndex]+bytes, norm_bytes, cudaMemcpyHostToDevice);
471  } else if (typeid(src) == typeid(cudaColorSpinorField)) {
473  } else {
474  void *Src, *srcNorm;
475  if (!zeroCopy) {
476  resizeBufferDevice(src.Bytes()+src.NormBytes());
477  Src = bufferDevice;
478  srcNorm = (char*)bufferDevice + src.Bytes();
479  cudaMemcpy(Src, src.V(), src.Bytes(), cudaMemcpyHostToDevice);
480  cudaMemcpy(srcNorm, src.Norm(), src.NormBytes(), cudaMemcpyHostToDevice);
481  } else {
482  for(int b=0; b<2; ++b){
483  resizeBufferPinned(src.Bytes()+src.NormBytes(), b);
484  }
485  memcpy(bufferPinned[bufferIndex], src.V(), src.Bytes());
486  memcpy((char*)bufferPinned[bufferIndex]+src.Bytes(), src.Norm(), src.NormBytes());
487 
488  cudaHostGetDevicePointer(&Src, bufferPinned[bufferIndex], 0);
489  srcNorm = (void*)((char*)Src + src.Bytes());
490  }
491 
492  cudaMemset(v, 0, bytes); // FIXME (temporary?) bug fix for padding
493  copyGenericColorSpinor(*this, src, QUDA_CUDA_FIELD_LOCATION, 0, Src, 0, srcNorm);
494  }
495 
496  checkCudaError();
497  return;
498  }
499 
500 
501  void cudaColorSpinorField::saveSpinorField(ColorSpinorField &dest) const {
502 
504  typeid(dest) == typeid(cpuColorSpinorField)) {
505  for(int b=0; b<2; ++b) resizeBufferPinned(bytes+norm_bytes,b);
506  cudaMemcpy(bufferPinned[bufferIndex], v, bytes, cudaMemcpyDeviceToHost);
507  cudaMemcpy((char*)bufferPinned[bufferIndex]+bytes, norm, norm_bytes, cudaMemcpyDeviceToHost);
508 
510  0, bufferPinned[bufferIndex], 0, (char*)bufferPinned[bufferIndex]+bytes);
511  } else if (typeid(dest) == typeid(cudaColorSpinorField)) {
513  } else {
514  void *dst, *dstNorm;
515  if (!zeroCopy) {
516  resizeBufferDevice(dest.Bytes()+dest.NormBytes());
517  dst = bufferDevice;
518  dstNorm = (char*)bufferDevice+dest.Bytes();
519  } else {
520  for(int b=0; b<2; ++b) resizeBufferPinned(dest.Bytes()+dest.NormBytes(),b);
521  cudaHostGetDevicePointer(&dst, bufferPinned[bufferIndex], 0);
522  dstNorm = (char*)dst+dest.Bytes();
523  }
524  copyGenericColorSpinor(dest, *this, QUDA_CUDA_FIELD_LOCATION, dst, v, dstNorm, 0);
525 
526  if (!zeroCopy) {
527  cudaMemcpy(dest.V(), dst, dest.Bytes(), cudaMemcpyDeviceToHost);
528  cudaMemcpy(dest.Norm(), dstNorm, dest.NormBytes(), cudaMemcpyDeviceToHost);
529  } else {
530  memcpy(dest.V(), bufferPinned[bufferIndex], dest.Bytes());
531  memcpy(dest.Norm(), (char*)bufferPinned[bufferIndex]+dest.Bytes(), dest.NormBytes());
532  }
533  }
534 
535  checkCudaError();
536  return;
537  }
538 
540  int Nint = nColor * nSpin * 2; // number of internal degrees of freedom
541  if (nSpin == 4) Nint /= 2; // spin projection for Wilson
542 
543  // compute size of buffer required
544  size_t faceBytes = 0;
545  for (int i=0; i<4; i++) {
546  if(!commDimPartitioned(i)) continue;
547  faceBytes += 2*nFace*ghostFace[i]*Nint*precision;
548  // add extra space for the norms for half precision
549  if (precision == QUDA_HALF_PRECISION) faceBytes += 2*nFace*ghostFace[i]*sizeof(float);
550  }
551 
552  // only allocate if not already allocated or buffer required is bigger than previously
553  if(initGhostFaceBuffer == 0 || faceBytes > ghostFaceBytes){
554 
555 
556  if (initGhostFaceBuffer){
557  for(int b=0; b<2; ++b) device_free(ghostFaceBuffer[b]);
558  }
559 
560  if (faceBytes > 0) {
561  for(int b=0; b<2; ++b) ghostFaceBuffer[b] = device_malloc(faceBytes);
562  initGhostFaceBuffer = 1;
563  ghostFaceBytes = faceBytes;
564  }
565 
566  }
567 
568  size_t offset = 0;
569  for (int i=0; i<4; i++) {
570  if(!commDimPartitioned(i)) continue;
571 
572  for(int b=0; b<2; ++b) backGhostFaceBuffer[b][i] = (void*)(((char*)ghostFaceBuffer[b]) + offset);
573  offset += nFace*ghostFace[i]*Nint*precision;
574  if (precision == QUDA_HALF_PRECISION) offset += nFace*ghostFace[i]*sizeof(float);
575 
576  for(int b=0; b<2; ++b) fwdGhostFaceBuffer[b][i] = (void*)(((char*)ghostFaceBuffer[b]) + offset);
577  offset += nFace*ghostFace[i]*Nint*precision;
578  if (precision == QUDA_HALF_PRECISION) offset += nFace*ghostFace[i]*sizeof(float);
579  }
580 
581  }
582 
583 
585  {
586  if (!initGhostFaceBuffer) return;
587 
588  for(int b=0; b<2; ++b) device_free(ghostFaceBuffer[b]);
589 
590  for(int i=0;i < 4; i++){
591  if(!commDimPartitioned(i)) continue;
592  for(int b=0; b<2; ++b){
593  backGhostFaceBuffer[b][i] = NULL;
594  fwdGhostFaceBuffer[b][i] = NULL;
595  }
596  }
597  initGhostFaceBuffer = 0;
598  }
599 
600  // pack the ghost zone into a contiguous buffer for communications
601  void cudaColorSpinorField::packGhost(const int nFace, const QudaParity parity,
602  const int dim, const QudaDirection dir,
603  const int dagger, cudaStream_t *stream,
604  void *buffer, double a, double b)
605  {
606 #ifdef MULTI_GPU
607  int face_num;
608  if(dir == QUDA_BACKWARDS){
609  face_num = 0;
610  }else if(dir == QUDA_FORWARDS){
611  face_num = 1;
612  }else{
613  face_num = 2;
614  }
615  void *packBuffer = buffer ? buffer : ghostFaceBuffer[bufferIndex];
616  packFace(packBuffer, *this, nFace, dagger, parity, dim, face_num, *stream, a, b);
617 #else
618  errorQuda("packGhost not built on single-GPU build");
619 #endif
620 
621  }
622 
623  // send the ghost zone to the host
624  void cudaColorSpinorField::sendGhost(void *ghost_spinor, const int nFace, const int dim,
625  const QudaDirection dir, const int dagger,
626  cudaStream_t *stream) {
627 
628 #ifdef MULTI_GPU
629  int Nvec = (nSpin == 1 || precision == QUDA_DOUBLE_PRECISION) ? 2 : 4;
630  int Nint = (nColor * nSpin * 2) / (nSpin == 4 ? 2 : 1); // (spin proj.) degrees of freedom
631 
632  if (dim !=3 || getKernelPackT() || getTwistPack()) { // use kernels to pack into contiguous buffers then a single cudaMemcpy
633 
634  size_t bytes = nFace*Nint*ghostFace[dim]*precision;
635  if (precision == QUDA_HALF_PRECISION) bytes += nFace*ghostFace[dim]*sizeof(float);
636  void* gpu_buf =
637  (dir == QUDA_BACKWARDS) ? this->backGhostFaceBuffer[bufferIndex][dim] : this->fwdGhostFaceBuffer[bufferIndex][dim];
638 
639  cudaMemcpyAsync(ghost_spinor, gpu_buf, bytes, cudaMemcpyDeviceToHost, *stream);
640  } else if(this->TwistFlavor() != QUDA_TWIST_NONDEG_DOUBLET){ // do multiple cudaMemcpys
641 
642  int Npad = Nint / Nvec; // number Nvec buffers we have
643  int Nt_minus1_offset = (volume - nFace*ghostFace[3]); // N_t -1 = Vh-Vsh
644  int offset = 0;
645  if (nSpin == 1) {
646  offset = (dir == QUDA_BACKWARDS) ? 0 : Nt_minus1_offset;
647  } else if (nSpin == 4) {
648  // !dagger: send lower components backwards, send upper components forwards
649  // dagger: send upper components backwards, send lower components forwards
650  bool upper = dagger ? true : false; // Fwd is !Back
651  if (dir == QUDA_FORWARDS) upper = !upper;
652  int lower_spin_offset = Npad*stride;
653  if (upper) offset = (dir == QUDA_BACKWARDS ? 0 : Nt_minus1_offset);
654  else offset = lower_spin_offset + (dir == QUDA_BACKWARDS ? 0 : Nt_minus1_offset);
655  }
656 
657  // QUDA Memcpy NPad's worth.
658  // -- Dest will point to the right beginning PAD.
659  // -- Each Pad has size Nvec*Vsh Floats.
660  // -- There is Nvec*Stride Floats from the start of one PAD to the start of the next
661 
662  void *dst = (char*)ghost_spinor;
663  void *src = (char*)v + offset*Nvec*precision;
664  size_t len = nFace*ghostFace[3]*Nvec*precision;
665  size_t spitch = stride*Nvec*precision;
666  cudaMemcpy2DAsync(dst, len, src, spitch, len, Npad, cudaMemcpyDeviceToHost, *stream);
667 
668  if (precision == QUDA_HALF_PRECISION) {
669  int norm_offset = (dir == QUDA_BACKWARDS) ? 0 : Nt_minus1_offset*sizeof(float);
670  void *dst = (char*)ghost_spinor + nFace*Nint*ghostFace[3]*precision;
671  void *src = (char*)norm + norm_offset;
672  cudaMemcpyAsync(dst, src, nFace*ghostFace[3]*sizeof(float), cudaMemcpyDeviceToHost, *stream);
673  }
674  }else{
675  int flavorVolume = volume / 2;
676  int flavorTFace = ghostFace[3] / 2;
677  int Npad = Nint / Nvec; // number Nvec buffers we have
678  int flavor1_Nt_minus1_offset = (flavorVolume - flavorTFace);
679  int flavor2_Nt_minus1_offset = (volume - flavorTFace);
680  int flavor1_offset = 0;
681  int flavor2_offset = 0;
682  // !dagger: send lower components backwards, send upper components forwards
683  // dagger: send upper components backwards, send lower components forwards
684  bool upper = dagger ? true : false; // Fwd is !Back
685  if (dir == QUDA_FORWARDS) upper = !upper;
686  int lower_spin_offset = Npad*stride;//ndeg tm: stride=2*flavor_volume+pad
687  if (upper){
688  flavor1_offset = (dir == QUDA_BACKWARDS ? 0 : flavor1_Nt_minus1_offset);
689  flavor2_offset = (dir == QUDA_BACKWARDS ? flavorVolume : flavor2_Nt_minus1_offset);
690  }else{
691  flavor1_offset = lower_spin_offset + (dir == QUDA_BACKWARDS ? 0 : flavor1_Nt_minus1_offset);
692  flavor2_offset = lower_spin_offset + (dir == QUDA_BACKWARDS ? flavorVolume : flavor2_Nt_minus1_offset);
693  }
694 
695  // QUDA Memcpy NPad's worth.
696  // -- Dest will point to the right beginning PAD.
697  // -- Each Pad has size Nvec*Vsh Floats.
698  // -- There is Nvec*Stride Floats from the start of one PAD to the start of the next
699 
700  void *dst = (char*)ghost_spinor;
701  void *src = (char*)v + flavor1_offset*Nvec*precision;
702  size_t len = flavorTFace*Nvec*precision;
703  size_t spitch = stride*Nvec*precision;//ndeg tm: stride=2*flavor_volume+pad
704  size_t dpitch = 2*len;
705  cudaMemcpy2DAsync(dst, dpitch, src, spitch, len, Npad, cudaMemcpyDeviceToHost, *stream);
706  dst = (char*)ghost_spinor+len;
707  src = (char*)v + flavor2_offset*Nvec*precision;
708  cudaMemcpy2DAsync(dst, dpitch, src, spitch, len, Npad, cudaMemcpyDeviceToHost, *stream);
709 
710  if (precision == QUDA_HALF_PRECISION) {
711  int Nt_minus1_offset = (flavorVolume - flavorTFace);
712  int norm_offset = (dir == QUDA_BACKWARDS) ? 0 : Nt_minus1_offset*sizeof(float);
713  void *dst = (char*)ghost_spinor + Nint*ghostFace[3]*precision;
714  void *src = (char*)norm + norm_offset;
715  size_t dpitch = flavorTFace*sizeof(float);
716  size_t spitch = flavorVolume*sizeof(float);
717  cudaMemcpy2DAsync(dst, dpitch, src, spitch, flavorTFace*sizeof(float), 2, cudaMemcpyDeviceToHost, *stream);
718  }
719  }
720 #else
721  errorQuda("sendGhost not built on single-GPU build");
722 #endif
723 
724  }
725 
726 
727 
728  void cudaColorSpinorField::unpackGhost(const void* ghost_spinor, const int nFace,
729  const int dim, const QudaDirection dir,
730  const int dagger, cudaStream_t* stream)
731  {
732  int Nint = (nColor * nSpin * 2) / (nSpin == 4 ? 2 : 1); // (spin proj.) degrees of freedom
733 
734  int len = nFace*ghostFace[dim]*Nint;
735  int offset = length + ghostOffset[dim]*nColor*nSpin*2;
736  offset += (dir == QUDA_BACKWARDS) ? 0 : len;
737 
738  void *dst = (char*)v + precision*offset;
739  const void *src = ghost_spinor;
740 
741  cudaMemcpyAsync(dst, src, len*precision, cudaMemcpyHostToDevice, *stream);
742 
743  if (precision == QUDA_HALF_PRECISION) {
744  // norm region of host ghost zone is at the end of the ghost_spinor
745 
746  int normlen = nFace*ghostFace[dim];
747  int norm_offset = stride + ghostNormOffset[dim];
748  norm_offset += (dir == QUDA_BACKWARDS) ? 0 : normlen;
749 
750  void *dst = static_cast<char*>(norm) + norm_offset*sizeof(float);
751  const void *src = static_cast<const char*>(ghost_spinor)+nFace*Nint*ghostFace[dim]*precision;
752  cudaMemcpyAsync(dst, src, normlen*sizeof(float), cudaMemcpyHostToDevice, *stream);
753  }
754  }
755 
756 
757 
758 
759  // pack the ghost zone into a contiguous buffer for communications
760  void cudaColorSpinorField::packGhostExtended(const int nFace, const int R[], const QudaParity parity,
761  const int dim, const QudaDirection dir,
762  const int dagger, cudaStream_t *stream,
763  void *buffer)
764  {
765 #ifdef MULTI_GPU
766  int face_num;
767  if(dir == QUDA_BACKWARDS){
768  face_num = 0;
769  }else if(dir == QUDA_FORWARDS){
770  face_num = 1;
771  }else{
772  face_num = 2;
773  }
774  void *packBuffer = buffer ? buffer : ghostFaceBuffer[bufferIndex];
775  packFaceExtended(packBuffer, *this, nFace, R, dagger, parity, dim, face_num, *stream);
776 #else
777  errorQuda("packGhostExtended not built on single-GPU build");
778 #endif
779 
780  }
781 
782 
783 
784 
785  // copy data from host buffer into boundary region of device field
786  void cudaColorSpinorField::unpackGhostExtended(const void* ghost_spinor, const int nFace, const QudaParity parity,
787  const int dim, const QudaDirection dir,
788  const int dagger, cudaStream_t* stream)
789  {
790 
791 
792 
793  // First call the regular unpackGhost routine to copy data into the `usual' ghost-zone region
794  // of the data array
795  unpackGhost(ghost_spinor, nFace, dim, dir, dagger, stream);
796 
797  // Next step is to copy data from the ghost zone back to the interior region
798  int Nint = (nColor * nSpin * 2) / (nSpin == 4 ? 2 : 1); // (spin proj.) degrees of freedom
799 
800  int len = nFace*ghostFace[dim]*Nint;
801  int offset = length + ghostOffset[dim]*nColor*nSpin*2;
802  offset += (dir == QUDA_BACKWARDS) ? 0 : len;
803 
804 #ifdef MULTI_GPU
805  const int face_num = 2;
806  const bool unpack = true;
807  const int R[4] = {0,0,0,0};
808  packFaceExtended(ghostFaceBuffer[bufferIndex], *this, nFace, R, dagger, parity, dim, face_num, *stream, unpack);
809 #else
810  errorQuda("unpackGhostExtended not built on single-GPU build");
811 #endif
812  }
813 
814 
815 
816  cudaStream_t *stream;
817 
819 
820  if(bufferMessageHandler != bufferPinnedResizeCount) destroyComms();
821 
822  if (!initComms || nFaceComms != nFace) {
823 
824  // if we are requesting a new number of faces destroy and start over
825  if(nFace != nFaceComms) destroyComms();
826 
828  errorQuda("Only supports single parity fields");
829 
830 #ifdef GPU_COMMS
831  bool comms = false;
832  for (int i=0; i<nDimComms; i++) if (commDimPartitioned(i)) comms = true;
833 #endif
834 
835  if (nFace > maxNface)
836  errorQuda("Requested number of faces %d in communicator is greater than supported %d",
837  nFace, maxNface);
838 
839  // faceBytes is the sum of all face sizes
840  size_t faceBytes = 0;
841 
842  // nbytes is the size in bytes of each face
843  size_t nbytes[QUDA_MAX_DIM];
844 
845  // The number of degrees of freedom per site for the given
846  // field. Currently assumes spin projection of a Wilson-like
847  // field (so half the number of degrees of freedom).
848  int Ndof = (2 * nSpin * nColor) / (nSpin==4 ? 2 : 1);
849 
850  for (int i=0; i<nDimComms; i++) {
851  nbytes[i] = maxNface*surfaceCB[i]*Ndof*precision;
852  if (precision == QUDA_HALF_PRECISION) nbytes[i] += maxNface*surfaceCB[i]*sizeof(float);
853  if (!commDimPartitioned(i)) continue;
854  faceBytes += 2*nbytes[i];
855  }
856 
857 #ifndef GPU_COMMS
858  // use static pinned memory for face buffers
859  for(int b=0; b<2; ++b){
860  resizeBufferPinned(2*faceBytes, b); // oversizes for GPU_COMMS case
861 
862  my_face[b] = bufferPinned[b];
863  from_face[b] = static_cast<char*>(bufferPinned[b]) + faceBytes;
864  }
865 
866  // assign pointers for each face - it's ok to alias for different Nface parameters
867  size_t offset = 0;
868 #endif
869  for (int i=0; i<nDimComms; i++) {
870  if (!commDimPartitioned(i)) continue;
871 
872 #ifdef GPU_COMMS
873  for(int b=0; b<2; ++b){
874  my_back_face[b][i] = backGhostFaceBuffer[b][i];
875  from_back_face[b][i] = ghost[i];
876 
877  if(precision == QUDA_HALF_PRECISION){
878  my_back_norm_face[b][i] = static_cast<char*>(backGhostFaceBuffer[b][i]) + nFace*ghostFace[i]*Ndof*precision;
879  from_back_norm_face[b][i] = ghostNorm[i];
880  }
881  } // loop over b
882 
883 #else
884  for(int b=0; b<2; ++b){
885  my_back_face[b][i] = static_cast<char*>(my_face[b]) + offset;
886  from_back_face[b][i] = static_cast<char*>(from_face[b]) + offset;
887  }
888  offset += nbytes[i];
889 #endif
890 
891 #ifdef GPU_COMMS
892  for(int b=0; b<2; ++b){
893  my_fwd_face[b][i] = fwdGhostFaceBuffer[b][i];
894  from_fwd_face[b][i] = ghost[i] + nFace*ghostFace[i]*Ndof*precision;
895 
896  if(precision == QUDA_HALF_PRECISION){
897  my_fwd_norm_face[b][i] = static_cast<char*>(fwdGhostFaceBuffer[b][i]) + nFace*ghostFace[i]*Ndof*precision;
898  from_fwd_norm_face[b][i] = static_cast<char*>(ghostNorm[i]) + nFace*ghostFace[i]*sizeof(float);
899  }
900  } // loop over b
901 #else
902  for(int b=0; b<2; ++b){
903  my_fwd_face[b][i] = static_cast<char*>(my_face[b]) + offset;
904  from_fwd_face[b][i] = static_cast<char*>(from_face[b]) + offset;
905  }
906  offset += nbytes[i];
907 #endif
908 
909  }
910 
911  // create a different message handler for each direction and Nface
912  for(int b=0; b<2; ++b){
913  mh_send_fwd[b] = new MsgHandle**[maxNface];
914  mh_send_back[b] = new MsgHandle**[maxNface];
915  mh_recv_fwd[b] = new MsgHandle**[maxNface];
916  mh_recv_back[b] = new MsgHandle**[maxNface];
917 #ifdef GPU_COMMS
918  if(precision == QUDA_HALF_PRECISION){
919  mh_send_norm_fwd[b] = new MsgHandle**[maxNface];
920  mh_send_norm_back[b] = new MsgHandle**[maxNface];
921  mh_recv_norm_fwd[b] = new MsgHandle**[maxNface];
922  mh_recv_norm_back[b] = new MsgHandle**[maxNface];
923  }
924 #endif
925  } // loop over b
926  for (int j=0; j<maxNface; j++) {
927  for(int b=0; b<2; ++b){
928  mh_send_fwd[b][j] = new MsgHandle*[2*nDimComms];
929  mh_send_back[b][j] = new MsgHandle*[2*nDimComms];
930  mh_recv_fwd[b][j] = new MsgHandle*[nDimComms];
931  mh_recv_back[b][j] = new MsgHandle*[nDimComms];
932 
933 #ifdef GPU_COMMS
934  if(precision == QUDA_HALF_PRECISION){
935  mh_send_norm_fwd[b][j] = new MsgHandle*[2*nDimComms];
936  mh_send_norm_back[b][j] = new MsgHandle*[2*nDimComms];
937  mh_recv_norm_fwd[b][j] = new MsgHandle*[nDimComms];
938  mh_recv_norm_back[b][j] = new MsgHandle*[nDimComms];
939  }
940 #endif
941  } // loop over b
942 
943 
944  for (int i=0; i<nDimComms; i++) {
945  if (!commDimPartitioned(i)) continue;
946 #ifdef GPU_COMMS
947  size_t nbytes_Nface = surfaceCB[i]*Ndof*precision*(j+1);
948  if (i != 3 || getKernelPackT() || getTwistPack()) {
949 #else
950  size_t nbytes_Nface = (nbytes[i] / maxNface) * (j+1);
951 #endif
952  for(int b=0; b<2; ++b){
953  mh_send_fwd[b][j][2*i+0] = comm_declare_send_relative(my_fwd_face[b][i], i, +1, nbytes_Nface);
954  mh_send_back[b][j][2*i+0] = comm_declare_send_relative(my_back_face[b][i], i, -1, nbytes_Nface);
955  mh_send_fwd[b][j][2*i+1] = mh_send_fwd[b][j][2*i]; // alias pointers
956  mh_send_back[b][j][2*i+1] = mh_send_back[b][j][2*i]; // alias pointers
957  }
958 #ifdef GPU_COMMS
959 
960  if(precision == QUDA_HALF_PRECISION){
961  for(int b=0; b<2; ++b){
962  mh_send_norm_fwd[b][j][2*i+0] = comm_declare_send_relative(my_fwd_norm_face[b][i], i, +1, surfaceCB[i]*(j+1)*sizeof(float));
963  mh_send_norm_back[b][j][2*i+0] = comm_declare_send_relative(my_back_norm_face[b][i], i, -1, surfaceCB[i]*(j+1)*sizeof(float));
964  mh_send_norm_fwd[b][j][2*i+1] = mh_send_norm_fwd[b][j][2*i];
965  mh_send_norm_back[b][j][2*i+1] = mh_send_norm_back[b][j][2*i];
966  }
967  }
968 
969  } else if (this->TwistFlavor() == QUDA_TWIST_NONDEG_DOUBLET) {
970  errorQuda("GPU_COMMS for non-degenerate doublet only supported with time-dimension kernel packing enabled.");
971  } else {
972  /*
973  use a strided communicator, here we can't really use
974  the previously declared my_fwd_face and my_back_face
975  pointers since they don't really map 1-to-1 so let's
976  just compute the required base pointers and pass these
977  directly into the communicator construction
978  */
979 
980  int Nblocks = Ndof / Nvec(); // number of Nvec buffers we have
981  // start of last time slice chunk we are sending forwards
982  int endOffset = (volume - (j+1)*ghostFace[i]);
983 
984  size_t offset[4];
985  void *base[4];
986  if (nSpin == 1) { // staggered is invariant with dagger
987  offset[2*0 + 0] = 0;
988  offset[2*1 + 0] = endOffset;
989  offset[2*0 + 1] = offset[2*0 + 0];
990  offset[2*1 + 1] = offset[2*1 + 0];
991  } else if (nSpin == 4) {
992  // !dagger: send last components backwards, send first components forwards
993  offset[2*0 + 0] = Nblocks*stride;
994  offset[2*1 + 0] = endOffset;
995  // dagger: send first components backwards, send last components forwards
996  offset[2*0 + 1] = 0;
997  offset[2*1 + 1] = Nblocks*stride + endOffset;
998  } else {
999  errorQuda("Unsupported number of spin components");
1000  }
1001 
1002  for (int k=0; k<4; k++) {
1003  base[k] = static_cast<char*>(v) + offset[k]*Nvec()*precision; // total offset in bytes
1004  }
1005 
1006  size_t blksize = (j+1)*ghostFace[i]*Nvec()*precision; // (j+1) is number of faces
1007  size_t Stride = stride*Nvec()*precision;
1008 
1009  if (blksize * Nblocks != nbytes_Nface)
1010  errorQuda("Total strided message size does not match expected size");
1011 
1012  //printf("%d strided sends with Nface=%d Nblocks=%d blksize=%d Stride=%d\n", i, j+1, Nblocks, blksize, Stride);
1013 
1014  for(int b=0; b<2; ++b){
1015  mh_send_fwd[b][j][2*i+0] = comm_declare_strided_send_relative(base[2], i, +1, blksize, Nblocks, Stride);
1016  mh_send_back[b][j][2*i+0] = comm_declare_strided_send_relative(base[0], i, -1, blksize, Nblocks, Stride);
1017  if (nSpin ==4) { // dagger communicators
1018  mh_send_fwd[b][j][2*i+1] = comm_declare_strided_send_relative(base[3], i, +1, blksize, Nblocks, Stride);
1019  mh_send_back[b][j][2*i+1] = comm_declare_strided_send_relative(base[1], i, -1, blksize, Nblocks, Stride);
1020  } else {
1021  mh_send_fwd[b][j][2*i+1] = mh_send_fwd[b][j][2*i+0];
1022  mh_send_back[b][j][2*i+1] = mh_send_back[b][j][2*i+0];
1023  }
1024 
1025  } // loop over b
1026 
1027 
1028  if(precision == QUDA_HALF_PRECISION){
1029  int Nt_minus1_offset = (volume - nFace*ghostFace[3]); // The space-time coordinate of the start of the last time slice
1030  void *norm_fwd = static_cast<float*>(norm) + Nt_minus1_offset;
1031  void *norm_back = norm; // the first time slice has zero offset
1032  for(int b=0; b<2; ++b){
1033  mh_send_norm_fwd[b][j][2*i+0] = comm_declare_send_relative(norm_fwd, i, +1, surfaceCB[i]*(j+1)*sizeof(float));
1034  mh_send_norm_back[b][j][2*i+0] = comm_declare_send_relative(norm_back, i, -1, surfaceCB[i]*(j+1)*sizeof(float));
1035  mh_send_norm_fwd[b][j][2*i+1] = mh_send_norm_fwd[b][j][2*i];
1036  mh_send_norm_back[b][j][2*i+1] = mh_send_norm_back[b][j][2*i];
1037  }
1038  }
1039 
1040  }
1041 
1042  if(precision == QUDA_HALF_PRECISION){
1043  for(int b=0; b<2; ++b){
1044  mh_recv_norm_fwd[b][j][i] = comm_declare_receive_relative(from_fwd_norm_face[b][i], i, +1, surfaceCB[i]*sizeof(float)*(j+1));
1045  mh_recv_norm_back[b][j][i] = comm_declare_receive_relative(from_back_norm_face[b][i], i, -1, surfaceCB[i]*sizeof(float)*(j+1));
1046  }
1047  }
1048 #endif // GPU_COMMS
1049 
1050  for(int b=0; b<2; ++b){
1051  mh_recv_fwd[b][j][i] = comm_declare_receive_relative(from_fwd_face[b][i], i, +1, nbytes_Nface);
1052  mh_recv_back[b][j][i] = comm_declare_receive_relative(from_back_face[b][i], i, -1, nbytes_Nface);
1053  }
1054 
1055 
1056 
1057  } // loop over dimension
1058  }
1059 
1060  bufferMessageHandler = bufferPinnedResizeCount;
1061  initComms = true;
1062  nFaceComms = nFace;
1063  }
1064  checkCudaError();
1065  }
1066 
1068  if (initComms) {
1069  for(int b=0; b<2; ++b){
1070  for (int j=0; j<maxNface; j++) {
1071  for (int i=0; i<nDimComms; i++) {
1072  if (commDimPartitioned(i)) {
1073  comm_free(mh_recv_fwd[b][j][i]);
1074  comm_free(mh_recv_back[b][j][i]);
1075  comm_free(mh_send_fwd[b][j][2*i]);
1076  comm_free(mh_send_back[b][j][2*i]);
1077  // only in a special case are these not aliasing pointers
1078 #ifdef GPU_COMMS
1079  if(precision == QUDA_HALF_PRECISION){
1080  comm_free(mh_recv_norm_fwd[b][j][i]);
1081  comm_free(mh_recv_norm_back[b][j][i]);
1082  comm_free(mh_send_norm_fwd[b][j][2*i]);
1083  comm_free(mh_send_norm_back[b][j][2*i]);
1084  }
1085 
1086  if (i == 3 && !getKernelPackT() && nSpin == 4) {
1087  comm_free(mh_send_fwd[b][j][2*i+1]);
1088  comm_free(mh_send_back[b][j][2*i+1]);
1089  }
1090 #endif // GPU_COMMS
1091  }
1092  }
1093  delete []mh_recv_fwd[b][j];
1094  delete []mh_recv_back[b][j];
1095  delete []mh_send_fwd[b][j];
1096  delete []mh_send_back[b][j];
1097 #ifdef GPU_COMMS
1098  if(precision == QUDA_HALF_PRECISION){
1099  delete []mh_recv_norm_fwd[b][j];
1100  delete []mh_recv_norm_back[b][j];
1101  delete []mh_send_norm_fwd[b][j];
1102  delete []mh_send_norm_back[b][j];
1103  }
1104 #endif
1105  }
1106  delete []mh_recv_fwd[b];
1107  delete []mh_recv_back[b];
1108  delete []mh_send_fwd[b];
1109  delete []mh_send_back[b];
1110 
1111  for (int i=0; i<nDimComms; i++) {
1112  my_fwd_face[b][i] = NULL;
1113  my_back_face[b][i] = NULL;
1114  from_fwd_face[b][i] = NULL;
1115  from_back_face[b][i] = NULL;
1116  }
1117 #ifdef GPU_COMMS
1118  if(precision == QUDA_HALF_PRECISION){
1119  delete []mh_recv_norm_fwd[b];
1120  delete []mh_recv_norm_back[b];
1121  delete []mh_send_norm_fwd[b];
1122  delete []mh_send_norm_back[b];
1123  }
1124 
1125  for(int i=0; i<nDimComms; i++){
1126  my_fwd_norm_face[b][i] = NULL;
1127  my_back_norm_face[b][i] = NULL;
1128  from_fwd_norm_face[b][i] = NULL;
1129  from_back_norm_face[b][i] = NULL;
1130  }
1131 #endif
1132  } // loop over b
1133  initComms = false;
1134  checkCudaError();
1135  }
1136  }
1137 
1138  void cudaColorSpinorField::streamInit(cudaStream_t *stream_p){
1139  stream = stream_p;
1140  }
1141 
1142  void cudaColorSpinorField::pack(int nFace, int parity, int dagger, cudaStream_t *stream_p,
1143  bool zeroCopyPack, double a, double b) {
1144  allocateGhostBuffer(nFace); // allocate the ghost buffer if not yet allocated
1145  createComms(nFace); // must call this first
1146 
1147  stream = stream_p;
1148 
1149  const int dim=-1; // pack all partitioned dimensions
1150 
1151  if (zeroCopyPack) {
1152  void *my_face_d;
1153  cudaHostGetDevicePointer(&my_face_d, my_face[bufferIndex], 0); // set the matching device pointer
1154  packGhost(nFace, (QudaParity)parity, dim, QUDA_BOTH_DIRS, dagger, &stream[0], my_face_d, a, b);
1155  } else {
1156  packGhost(nFace, (QudaParity)parity, dim, QUDA_BOTH_DIRS, dagger, &stream[Nstream-1], 0, a, b);
1157  }
1158  }
1159 
1160  void cudaColorSpinorField::pack(int nFace, int parity, int dagger, int stream_idx,
1161  bool zeroCopyPack, double a, double b) {
1162  allocateGhostBuffer(nFace); // allocate the ghost buffer if not yet allocated
1163  createComms(nFace); // must call this first
1164 
1165  const int dim=-1; // pack all partitioned dimensions
1166 
1167  if (zeroCopyPack) {
1168  void *my_face_d;
1169  cudaHostGetDevicePointer(&my_face_d, my_face[bufferIndex], 0); // set the matching device pointer
1170  packGhost(nFace, (QudaParity)parity, dim, QUDA_BOTH_DIRS, dagger, &stream[stream_idx], my_face_d, a, b);
1171  } else {
1172  packGhost(nFace, (QudaParity)parity, dim, QUDA_BOTH_DIRS, dagger, &stream[stream_idx], 0, a, b);
1173  }
1174  }
1175 
1176  void cudaColorSpinorField::packExtended(const int nFace, const int R[], const int parity,
1177  const int dagger, const int dim,
1178  cudaStream_t *stream_p, const bool zeroCopyPack){
1179 
1180  allocateGhostBuffer(nFace); // allocate the ghost buffer if not yet allocated
1181  createComms(nFace); // must call this first
1182 
1183  stream = stream_p;
1184 
1185  void *my_face_d = NULL;
1186  if(zeroCopyPack){
1187  cudaHostGetDevicePointer(&my_face_d, my_face[bufferIndex], 0);
1188  packGhostExtended(nFace, R, (QudaParity)parity, dim, QUDA_BOTH_DIRS, dagger, &stream[0], my_face_d);
1189  }else{
1190  packGhostExtended(nFace, R, (QudaParity)parity, dim, QUDA_BOTH_DIRS, dagger, &stream[Nstream-1], my_face_d);
1191  }
1192  }
1193 
1194 
1195 
1196  void cudaColorSpinorField::gather(int nFace, int dagger, int dir, cudaStream_t* stream_p)
1197  {
1198  int dim = dir/2;
1199 
1200  // If stream_p != 0, use pack_stream, else use the stream array
1201  cudaStream_t *pack_stream = (stream_p) ? stream_p : stream+dir;
1202 
1203  if(dir%2 == 0){
1204  // backwards copy to host
1205  sendGhost(my_back_face[bufferIndex][dim], nFace, dim, QUDA_BACKWARDS, dagger, pack_stream);
1206  } else {
1207  // forwards copy to host
1208  sendGhost(my_fwd_face[bufferIndex][dim], nFace, dim, QUDA_FORWARDS, dagger, pack_stream);
1209  }
1210  }
1211 
1212 
1213  void cudaColorSpinorField::recvStart(int nFace, int dir, int dagger) {
1214  int dim = dir/2;
1215  if(!commDimPartitioned(dim)) return;
1216 
1217  if (dir%2 == 0) { // sending backwards
1218  // Prepost receive
1219  comm_start(mh_recv_fwd[bufferIndex][nFace-1][dim]);
1220  } else { //sending forwards
1221  // Prepost receive
1222  comm_start(mh_recv_back[bufferIndex][nFace-1][dim]);
1223  }
1224 #ifdef GPU_COMMS
1225  if(precision != QUDA_HALF_PRECISION) return;
1226 
1227  if (dir%2 == 0) { // sending backwards
1228  // Prepost receive
1229  comm_start(mh_recv_norm_fwd[bufferIndex][nFace-1][dim]);
1230  } else { //sending forwards
1231  // Prepost receive
1232  comm_start(mh_recv_norm_back[bufferIndex][nFace-1][dim]);
1233  }
1234 #endif
1235  }
1236 
1237  void cudaColorSpinorField::sendStart(int nFace, int dir, int dagger) {
1238  int dim = dir / 2;
1239  if(!commDimPartitioned(dim)) return;
1240 
1241  if (dir%2 == 0) { // sending backwards
1242  comm_start(mh_send_back[bufferIndex][nFace-1][2*dim+dagger]);
1243  } else { //sending forwards
1244  comm_start(mh_send_fwd[bufferIndex][nFace-1][2*dim+dagger]);
1245  }
1246 #ifdef GPU_COMMS
1247  if(precision != QUDA_HALF_PRECISION) return;
1248  if (dir%2 == 0) { // sending backwards
1249  comm_start(mh_send_norm_back[bufferIndex][nFace-1][2*dim+dagger]);
1250  } else { //sending forwards
1251  comm_start(mh_send_norm_fwd[bufferIndex][nFace-1][2*dim+dagger]);
1252  }
1253 #endif
1254  }
1255 
1256 
1257 
1258 
1259  void cudaColorSpinorField::commsStart(int nFace, int dir, int dagger) {
1260  int dim = dir / 2;
1261  if(!commDimPartitioned(dim)) return;
1262 
1263  if (dir%2 == 0) { // sending backwards
1264  // Prepost receive
1265  comm_start(mh_recv_fwd[bufferIndex][nFace-1][dim]);
1266  comm_start(mh_send_back[bufferIndex][nFace-1][2*dim+dagger]);
1267  } else { //sending forwards
1268  // Prepost receive
1269  comm_start(mh_recv_back[bufferIndex][nFace-1][dim]);
1270  // Begin forward send
1271  comm_start(mh_send_fwd[bufferIndex][nFace-1][2*dim+dagger]);
1272  }
1273 #ifdef GPU_COMMS
1274 
1275  if(precision != QUDA_HALF_PRECISION) return;
1276 
1277  if (dir%2 == 0) { // sending backwards
1278  // Prepost receive
1279  comm_start(mh_recv_norm_fwd[bufferIndex][nFace-1][dim]);
1280 
1281  comm_start(mh_send_norm_back[bufferIndex][nFace-1][2*dim+dagger]);
1282  } else { //sending forwards
1283  // Prepost receive
1284  comm_start(mh_recv_norm_back[bufferIndex][nFace-1][dim]);
1285  // Begin forward send
1286  comm_start(mh_send_norm_fwd[bufferIndex][nFace-1][2*dim+dagger]);
1287  }
1288 #endif
1289  }
1290 
1291  int cudaColorSpinorField::commsQuery(int nFace, int dir, int dagger) {
1292  int dim = dir / 2;
1293  if(!commDimPartitioned(dim)) return 0;
1294 
1295 #ifdef GPU_COMMS
1296  if(precision != QUDA_HALF_PRECISION){
1297 #endif
1298  if(dir%2==0) {
1299  if (comm_query(mh_recv_fwd[bufferIndex][nFace-1][dim]) &&
1300  comm_query(mh_send_back[bufferIndex][nFace-1][2*dim+dagger])) return 1;
1301  } else {
1302  if (comm_query(mh_recv_back[bufferIndex][nFace-1][dim]) &&
1303  comm_query(mh_send_fwd[bufferIndex][nFace-1][2*dim+dagger])) return 1;
1304  }
1305 #ifdef GPU_COMMS
1306  }else{ // half precision
1307  if(dir%2==0) {
1308  if (comm_query(mh_recv_fwd[bufferIndex][nFace-1][dim]) &&
1309  comm_query(mh_send_back[bufferIndex][nFace-1][2*dim+dagger]) &&
1310  comm_query(mh_recv_norm_fwd[bufferIndex][nFace-1][dim]) &&
1311  comm_query(mh_send_norm_back[bufferIndex][nFace-1][2*dim+dagger])) return 1;
1312  } else {
1313  if (comm_query(mh_recv_back[bufferIndex][nFace-1][dim]) &&
1314  comm_query(mh_send_fwd[bufferIndex][nFace-1][2*dim+dagger]) &&
1315  comm_query(mh_recv_norm_back[bufferIndex][nFace-1][dim]) &&
1316  comm_query(mh_send_norm_fwd[bufferIndex][nFace-1][2*dim+dagger])) return 1;
1317  }
1318  } // half precision
1319 #endif
1320  return 0;
1321  }
1322 
1323 
1324  void cudaColorSpinorField::scatter(int nFace, int dagger, int dir, cudaStream_t* stream_p)
1325  {
1326  int dim = dir/2;
1327  if(!commDimPartitioned(dim)) return;
1328 
1329  // both scattering occurances now go through the same stream
1330  if (dir%2==0) {// receive from forwards
1331  unpackGhost(from_fwd_face[bufferIndex][dim], nFace, dim, QUDA_FORWARDS, dagger, stream_p);
1332  } else { // receive from backwards
1333  unpackGhost(from_back_face[bufferIndex][dim], nFace, dim, QUDA_BACKWARDS, dagger, stream_p);
1334  }
1335  }
1336 
1337 
1338 
1339  void cudaColorSpinorField::scatter(int nFace, int dagger, int dir)
1340  {
1341  int dim = dir/2;
1342  if(!commDimPartitioned(dim)) return;
1343 
1344  // both scattering occurances now go through the same stream
1345  if (dir%2==0) {// receive from forwards
1346  unpackGhost(from_fwd_face[bufferIndex][dim], nFace, dim, QUDA_FORWARDS, dagger, &stream[2*dim/*+0*/]);
1347  } else { // receive from backwards
1348  unpackGhost(from_back_face[bufferIndex][dim], nFace, dim, QUDA_BACKWARDS, dagger, &stream[2*dim/*+1*/]);
1349  }
1350  }
1351 
1352 
1353  void cudaColorSpinorField::scatterExtended(int nFace, int parity, int dagger, int dir)
1354  {
1355  int dim = dir/2;
1356  if(!commDimPartitioned(dim)) return;
1357  if (dir%2==0) {// receive from forwards
1358  unpackGhostExtended(from_fwd_face[bufferIndex][dim], nFace, static_cast<QudaParity>(parity), dim, QUDA_FORWARDS, dagger, &stream[2*dim/*+0*/]);
1359  } else { // receive from backwards
1360  unpackGhostExtended(from_back_face[bufferIndex][dim], nFace, static_cast<QudaParity>(parity), dim, QUDA_BACKWARDS, dagger, &stream[2*dim/*+1*/]);
1361  }
1362  }
1363 
1364 
1365  // Return the location of the field
1367 
1368  std::ostream& operator<<(std::ostream &out, const cudaColorSpinorField &a) {
1369  out << (const ColorSpinorField&)a;
1370  out << "v = " << a.v << std::endl;
1371  out << "norm = " << a.norm << std::endl;
1372  out << "alloc = " << a.alloc << std::endl;
1373  out << "init = " << a.init << std::endl;
1374  return out;
1375  }
1376 
1379 
1380  if (siteSubset == QUDA_PARITY_SITE_SUBSET && this->EigvId() == -1) {
1381  if (idx < this->EigvDim()) {//setup eigenvector form the set
1382  return *(dynamic_cast<cudaColorSpinorField*>(eigenvectors[idx]));
1383  }
1384  else{
1385  errorQuda("Incorrect eigenvector index...");
1386  }
1387  }
1388  errorQuda("Eigenvector must be a parity spinor");
1389  exit(-1);
1390  }
1391 
1392 //copyCuda currently cannot not work with set of spinor fields..
1393  void cudaColorSpinorField::CopyEigenvecSubset(cudaColorSpinorField &dst, const int range, const int first_element) const{
1394 #if 0
1395  if(first_element < 0) errorQuda("\nError: trying to set negative first element.\n");
1396  if (siteSubset == QUDA_PARITY_SITE_SUBSET && this->EigvId() == -1) {
1397  if (first_element == 0 && range == this->EigvDim())
1398  {
1399  if(range != dst.EigvDim())errorQuda("\nError: eigenvector range to big.\n");
1400  checkField(dst, *this);
1401  copyCuda(dst, *this);
1402  }
1403  else if ((first_element+range) < this->EigvDim())
1404  {//setup eigenvector subset
1405 
1406  cudaColorSpinorField *eigv_subset;
1407 
1409 
1410  param.nColor = nColor;
1411  param.nSpin = nSpin;
1412  param.twistFlavor = twistFlavor;
1413  param.precision = precision;
1414  param.nDim = nDim;
1415  param.pad = pad;
1416  param.siteSubset = siteSubset;
1417  param.siteOrder = siteOrder;
1418  param.fieldOrder = fieldOrder;
1419  param.gammaBasis = gammaBasis;
1420  memcpy(param.x, x, nDim*sizeof(int));
1422 
1423  param.eigv_dim = range;
1424  param.eigv_id = -1;
1425  param.v = (void*)((char*)v + first_element*eigv_bytes);
1426  param.norm = (void*)((char*)norm + first_element*eigv_norm_bytes);
1427 
1428  eigv_subset = new cudaColorSpinorField(param);
1429 
1430  //Not really needed:
1431  eigv_subset->eigenvectors.reserve(param.eigv_dim);
1432  for(int id = first_element; id < (first_element+range); id++)
1433  {
1434  param.eigv_id = id;
1435  eigv_subset->eigenvectors.push_back(new cudaColorSpinorField(*this, param));
1436  }
1437  checkField(dst, *eigv_subset);
1438  copyCuda(dst, *eigv_subset);
1439 
1440  delete eigv_subset;
1441  }
1442  else{
1443  errorQuda("Incorrect eigenvector dimension...");
1444  }
1445  }
1446  else{
1447  errorQuda("Eigenvector must be a parity spinor");
1448  exit(-1);
1449  }
1450 #endif
1451  }
1452 
1454  {
1455 #ifdef USE_TEXTURE_OBJECTS
1456  printfQuda("\nPrint texture info for the field:\n");
1457  std::cout << *this;
1458  cudaResourceDesc resDesc;
1459  //memset(&resDesc, 0, sizeof(resDesc));
1460  cudaGetTextureObjectResourceDesc(&resDesc, this->Tex());
1461  printfQuda("\nDevice pointer: %p\n", resDesc.res.linear.devPtr);
1462  printfQuda("\nVolume (in bytes): %d\n", resDesc.res.linear.sizeInBytes);
1463  if (resDesc.resType == cudaResourceTypeLinear) printfQuda("\nResource type: linear \n");
1464  checkCudaError();
1465 #endif
1466  }
1467 
1468 } // namespace quda
MsgHandle * comm_declare_strided_send_relative(void *buffer, int dim, int dir, size_t blksize, int nblocks, size_t stride)
void streamInit(cudaStream_t *stream_p)
int commDimPartitioned(int dir)
void packFace(void *ghost_buf, cudaColorSpinorField &in, const int nFace, const int dagger, const int parity, const int dim, const int face_num, const cudaStream_t &stream, const double a=0.0, const double b=0.0)
void packGhost(const int nFace, const QudaParity parity, const int dim, const QudaDirection dir, const int dagger, cudaStream_t *stream, void *buffer=0, double a=0, double b=0)
bool getKernelPackT()
Definition: dslash_quda.cu:84
int ghostFace[QUDA_MAX_DIM]
void * ghost[QUDA_MAX_DIM]
QudaVerbosity getVerbosity()
Definition: util_quda.cpp:20
void * ghostNorm[QUDA_MAX_DIM]
void unpackGhost(const void *ghost_spinor, const int nFace, const int dim, const QudaDirection dir, const int dagger, cudaStream_t *stream)
void gather(int nFace, int dagger, int dir, cudaStream_t *stream_p=NULL)
MsgHandle *** mh_send_back[2]
#define errorQuda(...)
Definition: util_quda.h:73
void sendGhost(void *ghost_spinor, const int nFace, const int dim, const QudaDirection dir, const int dagger, cudaStream_t *stream)
int commsQuery(int nFace, int dir, int dagger=0)
cudaStream_t * streams
void pack(int nFace, int parity, int dagger, int stream_idx, bool zeroCopyPack, double a=0, double b=0)
MsgHandle * comm_declare_send_relative(void *buffer, int dim, int dir, size_t nbytes)
int ghostOffset[QUDA_MAX_DIM]
cudaStream_t * stream
const int Nstream
void * my_fwd_face[2][QUDA_MAX_DIM]
QudaPrecision precision
Definition: lattice_field.h:41
void copyGenericColorSpinor(ColorSpinorField &dst, const ColorSpinorField &src, QudaFieldLocation location, void *Dst=0, void *Src=0, void *dstNorm=0, void *srcNorm=0)
#define REORDER_LOCATION
void unpackGhostExtended(const void *ghost_spinor, const int nFace, const QudaParity parity, const int dim, const QudaDirection dir, const int dagger, cudaStream_t *stream)
int eigv_dim
used for eigcg:
void scatterExtended(int nFace, int parity, int dagger, int dir)
void sendStart(int nFace, int dir, int dagger=0)
QudaSiteSubset siteSubset
Definition: lattice_field.h:42
std::ostream & operator<<(std::ostream &output, const CloverFieldParam &param)
QudaDagType dagger
Definition: test_util.cpp:1558
QudaGaugeParam param
Definition: pack_test.cpp:17
void comm_free(MsgHandle *mh)
Definition: comm_mpi.cpp:174
cudaColorSpinorField & Odd() const
int x[QUDA_MAX_DIM]
Definition: lattice_field.h:38
enum QudaDirection_s QudaDirection
ColorSpinorField * odd
std::vector< ColorSpinorField * > eigenvectors
for eigcg:
static void checkField(const ColorSpinorField &, const ColorSpinorField &)
static void * bufferDevice
void recvStart(int nFace, int dir, int dagger=0)
void reset(const ColorSpinorParam &)
void CopyEigenvecSubset(cudaColorSpinorField &dst, const int range, const int first_element=0) const
void comm_start(MsgHandle *mh)
Definition: comm_mpi.cpp:180
void packExtended(const int nFace, const int R[], const int parity, const int dagger, const int dim, cudaStream_t *stream_p, const bool zeroCopyPack=false)
int EigvDim() const
for eigcg only:
ColorSpinorField & operator=(const ColorSpinorField &)
void copyCuda(cudaColorSpinorField &dst, const cudaColorSpinorField &src)
Definition: copy_quda.cu:235
int ghostNormOffset[QUDA_MAX_DIM]
MsgHandle *** mh_recv_back[2]
enum QudaParity_s QudaParity
QudaTwistFlavorType twistFlavor
MsgHandle * comm_declare_receive_relative(void *buffer, int dim, int dir, size_t nbytes)
virtual ColorSpinorField & operator=(const ColorSpinorField &)
void * from_fwd_face[2][QUDA_MAX_DIM]
enum QudaFieldLocation_s QudaFieldLocation
cpuColorSpinorField * out
static size_t bufferPinnedResizeCount
void * from_back_face[2][QUDA_MAX_DIM]
void * memset(void *s, int c, size_t n)
void resizeBufferDevice(size_t bytes) const
bool getTwistPack()
Definition: dslash_quda.cu:91
int comm_query(MsgHandle *mh)
Definition: comm_mpi.cpp:192
#define printfQuda(...)
Definition: util_quda.h:67
QudaTwistFlavorType twistFlavor
QudaTwistFlavorType TwistFlavor() const
#define device_malloc(size)
Definition: malloc_quda.h:24
void * my_back_face[2][QUDA_MAX_DIM]
void packGhostExtended(const int nFace, const int R[], const QudaParity parity, const int dim, const QudaDirection dir, const int dagger, cudaStream_t *stream, void *buffer=0)
int surfaceCB[QUDA_MAX_DIM]
Definition: lattice_field.h:81
enum QudaFieldCreate_s QudaFieldCreate
void init(int argc, char **argv)
Definition: dslash_test.cpp:79
cudaColorSpinorField(const cudaColorSpinorField &)
#define QUDA_MAX_DIM
Maximum number of dimensions supported by QUDA. In practice, no routines make use of more than 5...
#define checkCudaError()
Definition: util_quda.h:110
QudaFieldLocation Location() const
void commsStart(int nFace, int dir, int dagger=0)
void packFaceExtended(void *ghost_buf, cudaColorSpinorField &field, const int nFace, const int R[], const int dagger, const int parity, const int dim, const int face_num, const cudaStream_t &stream, const bool unpack=false)
void scatter(int nFace, int dagger, int dir, cudaStream_t *stream_p)
MsgHandle *** mh_send_fwd[2]
const int maxNface
Definition: lattice_field.h:11
void initComms(int argc, char **argv, const int *commDims)
Definition: test_util.cpp:48
QudaSiteSubset SiteSubset() const
const QudaParity parity
Definition: dslash_test.cpp:29
void resizeBufferPinned(size_t bytes, const int index=0) const
ColorSpinorField * even
cudaColorSpinorField & Even() const
static void * bufferPinned[2]
Definition: lattice_field.h:90
MsgHandle *** mh_recv_fwd[2]
#define device_free(ptr)
Definition: malloc_quda.h:28
cudaColorSpinorField & Eigenvec(const int idx) const
for deflated solvers: