QUDA  v0.5.0
A library for QCD on GPUs
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
cuda_color_spinor_field.cu
Go to the documentation of this file.
1 #include <stdlib.h>
2 #include <stdio.h>
3 #include <typeinfo>
4 
5 #include <color_spinor_field.h>
6 #include <blas_quda.h>
7 
8 #include <string.h>
9 #include <iostream>
10 #include <misc_helpers.h>
11 #include <face_quda.h>
12 #include <dslash_quda.h>
13 
14 #ifdef DEVICE_PACK
15 #define REORDER_LOCATION QUDA_CUDA_FIELD_LOCATION
16 #else
17 #define REORDER_LOCATION QUDA_CPU_FIELD_LOCATION
18 #endif
19 
20 
21 namespace quda {
22 
23  void* cudaColorSpinorField::buffer_h = 0;
24  void* cudaColorSpinorField::buffer_d = 0;
25  bool cudaColorSpinorField::bufferInit = false;
26  size_t cudaColorSpinorField::bufferBytes = 0;
27 
28  int cudaColorSpinorField::initGhostFaceBuffer = 0;
29  void* cudaColorSpinorField::ghostFaceBuffer; //gpu memory
30  void* cudaColorSpinorField::fwdGhostFaceBuffer[QUDA_MAX_DIM]; //pointers to ghostFaceBuffer
31  void* cudaColorSpinorField::backGhostFaceBuffer[QUDA_MAX_DIM]; //pointers to ghostFaceBuffer
32  QudaPrecision cudaColorSpinorField::facePrecision;
33 
34  /*cudaColorSpinorField::cudaColorSpinorField() :
35  ColorSpinorField(), v(0), norm(0), alloc(false), init(false) {
36 
37  }*/
38 
40  ColorSpinorField(param), alloc(false), init(true), texInit(false) {
41 
42  // this must come before create
43  if (param.create == QUDA_REFERENCE_FIELD_CREATE) {
44  v = param.v;
45  norm = param.norm;
46  }
47 
48  create(param.create);
49 
50  if (param.create == QUDA_NULL_FIELD_CREATE) {
51  // do nothing
52  } else if (param.create == QUDA_ZERO_FIELD_CREATE) {
53  zero();
54  } else if (param.create == QUDA_REFERENCE_FIELD_CREATE) {
55  // dp nothing
56  } else if (param.create == QUDA_COPY_FIELD_CREATE){
57  errorQuda("not implemented");
58  }
60  }
61 
63  ColorSpinorField(src), alloc(false), init(true), texInit(false) {
64  create(QUDA_COPY_FIELD_CREATE);
65  if (isNative() && src.isNative()) copy(src);
66  else errorQuda("Cannot copy using non-native fields");
67  }
68 
69  // creates a copy of src, any differences defined in param
71  ColorSpinorField(src), alloc(false), init(true), texInit(false) {
72 
73  // can only overide if we are not using a reference or parity special case
74  if (param.create != QUDA_REFERENCE_FIELD_CREATE ||
78  typeid(src) == typeid(cudaColorSpinorField) ) ) {
79  reset(param);
80  } else {
81  errorQuda("Undefined behaviour"); // else silent bug possible?
82  }
83 
84  // This must be set before create is called
85  if (param.create == QUDA_REFERENCE_FIELD_CREATE) {
86  if (typeid(src) == typeid(cudaColorSpinorField)) {
87  v = (dynamic_cast<const cudaColorSpinorField&>(src)).v;
88  norm = (dynamic_cast<const cudaColorSpinorField&>(src)).norm;
89  } else {
90  errorQuda("Cannot reference a non-cuda field");
91  }
92  }
93 
94  create(param.create);
95 
96  if (param.create == QUDA_NULL_FIELD_CREATE) {
97  // do nothing
98  } else if (param.create == QUDA_ZERO_FIELD_CREATE) {
99  zero();
100  } else if (param.create == QUDA_COPY_FIELD_CREATE) {
101  if (typeid(src) == typeid(cudaColorSpinorField) &&
102  isNative() && dynamic_cast<const cudaColorSpinorField &>(src).isNative()) {
103  copy(dynamic_cast<const cudaColorSpinorField&>(src));
104  } else {
105  loadSpinorField(src);
106  }
107  } else if (param.create == QUDA_REFERENCE_FIELD_CREATE) {
108  // do nothing
109  } else {
110  errorQuda("CreateType %d not implemented", param.create);
111  }
112 
114  }
115 
117  : ColorSpinorField(src), alloc(false), init(true), texInit(false) {
118  create(QUDA_COPY_FIELD_CREATE);
119  if (typeid(src) == typeid(cudaColorSpinorField) &&
120  isNative() && dynamic_cast<const cudaColorSpinorField &>(src).isNative()) {
121  copy(dynamic_cast<const cudaColorSpinorField&>(src));
122  } else if (typeid(src) == typeid(cpuColorSpinorField) || typeid(src) == typeid(cudaColorSpinorField)) {
123  loadSpinorField(src);
124  } else {
125  errorQuda("Unknown input ColorSpinorField %s", typeid(src).name());
126  }
127 
129  }
130 
132  if (typeid(src) == typeid(cudaColorSpinorField)) {
133  *this = (dynamic_cast<const cudaColorSpinorField&>(src));
134  } else if (typeid(src) == typeid(cpuColorSpinorField)) {
135  *this = (dynamic_cast<const cpuColorSpinorField&>(src));
136  } else {
137  errorQuda("Unknown input ColorSpinorField %s", typeid(src).name());
138  }
139  return *this;
140  }
141 
143  if (&src != this) {
144  // keep current attributes unless unset
145  if (!ColorSpinorField::init) { // note this will turn a reference field into a regular field
146  destroy();
148  create(QUDA_COPY_FIELD_CREATE);
149  }
150  if (isNative() && src.isNative()) copy(src);
151  else errorQuda("Cannot copy using non-native fields");
152  }
153  return *this;
154  }
155 
157  // keep current attributes unless unset
158  if (!ColorSpinorField::init) { // note this will turn a reference field into a regular field
159  destroy();
161  create(QUDA_COPY_FIELD_CREATE);
162  }
163  loadSpinorField(src);
164  return *this;
165  }
166 
168  destroy();
169  }
170 
171  // return true if the field is a native field (ordering, precision, spin)
172  bool cudaColorSpinorField::isNative() const {
173 
175  if (fieldOrder == QUDA_FLOAT2_FIELD_ORDER) return true;
176  } else if (precision == QUDA_SINGLE_PRECISION) {
177  if (nSpin == 4) {
178  if (fieldOrder == QUDA_FLOAT4_FIELD_ORDER) return true;
179  } else if (nSpin == 1) {
180  if (fieldOrder == QUDA_FLOAT2_FIELD_ORDER) return true;
181  }
182  } else if (precision == QUDA_HALF_PRECISION) {
183  if (nSpin == 4) {
184  if (fieldOrder == QUDA_FLOAT4_FIELD_ORDER) return true;
185  } else if (nSpin == 1) {
186  if (fieldOrder == QUDA_FLOAT2_FIELD_ORDER) return true;
187  }
188  }
189 
190  return false;
191  }
192 
193  void cudaColorSpinorField::create(const QudaFieldCreate create) {
194 
196  errorQuda("Subset not implemented");
197  }
198 
199  //FIXME: This addition is temporary to ensure we have the correct
200  //field order for a given precision
201  //if (precision == QUDA_DOUBLE_PRECISION) fieldOrder = QUDA_FLOAT2_FIELD_ORDER;
202  //else fieldOrder = (nSpin == 4) ? QUDA_FLOAT4_FIELD_ORDER : QUDA_FLOAT2_FIELD_ORDER;
203 
204  if (create != QUDA_REFERENCE_FIELD_CREATE) {
205  v = device_malloc(bytes);
208  }
209  alloc = true;
210  }
211 
212  // Check if buffer isn't big enough
213  if ((bytes > bufferBytes) && bufferInit) {
214  host_free(buffer_h);
215  buffer_h = NULL;
217  device_free(buffer_d);
218  buffer_d = NULL;
219  }
220  bufferInit = false;
221  }
222 
223  if (!bufferInit) {
224  bufferBytes = bytes;
225  buffer_h = pinned_malloc(bufferBytes);
227  buffer_d = device_malloc(bufferBytes);
228  }
229  bufferInit = true;
230  }
231 
233  // create the associated even and odd subsets
235  param.siteSubset = QUDA_PARITY_SITE_SUBSET;
236  param.nDim = nDim;
237  memcpy(param.x, x, nDim*sizeof(int));
238  param.x[0] /= 2; // set single parity dimensions
239  param.create = QUDA_REFERENCE_FIELD_CREATE;
240  param.v = v;
241  param.norm = norm;
242  even = new cudaColorSpinorField(*this, param);
243  odd = new cudaColorSpinorField(*this, param);
244 
245  // need this hackery for the moment (need to locate the odd pointer half way into the full field)
246  (dynamic_cast<cudaColorSpinorField*>(odd))->v = (void*)((char*)v + bytes/2);
248  (dynamic_cast<cudaColorSpinorField*>(odd))->norm = (void*)((char*)norm + norm_bytes/2);
249 
250 #ifdef USE_TEXTURE_OBJECTS
251  dynamic_cast<cudaColorSpinorField*>(even)->destroyTexObject();
252  dynamic_cast<cudaColorSpinorField*>(even)->createTexObject();
253  dynamic_cast<cudaColorSpinorField*>(odd)->destroyTexObject();
254  dynamic_cast<cudaColorSpinorField*>(odd)->createTexObject();
255 #endif
256  }
257 
258  if (create != QUDA_REFERENCE_FIELD_CREATE) {
260  zeroPad();
261  } else {
262  (dynamic_cast<cudaColorSpinorField*>(even))->zeroPad();
263  (dynamic_cast<cudaColorSpinorField*>(odd))->zeroPad();
264  }
265  }
266 
267 #ifdef USE_TEXTURE_OBJECTS
268  createTexObject();
269 #endif
270 
271  checkCudaError();
272  }
273 
274 #ifdef USE_TEXTURE_OBJECTS
275  void cudaColorSpinorField::createTexObject() {
276 
277  if (texInit) errorQuda("Already bound textures");
278 
279  // create the texture for the field components
280 
281  cudaChannelFormatDesc desc;
282  memset(&desc, 0, sizeof(cudaChannelFormatDesc));
283  if (precision == QUDA_SINGLE_PRECISION) desc.f = cudaChannelFormatKindFloat;
284  else desc.f = cudaChannelFormatKindSigned; // half is short, double is int2
285 
286  // staggered fields in half and single are always two component
288  desc.x = 8*precision;
289  desc.y = 8*precision;
290  desc.z = 0;
291  desc.w = 0;
292  } else { // all others are four component
293  desc.x = (precision == QUDA_DOUBLE_PRECISION) ? 32 : 8*precision;
294  desc.y = (precision == QUDA_DOUBLE_PRECISION) ? 32 : 8*precision;
295  desc.z = (precision == QUDA_DOUBLE_PRECISION) ? 32 : 8*precision;
296  desc.w = (precision == QUDA_DOUBLE_PRECISION) ? 32 : 8*precision;
297  }
298 
299  cudaResourceDesc resDesc;
300  memset(&resDesc, 0, sizeof(resDesc));
301  resDesc.resType = cudaResourceTypeLinear;
302  resDesc.res.linear.devPtr = v;
303  resDesc.res.linear.desc = desc;
304  resDesc.res.linear.sizeInBytes = bytes;
305 
306  cudaTextureDesc texDesc;
307  memset(&texDesc, 0, sizeof(texDesc));
308  if (precision == QUDA_HALF_PRECISION) texDesc.readMode = cudaReadModeNormalizedFloat;
309  else texDesc.readMode = cudaReadModeElementType;
310 
311  cudaCreateTextureObject(&tex, &resDesc, &texDesc, NULL);
312  checkCudaError();
313 
314  // create the texture for the norm components
316  cudaChannelFormatDesc desc;
317  memset(&desc, 0, sizeof(cudaChannelFormatDesc));
318  desc.f = cudaChannelFormatKindFloat;
319  desc.x = 8*QUDA_SINGLE_PRECISION; desc.y = 0; desc.z = 0; desc.w = 0;
320 
321  cudaResourceDesc resDesc;
322  memset(&resDesc, 0, sizeof(resDesc));
323  resDesc.resType = cudaResourceTypeLinear;
324  resDesc.res.linear.devPtr = norm;
325  resDesc.res.linear.desc = desc;
326  resDesc.res.linear.sizeInBytes = norm_bytes;
327 
328  cudaTextureDesc texDesc;
329  memset(&texDesc, 0, sizeof(texDesc));
330  texDesc.readMode = cudaReadModeElementType;
331 
332  cudaCreateTextureObject(&texNorm, &resDesc, &texDesc, NULL);
333  checkCudaError();
334  }
335 
336  texInit = true;
337  }
338 
339  void cudaColorSpinorField::destroyTexObject() {
340  if (texInit) {
341  cudaDestroyTextureObject(tex);
342  if (precision == QUDA_HALF_PRECISION) cudaDestroyTextureObject(texNorm);
343  texInit = false;
344  checkCudaError();
345  }
346  }
347 #endif
348 
350  if (bufferInit) {
351  host_free(buffer_h);
352  buffer_h = NULL;
354  device_free(buffer_d);
355  buffer_d = NULL;
356  }
357  bufferInit = false;
358  }
359  }
360 
361  void cudaColorSpinorField::destroy() {
362  if (alloc) {
363  device_free(v);
366  delete even;
367  delete odd;
368  }
369  alloc = false;
370  }
371 
372 #ifdef USE_TEXTURE_OBJECTS
373  destroyTexObject();
374 #endif
375 
376  }
377 
378 
381  return *(dynamic_cast<cudaColorSpinorField*>(even));
382  }
383 
384  errorQuda("Cannot return even subset of %d subset", siteSubset);
385  exit(-1);
386  }
387 
390  return *(dynamic_cast<cudaColorSpinorField*>(odd));
391  }
392 
393  errorQuda("Cannot return odd subset of %d subset", siteSubset);
394  exit(-1);
395  }
396 
397  // cuda's floating point format, IEEE-754, represents the floating point
398  // zero as 4 zero bytes
400  cudaMemset(v, 0, bytes);
401  if (precision == QUDA_HALF_PRECISION) cudaMemset(norm, 0, norm_bytes);
402  }
403 
404 
405  void cudaColorSpinorField::zeroPad() {
406  size_t pad_bytes = (stride - volume) * precision * fieldOrder;
407  int Npad = nColor * nSpin * 2 / fieldOrder;
408  for (int i=0; i<Npad; i++) {
409  if (pad_bytes) cudaMemset((char*)v + (volume + i*stride)*fieldOrder*precision, 0, pad_bytes);
410  }
411  }
412 
413  void cudaColorSpinorField::copy(const cudaColorSpinorField &src) {
414  checkField(*this, src);
415  copyCuda(*this, src);
416  }
417 
418 #include <pack_spinor.h>
419 
420 #define REORDER_SPINOR_FIELD(DST, SRC, dst, src, myNs, loc) \
421  if ((dst).Precision() == QUDA_DOUBLE_PRECISION) { \
422  if ((src).Precision() == QUDA_DOUBLE_PRECISION) { \
423  if (fieldOrder == QUDA_FLOAT_FIELD_ORDER) { \
424  packSpinor<3,myNs,1>((double*)DST, (double*)SRC, dst, src, loc); \
425  } else if (fieldOrder == QUDA_FLOAT2_FIELD_ORDER) { \
426  packSpinor<3,myNs,2>((double*)DST, (double*)SRC, dst, src, loc); \
427  } else if (fieldOrder == QUDA_FLOAT4_FIELD_ORDER) { \
428  packSpinor<3,myNs,4>((double*)DST, (double*)SRC, dst, src, loc); \
429  } \
430  } else { \
431  if (fieldOrder == QUDA_FLOAT_FIELD_ORDER) { \
432  packSpinor<3,myNs,1>((double*)DST, (float*)SRC, dst, src, loc); \
433  } else if (fieldOrder == QUDA_FLOAT2_FIELD_ORDER) { \
434  packSpinor<3,myNs,2>((double*)DST, (float*)SRC, dst, src, loc); \
435  } else if (fieldOrder == QUDA_FLOAT4_FIELD_ORDER) { \
436  packSpinor<3,myNs,4>((double*)DST, (float*)SRC, dst, src, loc); \
437  } \
438  } \
439  } else { \
440  if ((src).Precision() == QUDA_DOUBLE_PRECISION) { \
441  if (fieldOrder == QUDA_FLOAT_FIELD_ORDER) { \
442  packSpinor<3,myNs,1>((float*)DST, (double*)SRC, dst, src, loc); \
443  } else if (fieldOrder == QUDA_FLOAT2_FIELD_ORDER) { \
444  packSpinor<3,myNs,2>((float*)DST, (double*)SRC, dst, src, loc); \
445  } else if (fieldOrder == QUDA_FLOAT4_FIELD_ORDER) { \
446  packSpinor<3,myNs,4>((float*)DST, (double*)SRC, dst, src, loc); \
447  } \
448  } else { \
449  if (fieldOrder == QUDA_FLOAT_FIELD_ORDER) { \
450  packSpinor<3,myNs,1>((float*)DST, (float*)SRC, dst, src, loc); \
451  } else if (fieldOrder == QUDA_FLOAT2_FIELD_ORDER) { \
452  packSpinor<3,myNs,2>((float*)DST, (float*)SRC, dst, src, loc); \
453  } else if (fieldOrder == QUDA_FLOAT4_FIELD_ORDER) { \
454  packSpinor<3,myNs,4>((float*)DST, (float*)SRC, dst, src, loc); \
455  } \
456  } \
457  }
458 
459 
460  void cudaColorSpinorField::resizeBuffer(size_t bytes) const {
461  if (bytes > bufferBytes) {
462  device_free(buffer_d);
463  host_free(buffer_h);
464  buffer_d = device_malloc(bytes);
465  buffer_h = pinned_malloc(bytes);
466  }
467  }
468 
469  void cudaColorSpinorField::loadSpinorField(const ColorSpinorField &src) {
470 
471  if (nColor != 3) errorQuda("Nc != 3 not yet supported");
472 
474  ColorSpinorParam param(*this); // acquire all attributes of this
475  param.setPrecision(QUDA_SINGLE_PRECISION); // change precision
476  param.create = QUDA_COPY_FIELD_CREATE;
477  cudaColorSpinorField tmp(src, param);
478  copy(tmp);
479  return;
480  }
481 
482  // no native support for this yet - copy to a native supported order
483  if (src.FieldOrder() == QUDA_QOP_DOMAIN_WALL_FIELD_ORDER) {
484  ColorSpinorParam param(src); // acquire all attributes of this
485  param.fieldOrder = QUDA_SPACE_SPIN_COLOR_FIELD_ORDER;
486  param.create = QUDA_NULL_FIELD_CREATE;
487  cpuColorSpinorField tmp(param);
488  tmp.copy(src);
489  loadSpinorField(tmp);
490  return;
491  }
492 
493  if (REORDER_LOCATION == QUDA_CPU_FIELD_LOCATION && typeid(src) == typeid(cpuColorSpinorField)) {
494  // (temporary?) bug fix for padding
495  memset(buffer_h, 0, bytes);
496 
497  switch(nSpin){
498  case 1:
499  REORDER_SPINOR_FIELD(buffer_h, dynamic_cast<const cpuColorSpinorField&>(src).V(),
500  *this, src, 1, QUDA_CPU_FIELD_LOCATION);
501  break;
502  case 4:
503  REORDER_SPINOR_FIELD(buffer_h, dynamic_cast<const cpuColorSpinorField&>(src).V(),
504  *this, src, 4, QUDA_CPU_FIELD_LOCATION);
505  break;
506  default:
507  errorQuda("invalid number of spinors");
508  }
509  cudaMemcpy(v, buffer_h, bytes, cudaMemcpyHostToDevice);
510 
511  } else {
512  // (temporary?) bug fix for padding
513  cudaMemset(v, 0, bytes);
514 
515  if (typeid(src) == typeid(cpuColorSpinorField)) {
516  resizeBuffer(src.Bytes());
517  cudaMemcpy(buffer_d, dynamic_cast<const cpuColorSpinorField&>(src).V(), src.Bytes(), cudaMemcpyHostToDevice);
518  }
519 
520  const void *source = typeid(src) == typeid(cudaColorSpinorField) ?
521  dynamic_cast<const cudaColorSpinorField&>(src).V() : buffer_d;
522 
523  switch(nSpin){
524  case 1:
525  REORDER_SPINOR_FIELD(v, source, *this, src, 1, QUDA_CUDA_FIELD_LOCATION);
526  break;
527  case 4:
528  REORDER_SPINOR_FIELD(v, source, *this, src, 4, QUDA_CUDA_FIELD_LOCATION);
529  break;
530  default:
531  errorQuda("invalid number of spinors");
532  }
533 
534  }
535 
536  checkCudaError();
537  return;
538  }
539 
540 
541  void cudaColorSpinorField::saveSpinorField(ColorSpinorField &dest) const {
542 
543  if (nColor != 3) errorQuda("Nc != 3 not yet supported");
544 
546  ColorSpinorParam param(*this); // acquire all attributes of this
547  param.precision = QUDA_SINGLE_PRECISION; // change precision
548  param.create = QUDA_COPY_FIELD_CREATE;
549  cudaColorSpinorField tmp(*this, param);
550  tmp.saveSpinorField(dest);
551  return;
552  }
553 
554  // no native support for this yet - copy to a native supported order
555  if (dest.FieldOrder() == QUDA_QOP_DOMAIN_WALL_FIELD_ORDER) {
556  if (typeid(dest) == typeid(cudaColorSpinorField)) errorQuda("Must use a cpuColorSpinorField here");
557  ColorSpinorParam param(dest); // acquire all attributes of this
558  param.fieldOrder = QUDA_SPACE_SPIN_COLOR_FIELD_ORDER;
559  param.create = QUDA_NULL_FIELD_CREATE;
560  cpuColorSpinorField tmp(param);
561  saveSpinorField(tmp);
562  dynamic_cast<cpuColorSpinorField&>(dest).copy(tmp);
563  return;
564  }
565 
566  if (REORDER_LOCATION == QUDA_CPU_FIELD_LOCATION && typeid(dest) == typeid(cpuColorSpinorField)) {
567  cudaMemcpy(buffer_h, v, bytes, cudaMemcpyDeviceToHost);
568 
569  switch(nSpin){
570  case 1:
571  REORDER_SPINOR_FIELD(dynamic_cast<cpuColorSpinorField&>(dest).V(), buffer_h,
572  dest, *this, 1, QUDA_CPU_FIELD_LOCATION);
573  break;
574  case 4:
575  REORDER_SPINOR_FIELD(dynamic_cast<cpuColorSpinorField&>(dest).V(), buffer_h,
576  dest, *this, 4, QUDA_CPU_FIELD_LOCATION);
577  break;
578  default:
579  errorQuda("invalid number of spinors in function");
580  }
581  } else {
582 
583  if (typeid(dest)==typeid(cpuColorSpinorField)) resizeBuffer(dest.Bytes());
584 
585  void *dst = (typeid(dest)==typeid(cudaColorSpinorField)) ? dynamic_cast<cudaColorSpinorField&>(dest).V() : buffer_d;
586 
587  switch(nSpin){
588  case 1:
589  REORDER_SPINOR_FIELD(dst, v, dest, *this, 1, QUDA_CUDA_FIELD_LOCATION);
590  break;
591  case 4:
592  REORDER_SPINOR_FIELD(dst, v, dest, *this, 4, QUDA_CUDA_FIELD_LOCATION);
593  break;
594  default:
595  errorQuda("invalid number of spinors in function");
596  }
597 
598  if (typeid(dest) == typeid(cpuColorSpinorField)) {
599  cudaMemcpy(dynamic_cast<cpuColorSpinorField&>(dest).V(), buffer_d, dest.Bytes(), cudaMemcpyDeviceToHost);
600  }
601  }
602 
603  checkCudaError();
604  return;
605  }
606 
608  int nFace = (nSpin == 1) ? 3 : 1; //3 faces for asqtad
609  int Nint = nColor * nSpin * 2; // number of internal degrees of freedom
610  if (nSpin == 4) Nint /= 2; // spin projection for Wilson
611 
612  // only allocate if not already allocated or precision is greater then previously
613  if(initGhostFaceBuffer == 0 || precision > facePrecision){
614 
615  if (initGhostFaceBuffer) device_free(ghostFaceBuffer);
616 
617  // allocate a single contiguous buffer for the buffers
618  size_t faceBytes = 0;
619  for (int i=0; i<4; i++) {
620  if(!commDimPartitioned(i)) continue;
621  faceBytes += 2*nFace*ghostFace[i]*Nint*precision;
622  // add extra space for the norms for half precision
623  if (precision == QUDA_HALF_PRECISION) faceBytes += 2*nFace*ghostFace[i]*sizeof(float);
624  }
625 
626  if (faceBytes > 0) {
627  ghostFaceBuffer = device_malloc(faceBytes);
628  initGhostFaceBuffer = 1;
629  facePrecision = precision;
630  }
631 
632  }
633 
634  size_t offset = 0;
635  for (int i=0; i<4; i++) {
636  if(!commDimPartitioned(i)) continue;
637 
638  backGhostFaceBuffer[i] = (void*)(((char*)ghostFaceBuffer) + offset);
639  offset += nFace*ghostFace[i]*Nint*precision;
640  if (precision == QUDA_HALF_PRECISION) offset += nFace*ghostFace[i]*sizeof(float);
641 
642  fwdGhostFaceBuffer[i] = (void*)(((char*)ghostFaceBuffer) + offset);
643  offset += nFace*ghostFace[i]*Nint*precision;
644  if (precision == QUDA_HALF_PRECISION) offset += nFace*ghostFace[i]*sizeof(float);
645  }
646 
647  }
648 
649 
651  {
652  if (!initGhostFaceBuffer) return;
653 
654  device_free(ghostFaceBuffer);
655 
656  for(int i=0;i < 4; i++){
657  if(!commDimPartitioned(i)) continue;
658  backGhostFaceBuffer[i] = NULL;
659  fwdGhostFaceBuffer[i] = NULL;
660  }
661  initGhostFaceBuffer = 0;
662  }
663 
664  // pack the ghost zone into a contiguous buffer for communications
665  void cudaColorSpinorField::packGhost(const QudaParity parity, const int dagger, cudaStream_t *stream)
666  {
667 #ifdef MULTI_GPU
668  packFace(ghostFaceBuffer, *this, dagger, parity, *stream);
669 #else
670  errorQuda("packGhost not built on single-GPU build");
671 #endif
672 
673  }
674 
675  // send the ghost zone to the host
676  void cudaColorSpinorField::sendGhost(void *ghost_spinor, const int dim, const QudaDirection dir,
677  const int dagger, cudaStream_t *stream) {
678 
679 #ifdef MULTI_GPU
680  int Nvec = (nSpin == 1 || precision == QUDA_DOUBLE_PRECISION) ? 2 : 4;
681  int nFace = (nSpin == 1) ? 3 : 1; //3 faces for asqtad
682  int Nint = (nColor * nSpin * 2) / (nSpin == 4 ? 2 : 1); // (spin proj.) degrees of freedom
683 
684  if (dim !=3 || getKernelPackT()) { // use kernels to pack into contiguous buffers then a single cudaMemcpy
685 
686  size_t bytes = nFace*Nint*ghostFace[dim]*precision;
687  if (precision == QUDA_HALF_PRECISION) bytes += nFace*ghostFace[dim]*sizeof(float);
688  void* gpu_buf =
689  (dir == QUDA_BACKWARDS) ? this->backGhostFaceBuffer[dim] : this->fwdGhostFaceBuffer[dim];
690 
691  cudaMemcpyAsync(ghost_spinor, gpu_buf, bytes, cudaMemcpyDeviceToHost, *stream);
692  } else { // do multiple cudaMemcpys
693 
694  int Npad = Nint / Nvec; // number Nvec buffers we have
695  int Nt_minus1_offset = (volume - nFace*ghostFace[3]); // N_t -1 = Vh-Vsh
696 
697  int offset = 0;
698  if (nSpin == 1) {
699  offset = (dir == QUDA_BACKWARDS) ? 0 : Nt_minus1_offset;
700  } else if (nSpin == 4) {
701  // !dagger: send lower components backwards, send upper components forwards
702  // dagger: send upper components backwards, send lower components forwards
703  bool upper = dagger ? true : false; // Fwd is !Back
704  if (dir == QUDA_FORWARDS) upper = !upper;
705  int lower_spin_offset = Npad*stride;
706  if (upper) offset = (dir == QUDA_BACKWARDS ? 0 : Nt_minus1_offset);
707  else offset = lower_spin_offset + (dir == QUDA_BACKWARDS ? 0 : Nt_minus1_offset);
708  }
709 
710  // QUDA Memcpy NPad's worth.
711  // -- Dest will point to the right beginning PAD.
712  // -- Each Pad has size Nvec*Vsh Floats.
713  // -- There is Nvec*Stride Floats from the start of one PAD to the start of the next
714 
715  void *dst = (char*)ghost_spinor;
716  void *src = (char*)v + offset*Nvec*precision;
717  size_t len = nFace*ghostFace[3]*Nvec*precision;
718  size_t spitch = stride*Nvec*precision;
719  cudaMemcpy2DAsync(dst, len, src, spitch, len, Npad, cudaMemcpyDeviceToHost, *stream);
720 
721  if (precision == QUDA_HALF_PRECISION) {
722  int norm_offset = (dir == QUDA_BACKWARDS) ? 0 : Nt_minus1_offset*sizeof(float);
723  void *dst = (char*)ghost_spinor + nFace*Nint*ghostFace[3]*precision;
724  void *src = (char*)norm + norm_offset;
725  cudaMemcpyAsync(dst, src, nFace*ghostFace[3]*sizeof(float), cudaMemcpyDeviceToHost, *stream);
726  }
727  }
728 #else
729  errorQuda("sendGhost not built on single-GPU build");
730 #endif
731 
732  }
733 
734  void cudaColorSpinorField::unpackGhost(void* ghost_spinor, const int dim,
735  const QudaDirection dir,
736  const int dagger, cudaStream_t* stream)
737  {
738  int nFace = (nSpin == 1) ? 3 : 1; //3 faces for asqtad
739  int Nint = (nColor * nSpin * 2) / (nSpin == 4 ? 2 : 1); // (spin proj.) degrees of freedom
740 
741  int len = nFace*ghostFace[dim]*Nint;
742  int offset = length + ghostOffset[dim]*nColor*nSpin*2;
743  offset += (dir == QUDA_BACKWARDS) ? 0 : len;
744 
745  void *dst = (char*)v + precision*offset;
746  void *src = ghost_spinor;
747 
748  cudaMemcpyAsync(dst, src, len*precision, cudaMemcpyHostToDevice, *stream);
749 
750  if (precision == QUDA_HALF_PRECISION) {
751  int normlen = nFace*ghostFace[dim];
752  int norm_offset = stride + ghostNormOffset[dim];
753  norm_offset += (dir == QUDA_BACKWARDS) ? 0 : normlen;
754 
755  void *dst = (char*)norm + norm_offset*sizeof(float);
756  void *src = (char*)ghost_spinor+nFace*Nint*ghostFace[dim]*precision; // norm region of host ghost zone
757  cudaMemcpyAsync(dst, src, normlen*sizeof(float), cudaMemcpyHostToDevice, *stream);
758  }
759 
760  }
761 
762  // Return the location of the field
765  }
766 
767  std::ostream& operator<<(std::ostream &out, const cudaColorSpinorField &a) {
768  out << (const ColorSpinorField&)a;
769  out << "v = " << a.v << std::endl;
770  out << "norm = " << a.norm << std::endl;
771  out << "alloc = " << a.alloc << std::endl;
772  out << "init = " << a.init << std::endl;
773  return out;
774  }
775 
776 } // namespace quda