QUDA  v0.7.0
A library for QCD on GPUs
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
cuda_gauge_field.cu
Go to the documentation of this file.
1 #include <string.h>
2 #include <gauge_field.h>
3 #include <face_quda.h>
4 #include <typeinfo>
5 #include <misc_helpers.h>
6 #include <blas_quda.h>
7 
8 namespace quda {
9 
11  GaugeField(param), gauge(0), even(0), odd(0), backed_up(false)
12  {
15  errorQuda("QDP ordering only supported for reference fields");
16  }
17 
21  errorQuda("Field ordering %d presently disabled for this type", order);
22 
23 #ifdef MULTI_GPU
26  isNative()) {
27  bool pad_check = true;
28  for (int i=0; i<nDim; i++)
29  if (pad < nFace*surfaceCB[i]) pad_check = false;
30  if (!pad_check)
31  errorQuda("cudaGaugeField being constructed with insufficient padding\n");
32  }
33 #endif
34 
38  errorQuda("ERROR: create type(%d) not supported yet\n", create);
39  }
40 
42  gauge = device_malloc(bytes);
43  if (create == QUDA_ZERO_FIELD_CREATE) cudaMemset(gauge, 0, bytes);
44  } else {
45  gauge = param.gauge;
46  }
47 
48  if ( !isNative() ) {
49  for (int i=0; i<nDim; i++) {
50  size_t nbytes = nFace * surface[i] * reconstruct * precision;
51  ghost[i] = nbytes ? device_malloc(nbytes) : NULL;
52  }
53  }
54 
57  }
58 
59  even = gauge;
60  odd = (char*)gauge + bytes/2;
61 
62 #ifdef USE_TEXTURE_OBJECTS
63  createTexObject(evenTex, even);
64  createTexObject(oddTex, odd);
66  { // Create texture objects for the phases
67  const int isPhase = 1;
68  createTexObject(evenPhaseTex, (char*)even + phase_offset, isPhase);
69  createTexObject(oddPhaseTex, (char*)odd + phase_offset, isPhase);
70  }
71 #endif
72 
73  }
74 
75 #ifdef USE_TEXTURE_OBJECTS
76  void cudaGaugeField::createTexObject(cudaTextureObject_t &tex, void *field, int isPhase) {
77 
78  if( isNative() ){
79  // create the texture for the field components
80  cudaChannelFormatDesc desc;
81  memset(&desc, 0, sizeof(cudaChannelFormatDesc));
82  if (precision == QUDA_SINGLE_PRECISION) desc.f = cudaChannelFormatKindFloat;
83  else desc.f = cudaChannelFormatKindSigned; // half is short, double is int2
84 
85  if(isPhase){
87  desc.x = 8*sizeof(int);
88  desc.y = 8*sizeof(int);
89  desc.z = 0;
90  desc.w = 0;
91  }else{
92  desc.x = 8*precision;
93  desc.y = desc.z = desc.w = 0;
94  }
95  }else{
96  // always four components regardless of precision
98  desc.x = 8*sizeof(int);
99  desc.y = 8*sizeof(int);
100  desc.z = 8*sizeof(int);
101  desc.w = 8*sizeof(int);
102  } else {
103  desc.x = 8*precision;
104  desc.y = 8*precision;
105  desc.z = (reconstruct == 18) ? 0 : 8*precision; // float2 or short2 for 18 reconstruct
106  desc.w = (reconstruct == 18) ? 0 : 8*precision;
107  }
108  }
109 
110  cudaResourceDesc resDesc;
111  memset(&resDesc, 0, sizeof(resDesc));
112  resDesc.resType = cudaResourceTypeLinear;
113  resDesc.res.linear.devPtr = field;
114  resDesc.res.linear.desc = desc;
115  resDesc.res.linear.sizeInBytes = isPhase ? phase_bytes/2 : (bytes-phase_bytes)/2;
116 
117  cudaTextureDesc texDesc;
118  memset(&texDesc, 0, sizeof(texDesc));
119  if (precision == QUDA_HALF_PRECISION) texDesc.readMode = cudaReadModeNormalizedFloat;
120  else texDesc.readMode = cudaReadModeElementType;
121 
122  cudaCreateTextureObject(&tex, &resDesc, &texDesc, NULL);
123  checkCudaError();
124  }
125  }
126 
127  void cudaGaugeField::destroyTexObject() {
128  if( isNative() ){
129  cudaDestroyTextureObject(evenTex);
130  cudaDestroyTextureObject(oddTex);
132  cudaDestroyTextureObject(evenPhaseTex);
133  cudaDestroyTextureObject(oddPhaseTex);
134  }
135  checkCudaError();
136  }
137  }
138 #endif
139 
141  {
142 #ifdef USE_TEXTURE_OBJECTS
143  destroyTexObject();
144 #endif
145 
147  if (gauge) device_free(gauge);
148  }
149 
150  if ( !isNative() ) {
151  for (int i=0; i<nDim; i++) {
152  if (ghost[i]) device_free(ghost[i]);
153  }
154  }
155 
156  }
157 
158  // This does the exchange of the gauge field ghost zone and places it
159  // into the ghost array.
162  errorQuda("Cannot call exchangeGhost with ghostExchange=%d",
163  ghostExchange);
164 
166  errorQuda("Cannot exchange for %d geometry gauge field", geometry);
167 
168  void *ghost_[QUDA_MAX_DIM];
169  void *send[QUDA_MAX_DIM];
170  for (int d=0; d<nDim; d++) {
171  ghost_[d] = isNative() ? device_malloc(nFace*surface[d]*reconstruct*precision) : ghost[d];
172  send[d] = device_malloc(nFace*surface[d]*reconstruct*precision);
173  }
174 
175  // get the links into contiguous buffers
176  extractGaugeGhost(*this, send);
177 
178  // communicate between nodes
179  FaceBuffer faceBuf(x, nDim, reconstruct, nFace, precision);
180  faceBuf.exchangeLink(ghost_, send, QUDA_CUDA_FIELD_LOCATION);
181 
182  for (int d=0; d<nDim; d++) device_free(send[d]);
183 
184  if (isNative()) {
185  // copy from ghost into the padded region in gauge
186  copyGenericGauge(*this, *this, QUDA_CUDA_FIELD_LOCATION, 0, 0, 0, ghost_, 1);
187  for (int d=0; d<nDim; d++) device_free(ghost_[d]);
188  }
189  }
190 
191  void cudaGaugeField::exchangeExtendedGhost(const int *R, bool no_comms_fill) {
192 
193  void *send[QUDA_MAX_DIM];
194  void *recv[QUDA_MAX_DIM];
195  void *send_d[QUDA_MAX_DIM];
196  void *recv_d[QUDA_MAX_DIM];
197  size_t bytes[QUDA_MAX_DIM];
198 
199  for (int d=0; d<nDim; d++) {
200  if (!commDimPartitioned(d) && !no_comms_fill) continue;
201  // store both parities and directions in each
202  bytes[d] = surface[d] * R[d] * geometry * reconstruct * precision;
203  send_d[d] = device_malloc(2 * bytes[d]);
204  recv_d[d] = device_malloc(2 * bytes[d]);
205  }
206 
207 #ifndef GPU_COMMS
208  void *send_h[QUDA_MAX_DIM];
209  void *recv_h[QUDA_MAX_DIM];
210  size_t total_bytes = 0;
211  for (int d=0; d<nDim; d++) {
212  if (!commDimPartitioned(d)) continue;
213  total_bytes += 4*bytes[d];
214  }
215  resizeBufferPinned(total_bytes,0);
216 
217 
218  size_t offset = 0;
219  for (int d=0; d<nDim; d++) {
220  if (!commDimPartitioned(d)) continue;
221 
222  recv_h[d] = static_cast<char*>(bufferPinned[0]) + offset;
223  send_h[d] = static_cast<char*>(recv_h[d]) + 2*bytes[d];
224  offset += 4*bytes[d];
225  }
226 #endif
227 
228  // do the exchange
233 
234  for (int d=0; d<nDim; d++) {
235  if (!commDimPartitioned(d)) continue;
236 #ifdef GPU_COMMS
237  recv[d] = recv_d[d];
238  send[d] = send_d[d];
239 #else
240  recv[d] = recv_h[d];
241  send[d] = send_h[d];
242 #endif
243 
244  // look into storing these for later
245  mh_recv_back[d] = comm_declare_receive_relative(recv[d], d, -1, bytes[d]);
246  mh_recv_fwd[d] = comm_declare_receive_relative(static_cast<char*>(recv[d])+bytes[d],
247  d, +1, bytes[d]);
248  mh_send_back[d] = comm_declare_send_relative(send[d], d, -1, bytes[d]);
249  mh_send_fwd[d] = comm_declare_send_relative(static_cast<char*>(send[d])+bytes[d],
250  d, +1, bytes[d]);
251  }
252 
253  for (int d=0; d<nDim; d++) {
254  if (!commDimPartitioned(d) && !no_comms_fill) continue;
255 
256  // FIXME why does this break if the order is switched?
257  // prepost the receives
258  if (commDimPartitioned(d)) {
259  comm_start(mh_recv_fwd[d]);
260  comm_start(mh_recv_back[d]);
261  }
262 
263  //extract into a contiguous buffer
264  extractExtendedGaugeGhost(*this, d, R, send_d, true);
265 
266  if (commDimPartitioned(d)) {
267 
268  // pipeline the forwards and backwards sending
269 #ifndef GPU_COMMS
270  cudaMemcpyAsync(send_h[d], send_d[d], bytes[d], cudaMemcpyDeviceToHost, streams[0]);
271  cudaMemcpyAsync(static_cast<char*>(send_h[d])+bytes[d],
272  static_cast<char*>(send_d[d])+bytes[d], bytes[d], cudaMemcpyDeviceToHost, streams[1]);
273 #endif
274 
275 #ifndef GPU_COMMS
276  cudaStreamSynchronize(streams[0]);
277 #endif
278  comm_start(mh_send_back[d]);
279 
280 #ifndef GPU_COMMS
281  cudaStreamSynchronize(streams[1]);
282 #endif
283  comm_start(mh_send_fwd[d]);
284 
285  // forwards recv
286  comm_wait(mh_send_back[d]);
287  comm_wait(mh_recv_fwd[d]);
288 #ifndef GPU_COMMS
289  cudaMemcpyAsync(static_cast<char*>(recv_d[d])+bytes[d],
290  static_cast<char*>(recv_h[d])+bytes[d], bytes[d], cudaMemcpyHostToDevice, streams[0]);
291 #endif
292 
293  // backwards recv
294  comm_wait(mh_send_fwd[d]);
295  comm_wait(mh_recv_back[d]);
296 #ifndef GPU_COMMS
297  cudaMemcpyAsync(recv_d[d], recv_h[d], bytes[d], cudaMemcpyHostToDevice, streams[1]);
298 #endif
299  } else { // if just doing a local exchange to fill halo then need to swap faces
300  cudaMemcpy(static_cast<char*>(recv_d[d])+bytes[d], send_d[d], bytes[d], cudaMemcpyDeviceToDevice);
301  cudaMemcpy(recv_d[d], static_cast<char*>(send_d[d])+bytes[d], bytes[d], cudaMemcpyDeviceToDevice);
302  }
303 
304  // inject back into the gauge field
305  extractExtendedGaugeGhost(*this, d, R, recv_d, false);
306  }
307 
308  for (int d=0; d<nDim; d++) {
309  if (!commDimPartitioned(d) && !no_comms_fill) continue;
310 
311  if (commDimPartitioned(d)) {
312  comm_free(mh_send_fwd[d]);
313  comm_free(mh_send_back[d]);
314  comm_free(mh_recv_back[d]);
315  comm_free(mh_recv_fwd[d]);
316  }
317 
318  device_free(send_d[d]);
319  device_free(recv_d[d]);
320  }
321 
322  }
323 
324  void cudaGaugeField::setGauge(void *gauge_)
325  {
327  errorQuda("Setting gauge pointer is only allowed when create="
328  "QUDA_REFERENCE_FIELD_CREATE type\n");
329  }
330  gauge = gauge_;
331  }
332 
333  void cudaGaugeField::copy(const GaugeField &src) {
334  if (this == &src) return;
335 
336  checkField(src);
337 
339  fat_link_max = src.LinkMax();
340  if (precision == QUDA_HALF_PRECISION && fat_link_max == 0.0)
341  errorQuda("fat_link_max has not been computed");
342  } else {
343  fat_link_max = 1.0;
344  }
345 
346  if (typeid(src) == typeid(cudaGaugeField)) {
347  // copy field and ghost zone into this field
348  copyGenericGauge(*this, src, QUDA_CUDA_FIELD_LOCATION, gauge,
349  static_cast<const cudaGaugeField&>(src).gauge);
350  } else if (typeid(src) == typeid(cpuGaugeField)) {
352 
353  // copy field and ghost zone into bufferPinned
355  static_cast<const cpuGaugeField&>(src).gauge);
356 
357  // this copies over both even and odd
358  cudaMemcpy(gauge, bufferPinned[0], bytes, cudaMemcpyHostToDevice);
359  } else {
360  errorQuda("Invalid gauge field type");
361  }
362 
363  // if we have copied from a source without a pad then we need to exchange
366  exchangeGhost();
367  }
368 
369  checkCudaError();
370  }
371 
372  void cudaGaugeField::loadCPUField(const cpuGaugeField &cpu, const QudaFieldLocation &pack_location)
373  {
374  if (pack_location == QUDA_CUDA_FIELD_LOCATION) {
375  if (cpu.Order() == QUDA_MILC_GAUGE_ORDER ||
377  resizeBufferPinned(cpu.Bytes(),0);
378  memcpy(bufferPinned[0], cpu.Gauge_p(), cpu.Bytes());
379 
380  // run kernel directly using host-mapped input data
381  void *bufferPinnedMapped;
382  cudaHostGetDevicePointer(&bufferPinnedMapped, bufferPinned[0], 0);
383  copyGenericGauge(*this, cpu, QUDA_CUDA_FIELD_LOCATION, gauge, bufferPinnedMapped);
384  } else {
385  errorQuda("Not implemented for order %d", cpu.Order());
386  }
387  } else if (pack_location == QUDA_CPU_FIELD_LOCATION) {
388  copy(cpu);
389  } else {
390  errorQuda("Invalid pack location %d", pack_location);
391  }
392 
393  }
394 
395  /*
396  Copies the device gauge field to the host.
397  - no reconstruction support
398  - device data is always Float2 ordered
399  - host data is a 1-dimensional array (MILC ordered)
400  - no support for half precision
401  - input and output precisions must match
402  */
403  template<typename FloatN, typename Float>
404  static void storeGaugeField(Float* cpuGauge, FloatN *gauge, int bytes, int volumeCB,
405  int stride, QudaPrecision prec)
406  {
407  cudaStream_t streams[2];
408  for (int i=0; i<2; i++) cudaStreamCreate(&streams[i]);
409 
410  FloatN *even = gauge;
411  FloatN *odd = (FloatN*)((char*)gauge + bytes/2);
412 
413  size_t datalen = 4*2*volumeCB*gaugeSiteSize*sizeof(Float); // both parities
414  void *unpacked = device_malloc(datalen);
415  void *unpackedEven = unpacked;
416  void *unpackedOdd = (char*)unpacked + datalen/2;
417 
418  //unpack even data kernel
419  link_format_gpu_to_cpu((void*)unpackedEven, (void*)even, volumeCB, stride, prec, streams[0]);
420 #ifdef GPU_DIRECT
421  cudaMemcpyAsync(cpuGauge, unpackedEven, datalen/2, cudaMemcpyDeviceToHost, streams[0]);
422 #else
423  cudaMemcpy(cpuGauge, unpackedEven, datalen/2, cudaMemcpyDeviceToHost);
424 #endif
425 
426  //unpack odd data kernel
427  link_format_gpu_to_cpu((void*)unpackedOdd, (void*)odd, volumeCB, stride, prec, streams[1]);
428 #ifdef GPU_DIRECT
429  cudaMemcpyAsync(cpuGauge + 4*volumeCB*gaugeSiteSize, unpackedOdd, datalen/2, cudaMemcpyDeviceToHost, streams[1]);
430  for(int i=0; i<2; i++) cudaStreamSynchronize(streams[i]);
431 #else
432  cudaMemcpy(cpuGauge + 4*volumeCB*gaugeSiteSize, unpackedOdd, datalen/2, cudaMemcpyDeviceToHost);
433 #endif
434 
435  device_free(unpacked);
436  for(int i=0; i<2; i++) cudaStreamDestroy(streams[i]);
437  }
438 
439  void cudaGaugeField::saveCPUField(cpuGaugeField &cpu, const QudaFieldLocation &pack_location) const
440  {
441  // FIXME use the generic copying for the below copying
442  // do device-side reordering then copy
443  if (pack_location == QUDA_CUDA_FIELD_LOCATION) {
444  // check parameters are suitable for device-side packing
445  if (precision != cpu.Precision())
446  errorQuda("cpu precision %d and cuda precision %d must be the same",
447  cpu.Precision(), precision);
448 
449  if (reconstruct != QUDA_RECONSTRUCT_NO) errorQuda("Only no reconstruction supported");
450  if (order != QUDA_FLOAT2_GAUGE_ORDER) errorQuda("Only QUDA_FLOAT2_GAUGE_ORDER supported");
451  if (cpu.Order() != QUDA_MILC_GAUGE_ORDER) errorQuda("Only QUDA_MILC_GAUGE_ORDER supported");
452 
454  storeGaugeField((double*)cpu.gauge, (double2*)gauge, bytes, volumeCB, stride, precision);
455  } else if (precision == QUDA_SINGLE_PRECISION){
456  storeGaugeField((float*)cpu.gauge, (float2*)gauge, bytes, volumeCB, stride, precision);
457  } else {
458  errorQuda("Half precision not supported");
459  }
460 
461  } else if (pack_location == QUDA_CPU_FIELD_LOCATION) { // do copy then host-side reorder
462  resizeBufferPinned(bytes,0);
463 
464  // this copies over both even and odd
465  cudaMemcpy(bufferPinned[0], gauge, bytes, cudaMemcpyDeviceToHost);
466  checkCudaError();
467 
468  copyGenericGauge(cpu, *this, QUDA_CPU_FIELD_LOCATION, cpu.gauge, bufferPinned[0]);
469  } else {
470  errorQuda("Invalid pack location %d", pack_location);
471  }
472 
473  }
474 
475  void cudaGaugeField::backup() const {
476  if (backed_up) errorQuda("Gauge field already backed up");
477  backup_h = new char[bytes];
478  cudaMemcpy(backup_h, gauge, bytes, cudaMemcpyDeviceToHost);
479  checkCudaError();
480  backed_up = true;
481  }
482 
484  if (!backed_up) errorQuda("Cannot restore since not backed up");
485  cudaMemcpy(gauge, backup_h, bytes, cudaMemcpyHostToDevice);
486  delete []backup_h;
487  checkCudaError();
488  backed_up = false;
489  }
490 
491  void setGhostSpinor(bool value);
492 
493  // Return the L2 norm squared of the gauge field
494  double norm2(const cudaGaugeField &a) {
495 
496  if (a.FieldOrder() == QUDA_QDP_GAUGE_ORDER ||
498  errorQuda("Not implemented");
499 
500  int spin = 0;
501  switch (a.Geometry()) {
503  spin = 1;
504  break;
506  spin = a.Ndim();
507  break;
509  spin = a.Ndim() * (a.Ndim()-1) / 2;
510  break;
511  default:
512  errorQuda("Unsupported field geometry %d", a.Geometry());
513  }
514 
515  if (a.Precision() == QUDA_HALF_PRECISION)
516  errorQuda("Casting a cudaGaugeField into cudaColorSpinorField not possible in half precision");
517 
519  errorQuda("Unsupported field reconstruct %d", a.Reconstruct());
520 
521  ColorSpinorParam spinor_param;
522  spinor_param.nColor = a.Reconstruct()/2;
523  spinor_param.nSpin = a.Ndim();
524  spinor_param.nDim = spin;
525  for (int d=0; d<a.Ndim(); d++) spinor_param.x[d] = a.X()[d];
526  spinor_param.precision = a.Precision();
527  spinor_param.pad = a.Pad();
528  spinor_param.siteSubset = QUDA_FULL_SITE_SUBSET;
529  spinor_param.siteOrder = QUDA_EVEN_ODD_SITE_ORDER;
530  spinor_param.fieldOrder = (a.Precision() == QUDA_DOUBLE_PRECISION || spinor_param.nSpin == 1) ?
532  spinor_param.gammaBasis = QUDA_UKQCD_GAMMA_BASIS;
533  spinor_param.create = QUDA_REFERENCE_FIELD_CREATE;
534  spinor_param.v = (void*)a.Gauge_p();
535 
536  // quick hack to disable ghost zone creation which otherwise breaks this mapping on multi-gpu
537  setGhostSpinor(false);
538  cudaColorSpinorField b(spinor_param);
539  setGhostSpinor(true);
540 
541  return norm2(b);
542  }
543 
544 } // namespace quda
void setGhostSpinor(bool value)
QudaGaugeFieldOrder FieldOrder() const
Definition: gauge_field.h:176
QudaGhostExchange GhostExchange() const
Definition: gauge_field.h:179
enum QudaPrecision_s QudaPrecision
int commDimPartitioned(int dir)
void copyGenericGauge(GaugeField &out, const GaugeField &in, QudaFieldLocation location, void *Out=0, void *In=0, void **ghostOut=0, void **ghostIn=0, int type=0)
Definition: copy_gauge.cu:30
void saveCPUField(cpuGaugeField &, const QudaFieldLocation &) const
MsgHandle *** mh_send_back[2]
#define errorQuda(...)
Definition: util_quda.h:73
void * ghost[QUDA_MAX_DIM]
Definition: gauge_field.h:148
const int * X() const
void extractGaugeGhost(const GaugeField &u, void **ghost)
QudaGaugeFieldOrder Order() const
Definition: gauge_field.h:169
cudaStream_t * streams
QudaReconstructType reconstruct
Definition: gauge_field.h:130
MsgHandle * comm_declare_send_relative(void *buffer, int dim, int dir, size_t nbytes)
int x[QUDA_MAX_DIM]
Definition: lattice_field.h:78
QudaPrecision precision
Definition: lattice_field.h:41
#define gaugeSiteSize
QudaSiteSubset siteSubset
Definition: lattice_field.h:42
cudaGaugeField(const GaugeFieldParam &)
QudaFieldGeometry geometry
Definition: gauge_field.h:128
QudaGaugeParam param
Definition: pack_test.cpp:17
void comm_free(MsgHandle *mh)
Definition: comm_mpi.cpp:174
int Pad() const
int x[QUDA_MAX_DIM]
Definition: lattice_field.h:38
size_t Bytes() const
Definition: gauge_field.h:197
QudaPrecision Precision() const
void exchangeLink(void **ghost_link, void **link_sendbuf, QudaFieldLocation location)
void exchangeExtendedGhost(const int *R, bool no_comms_fill=false)
FloatingPoint< float > Float
Definition: gtest.h:7350
void extractExtendedGaugeGhost(const GaugeField &u, int dim, const int *R, void **ghost, bool extract)
void comm_start(MsgHandle *mh)
Definition: comm_mpi.cpp:180
QudaReconstructType Reconstruct() const
Definition: gauge_field.h:168
void loadCPUField(const cpuGaugeField &, const QudaFieldLocation &)
const double & LinkMax() const
Definition: gauge_field.h:192
MsgHandle *** mh_recv_back[2]
cpuGaugeField * cpuGauge
MsgHandle * comm_declare_receive_relative(void *buffer, int dim, int dir, size_t nbytes)
void checkField(const GaugeField &)
int surface[QUDA_MAX_DIM]
Definition: lattice_field.h:80
enum QudaFieldLocation_s QudaFieldLocation
QudaGhostExchange ghostExchange
Definition: gauge_field.h:147
void * memset(void *s, int c, size_t n)
QudaLinkType link_type
Definition: gauge_field.h:133
bool isNative() const
Definition: gauge_field.cpp:92
void setGauge(void *_gauge)
QudaFieldCreate create
Definition: gauge_field.h:142
#define device_malloc(size)
Definition: malloc_quda.h:24
void copy(const GaugeField &)
int surfaceCB[QUDA_MAX_DIM]
Definition: lattice_field.h:81
int Ndim() const
#define QUDA_MAX_DIM
Maximum number of dimensions supported by QUDA. In practice, no routines make use of more than 5...
#define checkCudaError()
Definition: util_quda.h:110
QudaFieldGeometry Geometry() const
Definition: gauge_field.h:177
void comm_wait(MsgHandle *mh)
Definition: comm_mpi.cpp:186
QudaGaugeFieldOrder order
Definition: gauge_field.h:131
QudaPrecision prec
Definition: test_util.cpp:1551
MsgHandle *** mh_send_fwd[2]
double norm2(const ColorSpinorField &)
void link_format_gpu_to_cpu(void *dst, void *src, int Vh, int stride, QudaPrecision prec, cudaStream_t stream)
QudaPrecision precision
Definition: lattice_field.h:84
void * gauge[4]
Definition: su3_test.cpp:15
void resizeBufferPinned(size_t bytes, const int index=0) const
static void * bufferPinned[2]
Definition: lattice_field.h:90
MsgHandle *** mh_recv_fwd[2]
#define device_free(ptr)
Definition: malloc_quda.h:28