QUDA  v0.5.0
A library for QCD on GPUs
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
cuda_gauge_field.cpp
Go to the documentation of this file.
1 #include <string.h>
2 #include <gauge_field.h>
3 #include <face_quda.h>
4 #include <typeinfo>
5 #include <misc_helpers.h>
6 #include <blas_quda.h>
7 
8 namespace quda {
9 
11  GaugeField(param), gauge(0), even(0), odd(0), backed_up(false)
12  {
14  errorQuda("ERROR: create type(%d) not supported yet\n", create);
15  }
16 
17  gauge = device_malloc(bytes);
18 
20  cudaMemset(gauge, 0, bytes);
21  }
22  even = gauge;
23  odd = (char*)gauge + bytes/2;
24 
25 #ifdef USE_TEXTURE_OBJECTS
26  createTexObject(evenTex, even);
27  createTexObject(oddTex, odd);
28 #endif
29 
30  }
31 
32 #ifdef USE_TEXTURE_OBJECTS
33  void cudaGaugeField::createTexObject(cudaTextureObject_t &tex, void *field) {
34 
35  // create the texture for the field components
36 
37  cudaChannelFormatDesc desc;
38  memset(&desc, 0, sizeof(cudaChannelFormatDesc));
39  if (precision == QUDA_SINGLE_PRECISION) desc.f = cudaChannelFormatKindFloat;
40  else desc.f = cudaChannelFormatKindSigned; // half is short, double is int2
41 
42  // always four components regardless of precision
44  desc.x = 8*sizeof(int);
45  desc.y = 8*sizeof(int);
46  desc.z = 8*sizeof(int);
47  desc.w = 8*sizeof(int);
48  } else {
49  desc.x = 8*precision;
50  desc.y = 8*precision;
51  desc.z = (reconstruct == 18) ? 0 : 8*precision; // float2 or short2 for 18 reconstruct
52  desc.w = (reconstruct == 18) ? 0 : 8*precision;
53  }
54 
55  cudaResourceDesc resDesc;
56  memset(&resDesc, 0, sizeof(resDesc));
57  resDesc.resType = cudaResourceTypeLinear;
58  resDesc.res.linear.devPtr = field;
59  resDesc.res.linear.desc = desc;
60  resDesc.res.linear.sizeInBytes = bytes/2;
61 
62  cudaTextureDesc texDesc;
63  memset(&texDesc, 0, sizeof(texDesc));
64  if (precision == QUDA_HALF_PRECISION) texDesc.readMode = cudaReadModeNormalizedFloat;
65  else texDesc.readMode = cudaReadModeElementType;
66 
67  cudaCreateTextureObject(&tex, &resDesc, &texDesc, NULL);
69  }
70 
71  void cudaGaugeField::destroyTexObject() {
72  cudaDestroyTextureObject(evenTex);
73  cudaDestroyTextureObject(oddTex);
75  }
76 #endif
77 
79  {
80 #ifdef USE_TEXTURE_OBJECTS
81  destroyTexObject();
82 #endif
83 
84  if (gauge) device_free(gauge);
85  }
86 
87  // FIXME temporary hack
88  static double anisotropy_;
89  static double fat_link_max_;
90  static const int *X_;
91  static QudaTboundary t_boundary_;
92 
93 #include <pack_gauge.h>
94 
95  template <typename Float, typename FloatN>
96  static void loadGaugeField(FloatN *even, FloatN *odd, Float *cpuGauge, Float **cpuGhost,
97  GaugeFieldOrder cpu_order, QudaReconstructType reconstruct,
98  size_t bytes, int volumeCB, int *surfaceCB, int pad, int nFace,
99  QudaLinkType type, double &fat_link_max, void *buffer)
100  {
101  // Use pinned memory
102  FloatN *packedEven = (FloatN*)buffer;
103  FloatN *packedOdd = (FloatN*)((char*)buffer+ bytes/2);
104 
105  if( ! packedEven ) errorQuda( "packedEven is borked\n");
106  if( ! packedOdd ) errorQuda( "packedOdd is borked\n");
107  if( ! even ) errorQuda( "even is borked\n");
108  if( ! odd ) errorQuda( "odd is borked\n");
109  if( ! cpuGauge ) errorQuda( "cpuGauge is borked\n");
110 
111 #ifdef MULTI_GPU
112  if (cpu_order != QUDA_QDP_GAUGE_ORDER && cpu_order != QUDA_BQCD_GAUGE_ORDER &&
113  cpu_order != QUDA_CPS_WILSON_GAUGE_ORDER && cpu_order != QUDA_MILC_GAUGE_ORDER)
114  errorQuda("Gauge order %d is not supported for multi-gpu\n", cpu_order);
115 #endif
116 
117  //for QUDA_ASQTAD_FAT_LINKS, need to find out the max value
118  //fat_link_max will be used in encoding half precision fat link
119  if(type == QUDA_ASQTAD_FAT_LINKS){
120  fat_link_max = 0.0;
121  if(cpu_order == QUDA_QDP_GAUGE_ORDER){
122  for(int dir=0; dir < 4; dir++){
123  for(int i=0;i < 2*volumeCB*gaugeSiteSize; i++){
124  Float** tmp = (Float**)cpuGauge;
125  if( tmp[dir][i] > fat_link_max ){
126  fat_link_max = tmp[dir][i];
127  }
128  }
129  }
130  }else if(cpu_order == QUDA_MILC_GAUGE_ORDER || cpu_order == QUDA_CPS_WILSON_GAUGE_ORDER){
131  for(int i=0; i<8*volumeCB*gaugeSiteSize; ++i){
132  if(cpuGauge[i] > fat_link_max){ fat_link_max = cpuGauge[i]; }
133  }
134  }else{
135  errorQuda("Gauge ordering scheme not supported\n");
136  }
137  }
138 
139  double fat_link_max_double = fat_link_max;
140 #ifdef MULTI_GPU
141  reduceMaxDouble(fat_link_max_double);
142 #endif
143 
144  fat_link_max = fat_link_max_double;
145 
146  int voxels[] = {volumeCB, volumeCB, volumeCB, volumeCB};
147 
148  // FIXME - hack for the moment
149  fat_link_max_ = fat_link_max;
150 
151  int nFaceLocal = 1;
152  if (cpu_order == QUDA_QDP_GAUGE_ORDER) {
153  packQDPGaugeField(packedEven, (Float**)cpuGauge, 0, reconstruct, volumeCB,
154  voxels, pad, 0, nFaceLocal, type);
155  packQDPGaugeField(packedOdd, (Float**)cpuGauge, 1, reconstruct, volumeCB,
156  voxels, pad, 0, nFaceLocal, type);
157  } else if (cpu_order == QUDA_CPS_WILSON_GAUGE_ORDER) {
158  packCPSGaugeField(packedEven, (Float*)cpuGauge, 0, reconstruct, volumeCB, pad);
159  packCPSGaugeField(packedOdd, (Float*)cpuGauge, 1, reconstruct, volumeCB, pad);
160  } else if (cpu_order == QUDA_MILC_GAUGE_ORDER) {
161  packMILCGaugeField(packedEven, (Float*)cpuGauge, 0, reconstruct, volumeCB, pad);
162  packMILCGaugeField(packedOdd, (Float*)cpuGauge, 1, reconstruct, volumeCB, pad);
163  } else if (cpu_order == QUDA_BQCD_GAUGE_ORDER) {
164  packBQCDGaugeField(packedEven, (Float*)cpuGauge, 0, reconstruct, volumeCB, pad);
165  packBQCDGaugeField(packedOdd, (Float*)cpuGauge, 1, reconstruct, volumeCB, pad);
166  } else {
167  errorQuda("Invalid gauge_order %d", cpu_order);
168  }
169 
170 #ifdef MULTI_GPU
171  if(type != QUDA_ASQTAD_MOM_LINKS){
172  // pack into the padded regions
173  packQDPGaugeField(packedEven, (Float**)cpuGhost, 0, reconstruct, volumeCB,
174  surfaceCB, pad, volumeCB, nFace, type);
175  packQDPGaugeField(packedOdd, (Float**)cpuGhost, 1, reconstruct, volumeCB,
176  surfaceCB, pad, volumeCB, nFace, type);
177  }
178 #endif
179 
180  // this copies over both even and odd
181  cudaMemcpy(even, packedEven, bytes, cudaMemcpyHostToDevice);
182  checkCudaError();
183  }
184 
185 
186  template <typename Float, typename Float2>
187  void loadMomField(Float2 *even, Float2 *odd, Float *mom, int bytes, int Vh, int pad, void *buffer)
188  {
189  Float2 *packedEven = (Float2*)buffer;
190  Float2 *packedOdd = (Float2*)((char*)buffer + bytes/2);
191 
192  packMomField(packedEven, (Float*)mom, 0, Vh, pad);
193  packMomField(packedOdd, (Float*)mom, 1, Vh, pad);
194 
195  cudaMemcpy(even, packedEven, bytes/2, cudaMemcpyHostToDevice);
196  cudaMemcpy(odd, packedOdd, bytes/2, cudaMemcpyHostToDevice);
197  }
198 
199 
200  void cudaGaugeField::loadCPUField(const cpuGaugeField &cpu, const QudaFieldLocation &pack_location)
201  {
202  if (geometry != QUDA_VECTOR_GEOMETRY) errorQuda("Only vector geometry is supported");
203 
204  checkField(cpu);
205 
206  if (pack_location == QUDA_CUDA_FIELD_LOCATION) {
207  errorQuda("Not implemented"); // awaiting Guochun's new gauge packing
208  } else if (pack_location == QUDA_CPU_FIELD_LOCATION) {
209  // FIXME
210  anisotropy_ = anisotropy;
211  X_ = x;
212  t_boundary_ = t_boundary;
213 
214 #ifdef MULTI_GPU
215  //FIXME: if this is MOM field, we don't need exchange data
217  cpu.exchangeGhost();
218  }
219 #endif
220 
222 
223  if (reconstruct != QUDA_RECONSTRUCT_10) { // gauge field
225 
226  if (cpu.Precision() == QUDA_DOUBLE_PRECISION) {
227  loadGaugeField((double2*)(even), (double2*)(odd), (double*)cpu.gauge, (double**)cpu.ghost,
229  fat_link_max, LatticeField::bufferPinned);
230  } else if (cpu.Precision() == QUDA_SINGLE_PRECISION) {
231  loadGaugeField((double2*)(even), (double2*)(odd), (float*)cpu.gauge, (float**)cpu.ghost,
232  cpu.order, reconstruct, bytes, volumeCB, surfaceCB, pad, nFace, link_type,
233  fat_link_max, LatticeField::bufferPinned);
234  }
235 
236  } else if (precision == QUDA_SINGLE_PRECISION) {
237 
238  if (cpu.Precision() == QUDA_DOUBLE_PRECISION) {
239  if (reconstruct == QUDA_RECONSTRUCT_NO) {
240  loadGaugeField((float2*)(even), (float2*)(odd), (double*)cpu.gauge, (double**)cpu.ghost,
241  cpu.order, reconstruct, bytes, volumeCB, surfaceCB, pad, nFace, link_type,
242  fat_link_max, LatticeField::bufferPinned);
243  } else {
244  loadGaugeField((float4*)(even), (float4*)(odd), (double*)cpu.gauge, (double**)cpu.ghost,
245  cpu.order, reconstruct, bytes, volumeCB, surfaceCB, pad, nFace, link_type,
246  fat_link_max, LatticeField::bufferPinned);
247  }
248  } else if (cpu.Precision() == QUDA_SINGLE_PRECISION) {
249  if (reconstruct == QUDA_RECONSTRUCT_NO) {
250  loadGaugeField((float2*)(even), (float2*)(odd), (float*)cpu.gauge, (float**)cpu.ghost,
251  cpu.order, reconstruct, bytes, volumeCB, surfaceCB, pad, nFace, link_type,
252  fat_link_max, LatticeField::bufferPinned);
253  } else {
254  loadGaugeField((float4*)(even), (float4*)(odd), (float*)cpu.gauge, (float**)cpu.ghost,
255  cpu.order, reconstruct, bytes, volumeCB, surfaceCB, pad, nFace, link_type,
256  fat_link_max, LatticeField::bufferPinned);
257  }
258  }
259 
260  } else if (precision == QUDA_HALF_PRECISION) {
261 
262  if (cpu.Precision() == QUDA_DOUBLE_PRECISION){
263  if (reconstruct == QUDA_RECONSTRUCT_NO) {
264  loadGaugeField((short2*)(even), (short2*)(odd), (double*)cpu.gauge, (double**)cpu.ghost,
265  cpu.order, reconstruct, bytes, volumeCB, surfaceCB, pad, nFace, link_type,
266  fat_link_max, LatticeField::bufferPinned);
267  } else {
268  loadGaugeField((short4*)(even), (short4*)(odd), (double*)cpu.gauge, (double**)cpu.ghost,
269  cpu.order, reconstruct, bytes, volumeCB, surfaceCB, pad, nFace, link_type,
270  fat_link_max, LatticeField::bufferPinned);
271  }
272  } else if (cpu.Precision() == QUDA_SINGLE_PRECISION) {
273  if (reconstruct == QUDA_RECONSTRUCT_NO) {
274  loadGaugeField((short2*)(even), (short2*)(odd), (float*)cpu.gauge, (float**)cpu.ghost,
275  cpu.order, reconstruct, bytes, volumeCB, surfaceCB, pad, nFace, link_type,
276  fat_link_max, LatticeField::bufferPinned);
277  } else {
278  loadGaugeField((short4*)(even), (short4*)(odd), (float*)cpu.gauge, (float**)(cpu.ghost),
279  cpu.order, reconstruct, bytes, volumeCB, surfaceCB, pad, nFace, link_type,
280  fat_link_max, LatticeField::bufferPinned);
281  }
282  }
283  }
284  } else { // momentum field
286  if (cpu.Precision() == QUDA_DOUBLE_PRECISION) {
287  loadMomField((double2*)(even), (double2*)(odd), (double*)cpu.gauge, bytes,
288  volumeCB, pad, LatticeField::bufferPinned);
289  } else if (cpu.Precision() == QUDA_SINGLE_PRECISION) {
290  loadMomField((double2*)(even), (double2*)(odd), (float*)cpu.gauge, bytes,
291  volumeCB, pad, LatticeField::bufferPinned);
292  }
293  } else {
294  if (cpu.Precision() == QUDA_DOUBLE_PRECISION) {
295  loadMomField((float2*)(even), (float2*)(odd), (double*)cpu.gauge, bytes,
296  volumeCB, pad, LatticeField::bufferPinned);
297  } else if (cpu.Precision() == QUDA_SINGLE_PRECISION) {
298  loadMomField((float2*)(even), (float2*)(odd), (float*)cpu.gauge, bytes,
299  volumeCB, pad, LatticeField::bufferPinned);
300  }
301  }
302  } // gauge or momentum
303  } else {
304  errorQuda("Invalid pack location %d", pack_location);
305  }
306 
307  }
308 
309  /*
310  Generic gauge field retrieval. Copies the cuda gauge field into a
311  cpu gauge field. The reordering takes place on the host and
312  supports most gauge field options.
313  */
314  template <typename Float, typename FloatN>
315  static void storeGaugeField(Float *cpuGauge, FloatN *gauge, GaugeFieldOrder cpu_order,
316  QudaReconstructType reconstruct, int bytes, int volumeCB, int pad, void *buffer)
317  {
318  // Use pinned memory
319  FloatN *packedEven = (FloatN*)buffer;
320  FloatN *packedOdd = (FloatN*)((char*)buffer + bytes/2);
321 
322  cudaMemcpy(buffer, gauge, bytes, cudaMemcpyDeviceToHost);
323 
324  if (cpu_order == QUDA_QDP_GAUGE_ORDER) {
325  unpackQDPGaugeField((Float**)cpuGauge, packedEven, 0, reconstruct, volumeCB, pad);
326  unpackQDPGaugeField((Float**)cpuGauge, packedOdd, 1, reconstruct, volumeCB, pad);
327  } else if (cpu_order == QUDA_CPS_WILSON_GAUGE_ORDER) {
328  unpackCPSGaugeField((Float*)cpuGauge, packedEven, 0, reconstruct, volumeCB, pad);
329  unpackCPSGaugeField((Float*)cpuGauge, packedOdd, 1, reconstruct, volumeCB, pad);
330  } else if (cpu_order == QUDA_MILC_GAUGE_ORDER) {
331  unpackMILCGaugeField((Float*)cpuGauge, packedEven, 0, reconstruct, volumeCB, pad);
332  unpackMILCGaugeField((Float*)cpuGauge, packedOdd, 1, reconstruct, volumeCB, pad);
333  } else if (cpu_order == QUDA_BQCD_GAUGE_ORDER) {
334  unpackBQCDGaugeField((Float*)cpuGauge, packedEven, 0, reconstruct, volumeCB, pad);
335  unpackBQCDGaugeField((Float*)cpuGauge, packedOdd, 1, reconstruct, volumeCB, pad);
336  } else {
337  errorQuda("Invalid gauge_order");
338  }
339 
340  }
341 
342 
343 
344  /*
345  Copies the device gauge field to the host.
346  - no reconstruction support
347  - device data is always Float2 ordered
348  - host data is a 1-dimensional array (MILC ordered)
349  - no support for half precision
350  - input and output precisions must match
351  */
352  template<typename FloatN, typename Float>
353  static void storeGaugeField(Float* cpuGauge, FloatN *gauge, int bytes, int volumeCB,
354  int stride, QudaPrecision prec)
355  {
356  cudaStream_t streams[2];
357  for (int i=0; i<2; i++) cudaStreamCreate(&streams[i]);
358 
359  FloatN *even = gauge;
360  FloatN *odd = (FloatN*)((char*)gauge + bytes/2);
361 
362  size_t datalen = 4*2*volumeCB*gaugeSiteSize*sizeof(Float); // both parities
363  void *unpacked = device_malloc(datalen);
364  void *unpackedEven = unpacked;
365  void *unpackedOdd = (char*)unpacked + datalen/2;
366 
367  //unpack even data kernel
368  link_format_gpu_to_cpu((void*)unpackedEven, (void*)even, volumeCB, stride, prec, streams[0]);
369 #ifdef GPU_DIRECT
370  cudaMemcpyAsync(cpuGauge, unpackedEven, datalen/2, cudaMemcpyDeviceToHost, streams[0]);
371 #else
372  cudaMemcpy(cpuGauge, unpackedEven, datalen/2, cudaMemcpyDeviceToHost);
373 #endif
374 
375  //unpack odd data kernel
376  link_format_gpu_to_cpu((void*)unpackedOdd, (void*)odd, volumeCB, stride, prec, streams[1]);
377 #ifdef GPU_DIRECT
378  cudaMemcpyAsync(cpuGauge + 4*volumeCB*gaugeSiteSize, unpackedOdd, datalen/2, cudaMemcpyDeviceToHost, streams[1]);
379  for(int i=0; i<2; i++) cudaStreamSynchronize(streams[i]);
380 #else
381  cudaMemcpy(cpuGauge + 4*volumeCB*gaugeSiteSize, unpackedOdd, datalen/2, cudaMemcpyDeviceToHost);
382 #endif
383 
384  device_free(unpacked);
385  for(int i=0; i<2; i++) cudaStreamDestroy(streams[i]);
386  }
387 
388  template <typename Float, typename Float2>
389  void storeMomToCPUArray(Float* mom, Float2 *even, Float2 *odd, int bytes, int V, int pad, void *buffer)
390  {
391  Float2 *packedEven = (Float2*)buffer;
392  Float2 *packedOdd = (Float2*)((char*)buffer + bytes/2);
393 
394  cudaMemcpy(packedEven, even, bytes/2, cudaMemcpyDeviceToHost);
395  cudaMemcpy(packedOdd, odd, bytes/2, cudaMemcpyDeviceToHost);
396 
397  unpackMomField((Float*)mom, packedEven,0, V/2, pad);
398  unpackMomField((Float*)mom, packedOdd, 1, V/2, pad);
399  }
400 
401  void cudaGaugeField::saveCPUField(cpuGaugeField &cpu, const QudaFieldLocation &pack_location) const
402  {
403  if (geometry != QUDA_VECTOR_GEOMETRY) errorQuda("Only vector geometry is supported");
404 
405  // do device-side reordering then copy
406  if (pack_location == QUDA_CUDA_FIELD_LOCATION) {
407  // check parameters are suitable for device-side packing
408  if (precision != cpu.Precision())
409  errorQuda("cpu precision %d and cuda precision %d must be the same",
410  cpu.Precision(), precision );
411 
412  if (reconstruct != QUDA_RECONSTRUCT_NO)
413  errorQuda("Only no reconstruction supported");
414 
416  errorQuda("Only QUDA_FLOAT2_GAUGE_ORDER supported");
417 
418  if (cpu.Order() != QUDA_MILC_GAUGE_ORDER)
419  errorQuda("Only QUDA_MILC_GAUGE_ORDER supported");
420 
422  storeGaugeField((double*)cpu.gauge, (double2*)gauge, bytes, volumeCB, stride, precision);
423  } else if (precision == QUDA_SINGLE_PRECISION){
424  storeGaugeField((float*)cpu.gauge, (float2*)gauge, bytes, volumeCB, stride, precision);
425  } else {
426  errorQuda("Half precision not supported");
427  }
428 
429  } else if (pack_location == QUDA_CPU_FIELD_LOCATION) { // do copy then host-side reorder
430 
431  // FIXME - nasty globals
432  anisotropy_ = anisotropy;
433  fat_link_max_ = fat_link_max;
434  X_ = x;
435  t_boundary_ = t_boundary;
436 
437  resizeBuffer(bytes);
438 
439  if (reconstruct != QUDA_RECONSTRUCT_10) {
441 
442  if (cpu.Precision() == QUDA_DOUBLE_PRECISION) {
443  storeGaugeField((double*)cpu.gauge, (double2*)(gauge),
444  cpu.order, reconstruct, bytes, volumeCB, pad, bufferPinned);
445  } else if (cpu.Precision() == QUDA_SINGLE_PRECISION) {
446  storeGaugeField((float*)cpu.gauge, (double2*)(gauge),
447  cpu.order, reconstruct, bytes, volumeCB, pad, bufferPinned);
448  }
449 
450  } else if (precision == QUDA_SINGLE_PRECISION) {
451 
452  if (cpu.Precision() == QUDA_DOUBLE_PRECISION) {
453  if (reconstruct == QUDA_RECONSTRUCT_NO) {
454  storeGaugeField((double*)cpu.gauge, (float2*)(gauge),
455  cpu.order, reconstruct, bytes, volumeCB, pad, bufferPinned);
456  } else {
457  storeGaugeField((double*)cpu.gauge, (float4*)(gauge),
458  cpu.order, reconstruct, bytes, volumeCB, pad, bufferPinned);
459  }
460  } else if (cpu.Precision() == QUDA_SINGLE_PRECISION) {
461  if (reconstruct == QUDA_RECONSTRUCT_NO) {
462  storeGaugeField((float*)cpu.gauge, (float2*)(gauge),
463  cpu.order, reconstruct, bytes, volumeCB, pad, bufferPinned);
464  } else {
465  storeGaugeField((float*)cpu.gauge, (float4*)(gauge),
466  cpu.order, reconstruct, bytes, volumeCB, pad, bufferPinned);
467  }
468  }
469 
470  } else if (precision == QUDA_HALF_PRECISION) {
471 
472  if (cpu.Precision() == QUDA_DOUBLE_PRECISION) {
473  if (reconstruct == QUDA_RECONSTRUCT_NO) {
474  storeGaugeField((double*)cpu.gauge, (short2*)(gauge),
475  cpu.order, reconstruct, bytes, volumeCB, pad, bufferPinned);
476  } else {
477  storeGaugeField((double*)cpu.gauge, (short4*)(gauge),
478  cpu.order, reconstruct, bytes, volumeCB, pad, bufferPinned);
479  }
480  } else if (cpu.Precision() == QUDA_SINGLE_PRECISION) {
481  if (reconstruct == QUDA_RECONSTRUCT_NO) {
482  storeGaugeField((float*)cpu.gauge, (short2*)(gauge),
483  cpu.order, reconstruct, bytes, volumeCB, pad, bufferPinned);
484  } else {
485  storeGaugeField((float*)cpu.gauge, (short4*)(gauge),
486  cpu.order, reconstruct, bytes, volumeCB, pad, bufferPinned);
487  }
488  }
489  }
490  } else {
491 
492  if (cpu.Precision() != precision)
493  errorQuda("cpu and gpu precison has to be the same at this moment");
494 
496  errorQuda("half precision is not supported at this moment");
497 
498  if (cpu.order != QUDA_MILC_GAUGE_ORDER)
499  errorQuda("Only MILC gauge order supported in momentum unpack, not %d", cpu.order);
500 
502  storeMomToCPUArray( (double*)cpu.gauge, (double2*)even, (double2*)odd, bytes, volume, pad, bufferPinned);
503  }else { //SINGLE PRECISIONS
504  storeMomToCPUArray( (float*)cpu.gauge, (float2*)even, (float2*)odd, bytes, volume, pad, bufferPinned);
505  }
506  } // reconstruct 10
507  } else {
508  errorQuda("Invalid pack location %d", pack_location);
509  }
510 
511  }
512 
513  void cudaGaugeField::backup() const {
514  if (backed_up) errorQuda("Gauge field already backed up");
515  backup_h = new char[bytes];
516  cudaMemcpy(backup_h, gauge, bytes, cudaMemcpyDeviceToHost);
517  checkCudaError();
518  backed_up = true;
519  }
520 
522  if (!backed_up) errorQuda("Cannot retore since not backed up");
523  cudaMemcpy(gauge, backup_h, bytes, cudaMemcpyHostToDevice);
524  delete []backup_h;
525  checkCudaError();
526  backed_up = false;
527  }
528 
529  // Return the L2 norm squared of the gauge field
530  double norm2(const cudaGaugeField &a) {
531 
532  int spin = 0;
533  switch (a.Geometry()) {
535  spin = 1;
536  break;
538  spin = a.Ndim();
539  break;
541  spin = a.Ndim() * (a.Ndim()-1);
542  break;
543  default:
544  errorQuda("Unsupported field geometry %d", a.Geometry());
545  }
546 
547  if (a.Precision() == QUDA_HALF_PRECISION)
548  errorQuda("Casting a cudaGaugeField into cudaColorSpinorField not possible in half precision");
549 
550  ColorSpinorParam spinor_param;
551  spinor_param.nColor = a.Reconstruct()/2;
552  spinor_param.nSpin = a.Ndim();
553  spinor_param.nDim = spin;
554  for (int d=0; d<a.Ndim(); d++) spinor_param.x[d] = a.X()[d];
555  spinor_param.precision = a.Precision();
556  spinor_param.pad = a.Pad();
557  spinor_param.siteSubset = QUDA_FULL_SITE_SUBSET;
558  spinor_param.siteOrder = QUDA_EVEN_ODD_SITE_ORDER;
559  spinor_param.fieldOrder = (QudaFieldOrder)a.FieldOrder();
560  spinor_param.gammaBasis = QUDA_UKQCD_GAMMA_BASIS;
561  spinor_param.create = QUDA_REFERENCE_FIELD_CREATE;
562  spinor_param.v = (void*)a.Gauge_p();
563  cudaColorSpinorField b(spinor_param);
564  return norm2(b);
565  }
566 
567 } // namespace quda