QUDA  1.0.0
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros
copy_gauge_extended.cu
Go to the documentation of this file.
1 #include <tune_quda.h>
2 #include <gauge_field_order.h>
3 #include <quda_matrix.h>
4 
5 namespace quda {
6 
7  using namespace gauge;
8 
12  template <typename OutOrder, typename InOrder>
13  struct CopyGaugeExArg {
14  OutOrder out;
15  const InOrder in;
16  int Xin[QUDA_MAX_DIM];
17  int Xout[QUDA_MAX_DIM];
18  int volume;
19  int volumeEx;
20  int nDim;
21  int geometry;
22  int faceVolumeCB[QUDA_MAX_DIM];
24  CopyGaugeExArg(const OutOrder &out, const InOrder &in, const int *Xout, const int *Xin,
25  const int *faceVolumeCB, int nDim, int geometry)
26  : out(out), in(in), nDim(nDim), geometry(geometry) {
27  for (int d=0; d<nDim; d++) {
28  this->Xout[d] = Xout[d];
29  this->Xin[d] = Xin[d];
30  this->faceVolumeCB[d] = faceVolumeCB[d];
31  }
32 
33  if (out.volumeCB > in.volumeCB) {
34  this->volume = 2*in.volumeCB;
35  this->volumeEx = 2*out.volumeCB;
36  this->regularToextended = true;
37  } else {
38  this->volume = 2*out.volumeCB;
39  this->volumeEx = 2*in.volumeCB;
40  this->regularToextended = false;
41  }
42  }
43 
44  };
45 
49  template <typename FloatOut, typename FloatIn, int length, typename OutOrder, typename InOrder, bool regularToextended>
50  __device__ __host__ void copyGaugeEx(CopyGaugeExArg<OutOrder,InOrder> &arg, int X, int parity) {
51  typedef typename mapper<FloatIn>::type RegTypeIn;
52  typedef typename mapper<FloatOut>::type RegTypeOut;
53  constexpr int nColor = Ncolor(length);
54 
55  int x[4];
56  int R[4];
57  int xin, xout;
58  if(regularToextended){
59  //regular to extended
60  for (int d=0; d<4; d++) R[d] = (arg.Xout[d] - arg.Xin[d]) >> 1;
61  int za = X/(arg.Xin[0]/2);
62  int x0h = X - za*(arg.Xin[0]/2);
63  int zb = za/arg.Xin[1];
64  x[1] = za - zb*arg.Xin[1];
65  x[3] = zb / arg.Xin[2];
66  x[2] = zb - x[3]*arg.Xin[2];
67  x[0] = 2*x0h + ((x[1] + x[2] + x[3] + parity) & 1);
68  // Y is the cb spatial index into the extended gauge field
69  xout = ((((x[3]+R[3])*arg.Xout[2] + (x[2]+R[2]))*arg.Xout[1] + (x[1]+R[1]))*arg.Xout[0]+(x[0]+R[0])) >> 1;
70  xin = X;
71  } else{
72  //extended to regular gauge
73  for (int d=0; d<4; d++) R[d] = (arg.Xin[d] - arg.Xout[d]) >> 1;
74  int za = X/(arg.Xout[0]/2);
75  int x0h = X - za*(arg.Xout[0]/2);
76  int zb = za/arg.Xout[1];
77  x[1] = za - zb*arg.Xout[1];
78  x[3] = zb / arg.Xout[2];
79  x[2] = zb - x[3]*arg.Xout[2];
80  x[0] = 2*x0h + ((x[1] + x[2] + x[3] + parity) & 1);
81  // Y is the cb spatial index into the extended gauge field
82  xin = ((((x[3]+R[3])*arg.Xin[2] + (x[2]+R[2]))*arg.Xin[1] + (x[1]+R[1]))*arg.Xin[0]+(x[0]+R[0])) >> 1;
83  xout = X;
84  }
85  for (int d=0; d<arg.geometry; d++) {
86  const Matrix<complex<RegTypeIn>,nColor> in = arg.in(d, xin, parity);
88  arg.out(d, xout, parity) = out;
89  }//dir
90  }
91 
92  template <typename FloatOut, typename FloatIn, int length, typename OutOrder, typename InOrder, bool regularToextended>
94  for (int parity=0; parity<2; parity++) {
95  for(int X=0; X<arg.volume/2; X++){
96  copyGaugeEx<FloatOut, FloatIn, length, OutOrder, InOrder, regularToextended>(arg, X, parity);
97  }
98  }
99  }
100 
101  template <typename FloatOut, typename FloatIn, int length, typename OutOrder, typename InOrder, bool regularToextended>
103  for (int parity=0; parity<2; parity++) {
104  int X = blockIdx.x * blockDim.x + threadIdx.x;
105  if (X >= arg.volume/2) return;
106  copyGaugeEx<FloatOut, FloatIn, length, OutOrder, InOrder, regularToextended>(arg, X, parity);
107  }
108  }
109 
110  template <typename FloatOut, typename FloatIn, int length, typename OutOrder, typename InOrder>
113  const GaugeField &meta; // use for metadata
115 
116  private:
117  unsigned int sharedBytesPerThread() const { return 0; }
118  unsigned int sharedBytesPerBlock(const TuneParam &param) const { return 0 ;}
119 
120  bool tuneGridDim() const { return false; } // Don't tune the grid dimensions.
121  unsigned int minThreads() const { return arg.volume/2; }
122 
123  public:
125  : arg(arg), meta(meta), location(location) {
126  writeAuxString("out_stride=%d,in_stride=%d,geometry=%d",arg.out.stride,arg.in.stride,arg.geometry);
127  }
128  virtual ~CopyGaugeEx() { ; }
129 
130  void apply(const cudaStream_t &stream) {
131  TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
132 
133  if (location == QUDA_CPU_FIELD_LOCATION) {
134  if(arg.regularToextended) copyGaugeEx<FloatOut, FloatIn, length, OutOrder, InOrder, true>(arg);
135  else copyGaugeEx<FloatOut, FloatIn, length, OutOrder, InOrder, false>(arg);
136  } else if (location == QUDA_CUDA_FIELD_LOCATION) {
137  if(arg.regularToextended) copyGaugeExKernel<FloatOut, FloatIn, length, OutOrder, InOrder, true>
138  <<<tp.grid, tp.block, tp.shared_bytes, stream>>>(arg);
139  else copyGaugeExKernel<FloatOut, FloatIn, length, OutOrder, InOrder, false>
140  <<<tp.grid, tp.block, tp.shared_bytes, stream>>>(arg);
141  }
142  }
143 
144  TuneKey tuneKey() const {
145  return TuneKey(meta.VolString(), typeid(*this).name(), aux);
146  }
147 
148  long long flops() const { return 0; }
149  long long bytes() const {
150  int sites = 4*arg.volume/2;
151  return 2 * sites * ( arg.in.Bytes() + arg.in.hasPhase*sizeof(FloatIn)
152  + arg.out.Bytes() + arg.out.hasPhase*sizeof(FloatOut) );
153  }
154  };
155 
156 
157  template <typename FloatOut, typename FloatIn, int length, typename OutOrder, typename InOrder>
158  void copyGaugeEx(OutOrder outOrder, const InOrder inOrder, const int *E,
159  const int *X, const int *faceVolumeCB, const GaugeField &meta, QudaFieldLocation location) {
160 
162  arg(outOrder, inOrder, E, X, faceVolumeCB, meta.Ndim(), meta.Geometry());
164  copier.apply(0);
165  if (location == QUDA_CUDA_FIELD_LOCATION) checkCudaError();
166  }
167 
168  template <typename FloatOut, typename FloatIn, int length, typename InOrder>
169  void copyGaugeEx(const InOrder &inOrder, const int *X, GaugeField &out,
170  QudaFieldLocation location, FloatOut *Out) {
171 
172  int faceVolumeCB[QUDA_MAX_DIM];
173  for (int i=0; i<4; i++) faceVolumeCB[i] = out.SurfaceCB(i) * out.Nface();
174 
175  if (out.isNative()) {
176  if (out.Reconstruct() == QUDA_RECONSTRUCT_NO) {
178  copyGaugeEx<FloatOut, FloatIn, length>(G(out, Out), inOrder, out.X(), X, faceVolumeCB, out, location);
179  } else if (out.Reconstruct() == QUDA_RECONSTRUCT_12) {
180 #if QUDA_RECONSTRUCT & 2
182  copyGaugeEx<FloatOut,FloatIn,length>
183  (G(out, Out), inOrder, out.X(), X, faceVolumeCB, out, location);
184 #else
185  errorQuda("QUDA_RECONSTRUCT=%d does not enable reconstruct-12", QUDA_RECONSTRUCT);
186 #endif
187  } else if (out.Reconstruct() == QUDA_RECONSTRUCT_8) {
188 #if QUDA_RECONSTRUCT & 1
190  copyGaugeEx<FloatOut,FloatIn,length>
191  (G(out, Out), inOrder, out.X(), X, faceVolumeCB, out, location);
192 #else
193  errorQuda("QUDA_RECONSTRUCT=%d does not enable reconstruct-8", QUDA_RECONSTRUCT);
194 #endif
195 #ifdef GPU_STAGGERED_DIRAC
196  } else if (out.Reconstruct() == QUDA_RECONSTRUCT_13) {
197 #if QUDA_RECONSTRUCT & 2
199  copyGaugeEx<FloatOut,FloatIn,length>
200  (G(out, Out), inOrder, out.X(), X, faceVolumeCB, out, location);
201 #else
202  errorQuda("QUDA_RECONSTRUCT=%d does not enable reconstruct-13", QUDA_RECONSTRUCT);
203 #endif
204  } else if (out.Reconstruct() == QUDA_RECONSTRUCT_9) {
205 #if QUDA_RECONSTRUCT & 1
207  copyGaugeEx<FloatOut,FloatIn,length>
208  (G(out, Out), inOrder, out.X(), X, faceVolumeCB, out, location);
209 #else
210  errorQuda("QUDA_RECONSTRUCT=%d does not enable reconstruct-9", QUDA_RECONSTRUCT);
211 #endif
212 #endif // GPU_STAGGERED_DIRAC
213  } else {
214  errorQuda("Reconstruction %d and order %d not supported", out.Reconstruct(), out.Order());
215  }
216  } else if (out.Order() == QUDA_QDP_GAUGE_ORDER) {
217 
218 #ifdef BUILD_QDP_INTERFACE
219  copyGaugeEx<FloatOut,FloatIn,length>
220  (QDPOrder<FloatOut,length>(out, Out), inOrder, out.X(), X, faceVolumeCB, out, location);
221 #else
222  errorQuda("QDP interface has not been built\n");
223 #endif
224 
225  } else if (out.Order() == QUDA_MILC_GAUGE_ORDER) {
226 
227 #ifdef BUILD_MILC_INTERFACE
228  copyGaugeEx<FloatOut,FloatIn,length>
229  (MILCOrder<FloatOut,length>(out, Out), inOrder, out.X(), X, faceVolumeCB, out, location);
230 #else
231  errorQuda("MILC interface has not been built\n");
232 #endif
233 
234  } else if (out.Order() == QUDA_TIFR_GAUGE_ORDER) {
235 
236 #ifdef BUILD_TIFR_INTERFACE
237  copyGaugeEx<FloatOut,FloatIn,length>
238  (TIFROrder<FloatOut,length>(out, Out), inOrder, out.X(), X, faceVolumeCB, out, location);
239 #else
240  errorQuda("TIFR interface has not been built\n");
241 #endif
242 
243  } else {
244  errorQuda("Gauge field %d order not supported", out.Order());
245  }
246 
247  }
248 
249  template <typename FloatOut, typename FloatIn, int length>
251  FloatOut *Out, FloatIn *In) {
252 
253  if (in.isNative()) {
254  if (in.Reconstruct() == QUDA_RECONSTRUCT_NO) {
256  copyGaugeEx<FloatOut, FloatIn, length>(G(in, In), in.X(), out, location, Out);
257  } else if (in.Reconstruct() == QUDA_RECONSTRUCT_12) {
258 #if QUDA_RECONSTRUCT & 2
260  copyGaugeEx<FloatOut,FloatIn,length> (G(in, In), in.X(), out, location, Out);
261 #else
262  errorQuda("QUDA_RECONSTRUCT=%d does not enable reconstruct-12", QUDA_RECONSTRUCT);
263 #endif
264  } else if (in.Reconstruct() == QUDA_RECONSTRUCT_8) {
265 #if QUDA_RECONSTRUCT & 1
267  copyGaugeEx<FloatOut,FloatIn,length> (G(in, In), in.X(), out, location, Out);
268 #else
269  errorQuda("QUDA_RECONSTRUCT=%d does not enable reconstruct-8", QUDA_RECONSTRUCT);
270 #endif
271 #ifdef GPU_STAGGERED_DIRAC
272  } else if (in.Reconstruct() == QUDA_RECONSTRUCT_13) {
273 #if QUDA_RECONSTRUCT & 2
275  copyGaugeEx<FloatOut,FloatIn,length> (G(in, In), in.X(), out, location, Out);
276 #else
277  errorQuda("QUDA_RECONSTRUCT=%d does not enable reconstruct-13", QUDA_RECONSTRUCT);
278 #endif
279  } else if (in.Reconstruct() == QUDA_RECONSTRUCT_9) {
280 #if QUDA_RECONSTRUCT & 1
282  copyGaugeEx<FloatOut,FloatIn,length> (G(in, In), in.X(), out, location, Out);
283 #else
284  errorQuda("QUDA_RECONSTRUCT=%d does not enable reconstruct-9", QUDA_RECONSTRUCT);
285 #endif
286 #endif // GPU_STAGGERED_DIRAC
287  } else {
288  errorQuda("Reconstruction %d and order %d not supported", in.Reconstruct(), in.Order());
289  }
290  } else if (in.Order() == QUDA_QDP_GAUGE_ORDER) {
291 
292 #ifdef BUILD_QDP_INTERFACE
293  copyGaugeEx<FloatOut,FloatIn,length>(QDPOrder<FloatIn,length>(in, In),
294  in.X(), out, location, Out);
295 #else
296  errorQuda("QDP interface has not been built\n");
297 #endif
298 
299  } else if (in.Order() == QUDA_MILC_GAUGE_ORDER) {
300 
301 #ifdef BUILD_MILC_INTERFACE
302  copyGaugeEx<FloatOut,FloatIn,length>(MILCOrder<FloatIn,length>(in, In),
303  in.X(), out, location, Out);
304 #else
305  errorQuda("MILC interface has not been built\n");
306 #endif
307 
308  } else if (in.Order() == QUDA_TIFR_GAUGE_ORDER) {
309 
310 #ifdef BUILD_TIFR_INTERFACE
311  copyGaugeEx<FloatOut,FloatIn,length>(TIFROrder<FloatIn,length>(in, In),
312  in.X(), out, location, Out);
313 #else
314  errorQuda("TIFR interface has not been built\n");
315 #endif
316 
317  } else {
318  errorQuda("Gauge field %d order not supported", in.Order());
319  }
320 
321  }
322 
323  template <typename FloatOut, typename FloatIn>
325  FloatOut *Out, FloatIn *In) {
326 
327  if (in.Ncolor() != 3 && out.Ncolor() != 3) {
328  errorQuda("Unsupported number of colors; out.Nc=%d, in.Nc=%d", out.Ncolor(), in.Ncolor());
329  }
330 
331  if (out.Geometry() != in.Geometry()) {
332  errorQuda("Field geometries %d %d do not match", out.Geometry(), in.Geometry());
333  }
334 
336  // we are doing gauge field packing
337  copyGaugeEx<FloatOut,FloatIn,18>(out, in, location, Out, In);
338  } else {
339  errorQuda("Not supported");
340  }
341  }
342 
344  QudaFieldLocation location, void *Out, void *In) {
345 
346  for (int d=0; d<in.Ndim(); d++) {
347  if ( (out.X()[d] - in.X()[d]) % 2 != 0)
348  errorQuda("Cannot copy into an asymmetrically extended gauge field");
349  }
350 
351  if (out.Precision() == QUDA_DOUBLE_PRECISION) {
352  if (in.Precision() == QUDA_DOUBLE_PRECISION) {
353  copyGaugeEx(out, in, location, (double*)Out, (double*)In);
354  } else if (in.Precision() == QUDA_SINGLE_PRECISION) {
355 #if QUDA_PRECISION & 4
356  copyGaugeEx(out, in, location, (double*)Out, (float*)In);
357 #else
358  errorQuda("QUDA_PRECISION=%d does not enable single precision", QUDA_PRECISION);
359 #endif
360  }
361  } else if (out.Precision() == QUDA_SINGLE_PRECISION) {
362  if (in.Precision() == QUDA_DOUBLE_PRECISION) {
363  copyGaugeEx(out, in, location, (float*)Out, (double*)In);
364  } else if (in.Precision() == QUDA_SINGLE_PRECISION) {
365 #if QUDA_PRECISION & 4
366  copyGaugeEx(out, in, location, (float*)Out, (float*)In);
367 #else
368  errorQuda("QUDA_PRECISION=%d does not enable single precision", QUDA_PRECISION);
369 #endif
370  }
371  }
372 
373  }
374 
375 } // namespace quda
struct to define TIFR ordered gauge fields: [mu][parity][volumecb][col][row]
__host__ __device__ constexpr int Ncolor(int length)
Return the number of colors of the accessor based on the length of the field.
QudaFieldLocation location
CopyGaugeExArg(const OutOrder &out, const InOrder &in, const int *Xout, const int *Xin, const int *faceVolumeCB, int nDim, int geometry)
__device__ __host__ void copyGaugeEx(CopyGaugeExArg< OutOrder, InOrder > &arg, int X, int parity)
QudaVerbosity getVerbosity()
Definition: util_quda.cpp:21
void apply(const cudaStream_t &stream)
#define errorQuda(...)
Definition: util_quda.h:121
long long bytes() const
cudaStream_t * stream
QudaLinkType LinkType() const
Definition: gauge_field.h:255
const char * VolString() const
static int R[4]
const int * SurfaceCB() const
QudaFieldGeometry Geometry() const
Definition: gauge_field.h:258
int E[4]
Definition: test_util.cpp:35
int length[]
CopyGaugeExArg< OutOrder, InOrder > arg
int Nface() const
Definition: gauge_field.h:281
QudaGaugeParam param
Definition: pack_test.cpp:17
int Ncolor() const
Definition: gauge_field.h:249
const int nColor
Definition: covdev_test.cpp:75
cpuColorSpinorField * in
unsigned int sharedBytesPerThread() const
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
Definition: tune.cpp:643
Main header file for host and device accessors to GaugeFields.
int X[4]
Definition: covdev_test.cpp:70
TuneKey tuneKey() const
const GaugeField & meta
enum QudaFieldLocation_s QudaFieldLocation
cpuColorSpinorField * out
unsigned int sharedBytesPerBlock(const TuneParam &param) const
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
QudaReconstructType Reconstruct() const
Definition: gauge_field.h:250
QudaGaugeFieldOrder Order() const
Definition: gauge_field.h:251
#define QUDA_MAX_DIM
Maximum number of dimensions supported by QUDA. In practice, no routines make use of more than 5...
#define checkCudaError()
Definition: util_quda.h:161
unsigned int minThreads() const
long long flops() const
QudaTune getTuning()
Query whether autotuning is enabled or not. Default is enabled but can be overridden by setting QUDA_...
Definition: util_quda.cpp:52
QudaPrecision Precision() const
bool isNative() const
__global__ void copyGaugeExKernel(CopyGaugeExArg< OutOrder, InOrder > arg)
CopyGaugeEx(CopyGaugeExArg< OutOrder, InOrder > &arg, const GaugeField &meta, QudaFieldLocation location)
QudaParity parity
Definition: covdev_test.cpp:54
void copyExtendedGauge(GaugeField &out, const GaugeField &in, QudaFieldLocation location, void *Out=0, void *In=0)
const int * X() const
int Xout[QUDA_MAX_DIM]