QUDA  0.9.0
copy_gauge_extended.cu
Go to the documentation of this file.
1 #include <gauge_field_order.h>
2 
3 namespace quda {
4 
5  using namespace gauge;
6 
10  template <typename OutOrder, typename InOrder>
11  struct CopyGaugeExArg {
12  OutOrder out;
13  const InOrder in;
14  int Xin[QUDA_MAX_DIM];
15  int Xout[QUDA_MAX_DIM];
16  int volume;
17  int volumeEx;
18  int nDim;
19  int geometry;
20  int faceVolumeCB[QUDA_MAX_DIM];
22  CopyGaugeExArg(const OutOrder &out, const InOrder &in, const int *Xout, const int *Xin,
23  const int *faceVolumeCB, int nDim, int geometry)
24  : out(out), in(in), nDim(nDim), geometry(geometry) {
25  for (int d=0; d<nDim; d++) {
26  this->Xout[d] = Xout[d];
27  this->Xin[d] = Xin[d];
28  this->faceVolumeCB[d] = faceVolumeCB[d];
29  }
30 
31  if (out.volumeCB > in.volumeCB) {
32  this->volume = 2*in.volumeCB;
33  this->volumeEx = 2*out.volumeCB;
34  this->regularToextended = true;
35  } else {
36  this->volume = 2*out.volumeCB;
37  this->volumeEx = 2*in.volumeCB;
38  this->regularToextended = false;
39  }
40  }
41 
42  };
43 
47  template <typename FloatOut, typename FloatIn, int length, typename OutOrder, typename InOrder, bool regularToextended>
48  __device__ __host__ void copyGaugeEx(CopyGaugeExArg<OutOrder,InOrder> &arg, int X, int parity) {
49  typedef typename mapper<FloatIn>::type RegTypeIn;
50  typedef typename mapper<FloatOut>::type RegTypeOut;
51 
52  int x[4];
53  int R[4];
54  int xin, xout;
55  if(regularToextended){
56  //regular to extended
57  for (int d=0; d<4; d++) R[d] = (arg.Xout[d] - arg.Xin[d]) >> 1;
58  int za = X/(arg.Xin[0]/2);
59  int x0h = X - za*(arg.Xin[0]/2);
60  int zb = za/arg.Xin[1];
61  x[1] = za - zb*arg.Xin[1];
62  x[3] = zb / arg.Xin[2];
63  x[2] = zb - x[3]*arg.Xin[2];
64  x[0] = 2*x0h + ((x[1] + x[2] + x[3] + parity) & 1);
65  // Y is the cb spatial index into the extended gauge field
66  xout = ((((x[3]+R[3])*arg.Xout[2] + (x[2]+R[2]))*arg.Xout[1] + (x[1]+R[1]))*arg.Xout[0]+(x[0]+R[0])) >> 1;
67  xin = X;
68  } else{
69  //extended to regular gauge
70  for (int d=0; d<4; d++) R[d] = (arg.Xin[d] - arg.Xout[d]) >> 1;
71  int za = X/(arg.Xout[0]/2);
72  int x0h = X - za*(arg.Xout[0]/2);
73  int zb = za/arg.Xout[1];
74  x[1] = za - zb*arg.Xout[1];
75  x[3] = zb / arg.Xout[2];
76  x[2] = zb - x[3]*arg.Xout[2];
77  x[0] = 2*x0h + ((x[1] + x[2] + x[3] + parity) & 1);
78  // Y is the cb spatial index into the extended gauge field
79  xin = ((((x[3]+R[3])*arg.Xin[2] + (x[2]+R[2]))*arg.Xin[1] + (x[1]+R[1]))*arg.Xin[0]+(x[0]+R[0])) >> 1;
80  xout = X;
81  }
82  for (int d=0; d<arg.geometry; d++){
83  RegTypeIn in[length];
84  RegTypeOut out[length];
85  arg.in.load(in, xin, d, parity);
86  for (int i=0; i<length; i++) out[i] = in[i];
87  arg.out.save(out, xout, d, parity);
88  }//dir
89  }
90 
91  template <typename FloatOut, typename FloatIn, int length, typename OutOrder, typename InOrder, bool regularToextended>
93  for (int parity=0; parity<2; parity++) {
94  for(int X=0; X<arg.volume/2; X++){
95  copyGaugeEx<FloatOut, FloatIn, length, OutOrder, InOrder, regularToextended>(arg, X, parity);
96  }
97  }
98  }
99 
100  template <typename FloatOut, typename FloatIn, int length, typename OutOrder, typename InOrder, bool regularToextended>
102  for (int parity=0; parity<2; parity++) {
103  int X = blockIdx.x * blockDim.x + threadIdx.x;
104  if (X >= arg.volume/2) return;
105  copyGaugeEx<FloatOut, FloatIn, length, OutOrder, InOrder, regularToextended>(arg, X, parity);
106  }
107  }
108 
109  template <typename FloatOut, typename FloatIn, int length, typename OutOrder, typename InOrder>
112  const GaugeField &meta; // use for metadata
114 
115  private:
116  unsigned int sharedBytesPerThread() const { return 0; }
117  unsigned int sharedBytesPerBlock(const TuneParam &param) const { return 0 ;}
118 
119  bool tuneGridDim() const { return false; } // Don't tune the grid dimensions.
120  unsigned int minThreads() const { return arg.volume/2; }
121 
122  public:
124  : arg(arg), meta(meta), location(location) {
125  writeAuxString("out_stride=%d,in_stride=%d,geometry=%d",arg.out.stride,arg.in.stride,arg.geometry);
126  }
127  virtual ~CopyGaugeEx() { ; }
128 
129  void apply(const cudaStream_t &stream) {
130  TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
131 
132  if (location == QUDA_CPU_FIELD_LOCATION) {
133  if(arg.regularToextended) copyGaugeEx<FloatOut, FloatIn, length, OutOrder, InOrder, true>(arg);
134  else copyGaugeEx<FloatOut, FloatIn, length, OutOrder, InOrder, false>(arg);
135  } else if (location == QUDA_CUDA_FIELD_LOCATION) {
136  if(arg.regularToextended) copyGaugeExKernel<FloatOut, FloatIn, length, OutOrder, InOrder, true>
137  <<<tp.grid, tp.block, tp.shared_bytes, stream>>>(arg);
138  else copyGaugeExKernel<FloatOut, FloatIn, length, OutOrder, InOrder, false>
139  <<<tp.grid, tp.block, tp.shared_bytes, stream>>>(arg);
140  }
141  }
142 
143  TuneKey tuneKey() const {
144  return TuneKey(meta.VolString(), typeid(*this).name(), aux);
145  }
146 
147  long long flops() const { return 0; }
148  long long bytes() const {
149  int sites = 4*arg.volume/2;
150  return 2 * sites * ( arg.in.Bytes() + arg.in.hasPhase*sizeof(FloatIn)
151  + arg.out.Bytes() + arg.out.hasPhase*sizeof(FloatOut) );
152  }
153  };
154 
155 
156  template <typename FloatOut, typename FloatIn, int length, typename OutOrder, typename InOrder>
157  void copyGaugeEx(OutOrder outOrder, const InOrder inOrder, const int *E,
158  const int *X, const int *faceVolumeCB, const GaugeField &meta, QudaFieldLocation location) {
159 
161  arg(outOrder, inOrder, E, X, faceVolumeCB, meta.Ndim(), meta.Geometry());
163  copier.apply(0);
164  if (location == QUDA_CUDA_FIELD_LOCATION) checkCudaError();
165  }
166 
167  template <typename FloatOut, typename FloatIn, int length, typename InOrder>
168  void copyGaugeEx(const InOrder &inOrder, const int *X, GaugeField &out,
169  QudaFieldLocation location, FloatOut *Out) {
170 
171  int faceVolumeCB[QUDA_MAX_DIM];
172  for (int i=0; i<4; i++) faceVolumeCB[i] = out.SurfaceCB(i) * out.Nface();
173 
174  if (out.isNative()) {
175  if (out.Reconstruct() == QUDA_RECONSTRUCT_NO) {
176  if (typeid(FloatOut)==typeid(short) && out.LinkType() == QUDA_ASQTAD_FAT_LINKS) {
177  copyGaugeEx<short,FloatIn,length>
178  (FloatNOrder<short,length,2,19>(out, (short*)Out), inOrder, out.X(), X, faceVolumeCB, out, location);
179  } else {
181  copyGaugeEx<FloatOut,FloatIn,length>
182  (G(out, Out), inOrder, out.X(), X, faceVolumeCB, out, location);
183  }
184  } else if (out.Reconstruct() == QUDA_RECONSTRUCT_12) {
186  copyGaugeEx<FloatOut,FloatIn,length>
187  (G(out, Out), inOrder, out.X(), X, faceVolumeCB, out, location);
188  } else if (out.Reconstruct() == QUDA_RECONSTRUCT_8) {
190  copyGaugeEx<FloatOut,FloatIn,length>
191  (G(out, Out), inOrder, out.X(), X, faceVolumeCB, out, location);
192 #ifdef GPU_STAGGERED_DIRAC
193  } else if (out.Reconstruct() == QUDA_RECONSTRUCT_13) {
195  copyGaugeEx<FloatOut,FloatIn,length>
196  (G(out, Out), inOrder, out.X(), X, faceVolumeCB, out, location);
197  } else if (out.Reconstruct() == QUDA_RECONSTRUCT_9) {
199  copyGaugeEx<FloatOut,FloatIn,length>
200  (G(out, Out), inOrder, out.X(), X, faceVolumeCB, out, location);
201 #endif
202  } else {
203  errorQuda("Reconstruction %d and order %d not supported", out.Reconstruct(), out.Order());
204  }
205  } else if (out.Order() == QUDA_QDP_GAUGE_ORDER) {
206 
207 #ifdef BUILD_QDP_INTERFACE
208  copyGaugeEx<FloatOut,FloatIn,length>
209  (QDPOrder<FloatOut,length>(out, Out), inOrder, out.X(), X, faceVolumeCB, out, location);
210 #else
211  errorQuda("QDP interface has not been built\n");
212 #endif
213 
214  } else if (out.Order() == QUDA_MILC_GAUGE_ORDER) {
215 
216 #ifdef BUILD_MILC_INTERFACE
217  copyGaugeEx<FloatOut,FloatIn,length>
218  (MILCOrder<FloatOut,length>(out, Out), inOrder, out.X(), X, faceVolumeCB, out, location);
219 #else
220  errorQuda("MILC interface has not been built\n");
221 #endif
222 
223  } else if (out.Order() == QUDA_TIFR_GAUGE_ORDER) {
224 
225 #ifdef BUILD_TIFR_INTERFACE
226  copyGaugeEx<FloatOut,FloatIn,length>
227  (TIFROrder<FloatOut,length>(out, Out), inOrder, out.X(), X, faceVolumeCB, out, location);
228 #else
229  errorQuda("TIFR interface has not been built\n");
230 #endif
231 
232  } else {
233  errorQuda("Gauge field %d order not supported", out.Order());
234  }
235 
236  }
237 
238  template <typename FloatOut, typename FloatIn, int length>
240  FloatOut *Out, FloatIn *In) {
241 
242  if (in.isNative()) {
243  if (in.Reconstruct() == QUDA_RECONSTRUCT_NO) {
244  if (typeid(FloatIn)==typeid(short) && in.LinkType() == QUDA_ASQTAD_FAT_LINKS) {
245  copyGaugeEx<FloatOut,short,length> (FloatNOrder<short,length,2,19>(in, (short*)In),
246  in.X(), out, location, Out);
247  } else {
249  copyGaugeEx<FloatOut,FloatIn,length> (G(in, In), in.X(), out, location, Out);
250  }
251  } else if (in.Reconstruct() == QUDA_RECONSTRUCT_12) {
253  copyGaugeEx<FloatOut,FloatIn,length> (G(in, In), in.X(), out, location, Out);
254  } else if (in.Reconstruct() == QUDA_RECONSTRUCT_8) {
256  copyGaugeEx<FloatOut,FloatIn,length> (G(in, In), in.X(), out, location, Out);
257 #ifdef GPU_STAGGERED_DIRAC
258  } else if (in.Reconstruct() == QUDA_RECONSTRUCT_13) {
260  copyGaugeEx<FloatOut,FloatIn,length> (G(in, In), in.X(), out, location, Out);
261  } else if (in.Reconstruct() == QUDA_RECONSTRUCT_9) {
263  copyGaugeEx<FloatOut,FloatIn,length> (G(in, In), in.X(), out, location, Out);
264 #endif
265  } else {
266  errorQuda("Reconstruction %d and order %d not supported", in.Reconstruct(), in.Order());
267  }
268  } else if (in.Order() == QUDA_QDP_GAUGE_ORDER) {
269 
270 #ifdef BUILD_QDP_INTERFACE
271  copyGaugeEx<FloatOut,FloatIn,length>(QDPOrder<FloatIn,length>(in, In),
272  in.X(), out, location, Out);
273 #else
274  errorQuda("QDP interface has not been built\n");
275 #endif
276 
277  } else if (in.Order() == QUDA_MILC_GAUGE_ORDER) {
278 
279 #ifdef BUILD_MILC_INTERFACE
280  copyGaugeEx<FloatOut,FloatIn,length>(MILCOrder<FloatIn,length>(in, In),
281  in.X(), out, location, Out);
282 #else
283  errorQuda("MILC interface has not been built\n");
284 #endif
285 
286  } else if (in.Order() == QUDA_TIFR_GAUGE_ORDER) {
287 
288 #ifdef BUILD_TIFR_INTERFACE
289  copyGaugeEx<FloatOut,FloatIn,length>(TIFROrder<FloatIn,length>(in, In),
290  in.X(), out, location, Out);
291 #else
292  errorQuda("TIFR interface has not been built\n");
293 #endif
294 
295  } else {
296  errorQuda("Gauge field %d order not supported", in.Order());
297  }
298 
299  }
300 
301  template <typename FloatOut, typename FloatIn>
303  FloatOut *Out, FloatIn *In) {
304 
305  if (in.Ncolor() != 3 && out.Ncolor() != 3) {
306  errorQuda("Unsupported number of colors; out.Nc=%d, in.Nc=%d", out.Ncolor(), in.Ncolor());
307  }
308 
309  if (out.Geometry() != in.Geometry()) {
310  errorQuda("Field geometries %d %d do not match", out.Geometry(), in.Geometry());
311  }
312 
313  if (in.LinkType() != QUDA_ASQTAD_MOM_LINKS && out.LinkType() != QUDA_ASQTAD_MOM_LINKS) {
314  // we are doing gauge field packing
315  copyGaugeEx<FloatOut,FloatIn,18>(out, in, location, Out, In);
316  } else {
317  errorQuda("Not supported");
318  }
319  }
320 
322  QudaFieldLocation location, void *Out, void *In) {
323 
324  for (int d=0; d<in.Ndim(); d++) {
325  if ( (out.X()[d] - in.X()[d]) % 2 != 0)
326  errorQuda("Cannot copy into an asymmetrically extended gauge field");
327  }
328 
329  if (out.Precision() == QUDA_DOUBLE_PRECISION) {
330  if (in.Precision() == QUDA_DOUBLE_PRECISION) {
331  copyGaugeEx(out, in, location, (double*)Out, (double*)In);
332  } else if (in.Precision() == QUDA_SINGLE_PRECISION) {
333  copyGaugeEx(out, in, location, (double*)Out, (float*)In);
334  }
335  } else if (out.Precision() == QUDA_SINGLE_PRECISION) {
336  if (in.Precision() == QUDA_DOUBLE_PRECISION) {
337  copyGaugeEx(out, in, location, (float*)Out, (double*)In);
338  } else if (in.Precision() == QUDA_SINGLE_PRECISION) {
339  copyGaugeEx(out, in, location, (float*)Out, (float*)In);
340  }
341  }
342 
343  }
344 
345 } // namespace quda
QudaFieldLocation location
dim3 dim3 blockDim
CopyGaugeExArg(const OutOrder &out, const InOrder &in, const int *Xout, const int *Xin, const int *faceVolumeCB, int nDim, int geometry)
__device__ __host__ void copyGaugeEx(CopyGaugeExArg< OutOrder, InOrder > &arg, int X, int parity)
QudaVerbosity getVerbosity()
Definition: util_quda.cpp:20
void apply(const cudaStream_t &stream)
#define errorQuda(...)
Definition: util_quda.h:90
long long bytes() const
cudaStream_t * stream
const char * VolString() const
static int R[4]
QudaFieldGeometry Geometry() const
Definition: gauge_field.h:212
int E[4]
Definition: test_util.cpp:36
CopyGaugeExArg< OutOrder, InOrder > arg
QudaGaugeParam param
Definition: pack_test.cpp:17
cpuColorSpinorField * in
unsigned int sharedBytesPerThread() const
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
Definition: tune.cpp:603
Main header file for host and device accessors to GaugeFields.
TuneKey tuneKey() const
const GaugeField & meta
enum QudaFieldLocation_s QudaFieldLocation
cpuColorSpinorField * out
unsigned int sharedBytesPerBlock(const TuneParam &param) const
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
Definition: complex_quda.h:880
void size_t length
Accessor routine for CloverFields in native field order.
#define QUDA_MAX_DIM
Maximum number of dimensions supported by QUDA. In practice, no routines make use of more than 5...
#define checkCudaError()
Definition: util_quda.h:129
unsigned int minThreads() const
long long flops() const
QudaTune getTuning()
Query whether autotuning is enabled or not. Default is enabled but can be overridden by setting QUDA_...
Definition: util_quda.cpp:51
static __inline__ size_t size_t d
__global__ void copyGaugeExKernel(CopyGaugeExArg< OutOrder, InOrder > arg)
CopyGaugeEx(CopyGaugeExArg< OutOrder, InOrder > &arg, const GaugeField &meta, QudaFieldLocation location)
QudaParity parity
Definition: covdev_test.cpp:53
void copyExtendedGauge(GaugeField &out, const GaugeField &in, QudaFieldLocation location, void *Out=0, void *In=0)