QUDA  v0.7.0
A library for QCD on GPUs
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
copy_gauge_extended.cu
Go to the documentation of this file.
1 #include <gauge_field_order.h>
2 
3 namespace quda {
4 
8  template <typename OutOrder, typename InOrder>
9  struct CopyGaugeExArg {
10  OutOrder out;
11  const InOrder in;
12  int E[QUDA_MAX_DIM]; // geometry of extended gauge field
13  int X[QUDA_MAX_DIM]; // geometry of the normal gauge field
14  int volume;
15  int volumeEx;
16  int nDim;
17  int geometry;
19  CopyGaugeExArg(const OutOrder &out, const InOrder &in, const int *E, const int *X,
20  const int *faceVolumeCB, int nDim, int geometry)
21  : out(out), in(in), volume(2*in.volumeCB), volumeEx(2*out.volumeCB),
22  nDim(nDim), geometry(geometry) {
23  for (int d=0; d<nDim; d++) {
24  this->E[d] = E[d];
25  this->X[d] = X[d];
26  this->faceVolumeCB[d] = faceVolumeCB[d];
27  }
28  }
29  };
30 
34  template <typename FloatOut, typename FloatIn, int length, typename OutOrder, typename InOrder>
35  __device__ __host__ void copyGaugeEx(CopyGaugeExArg<OutOrder,InOrder> &arg, int X, int parity) {
36  typedef typename mapper<FloatIn>::type RegTypeIn;
37  typedef typename mapper<FloatOut>::type RegTypeOut;
38 
39  int x[4];
40  int R[4];
41  for (int d=0; d<4; d++) R[d] = (arg.E[d] - arg.X[d]) >> 1;
42 
43  int za = X/(arg.X[0]/2);
44  int x0h = X - za*(arg.X[0]/2);
45  int zb = za/arg.X[1];
46  x[1] = za - zb*arg.X[1];
47  x[3] = zb / arg.X[2];
48  x[2] = zb - x[3]*arg.X[2];
49  x[0] = 2*x0h + ((x[1] + x[2] + x[3] + parity) & 1);
50 
51  // Y is the cb spatial index into the extended gauge field
52  int Y = ((((x[3]+R[3])*arg.E[2] + (x[2]+R[2]))*arg.E[1] + (x[1]+R[1]))*arg.E[0]+(x[0]+R[0])) >> 1;
53 
54  for(int d=0; d<arg.geometry; d++){
55  RegTypeIn in[length];
56  RegTypeOut out[length];
57  arg.in.load(in, X, d, parity);
58  for (int i=0; i<length; i++) out[i] = in[i];
59  arg.out.save(out, Y, d, parity);
60  }//dir
61  }
62 
63  template <typename FloatOut, typename FloatIn, int length, typename OutOrder, typename InOrder>
65  for (int parity=0; parity<2; parity++) {
66  for(int X=0; X<arg.volume/2; X++){
67  copyGaugeEx<FloatOut, FloatIn, length, OutOrder, InOrder>(arg, X, parity);
68  }
69  }
70  }
71 
72  template <typename FloatOut, typename FloatIn, int length, typename OutOrder, typename InOrder>
74  for (int parity=0; parity<2; parity++) {
75  int X = blockIdx.x * blockDim.x + threadIdx.x;
76  if (X >= arg.volume/2) return;
77  copyGaugeEx<FloatOut, FloatIn, length, OutOrder, InOrder>(arg, X, parity);
78  }
79  }
80 
81  template <typename FloatOut, typename FloatIn, int length, typename OutOrder, typename InOrder>
82  class CopyGaugeEx : Tunable {
84  const GaugeField &meta; // use for metadata
85  QudaFieldLocation location;
86 
87  private:
88  unsigned int sharedBytesPerThread() const { return 0; }
89  unsigned int sharedBytesPerBlock(const TuneParam &param) const { return 0 ;}
90 
91  bool tuneGridDim() const { return false; } // Don't tune the grid dimensions.
92  unsigned int minThreads() const { return arg.volume/2; }
93 
94  public:
96  : arg(arg), meta(meta), location(location) {
97  writeAuxString("out_stride=%d,in_stride=%d,geometery=%d",arg.out.stride,arg.in.stride,arg.geometry);
98  }
99  virtual ~CopyGaugeEx() { ; }
100 
101  void apply(const cudaStream_t &stream) {
102  TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
103 
104  if (location == QUDA_CPU_FIELD_LOCATION) {
105  copyGaugeEx<FloatOut, FloatIn, length>(arg);
106  } else if (location == QUDA_CUDA_FIELD_LOCATION) {
107 #if (__COMPUTE_CAPABILITY__ >= 200)
108  copyGaugeExKernel<FloatOut, FloatIn, length, OutOrder, InOrder>
109  <<<tp.grid, tp.block, tp.shared_bytes, stream>>>(arg);
110 #else
111  errorQuda("Extended gauge copy not supported on pre-Fermi architecture");
112 #endif
113  }
114  }
115 
116  TuneKey tuneKey() const {
117  return TuneKey(meta.VolString(), typeid(*this).name(), aux);
118  }
119 
120  std::string paramString(const TuneParam &param) const { // Don't bother printing the grid dim.
121  std::stringstream ps;
122  ps << "block=(" << param.block.x << "," << param.block.y << "," << param.block.z << "), ";
123  ps << "shared=" << param.shared_bytes;
124  return ps.str();
125  }
126 
127  long long flops() const { return 0; }
128  long long bytes() const {
129  int sites = 4*arg.volume/2;
130 #if (__COMPUTE_CAPABILITY__ >= 200)
131  return 2 * sites * ( arg.in.Bytes() + arg.in.hasPhase*sizeof(FloatIn)
132  + arg.out.Bytes() + arg.out.hasPhase*sizeof(FloatOut) );
133 #else
134  return 2 * sites * ( arg.in.Bytes() + arg.out.Bytes() );
135 #endif
136  }
137  };
138 
139 
140  template <typename FloatOut, typename FloatIn, int length, typename OutOrder, typename InOrder>
141  void copyGaugeEx(OutOrder outOrder, const InOrder inOrder, const int *E,
142  const int *X, const int *faceVolumeCB, const GaugeField &meta, QudaFieldLocation location) {
143 
145  arg(outOrder, inOrder, E, X, faceVolumeCB, meta.Ndim(), meta.Geometry());
147  copier.apply(0);
148  if (location == QUDA_CUDA_FIELD_LOCATION) checkCudaError();
149  }
150 
151  template <typename FloatOut, typename FloatIn, int length, typename InOrder>
152  void copyGaugeEx(const InOrder &inOrder, const int *X, GaugeField &out,
153  QudaFieldLocation location, FloatOut *Out) {
154  int faceVolumeCB[QUDA_MAX_DIM];
155  for (int i=0; i<4; i++) faceVolumeCB[i] = out.SurfaceCB(i) * out.Nface();
156 
157  if (out.Order() == QUDA_FLOAT2_GAUGE_ORDER) {
158  if (out.Reconstruct() == QUDA_RECONSTRUCT_NO) {
159  if (typeid(FloatOut)==typeid(short) && out.LinkType() == QUDA_ASQTAD_FAT_LINKS) {
160  copyGaugeEx<FloatOut,FloatIn,length>
161  (FloatNOrder<FloatOut,length,2,19>(out, Out), inOrder, out.X(), X, faceVolumeCB, out, location);
162  } else {
163  copyGaugeEx<FloatOut,FloatIn,length>
164  (FloatNOrder<FloatOut,length,2,18>(out, Out), inOrder, out.X(), X, faceVolumeCB, out, location);
165  }
166  } else if (out.Reconstruct() == QUDA_RECONSTRUCT_12) {
167  copyGaugeEx<FloatOut,FloatIn,length>
168  (FloatNOrder<FloatOut,length,2,12>(out, Out), inOrder, out.X(), X, faceVolumeCB, out, location);
169  } else if (out.Reconstruct() == QUDA_RECONSTRUCT_8) {
170  copyGaugeEx<FloatOut,FloatIn,length>
171  (FloatNOrder<FloatOut,length,2,8>(out, Out), inOrder, out.X(), X, faceVolumeCB, out, location);
172 #ifdef GPU_STAGGERED_DIRAC
173  } else if (out.Reconstruct() == QUDA_RECONSTRUCT_13) {
174  copyGaugeEx<FloatOut,FloatIn,length>
175  (FloatNOrder<FloatOut,length,2,13>(out, Out), inOrder, out.X(), X, faceVolumeCB, out, location);
176  } else if (out.Reconstruct() == QUDA_RECONSTRUCT_9) {
177  copyGaugeEx<FloatOut,FloatIn,length>
178  (FloatNOrder<FloatOut,length,2,9>(out, Out), inOrder, out.X(), X, faceVolumeCB, out, location);
179 #endif
180  } else {
181  errorQuda("Reconstruction %d and order %d not supported", out.Reconstruct(), out.Order());
182  }
183  } else if (out.Order() == QUDA_FLOAT4_GAUGE_ORDER) {
184  if (out.Reconstruct() == QUDA_RECONSTRUCT_12) {
185  copyGaugeEx<FloatOut,FloatIn,length>
186  (FloatNOrder<FloatOut,length,4,12>(out, Out), inOrder, out.X(), X, faceVolumeCB, out, location);
187  } else if (out.Reconstruct() == QUDA_RECONSTRUCT_8) {
188  copyGaugeEx<FloatOut,FloatIn,length>
189  (FloatNOrder<FloatOut,length,4,8>(out, Out), inOrder, out.X(), X, faceVolumeCB, out, location);
190 #ifdef GPU_STAGGERED_DIRAC
191  } else if (out.Reconstruct() == QUDA_RECONSTRUCT_13) {
192  copyGaugeEx<FloatOut,FloatIn,length>
193  (FloatNOrder<FloatOut,length,4,13>(out, Out), inOrder, out.X(), X, faceVolumeCB, out, location);
194  } else if (out.Reconstruct() == QUDA_RECONSTRUCT_9) {
195  copyGaugeEx<FloatOut,FloatIn,length>
196  (FloatNOrder<FloatOut,length,4,9>(out, Out), inOrder, out.X(), X, faceVolumeCB, out, location);
197 #endif
198  } else {
199  errorQuda("Reconstruction %d and order %d not supported", out.Reconstruct(), out.Order());
200  }
201 
202  } else if (out.Order() == QUDA_QDP_GAUGE_ORDER) {
203 
204 #ifdef BUILD_QDP_INTERFACE
205  copyGaugeEx<FloatOut,FloatIn,length>
206  (QDPOrder<FloatOut,length>(out, Out), inOrder, out.X(), X, faceVolumeCB, out, location);
207 #else
208  errorQuda("QDP interface has not been built\n");
209 #endif
210 
211  } else if (out.Order() == QUDA_MILC_GAUGE_ORDER) {
212 
213 #ifdef BUILD_MILC_INTERFACE
214  copyGaugeEx<FloatOut,FloatIn,length>
215  (MILCOrder<FloatOut,length>(out, Out), inOrder, out.X(), X, faceVolumeCB, out, location);
216 #else
217  errorQuda("MILC interface has not been built\n");
218 #endif
219 
220  } else if (out.Order() == QUDA_TIFR_GAUGE_ORDER) {
221 
222 #ifdef BUILD_TIFR_INTERFACE
223  copyGaugeEx<FloatOut,FloatIn,length>
224  (TIFROrder<FloatOut,length>(out, Out), inOrder, out.X(), X, faceVolumeCB, out, location);
225 #else
226  errorQuda("TIFR interface has not been built\n");
227 #endif
228 
229  } else {
230  errorQuda("Gauge field %d order not supported", out.Order());
231  }
232 
233  }
234 
235  template <typename FloatOut, typename FloatIn, int length>
237  FloatOut *Out, FloatIn *In) {
238 
239  if (in.Order() == QUDA_FLOAT2_GAUGE_ORDER) {
240  if (in.Reconstruct() == QUDA_RECONSTRUCT_NO) {
241  if (typeid(FloatIn)==typeid(short) && in.LinkType() == QUDA_ASQTAD_FAT_LINKS) {
242  copyGaugeEx<FloatOut,FloatIn,length> (FloatNOrder<FloatIn,length,2,19>(in, In),
243  in.X(), out, location, Out);
244  } else {
245  copyGaugeEx<FloatOut,FloatIn,length> (FloatNOrder<FloatIn,length,2,18>(in, In),
246  in.X(), out, location, Out);
247  }
248  } else if (in.Reconstruct() == QUDA_RECONSTRUCT_12) {
249  copyGaugeEx<FloatOut,FloatIn,length> (FloatNOrder<FloatIn,length,2,12>(in, In),
250  in.X(), out, location, Out);
251  } else if (in.Reconstruct() == QUDA_RECONSTRUCT_8) {
252  copyGaugeEx<FloatOut,FloatIn,length> (FloatNOrder<FloatIn,length,2,8>(in, In),
253  in.X(), out, location, Out);
254 #ifdef GPU_STAGGERED_DIRAC
255  } else if (in.Reconstruct() == QUDA_RECONSTRUCT_13) {
256  copyGaugeEx<FloatOut,FloatIn,length> (FloatNOrder<FloatIn,length,2,13>(in, In),
257  in.X(), out, location, Out);
258  } else if (in.Reconstruct() == QUDA_RECONSTRUCT_9) {
259  copyGaugeEx<FloatOut,FloatIn,length> (FloatNOrder<FloatIn,length,2,9>(in, In),
260  in.X(), out, location, Out);
261 #endif
262  } else {
263  errorQuda("Reconstruction %d and order %d not supported", in.Reconstruct(), in.Order());
264  }
265  } else if (in.Order() == QUDA_FLOAT4_GAUGE_ORDER) {
266  if (in.Reconstruct() == QUDA_RECONSTRUCT_12) {
267  copyGaugeEx<FloatOut,FloatIn,length> (FloatNOrder<FloatIn,length,4,12>(in, In),
268  in.X(), out, location, Out);
269  } else if (in.Reconstruct() == QUDA_RECONSTRUCT_8) {
270  copyGaugeEx<FloatOut,FloatIn,length> (FloatNOrder<FloatIn,length,4,8>(in, In),
271  in.X(), out, location, Out);
272 #ifdef GPU_STAGGERED_DIRAC
273  } else if (in.Reconstruct() == QUDA_RECONSTRUCT_13) {
274  copyGaugeEx<FloatOut,FloatIn,length> (FloatNOrder<FloatIn,length,4,13>(in, In),
275  in.X(), out, location, Out);
276  } else if (in.Reconstruct() == QUDA_RECONSTRUCT_9) {
277  copyGaugeEx<FloatOut,FloatIn,length> (FloatNOrder<FloatIn,length,4,9>(in, In),
278  in.X(), out, location, Out);
279 #endif
280  } else {
281  errorQuda("Reconstruction %d and order %d not supported", in.Reconstruct(), in.Order());
282  }
283 
284  } else if (in.Order() == QUDA_QDP_GAUGE_ORDER) {
285 
286 #ifdef BUILD_QDP_INTERFACE
287  copyGaugeEx<FloatOut,FloatIn,length>(QDPOrder<FloatIn,length>(in, In),
288  in.X(), out, location, Out);
289 #else
290  errorQuda("QDP interface has not been built\n");
291 #endif
292 
293  } else if (in.Order() == QUDA_MILC_GAUGE_ORDER) {
294 
295 #ifdef BUILD_MILC_INTERFACE
296  copyGaugeEx<FloatOut,FloatIn,length>(MILCOrder<FloatIn,length>(in, In),
297  in.X(), out, location, Out);
298 #else
299  errorQuda("MILC interface has not been built\n");
300 #endif
301 
302  } else if (in.Order() == QUDA_TIFR_GAUGE_ORDER) {
303 
304 #ifdef BUILD_TIFR_INTERFACE
305  copyGaugeEx<FloatOut,FloatIn,length>(TIFROrder<FloatIn,length>(in, In),
306  in.X(), out, location, Out);
307 #else
308  errorQuda("TIFR interface has not been built\n");
309 #endif
310 
311  } else {
312  errorQuda("Gauge field %d order not supported", in.Order());
313  }
314 
315  }
316 
317  template <typename FloatOut, typename FloatIn>
319  FloatOut *Out, FloatIn *In) {
320 
321  if (in.Ncolor() != 3 && out.Ncolor() != 3) {
322  errorQuda("Unsupported number of colors; out.Nc=%d, in.Nc=%d", out.Ncolor(), in.Ncolor());
323  }
324 
325  if (out.Geometry() != in.Geometry()) {
326  errorQuda("Field geometries %d %d do not match", out.Geometry(), in.Geometry());
327  }
328 
330  // we are doing gauge field packing
331  copyGaugeEx<FloatOut,FloatIn,18>(out, in, location, Out, In);
332  } else {
333  errorQuda("Not supported");
334  }
335  }
336 
338  QudaFieldLocation location, void *Out, void *In) {
339 
340  for (int d=0; d<in.Ndim(); d++) {
341  if ( (out.X()[d] - in.X()[d]) % 2 != 0)
342  errorQuda("Cannot copy into an asymmetrically extended gauge field");
343  }
344 
345  if (out.Precision() == QUDA_DOUBLE_PRECISION) {
346  if (in.Precision() == QUDA_DOUBLE_PRECISION) {
347  copyGaugeEx(out, in, location, (double*)Out, (double*)In);
348  } else if (in.Precision() == QUDA_SINGLE_PRECISION) {
349  copyGaugeEx(out, in, location, (double*)Out, (float*)In);
350  }
351  } else if (out.Precision() == QUDA_SINGLE_PRECISION) {
352  if (in.Precision() == QUDA_DOUBLE_PRECISION) {
353  copyGaugeEx(out, in, location, (float*)Out, (double*)In);
354  } else if (in.Precision() == QUDA_SINGLE_PRECISION) {
355  copyGaugeEx(out, in, location, (float*)Out, (float*)In);
356  }
357  }
358 
359  }
360 
361 } // namespace quda
int faceVolumeCB[QUDA_MAX_DIM]
QudaVerbosity getVerbosity()
Definition: util_quda.cpp:20
void apply(const cudaStream_t &stream)
int Ncolor() const
Definition: gauge_field.h:167
#define errorQuda(...)
Definition: util_quda.h:73
const int * X() const
QudaGaugeFieldOrder Order() const
Definition: gauge_field.h:169
cudaStream_t * stream
::std::string string
Definition: gtest.h:1979
int Nface() const
Definition: gauge_field.h:193
int length[]
const int * SurfaceCB() const
QudaGaugeParam param
Definition: pack_test.cpp:17
int E[4]
QudaPrecision Precision() const
void writeAuxString(const char *format,...)
Definition: tune_quda.h:138
const QudaFieldLocation location
Definition: pack_test.cpp:46
cpuColorSpinorField * in
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
Definition: tune.cpp:271
QudaReconstructType Reconstruct() const
Definition: gauge_field.h:168
const char * VolString() const
TuneKey tuneKey() const
long long bytes() const
int x[4]
enum QudaFieldLocation_s QudaFieldLocation
cpuColorSpinorField * out
QudaLinkType LinkType() const
Definition: gauge_field.h:174
__device__ __host__ void copyGaugeEx(CopyGaugeExArg< OutOrder, InOrder > &arg, int X, int parity)
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
Definition: complex_quda.h:843
int Ndim() const
#define QUDA_MAX_DIM
Maximum number of dimensions supported by QUDA. In practice, no routines make use of more than 5...
#define checkCudaError()
Definition: util_quda.h:110
QudaFieldGeometry Geometry() const
Definition: gauge_field.h:177
QudaTune getTuning()
Definition: util_quda.cpp:32
CopyGaugeEx(CopyGaugeExArg< OutOrder, InOrder > &arg, const GaugeField &meta, QudaFieldLocation location)
long long flops() const
std::string paramString(const TuneParam &param) const
const QudaParity parity
Definition: dslash_test.cpp:29
void copyExtendedGauge(GaugeField &out, const GaugeField &in, QudaFieldLocation location, void *Out=0, void *In=0)
char aux[TuneKey::aux_n]
Definition: tune_quda.h:136
CopyGaugeExArg(const OutOrder &out, const InOrder &in, const int *E, const int *X, const int *faceVolumeCB, int nDim, int geometry)
__global__ void copyGaugeExKernel(CopyGaugeExArg< OutOrder, InOrder > arg)