QUDA  0.9.0
copy_gauge_helper.cuh
Go to the documentation of this file.
1 #include <gauge_field_order.h>
2 
3 namespace quda {
4 
5  using namespace gauge;
6 
10  template <typename OutOrder, typename InOrder>
11  struct CopyGaugeArg {
12  OutOrder out;
13  const InOrder in;
14  int volume;
15  int faceVolumeCB[QUDA_MAX_DIM];
16  int nDim;
17  int geometry;
19  int in_offset;
20  CopyGaugeArg(const OutOrder &out, const InOrder &in, int volume,
21  const int *faceVolumeCB, int nDim, int geometry)
22  : out(out), in(in), volume(volume), nDim(nDim), geometry(geometry),
23  out_offset(0), in_offset(0) {
24  for (int d=0; d<nDim; d++) this->faceVolumeCB[d] = faceVolumeCB[d];
25  }
26  };
27 
31  template <typename FloatOut, typename FloatIn, int length, typename OutOrder, typename InOrder>
33  typedef typename mapper<FloatIn>::type RegTypeIn;
34  typedef typename mapper<FloatOut>::type RegTypeOut;
35 
36  for (int parity=0; parity<2; parity++) {
37 
38  for (int d=0; d<arg.geometry; d++) {
39  for (int x=0; x<arg.volume/2; x++) {
40 #ifdef FINE_GRAINED_ACCESS
41  for (int i=0; i<Ncolor(length); i++)
42  for (int j=0; j<Ncolor(length); j++) {
43  arg.out(d, parity, x, i, j) = arg.in(d, parity, x, i, j);
44  }
45 #else
46  RegTypeIn in[length];
47  RegTypeOut out[length];
48  arg.in.load(in, x, d, parity);
49  for (int i=0; i<length; i++) out[i] = in[i];
50  arg.out.save(out, x, d, parity);
51 #endif
52  }
53  }
54 
55  }
56  }
57 
61  template <typename Float, int length, typename Arg>
62  void checkNan(Arg arg) {
63  typedef typename mapper<Float>::type RegType;
64 
65  for (int parity=0; parity<2; parity++) {
66 
67  for (int d=0; d<arg.geometry; d++) {
68  for (int x=0; x<arg.volume/2; x++) {
69 #ifdef FINE_GRAINED_ACCESS
70  for (int i=0; i<Ncolor(length); i++)
71  for (int j=0; j<Ncolor(length); j++) {
72  complex<Float> u = arg.in(d, parity, x, i, j);
73  if (isnan(u.real()))
74  errorQuda("Nan detected at parity=%d, dir=%d, x=%d, i=%d", parity, d, x, 2*(i*Ncolor(length)+j));
75  if (isnan(u.imag()))
76  errorQuda("Nan detected at parity=%d, dir=%d, x=%d, i=%d", parity, d, x, 2*(i*Ncolor(length)+j+1));
77  }
78 #else
79  RegType u[length];
80  arg.in.load(u, x, d, parity);
81  for (int i=0; i<length; i++)
82  if (isnan(u[i]))
83  errorQuda("Nan detected at parity=%d, dir=%d, x=%d, i=%d", parity, d, x, i);
84 #endif
85  }
86  }
87 
88  }
89  }
90 
95  template <typename FloatOut, typename FloatIn, int length, typename OutOrder, typename InOrder>
97  typedef typename mapper<FloatIn>::type RegTypeIn;
98  typedef typename mapper<FloatOut>::type RegTypeOut;
99 
100  for (int parity=0; parity<2; parity++) {
101  int x = blockIdx.x * blockDim.x + threadIdx.x;
102  int d = blockIdx.y * blockDim.y + threadIdx.y;
103  if (x >= arg.volume/2) return;
104  if (d >= arg.geometry) return;
105 
106 #ifdef FINE_GRAINED_ACCESS
107  for (int i=0; i<Ncolor(length); i++)
108  for (int j=0; j<Ncolor(length); j++)
109  arg.out(d, parity, x, i, j) = arg.in(d, parity, x, i, j);
110 #else
111  RegTypeIn in[length];
112  RegTypeOut out[length];
113  arg.in.load(in, x, d, parity);
114  for (int i=0; i<length; i++) out[i] = in[i];
115  arg.out.save(out, x, d, parity);
116 #endif
117  }
118  }
119 
123  template <typename FloatOut, typename FloatIn, int length, typename OutOrder, typename InOrder>
125  typedef typename mapper<FloatIn>::type RegTypeIn;
126  typedef typename mapper<FloatOut>::type RegTypeOut;
127 
128  for (int parity=0; parity<2; parity++) {
129 
130  for (int d=0; d<arg.nDim; d++) {
131  for (int x=0; x<arg.faceVolumeCB[d]; x++) {
132 #ifdef FINE_GRAINED_ACCESS
133  for (int i=0; i<Ncolor(length); i++)
134  for (int j=0; j<Ncolor(length); j++)
135  arg.out.Ghost(d+arg.out_offset, parity, x, i, j) = arg.in.Ghost(d+arg.in_offset, parity, x, i, j);
136 #else
137  RegTypeIn in[length];
138  RegTypeOut out[length];
139  arg.in.loadGhost(in, x, d+arg.in_offset, parity); // assumes we are loading
140  for (int i=0; i<length; i++) out[i] = in[i];
141  arg.out.saveGhost(out, x, d+arg.out_offset, parity);
142 #endif
143  }
144  }
145 
146  }
147  }
148 
153  template <typename FloatOut, typename FloatIn, int length, typename OutOrder, typename InOrder>
155  typedef typename mapper<FloatIn>::type RegTypeIn;
156  typedef typename mapper<FloatOut>::type RegTypeOut;
157 
158  int x = blockIdx.x * blockDim.x + threadIdx.x;
159 
160  for (int parity=0; parity<2; parity++) {
161  for (int d=0; d<arg.nDim; d++) {
162  if (x < arg.faceVolumeCB[d]) {
163 #ifdef FINE_GRAINED_ACCESS
164  for (int i=0; i<Ncolor(length); i++)
165  for (int j=0; j<Ncolor(length); j++)
166  arg.out.Ghost(d+arg.out_offset, parity, x, i, j) = arg.in.Ghost(d+arg.in_offset, parity, x, i, j);
167 #else
168  RegTypeIn in[length];
169  RegTypeOut out[length];
170  arg.in.loadGhost(in, x, d+arg.in_offset, parity); // assumes we are loading
171  for (int i=0; i<length; i++) out[i] = in[i];
172  arg.out.saveGhost(out, x, d+arg.out_offset, parity);
173 #endif
174  }
175  }
176 
177  }
178  }
179 
180  template <typename FloatOut, typename FloatIn, int length, typename OutOrder, typename InOrder, bool isGhost>
183  int size;
184  const GaugeField &meta;
185 
186  private:
187  unsigned int sharedBytesPerThread() const { return 0; }
188  unsigned int sharedBytesPerBlock(const TuneParam &param) const { return 0 ;}
189 
190  bool tuneGridDim() const { return false; } // Don't tune the grid dimensions.
191  unsigned int minThreads() const { return size; }
192 
193  public:
195  : TunableVectorY(arg.in.geometry), arg(arg), meta(out) {
196  int faceMax = 0;
197  for (int d=0; d<arg.nDim; d++) {
198  faceMax = (arg.faceVolumeCB[d] > faceMax ) ? arg.faceVolumeCB[d] : faceMax;
199  }
200  size = isGhost ? faceMax : arg.volume/2;
201  if (size == 0 && isGhost) {
202  errorQuda("Cannot copy zero-sized ghost zone. Check nFace parameter is non-zero for both input and output gauge fields");
203  }
204 
205 #ifndef FINE_GRAINED_ACCESS
206  int n = writeAuxString("out_stride=%d,in_stride=%d,geometry=%d",arg.out.stride, arg.in.stride, arg.in.geometry);
207  if (out.Order() == QUDA_MILC_SITE_GAUGE_ORDER) {
208  n = snprintf(aux+n,TuneKey::aux_n,",in_siteoffset=%lu,out_sitesize=%lu",out.SiteOffset(),out.SiteSize());
209  if (n < 0 || n >=TuneKey::aux_n) errorQuda("Error writing auxiliary string");
210  }
211  if (in.Order() == QUDA_MILC_SITE_GAUGE_ORDER) {
212  n = snprintf(aux+n,TuneKey::aux_n,",in_siteoffset=%lu,in_sitesize=%lu",in.SiteOffset(),in.SiteSize());
213  if (n < 0 || n >=TuneKey::aux_n) errorQuda("Error writing auxiliary string");
214  }
215 #else
216  writeAuxString("fine-grained,geometry=%d", arg.in.geometry);
217 #endif
218  }
219 
220  virtual ~CopyGauge() { ; }
221 
222  void apply(const cudaStream_t &stream) {
223  TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
224  if (!isGhost) {
225  copyGaugeKernel<FloatOut, FloatIn, length, OutOrder, InOrder>
226  <<<tp.grid, tp.block, tp.shared_bytes, stream>>>(arg);
227  } else {
228  copyGhostKernel<FloatOut, FloatIn, length, OutOrder, InOrder>
229  <<<tp.grid, tp.block, tp.shared_bytes, stream>>>(arg);
230  }
231  }
232 
233  TuneKey tuneKey() const { return TuneKey(meta.VolString(), typeid(*this).name(), aux); }
234 
235  long long flops() const { return 0; }
236  long long bytes() const {
237  int sites = 4*arg.volume/2;
238  if (isGhost) {
239  sites = 0;
240  for (int d=0; d<4; d++) sites += arg.faceVolumeCB[d];
241  }
242 #ifndef FINE_GRAINED_ACCESS
243  return 2 * sites * ( arg.in.Bytes() + arg.in.hasPhase*sizeof(FloatIn)
244  + arg.out.Bytes() + arg.out.hasPhase*sizeof(FloatOut) );
245 #else
246  return 2 * sites * ( arg.in.Bytes() + arg.out.Bytes() );
247 #endif
248  }
249  };
250 
251 
252  template <typename FloatOut, typename FloatIn, int length, typename OutOrder, typename InOrder>
253  void copyGauge(OutOrder &&outOrder, const InOrder &inOrder, int volume, const int *faceVolumeCB,
254  int nDim, int geometry, const GaugeField &out, const GaugeField &in,
255  QudaFieldLocation location, int type) {
256 
257  CopyGaugeArg<OutOrder,InOrder> arg(outOrder, inOrder, volume, faceVolumeCB, nDim, geometry);
258 
259  if (location == QUDA_CPU_FIELD_LOCATION) {
260 #ifdef HOST_DEBUG
261  checkNan<FloatIn, length>(arg);
262 #endif
263 
264  if (type == 0 || type == 2) {
265  copyGauge<FloatOut, FloatIn, length>(arg);
266  }
267 #ifdef MULTI_GPU // only copy the ghost zone if doing multi-gpu
268  if (type == 0 || type == 1) {
269  if (geometry == QUDA_VECTOR_GEOMETRY || geometry == QUDA_COARSE_GEOMETRY) copyGhost<FloatOut, FloatIn, length>(arg);
270  //else warningQuda("Cannot copy for %d geometry gauge field", geometry);
271  }
272 
273  // special copy that only copies the second set of links in the ghost zone for bi-directional link fields
274  if (type == 3) {
275  if (geometry != QUDA_COARSE_GEOMETRY) errorQuda("Cannot request copy type %d on non-coarse link fields", geometry);
276  arg.out_offset = nDim;
277  copyGhost<FloatOut, FloatIn, length>(arg);
278  }
279 #endif
280  } else if (location == QUDA_CUDA_FIELD_LOCATION) {
281  // first copy body
282  if (type == 0 || type == 2) {
284  gaugeCopier.apply(0);
285  }
286 #ifdef MULTI_GPU
287  if (type == 0 || type == 1) {
288  if (geometry == QUDA_VECTOR_GEOMETRY || geometry == QUDA_COARSE_GEOMETRY) {
289  // now copy ghost
291  ghostCopier.apply(0);
292  } else {
293  //warningQuda("Cannot copy for %d geometry gauge field", geometry);
294  }
295  }
296 
297  // special copy that only copies the second set of links in the ghost zone for bi-directional link fields
298  if (type == 3) {
299  if (geometry != QUDA_COARSE_GEOMETRY) errorQuda("Cannot request copy type %d on non-coarse link fields", geometry);
300  arg.out_offset = nDim;
302  ghostCopier.apply(0);
303  }
304 #endif
305  } else {
306  errorQuda("Undefined field location %d for copyGauge", location);
307  }
308 
309  }
310 
311 } // namespace quda
__host__ __device__ constexpr int Ncolor(int length)
Return the number of colors of the accessor based on the length of the field.
dim3 dim3 blockDim
const GaugeField & meta
void apply(const cudaStream_t &stream)
int snprintf(char *__str, size_t __size, const char *__format,...) __attribute__((__format__(__printf__
CopyGauge(CopyGaugeArg< OutOrder, InOrder > &arg, const GaugeField &out, const GaugeField &in)
QudaVerbosity getVerbosity()
Definition: util_quda.cpp:20
void copyGauge(CopyGaugeArg< OutOrder, InOrder > arg)
#define errorQuda(...)
Definition: util_quda.h:90
cudaStream_t * stream
const char * VolString() const
long long flops() const
bool tuneGridDim() const
__global__ void copyGhostKernel(CopyGaugeArg< OutOrder, InOrder > arg)
QudaGaugeParam param
Definition: pack_test.cpp:17
unsigned int minThreads() const
cpuColorSpinorField * in
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
Definition: tune.cpp:603
unsigned int sharedBytesPerThread() const
Main header file for host and device accessors to GaugeFields.
__global__ void copyGaugeKernel(CopyGaugeArg< OutOrder, InOrder > arg)
long long bytes() const
enum QudaFieldLocation_s QudaFieldLocation
cpuColorSpinorField * out
CopyGaugeArg(const OutOrder &out, const InOrder &in, int volume, const int *faceVolumeCB, int nDim, int geometry)
static const int aux_n
Definition: tune_key.h:12
void checkNan(Arg arg)
unsigned int sharedBytesPerBlock(const TuneParam &param) const
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
Definition: complex_quda.h:880
void size_t length
#define QUDA_MAX_DIM
Maximum number of dimensions supported by QUDA. In practice, no routines make use of more than 5...
QudaTune getTuning()
Query whether autotuning is enabled or not. Default is enabled but can be overridden by setting QUDA_...
Definition: util_quda.cpp:51
static __inline__ size_t size_t d
QudaParity parity
Definition: covdev_test.cpp:53
void copyGhost(CopyGaugeArg< OutOrder, InOrder > arg)
CopyGaugeArg< OutOrder, InOrder > arg
TuneKey tuneKey() const