QUDA  1.0.0
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros
gauge_fix_ovr.cu
Go to the documentation of this file.
1 #include <quda_internal.h>
2 #include <quda_matrix.h>
3 #include <tune_quda.h>
4 #include <gauge_field.h>
5 #include <gauge_field_order.h>
6 #include <launch_kernel.cuh>
7 #include <unitarization_links.h>
8 #include <comm_quda.h>
9 #include <gauge_fix_ovr_extra.h>
11 #include <cub_helper.cuh>
12 #include <index_helper.cuh>
13 
14 namespace quda {
15 
16 #ifdef GPU_GAUGE_ALG
17 
18  static int numParams = 18;
19 
20 #define LAUNCH_KERNEL_GAUGEFIX(kernel, tp, stream, arg, parity, ...) \
21  if (tp.aux.x == 0) { \
22  switch (tp.block.x) { \
23  case 256: kernel<0, 32, __VA_ARGS__><<<tp.grid.x, tp.block.x, tp.shared_bytes, stream>>>(arg, parity); break; \
24  case 512: kernel<0, 64, __VA_ARGS__><<<tp.grid.x, tp.block.x, tp.shared_bytes, stream>>>(arg, parity); break; \
25  case 768: kernel<0, 96, __VA_ARGS__><<<tp.grid.x, tp.block.x, tp.shared_bytes, stream>>>(arg, parity); break; \
26  case 1024: kernel<0, 128, __VA_ARGS__><<<tp.grid.x, tp.block.x, tp.shared_bytes, stream>>>(arg, parity); break; \
27  default: errorQuda("%s not implemented for %d threads", #kernel, tp.block.x); \
28  } \
29  } else if (tp.aux.x == 1) { \
30  switch (tp.block.x) { \
31  case 256: kernel<1, 32, __VA_ARGS__><<<tp.grid.x, tp.block.x, tp.shared_bytes, stream>>>(arg, parity); break; \
32  case 512: kernel<1, 64, __VA_ARGS__><<<tp.grid.x, tp.block.x, tp.shared_bytes, stream>>>(arg, parity); break; \
33  case 768: kernel<1, 96, __VA_ARGS__><<<tp.grid.x, tp.block.x, tp.shared_bytes, stream>>>(arg, parity); break; \
34  case 1024: kernel<1, 128, __VA_ARGS__><<<tp.grid.x, tp.block.x, tp.shared_bytes, stream>>>(arg, parity); break; \
35  default: errorQuda("%s not implemented for %d threads", #kernel, tp.block.x); \
36  } \
37  } else if (tp.aux.x == 2) { \
38  switch (tp.block.x) { \
39  case 256: kernel<2, 32, __VA_ARGS__><<<tp.grid.x, tp.block.x, tp.shared_bytes, stream>>>(arg, parity); break; \
40  case 512: kernel<2, 64, __VA_ARGS__><<<tp.grid.x, tp.block.x, tp.shared_bytes, stream>>>(arg, parity); break; \
41  case 768: kernel<2, 96, __VA_ARGS__><<<tp.grid.x, tp.block.x, tp.shared_bytes, stream>>>(arg, parity); break; \
42  case 1024: kernel<2, 128, __VA_ARGS__><<<tp.grid.x, tp.block.x, tp.shared_bytes, stream>>>(arg, parity); break; \
43  default: errorQuda("%s not implemented for %d threads", #kernel, tp.block.x); \
44  } \
45  } else if (tp.aux.x == 3) { \
46  switch (tp.block.x) { \
47  case 128: kernel<3, 32, __VA_ARGS__><<<tp.grid.x, tp.block.x, tp.shared_bytes, stream>>>(arg, parity); break; \
48  case 256: kernel<3, 64, __VA_ARGS__><<<tp.grid.x, tp.block.x, tp.shared_bytes, stream>>>(arg, parity); break; \
49  case 384: kernel<3, 96, __VA_ARGS__><<<tp.grid.x, tp.block.x, tp.shared_bytes, stream>>>(arg, parity); break; \
50  case 512: kernel<3, 128, __VA_ARGS__><<<tp.grid.x, tp.block.x, tp.shared_bytes, stream>>>(arg, parity); break; \
51  case 640: kernel<3, 160, __VA_ARGS__><<<tp.grid.x, tp.block.x, tp.shared_bytes, stream>>>(arg, parity); break; \
52  case 768: kernel<3, 192, __VA_ARGS__><<<tp.grid.x, tp.block.x, tp.shared_bytes, stream>>>(arg, parity); break; \
53  case 896: kernel<3, 224, __VA_ARGS__><<<tp.grid.x, tp.block.x, tp.shared_bytes, stream>>>(arg, parity); break; \
54  case 1024: kernel<3, 256, __VA_ARGS__><<<tp.grid.x, tp.block.x, tp.shared_bytes, stream>>>(arg, parity); break; \
55  default: errorQuda("%s not implemented for %d threads", #kernel, tp.block.x); \
56  } \
57  } else if (tp.aux.x == 4) { \
58  switch (tp.block.x) { \
59  case 128: kernel<4, 32, __VA_ARGS__><<<tp.grid.x, tp.block.x, tp.shared_bytes, stream>>>(arg, parity); break; \
60  case 256: kernel<4, 64, __VA_ARGS__><<<tp.grid.x, tp.block.x, tp.shared_bytes, stream>>>(arg, parity); break; \
61  case 384: kernel<4, 96, __VA_ARGS__><<<tp.grid.x, tp.block.x, tp.shared_bytes, stream>>>(arg, parity); break; \
62  case 512: kernel<4, 128, __VA_ARGS__><<<tp.grid.x, tp.block.x, tp.shared_bytes, stream>>>(arg, parity); break; \
63  case 640: kernel<4, 160, __VA_ARGS__><<<tp.grid.x, tp.block.x, tp.shared_bytes, stream>>>(arg, parity); break; \
64  case 768: kernel<4, 192, __VA_ARGS__><<<tp.grid.x, tp.block.x, tp.shared_bytes, stream>>>(arg, parity); break; \
65  case 896: kernel<4, 224, __VA_ARGS__><<<tp.grid.x, tp.block.x, tp.shared_bytes, stream>>>(arg, parity); break; \
66  case 1024: kernel<4, 256, __VA_ARGS__><<<tp.grid.x, tp.block.x, tp.shared_bytes, stream>>>(arg, parity); break; \
67  default: errorQuda("%s not implemented for %d threads", #kernel, tp.block.x); \
68  } \
69  } else if (tp.aux.x == 5) { \
70  switch (tp.block.x) { \
71  case 128: kernel<5, 32, __VA_ARGS__><<<tp.grid.x, tp.block.x, tp.shared_bytes, stream>>>(arg, parity); break; \
72  case 256: kernel<5, 64, __VA_ARGS__><<<tp.grid.x, tp.block.x, tp.shared_bytes, stream>>>(arg, parity); break; \
73  case 384: kernel<5, 96, __VA_ARGS__><<<tp.grid.x, tp.block.x, tp.shared_bytes, stream>>>(arg, parity); break; \
74  case 512: kernel<5, 128, __VA_ARGS__><<<tp.grid.x, tp.block.x, tp.shared_bytes, stream>>>(arg, parity); break; \
75  case 640: kernel<5, 160, __VA_ARGS__><<<tp.grid.x, tp.block.x, tp.shared_bytes, stream>>>(arg, parity); break; \
76  case 768: kernel<5, 192, __VA_ARGS__><<<tp.grid.x, tp.block.x, tp.shared_bytes, stream>>>(arg, parity); break; \
77  case 896: kernel<5, 224, __VA_ARGS__><<<tp.grid.x, tp.block.x, tp.shared_bytes, stream>>>(arg, parity); break; \
78  case 1024: kernel<5, 256, __VA_ARGS__><<<tp.grid.x, tp.block.x, tp.shared_bytes, stream>>>(arg, parity); break; \
79  default: errorQuda("%s not implemented for %d threads", #kernel, tp.block.x); \
80  } \
81  } else { \
82  errorQuda("Not implemented for %d", tp.aux.x); \
83  }
84 
88  template <typename Gauge>
89  struct GaugeFixQualityArg : public ReduceArg<double2> {
90  int threads; // number of active threads required
91  int X[4]; // grid dimensions
92 #ifdef MULTI_GPU
93  int border[4];
94 #endif
95  Gauge dataOr;
96  GaugeFixQualityArg(const Gauge &dataOr, const cudaGaugeField &data)
97  : ReduceArg<double2>(), dataOr(dataOr) {
98 
99  for ( int dir = 0; dir < 4; ++dir ) {
100  X[dir] = data.X()[dir] - data.R()[dir] * 2;
101  #ifdef MULTI_GPU
102  border[dir] = data.R()[dir];
103  #endif
104  }
105  threads = X[0]*X[1]*X[2]*X[3]/2;
106  }
107  double getAction(){ return result_h[0].x; }
108  double getTheta(){ return result_h[0].y; }
109  };
110 
111 
115  template<int blockSize, typename Float, typename Gauge, int gauge_dir>
116  __global__ void computeFix_quality(GaugeFixQualityArg<Gauge> argQ){
117  typedef complex<Float> Cmplx;
118 
119  int idx_cb = threadIdx.x + blockIdx.x * blockDim.x;
120  int parity = threadIdx.y;
121 
122  double2 data = make_double2(0.0,0.0);
123  while (idx_cb < argQ.threads) {
124  int X[4];
125  #pragma unroll
126  for ( int dr = 0; dr < 4; ++dr ) X[dr] = argQ.X[dr];
127 
128  int x[4];
129  getCoords(x, idx_cb, X, parity);
130 #ifdef MULTI_GPU
131  #pragma unroll
132  for ( int dr = 0; dr < 4; ++dr ) {
133  x[dr] += argQ.border[dr];
134  X[dr] += 2 * argQ.border[dr];
135  }
136 #endif
137  Matrix<Cmplx,3> delta;
138  setZero(&delta);
139  //load upward links
140  for ( int mu = 0; mu < gauge_dir; mu++ ) {
141  Matrix<Cmplx,3> U = argQ.dataOr(mu, linkIndex(x, X), parity);
142  delta -= U;
143  }
144  //18*gauge_dir
145  data.x += -delta(0, 0).x - delta(1, 1).x - delta(2, 2).x;
146  //2
147  //load downward links
148  for ( int mu = 0; mu < gauge_dir; mu++ ) {
149  Matrix<Cmplx,3> U = argQ.dataOr(mu, linkIndexM1(x,X,mu), 1 - parity);
150  delta += U;
151  }
152  //18*gauge_dir
153  delta -= conj(delta);
154  //18
155  SubTraceUnit(delta);
156  //12
157  data.y += getRealTraceUVdagger(delta, delta);
158  //35
159  //T=36*gauge_dir+65
160 
161  idx_cb += blockDim.x * gridDim.x;
162  }
163  reduce2d<blockSize,2>(argQ, data);
164  }
165 
166 
170  template<typename Float, typename Gauge, int gauge_dir>
171  class GaugeFixQuality : TunableLocalParity {
172  GaugeFixQualityArg<Gauge> argQ;
173  mutable char aux_string[128]; // used as a label in the autotuner
174 
175  private:
176  bool tuneGridDim() const { return true; }
177 
178  public:
179  GaugeFixQuality(GaugeFixQualityArg<Gauge> &argQ) : argQ(argQ) { }
180  ~GaugeFixQuality () { }
181 
182  void apply(const cudaStream_t &stream){
183  TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
184  argQ.result_h[0] = make_double2(0.0,0.0);
185  LAUNCH_KERNEL_LOCAL_PARITY(computeFix_quality, tp, stream, argQ, Float, Gauge, gauge_dir);
187  if ( comm_size() != 1 ) comm_allreduce_array((double*)argQ.result_h, 2);
188  argQ.result_h[0].x /= (double)(3 * gauge_dir * 2 * argQ.threads * comm_size());
189  argQ.result_h[0].y /= (double)(3 * 2 * argQ.threads * comm_size());
190  }
191 
192  TuneKey tuneKey() const {
193  std::stringstream vol;
194  vol << argQ.X[0] << "x";
195  vol << argQ.X[1] << "x";
196  vol << argQ.X[2] << "x";
197  vol << argQ.X[3];
198  sprintf(aux_string,"threads=%d,prec=%lu,gaugedir=%d",argQ.threads, sizeof(Float),gauge_dir);
199  return TuneKey(vol.str().c_str(), typeid(*this).name(), aux_string);
200 
201  }
202 
203  long long flops() const {
204  return (36LL * gauge_dir + 65LL) * 2 * argQ.threads;
205  } // Only correct if there is no link reconstruction, no cub reduction accounted also
206  //long long bytes() const { return (1)*2*gauge_dir*argQ.dataOr.Bytes(); }//no accounting the reduction!!!! argQ.dataOr.Bytes() return 0....
207  long long bytes() const {
208  return 2LL * gauge_dir * 2 * argQ.threads * numParams * sizeof(Float);
209  } //no accounting the reduction!!!!
210 
211  };
212 
213 
217  template <typename Float, typename Gauge>
218  struct GaugeFixArg {
219  int threads; // number of active threads required
220  int X[4]; // grid dimensions
221 #ifdef MULTI_GPU
222  int border[4];
223 #endif
224  Gauge dataOr;
225  cudaGaugeField &data;
226  const Float relax_boost;
227 
228  GaugeFixArg(Gauge & dataOr, cudaGaugeField & data, const Float relax_boost)
229  : dataOr(dataOr), data(data), relax_boost(relax_boost) {
230 
231  for ( int dir = 0; dir < 4; ++dir ) {
232  X[dir] = data.X()[dir] - data.R()[dir] * 2;
233  #ifdef MULTI_GPU
234  border[dir] = data.R()[dir];
235  #endif
236  }
237  threads = X[0] * X[1] * X[2] * X[3] >> 1;
238  }
239  };
240 
241 
242 
243 
247  template<int ImplementationType, int blockSize, typename Float, typename Gauge, int gauge_dir>
248  __global__ void computeFix(GaugeFixArg<Float, Gauge> arg, int parity){
249  typedef complex<Float> Cmplx;
250 
251  int tid = (threadIdx.x + blockSize) % blockSize;
252  int idx = blockIdx.x * blockSize + tid;
253 
254  if ( idx >= arg.threads ) return;
255 
256  // 8 threads per lattice site
257  if ( ImplementationType < 3 ) {
258  int X[4];
259  #pragma unroll
260  for ( int dr = 0; dr < 4; ++dr ) X[dr] = arg.X[dr];
261 
262  int x[4];
263  getCoords(x, idx, X, parity);
264  #ifdef MULTI_GPU
265  #pragma unroll
266  for ( int dr = 0; dr < 4; ++dr ) {
267  x[dr] += arg.border[dr];
268  X[dr] += 2 * arg.border[dr];
269  }
270  #endif
271  int mu = (threadIdx.x / blockSize);
272  int oddbit = parity;
273  if ( threadIdx.x >= blockSize * 4 ) {
274  mu -= 4;
275  x[mu] = (x[mu] - 1 + X[mu]) % X[mu];
276  oddbit = 1 - parity;
277  }
278  idx = (((x[3] * X[2] + x[2]) * X[1] + x[1]) * X[0] + x[0]) >> 1;
279  Matrix<Cmplx,3> link = arg.dataOr(mu, idx, oddbit);
280  // 8 treads per lattice site, the reduction is performed by shared memory without using atomicadd.
281  // this implementation needs 8x more shared memory than the implementation using atomicadd
282  if ( ImplementationType == 0 ) GaugeFixHit_NoAtomicAdd<blockSize, Float, gauge_dir, 3>(link, arg.relax_boost, tid);
283  // 8 treads per lattice site, the reduction is performed by shared memory using atomicadd
284  if ( ImplementationType == 1 ) GaugeFixHit_AtomicAdd<blockSize, Float, gauge_dir, 3>(link, arg.relax_boost, tid);
285  // 8 treads per lattice site, the reduction is performed by shared memory without using atomicadd.
286  // uses the same amount of shared memory as the atomicadd implementation with more thread block synchronization
287  if ( ImplementationType == 2 ) GaugeFixHit_NoAtomicAdd_LessSM<blockSize, Float, gauge_dir, 3>(link, arg.relax_boost, tid);
288  arg.dataOr(mu, idx, oddbit) = link;
289  }
290  // 4 threads per lattice site
291  else{
292  int X[4];
293  #pragma unroll
294  for ( int dr = 0; dr < 4; ++dr ) X[dr] = arg.X[dr];
295 
296  int x[4];
297  getCoords(x, idx, X, parity);
298  #ifdef MULTI_GPU
299  #pragma unroll
300  for ( int dr = 0; dr < 4; ++dr ) {
301  x[dr] += arg.border[dr];
302  X[dr] += 2 * arg.border[dr];
303  }
304  #endif
305  int mu = (threadIdx.x / blockSize);
306  idx = (((x[3] * X[2] + x[2]) * X[1] + x[1]) * X[0] + x[0]) >> 1;
307  //load upward link
308  Matrix<Cmplx,3> link = arg.dataOr(mu, idx, parity);
309 
310  x[mu] = (x[mu] - 1 + X[mu]) % X[mu];
311  int idx1 = (((x[3] * X[2] + x[2]) * X[1] + x[1]) * X[0] + x[0]) >> 1;
312  //load downward link
313  Matrix<Cmplx,3> link1 = arg.dataOr(mu, idx1, 1 - parity);
314 
315  // 4 treads per lattice site, the reduction is performed by shared memory without using atomicadd.
316  // this implementation needs 4x more shared memory than the implementation using atomicadd
317  if ( ImplementationType == 3 ) GaugeFixHit_NoAtomicAdd<blockSize, Float, gauge_dir, 3>(link, link1, arg.relax_boost, tid);
318  // 4 treads per lattice site, the reduction is performed by shared memory using atomicadd
319  if ( ImplementationType == 4 ) GaugeFixHit_AtomicAdd<blockSize, Float, gauge_dir, 3>(link, link1, arg.relax_boost, tid);
320  // 4 treads per lattice site, the reduction is performed by shared memory without using atomicadd.
321  // uses the same amount of shared memory as the atomicadd implementation with more thread block synchronization
322  if ( ImplementationType == 5 ) GaugeFixHit_NoAtomicAdd_LessSM<blockSize, Float, gauge_dir, 3>(link, link1, arg.relax_boost, tid);
323 
324  arg.dataOr(mu, idx, parity) = link;
325  arg.dataOr(mu, idx1, 1 - parity) = link1;
326 
327  }
328  }
329 
330 
334  template<typename Float, typename Gauge, int gauge_dir>
335  class GaugeFix : Tunable {
336  GaugeFixArg<Float, Gauge> arg;
337  int parity;
338  mutable char aux_string[128]; // used as a label in the autotuner
339 protected:
340  dim3 createGrid(const TuneParam &param) const
341  {
342  unsigned int blockx = param.block.x / 8;
343  if (param.aux.x > 2) blockx = param.block.x / 4;
344  unsigned int gx = (arg.threads + blockx - 1) / blockx;
345  return dim3(gx, 1, 1);
346  }
347 
348  bool advanceBlockDim (TuneParam &param) const {
349  // Use param.aux.x to tune and save state for best kernel option
350  // to make use or not of atomicAdd operations and 4 or 8 threads per lattice site!!!
351  const unsigned int min_threads0 = 32 * 8;
352  const unsigned int min_threads1 = 32 * 4;
353  const unsigned int max_threads = 1024; // FIXME: use deviceProp.maxThreadsDim[0];
354  const unsigned int atmadd = 0;
355  unsigned int min_threads = min_threads0;
356  param.aux.x += atmadd; // USE TO SELECT BEST KERNEL OPTION WITH/WITHOUT USING ATOMICADD
357  if (param.aux.x > 2) min_threads = 32 * 4;
358  param.block.x += min_threads;
359  param.block.y = 1;
360  param.grid = createGrid(param);
361 
362  if ((param.block.x >= min_threads) && (param.block.x <= max_threads)) {
363  param.shared_bytes = sharedBytesPerBlock(param);
364  return true;
365  } else if (param.aux.x == 0) {
366  param.block.x = min_threads0;
367  param.block.y = 1;
368  param.aux.x = 1; // USE FOR ATOMIC ADD
369  param.grid = createGrid(param);
370  param.shared_bytes = param.block.x * 4 * sizeof(Float) / 8;
371  return true;
372  } else if (param.aux.x == 1) {
373  param.block.x = min_threads0;
374  param.block.y = 1;
375  param.aux.x = 2; // USE FOR NO ATOMIC ADD and LESS SHARED MEM
376  param.grid = createGrid(param);
377  param.shared_bytes = param.block.x * 4 * sizeof(Float) / 8;
378  return true;
379  } else if (param.aux.x == 2) {
380  param.block.x = min_threads1;
381  param.block.y = 1;
382  param.aux.x = 3; // USE FOR NO ATOMIC ADD
383  param.grid = createGrid(param);
384  param.shared_bytes = param.block.x * 4 * sizeof(Float);
385  return true;
386  } else if (param.aux.x == 3) {
387  param.block.x = min_threads1;
388  param.block.y = 1;
389  param.aux.x = 4;
390  param.grid = createGrid(param);
391  param.shared_bytes = param.block.x * sizeof(Float);
392  return true;
393  } else if (param.aux.x == 4) {
394  param.block.x = min_threads1;
395  param.block.y = 1;
396  param.aux.x = 5;
397  param.grid = createGrid(param);
398  param.shared_bytes = param.block.x * sizeof(Float);
399  return true;
400  } else {
401  return false;
402  }
403  }
404 
405 private:
406  unsigned int sharedBytesPerThread() const {
407  return 0;
408  }
409  unsigned int sharedBytesPerBlock(const TuneParam &param) const {
410  switch (param.aux.x) {
411  case 0: return param.block.x * 4 * sizeof(Float);
412  case 1: return param.block.x * 4 * sizeof(Float) / 8;
413  case 2: return param.block.x * 4 * sizeof(Float) / 8;
414  case 3: return param.block.x * 4 * sizeof(Float);
415  default: return param.block.x * sizeof(Float);
416  }
417  }
418 
419  bool tuneSharedBytes() const {
420  return false;
421  } // Don't tune shared memory
422  bool tuneGridDim() const {
423  return false;
424  } // Don't tune the grid dimensions.
425  unsigned int minThreads() const {
426  return arg.threads;
427  }
428 
429 public:
430  GaugeFix(GaugeFixArg<Float, Gauge> &arg) : arg(arg), parity(0) { }
431  ~GaugeFix () { }
432 
433  void setParity(const int par){
434  parity = par;
435  }
436 
437  void apply(const cudaStream_t &stream){
438  TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
439  LAUNCH_KERNEL_GAUGEFIX(computeFix, tp, stream, arg, parity, Float, Gauge, gauge_dir);
440  }
441 
442  virtual void initTuneParam(TuneParam &param) const
443  {
444  param.block = dim3(256, 1, 1);
445  param.aux.x = 0;
446  param.grid = createGrid(param);
447  param.shared_bytes = sharedBytesPerBlock(param);
448  }
449 
450  virtual void defaultTuneParam(TuneParam &param) const {
451  initTuneParam(param);
452  }
453 
454  TuneKey tuneKey() const {
455  std::stringstream vol;
456  vol << arg.X[0] << "x";
457  vol << arg.X[1] << "x";
458  vol << arg.X[2] << "x";
459  vol << arg.X[3];
460  sprintf(aux_string,"threads=%d,prec=%lu,gaugedir=%d",arg.threads,sizeof(Float),gauge_dir);
461  return TuneKey(vol.str().c_str(), typeid(*this).name(), aux_string);
462  }
463 
464  std::string paramString(const TuneParam &param) const {
465  std::stringstream ps(Tunable::paramString(param));
466  ps << ", atomicadd=" << param.aux.x;
467  return ps.str();
468  }
469 
470  //need this
471  void preTune() {
472  arg.data.backup();
473  }
474  void postTune() {
475  arg.data.restore();
476  }
477  long long flops() const {
478  return 3LL * (22 + 28 * gauge_dir + 224 * 3) * arg.threads;
479  } // Only correct if there is no link reconstruction
480  //long long bytes() const { return (1)*8*2*arg.dataOr.Bytes(); } // Only correct if there is no link reconstruction load+save
481  long long bytes() const {
482  return 8LL * 2 * arg.threads * numParams * sizeof(Float);
483  } //no accounting the reduction!!!!
484  };
485 
486 
487 
488 
489 #ifdef MULTI_GPU
490  template <typename Float, typename Gauge>
491  struct GaugeFixInteriorPointsArg {
492  int threads; // number of active threads required
493  int X[4]; // grid dimensions
494 #ifdef MULTI_GPU
495  int border[4];
496 #endif
497  Gauge dataOr;
498  cudaGaugeField &data;
499  const Float relax_boost;
500  GaugeFixInteriorPointsArg(Gauge & dataOr, cudaGaugeField & data, const Float relax_boost)
501  : dataOr(dataOr), data(data), relax_boost(relax_boost) {
502 
503 #ifdef MULTI_GPU
504  for ( int dir = 0; dir < 4; ++dir ) {
505  if ( comm_dim_partitioned(dir)) border[dir] = data.R()[dir] + 1; //skip BORDER_RADIUS + face border point
506  else border[dir] = 0;
507  }
508  for ( int dir = 0; dir < 4; ++dir ) X[dir] = data.X()[dir] - border[dir] * 2;
509 #else
510  for ( int dir = 0; dir < 4; ++dir ) X[dir] = data.X()[dir];
511 #endif
512  threads = X[0] * X[1] * X[2] * X[3] >> 1;
513  }
514  };
515 
516 
517 
518 
522  template<int ImplementationType, int blockSize, typename Float, typename Gauge, int gauge_dir>
523  __global__ void computeFixInteriorPoints(GaugeFixInteriorPointsArg<Float, Gauge> arg, int parity){
524  int tid = (threadIdx.x + blockSize) % blockSize;
525  int idx = blockIdx.x * blockSize + tid;
526  if ( idx >= arg.threads ) return;
527  typedef complex<Float> Complex;
528  int X[4];
529 #pragma unroll
530  for ( int dr = 0; dr < 4; ++dr ) X[dr] = arg.X[dr];
531  int x[4];
532 #ifdef MULTI_GPU
533  int za = (idx / (X[0] / 2));
534  int zb = (za / X[1]);
535  x[1] = za - zb * X[1];
536  x[3] = (zb / X[2]);
537  x[2] = zb - x[3] * X[2];
538  int p = 0; for ( int dr = 0; dr < 4; ++dr ) p += arg.border[dr];
539  p = p & 1;
540  int x1odd = (x[1] + x[2] + x[3] + parity + p) & 1;
541  //int x1odd = (x[1] + x[2] + x[3] + parity) & 1;
542  x[0] = (2 * idx + x1odd) - za * X[0];
543  for ( int dr = 0; dr < 4; ++dr ) {
544  x[dr] += arg.border[dr];
545  X[dr] += 2 * arg.border[dr];
546  }
547 #else
548  getCoords(x, idx, X, parity);
549 #endif
550  int mu = (threadIdx.x / blockSize);
551 
552  // 8 threads per lattice site
553  if ( ImplementationType < 3 ) {
554  if ( threadIdx.x >= blockSize * 4 ) {
555  mu -= 4;
556  x[mu] = (x[mu] - 1 + X[mu]) % X[mu];
557  parity = 1 - parity;
558  }
559  idx = (((x[3] * X[2] + x[2]) * X[1] + x[1]) * X[0] + x[0]) >> 1;
560  Matrix<Complex,3> link = arg.dataOr(mu, idx, parity);
561  // 8 treads per lattice site, the reduction is performed by shared memory without using atomicadd.
562  // this implementation needs 8x more shared memory than the implementation using atomicadd
563  if ( ImplementationType == 0 ) GaugeFixHit_NoAtomicAdd<blockSize, Float, gauge_dir, 3>(link, arg.relax_boost, tid);
564  // 8 treads per lattice site, the reduction is performed by shared memory using atomicadd
565  if ( ImplementationType == 1 ) GaugeFixHit_AtomicAdd<blockSize, Float, gauge_dir, 3>(link, arg.relax_boost, tid);
566  // 8 treads per lattice site, the reduction is performed by shared memory without using atomicadd.
567  // uses the same amount of shared memory as the atomicadd implementation with more thread block synchronization
568  if ( ImplementationType == 2 ) GaugeFixHit_NoAtomicAdd_LessSM<blockSize, Float, gauge_dir, 3>(link, arg.relax_boost, tid);
569  arg.dataOr(mu, idx, parity) = link;
570  }
571  // 4 threads per lattice site
572  else{
573  idx = (((x[3] * X[2] + x[2]) * X[1] + x[1]) * X[0] + x[0]) >> 1;
574  Matrix<Complex,3> link = arg.dataOr(mu, idx, parity);
575 
576 
577  x[mu] = (x[mu] - 1 + X[mu]) % X[mu];
578  int idx1 = (((x[3] * X[2] + x[2]) * X[1] + x[1]) * X[0] + x[0]) >> 1;
579  Matrix<Complex,3> link1 = arg.dataOr(mu, idx1, 1 - parity);
580 
581  // 4 treads per lattice site, the reduction is performed by shared memory without using atomicadd.
582  // this implementation needs 4x more shared memory than the implementation using atomicadd
583  if ( ImplementationType == 3 ) GaugeFixHit_NoAtomicAdd<blockSize, Float, gauge_dir, 3>(link, link1, arg.relax_boost, tid);
584  // 4 treads per lattice site, the reduction is performed by shared memory using atomicadd
585  if ( ImplementationType == 4 ) GaugeFixHit_AtomicAdd<blockSize, Float, gauge_dir, 3>(link, link1, arg.relax_boost, tid);
586  // 4 treads per lattice site, the reduction is performed by shared memory without using atomicadd.
587  // uses the same amount of shared memory as the atomicadd implementation with more thread block synchronization
588  if ( ImplementationType == 5 ) GaugeFixHit_NoAtomicAdd_LessSM<blockSize, Float, gauge_dir, 3>(link, link1, arg.relax_boost, tid);
589 
590  arg.dataOr(mu, idx, parity) = link;
591  arg.dataOr(mu, idx1, 1 - parity) = link1;
592  }
593  }
594 
599  template<typename Float, typename Gauge, int gauge_dir>
600  class GaugeFixInteriorPoints : Tunable {
601  GaugeFixInteriorPointsArg<Float, Gauge> arg;
602  int parity;
603  mutable char aux_string[128]; // used as a label in the autotuner
604 protected:
605  dim3 createGrid(const TuneParam &param) const
606  {
607  unsigned int blockx = param.block.x / 8;
608  if (param.aux.x > 2) blockx = param.block.x / 4;
609  unsigned int gx = (arg.threads + blockx - 1) / blockx;
610  return dim3(gx, 1, 1);
611  }
612 
613  bool advanceBlockDim (TuneParam &param) const {
614  // Use param.aux.x to tune and save state for best kernel option
615  // to make use or not of atomicAdd operations and 4 or 8 threads per lattice site!!!
616  const unsigned int min_threads0 = 32 * 8;
617  const unsigned int min_threads1 = 32 * 4;
618  const unsigned int max_threads = 1024; // FIXME: use deviceProp.maxThreadsDim[0];
619  const unsigned int atmadd = 0;
620  unsigned int min_threads = min_threads0;
621  param.aux.x += atmadd; // USE TO SELECT BEST KERNEL OPTION WITH/WITHOUT USING ATOMICADD
622  if (param.aux.x > 2) min_threads = 32 * 4;
623  param.block.x += min_threads;
624  param.block.y = 1;
625  param.grid = createGrid(param);
626 
627  if ((param.block.x >= min_threads) && (param.block.x <= max_threads)) {
628  param.shared_bytes = sharedBytesPerBlock(param);
629  return true;
630  } else if (param.aux.x == 0) {
631  param.block.x = min_threads0;
632  param.block.y = 1;
633  param.aux.x = 1; // USE FOR ATOMIC ADD
634  param.grid = createGrid(param);
635  param.shared_bytes = param.block.x * 4 * sizeof(Float) / 8;
636  return true;
637  } else if (param.aux.x == 1) {
638  param.block.x = min_threads0;
639  param.block.y = 1;
640  param.aux.x = 2; // USE FOR NO ATOMIC ADD and LESS SHARED MEM
641  param.grid = createGrid(param);
642  param.shared_bytes = param.block.x * 4 * sizeof(Float) / 8;
643  return true;
644  } else if (param.aux.x == 2) {
645  param.block.x = min_threads1;
646  param.block.y = 1;
647  param.aux.x = 3; // USE FOR NO ATOMIC ADD
648  param.grid = createGrid(param);
649  param.shared_bytes = param.block.x * 4 * sizeof(Float);
650  return true;
651  } else if (param.aux.x == 3) {
652  param.block.x = min_threads1;
653  param.block.y = 1;
654  param.aux.x = 4;
655  param.grid = createGrid(param);
656  param.shared_bytes = param.block.x * sizeof(Float);
657  return true;
658  } else if (param.aux.x == 4) {
659  param.block.x = min_threads1;
660  param.block.y = 1;
661  param.aux.x = 5;
662  param.grid = createGrid(param);
663  param.shared_bytes = param.block.x * sizeof(Float);
664  return true;
665  } else {
666  return false;
667  }
668  }
669 
670 private:
671  unsigned int sharedBytesPerThread() const {
672  return 0;
673  }
674  unsigned int sharedBytesPerBlock(const TuneParam &param) const {
675  switch (param.aux.x) {
676  case 0: return param.block.x * 4 * sizeof(Float);
677  case 1: return param.block.x * 4 * sizeof(Float) / 8;
678  case 2: return param.block.x * 4 * sizeof(Float) / 8;
679  case 3: return param.block.x * 4 * sizeof(Float);
680  default: return param.block.x * sizeof(Float);
681  }
682  }
683 
684  bool tuneSharedBytes() const {
685  return false;
686  } // Don't tune shared memory
687  bool tuneGridDim() const {
688  return false;
689  } // Don't tune the grid dimensions.
690  unsigned int minThreads() const {
691  return arg.threads;
692  }
693 
694 public:
695  GaugeFixInteriorPoints(GaugeFixInteriorPointsArg<Float, Gauge> &arg) : arg(arg), parity(0) {}
696 
697  ~GaugeFixInteriorPoints () { }
698 
699  void setParity(const int par) { parity = par; }
700 
701  void apply(const cudaStream_t &stream)
702  {
703  TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
704  LAUNCH_KERNEL_GAUGEFIX(computeFixInteriorPoints, tp, stream, arg, parity, Float, Gauge, gauge_dir);
705  }
706 
707  virtual void initTuneParam(TuneParam &param) const
708  {
709  param.block = dim3(256, 1, 1);
710  param.aux.x = 0;
711  param.grid = createGrid(param);
712  param.shared_bytes = sharedBytesPerBlock(param);
713  }
714 
715  virtual void defaultTuneParam(TuneParam &param) const { initTuneParam(param); }
716 
717  TuneKey tuneKey() const {
718  std::stringstream vol;
719  vol << arg.X[0] << "x";
720  vol << arg.X[1] << "x";
721  vol << arg.X[2] << "x";
722  vol << arg.X[3];
723  sprintf(aux_string,"threads=%d,prec=%lu,gaugedir=%d",arg.threads,sizeof(Float),gauge_dir);
724  return TuneKey(vol.str().c_str(), typeid(*this).name(), aux_string);
725  }
726 
727  std::string paramString(const TuneParam &param) const {
728  std::stringstream ps(Tunable::paramString(param));
729  ps << ", atomicadd=" << param.aux.x;
730  return ps.str();
731  }
732 
733  //need this
734  void preTune() {
735  arg.data.backup();
736  }
737  void postTune() {
738  arg.data.restore();
739  }
740  long long flops() const {
741  return 3LL * (22 + 28 * gauge_dir + 224 * 3) * arg.threads;
742  } // Only correct if there is no link reconstruction
743  //long long bytes() const { return (1)*8*2*arg.dataOr.Bytes(); } // Only correct if there is no link reconstruction load+save
744  long long bytes() const {
745  return 8LL * 2 * arg.threads * numParams * sizeof(Float);
746  } // Only correct if there is no link reconstruction load+save
747  };
748 
749 
750  template <typename Float, typename Gauge>
751  struct GaugeFixBorderPointsArg {
752  int threads; // number of active threads required
753  int X[4]; // grid dimensions
754  int border[4];
755  int *borderpoints[2];
756  int *faceindicessize[2];
757  size_t faceVolume[4];
758  size_t faceVolumeCB[4];
759  Gauge dataOr;
760  cudaGaugeField &data;
761  const Float relax_boost;
762 
763  GaugeFixBorderPointsArg(Gauge & dataOr, cudaGaugeField & data, const Float relax_boost, size_t faceVolume_[4], size_t faceVolumeCB_[4])
764  : dataOr(dataOr), data(data), relax_boost(relax_boost) {
765 
766 
767  for ( int dir = 0; dir < 4; ++dir ) {
768  X[dir] = data.X()[dir] - data.R()[dir] * 2;
769  border[dir] = data.R()[dir];
770  }
771 
772  /*for(int dir=0; dir<4; ++dir){
773  if(comm_dim_partitioned(dir)) border[dir] = BORDER_RADIUS;
774  else border[dir] = 0;
775  }
776  for(int dir=0; dir<4; ++dir) X[dir] = data.X()[dir] - border[dir]*2;*/
777  for ( int dir = 0; dir < 4; ++dir ) {
778  faceVolume[dir] = faceVolume_[dir];
779  faceVolumeCB[dir] = faceVolumeCB_[dir];
780  }
781  if ( comm_partitioned() ) PreCalculateLatticeIndices(faceVolume, faceVolumeCB, X, border, threads, borderpoints);
782  }
783  };
784 
788  template<int ImplementationType, int blockSize, typename Float, typename Gauge, int gauge_dir>
789  __global__ void computeFixBorderPoints(GaugeFixBorderPointsArg<Float, Gauge> arg, int parity){
790  typedef complex<Float> Cmplx;
791 
792  int tid = (threadIdx.x + blockSize) % blockSize;
793  int idx = blockIdx.x * blockSize + tid;
794  if ( idx >= arg.threads ) return;
795  int mu = (threadIdx.x / blockSize);
796  idx = arg.borderpoints[parity][idx];
797  int X[4], x[4];
798  x[3] = idx / (arg.X[0] * arg.X[1] * arg.X[2]);
799  x[2] = (idx / (arg.X[0] * arg.X[1])) % arg.X[2];
800  x[1] = (idx / arg.X[0]) % arg.X[1];
801  x[0] = idx % arg.X[0];
802  #pragma unroll
803  for ( int dr = 0; dr < 4; ++dr ) x[dr] += arg.border[dr];
804  #pragma unroll
805  for ( int dr = 0; dr < 4; ++dr ) X[dr] = arg.X[dr] + 2 * arg.border[dr];
806 
807  // 8 threads per lattice site
808  if ( ImplementationType < 3 ) {
809  if ( threadIdx.x >= blockSize * 4 ) {
810  mu -= 4;
811  x[mu] = (x[mu] - 1 + X[mu]) % X[mu];
812  parity = 1 - parity;
813  }
814  idx = (((x[3] * X[2] + x[2]) * X[1] + x[1]) * X[0] + x[0]) >> 1;
815  Matrix<Cmplx,3> link = arg.dataOr(mu, idx, parity);
816  // 8 treads per lattice site, the reduction is performed by shared memory without using atomicadd.
817  // this implementation needs 8x more shared memory than the implementation using atomicadd
818  if ( ImplementationType == 0 ) GaugeFixHit_NoAtomicAdd<blockSize, Float, gauge_dir, 3>(link, arg.relax_boost, tid);
819  // 8 treads per lattice site, the reduction is performed by shared memory using atomicadd
820  if ( ImplementationType == 1 ) GaugeFixHit_AtomicAdd<blockSize, Float, gauge_dir, 3>(link, arg.relax_boost, tid);
821  // 8 treads per lattice site, the reduction is performed by shared memory without using atomicadd.
822  // uses the same amount of shared memory as the atomicadd implementation with more thread block synchronization
823  if ( ImplementationType == 2 ) GaugeFixHit_NoAtomicAdd_LessSM<blockSize, Float, gauge_dir, 3>(link, arg.relax_boost, tid);
824  arg.dataOr(mu, idx, parity) = link;
825  }
826  // 4 threads per lattice site
827  else{
828  idx = (((x[3] * X[2] + x[2]) * X[1] + x[1]) * X[0] + x[0]) >> 1;
829  Matrix<Cmplx,3> link = arg.dataOr(mu, idx, parity);
830 
831 
832  x[mu] = (x[mu] - 1 + X[mu]) % X[mu];
833  int idx1 = (((x[3] * X[2] + x[2]) * X[1] + x[1]) * X[0] + x[0]) >> 1;
834  Matrix<Cmplx,3> link1 = arg.dataOr(mu, idx1, 1 - parity);
835 
836  // 4 treads per lattice site, the reduction is performed by shared memory without using atomicadd.
837  // this implementation needs 4x more shared memory than the implementation using atomicadd
838  if ( ImplementationType == 3 ) GaugeFixHit_NoAtomicAdd<blockSize, Float, gauge_dir, 3>(link, link1, arg.relax_boost, tid);
839  // 4 treads per lattice site, the reduction is performed by shared memory using atomicadd
840  if ( ImplementationType == 4 ) GaugeFixHit_AtomicAdd<blockSize, Float, gauge_dir, 3>(link, link1, arg.relax_boost, tid);
841  // 4 treads per lattice site, the reduction is performed by shared memory without using atomicadd.
842  // uses the same amount of shared memory as the atomicadd implementation with more thread block synchronization
843  if ( ImplementationType == 5 ) GaugeFixHit_NoAtomicAdd_LessSM<blockSize, Float, gauge_dir, 3>(link, link1, arg.relax_boost, tid);
844 
845  arg.dataOr(mu, idx, parity) = link;
846  arg.dataOr(mu, idx1, 1 - parity) = link1;
847  }
848  }
849 
850 
851 
852 
856  template<typename Float, typename Gauge, int gauge_dir>
857  class GaugeFixBorderPoints : Tunable {
858  GaugeFixBorderPointsArg<Float, Gauge> arg;
859  int parity;
860  mutable char aux_string[128]; // used as a label in the autotuner
861  protected:
862  dim3 createGrid(const TuneParam &param) const
863  {
864  unsigned int blockx = param.block.x / 8;
865  if (param.aux.x > 2) blockx = param.block.x / 4;
866  unsigned int gx = (arg.threads + blockx - 1) / blockx;
867  return dim3(gx, 1, 1);
868  }
869 
870  bool advanceBlockDim(TuneParam &param) const
871  {
872  // Use param.aux.x to tune and save state for best kernel option
873  // to make use or not of atomicAdd operations and 4 or 8 threads per lattice site!!!
874  const unsigned int min_threads0 = 32 * 8;
875  const unsigned int min_threads1 = 32 * 4;
876  const unsigned int max_threads = 1024; // FIXME: use deviceProp.maxThreadsDim[0];
877  const unsigned int atmadd = 0;
878  unsigned int min_threads = min_threads0;
879  param.aux.x += atmadd; // USE TO SELECT BEST KERNEL OPTION WITH/WITHOUT USING ATOMICADD
880  if (param.aux.x > 2) min_threads = 32 * 4;
881  param.block.x += min_threads;
882  param.block.y = 1;
883  param.grid = createGrid(param);
884 
885  if ((param.block.x >= min_threads) && (param.block.x <= max_threads)) {
886  param.shared_bytes = sharedBytesPerBlock(param);
887  return true;
888  } else if (param.aux.x == 0) {
889  param.block.x = min_threads0;
890  param.block.y = 1;
891  param.aux.x = 1; // USE FOR ATOMIC ADD
892  param.grid = createGrid(param);
893  param.shared_bytes = param.block.x * 4 * sizeof(Float) / 8;
894  return true;
895  } else if (param.aux.x == 1) {
896  param.block.x = min_threads0;
897  param.block.y = 1;
898  param.aux.x = 2; // USE FOR NO ATOMIC ADD and LESS SHARED MEM
899  param.grid = createGrid(param);
900  param.shared_bytes = param.block.x * 4 * sizeof(Float) / 8;
901  return true;
902  } else if (param.aux.x == 2) {
903  param.block.x = min_threads1;
904  param.block.y = 1;
905  param.aux.x = 3; // USE FOR NO ATOMIC ADD
906  param.grid = createGrid(param);
907  param.shared_bytes = param.block.x * 4 * sizeof(Float);
908  return true;
909  } else if (param.aux.x == 3) {
910  param.block.x = min_threads1;
911  param.block.y = 1;
912  param.aux.x = 4;
913  param.grid = createGrid(param);
914  param.shared_bytes = param.block.x * sizeof(Float);
915  return true;
916  } else if (param.aux.x == 4) {
917  param.block.x = min_threads1;
918  param.block.y = 1;
919  param.aux.x = 5;
920  param.grid = createGrid(param);
921  param.shared_bytes = param.block.x * sizeof(Float);
922  return true;
923  } else {
924  return false;
925  }
926  }
927 
928  private:
929  unsigned int sharedBytesPerThread() const {
930  return 0;
931  }
932  unsigned int sharedBytesPerBlock(const TuneParam &param) const {
933  switch (param.aux.x) {
934  case 0: return param.block.x * 4 * sizeof(Float);
935  case 1: return param.block.x * 4 * sizeof(Float) / 8;
936  case 2: return param.block.x * 4 * sizeof(Float) / 8;
937  case 3: return param.block.x * 4 * sizeof(Float);
938  default: return param.block.x * sizeof(Float);
939  }
940  }
941 
942  bool tuneSharedBytes() const {
943  return false;
944  } // Don't tune shared memory
945  bool tuneGridDim() const {
946  return false;
947  } // Don't tune the grid dimensions.
948  unsigned int minThreads() const {
949  return arg.threads;
950  }
951 
952 public:
953  GaugeFixBorderPoints(GaugeFixBorderPointsArg<Float, Gauge> &arg) : arg(arg), parity(0) { }
954  ~GaugeFixBorderPoints () {
955  if ( comm_partitioned() ) for ( int i = 0; i < 2; i++ ) pool_device_free(arg.borderpoints[i]);
956  }
957  void setParity(const int par){
958  parity = par;
959  }
960 
961  void apply(const cudaStream_t &stream){
962  TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
963  LAUNCH_KERNEL_GAUGEFIX(computeFixBorderPoints, tp, stream, arg, parity, Float, Gauge, gauge_dir);
964  }
965 
966  virtual void initTuneParam(TuneParam &param) const
967  {
968  param.block = dim3(256, 1, 1);
969  param.aux.x = 0;
970  param.grid = createGrid(param);
971  param.shared_bytes = sharedBytesPerBlock(param);
972  }
973 
974  virtual void defaultTuneParam(TuneParam &param) const {
975  initTuneParam(param);
976  }
977 
978  TuneKey tuneKey() const {
979  std::stringstream vol;
980  vol << arg.X[0] << "x";
981  vol << arg.X[1] << "x";
982  vol << arg.X[2] << "x";
983  vol << arg.X[3];
984  sprintf(aux_string,"threads=%d,prec=%lu,gaugedir=%d",arg.threads,sizeof(Float),gauge_dir);
985  return TuneKey(vol.str().c_str(), typeid(*this).name(), aux_string);
986  }
987 
988  std::string paramString(const TuneParam &param) const {
989  std::stringstream ps(Tunable::paramString(param));
990  ps << ", atomicadd=" << param.aux.x;
991  return ps.str();
992  }
993 
994  //need this
995  void preTune() {
996  arg.data.backup();
997  }
998  void postTune() {
999  arg.data.restore();
1000  }
1001  long long flops() const {
1002  return 3LL * (22 + 28 * gauge_dir + 224 * 3) * arg.threads;
1003  } // Only correct if there is no link reconstruction
1004  //long long bytes() const { return (1)*8*2*arg.dataOr.Bytes(); } // Only correct if there is no link reconstruction load+save
1005  long long bytes() const {
1006  return 8LL * 2 * arg.threads * numParams * sizeof(Float);
1007  } // Only correct if there is no link reconstruction load+save
1008 
1009  };
1010 
1011 
1012 
1013 
1014 
1015 
1016 
1017 
1018 
1019 
1020 
1021 
1022 
1023 
1024  template <typename Gauge>
1025  struct GaugeFixUnPackArg {
1026  int X[4]; // grid dimensions
1027 #ifdef MULTI_GPU
1028  int border[4];
1029 #endif
1030  Gauge dataOr;
1031  GaugeFixUnPackArg(Gauge & dataOr, cudaGaugeField & data)
1032  : dataOr(dataOr) {
1033  for ( int dir = 0; dir < 4; ++dir ) {
1034  X[dir] = data.X()[dir] - data.R()[dir] * 2;
1035  #ifdef MULTI_GPU
1036  border[dir] = data.R()[dir];
1037  #endif
1038  }
1039  }
1040  };
1041 
1042 
1043  template<int NElems, typename Float, typename Gauge, bool pack>
1044  __global__ void Kernel_UnPackGhost(int size, GaugeFixUnPackArg<Gauge> arg, complex<Float> *array, int parity, int face, int dir){
1045  int idx = blockIdx.x * blockDim.x + threadIdx.x;
1046  if ( idx >= size ) return;
1047  int X[4];
1048  for ( int dr = 0; dr < 4; ++dr ) X[dr] = arg.X[dr];
1049  int x[4];
1050  int za, xodd;
1051  int borderid = 0;
1052  parity = 1 - parity;
1053  switch ( face ) {
1054  case 0: //X FACE
1055  za = idx / ( X[1] / 2);
1056  x[3] = za / X[2];
1057  x[2] = za - x[3] * X[2];
1058  x[0] = borderid;
1059  xodd = (borderid + x[2] + x[3] + parity) & 1;
1060  x[1] = (2 * idx + xodd) - za * X[1];
1061  break;
1062  case 1: //Y FACE
1063  za = idx / ( X[0] / 2);
1064  x[3] = za / X[2];
1065  x[2] = za - x[3] * X[2];
1066  x[1] = borderid;
1067  xodd = (borderid + x[2] + x[3] + parity) & 1;
1068  x[0] = (2 * idx + xodd) - za * X[0];
1069  break;
1070  case 2: //Z FACE
1071  za = idx / ( X[0] / 2);
1072  x[3] = za / X[1];
1073  x[1] = za - x[3] * X[1];
1074  x[2] = borderid;
1075  xodd = (borderid + x[1] + x[3] + parity) & 1;
1076  x[0] = (2 * idx + xodd) - za * X[0];
1077  break;
1078  case 3: //T FACE
1079  za = idx / ( X[0] / 2);
1080  x[2] = za / X[1];
1081  x[1] = za - x[2] * X[1];
1082  x[3] = borderid;
1083  xodd = (borderid + x[1] + x[2] + parity) & 1;
1084  x[0] = (2 * idx + xodd) - za * X[0];
1085  break;
1086  }
1087  for ( int dr = 0; dr < 4; ++dr ) {
1088  x[dr] += arg.border[dr];
1089  X[dr] += 2 * arg.border[dr];
1090  }
1091  x[face] -= 1;
1092  parity = 1 - parity;
1093  int id = (((x[3] * X[2] + x[2]) * X[1] + x[1]) * X[0] + x[0]) >> 1;
1094  typedef complex<Float> Cmplx;
1095  typedef typename mapper<Float>::type RegType;
1096  RegType tmp[NElems];
1097  Cmplx data[9];
1098  if ( pack ) {
1099  arg.dataOr.load(data, id, dir, parity);
1100  arg.dataOr.reconstruct.Pack(tmp, data, id);
1101  for ( int i = 0; i < NElems / 2; ++i ) {
1102  array[idx + size * i] = Cmplx(tmp[2*i+0], tmp[2*i+1]);
1103  }
1104  } else {
1105  for ( int i = 0; i < NElems / 2; ++i ) {
1106  tmp[2*i+0] = array[idx + size * i].real();
1107  tmp[2*i+1] = array[idx + size * i].imag();
1108  }
1109  arg.dataOr.reconstruct.Unpack(data, tmp, id, dir, 0, arg.dataOr.X, arg.dataOr.R);
1110  arg.dataOr.save(data, id, dir, parity);
1111  }
1112  }
1113 
1114 
1115  template<int NElems, typename Float, typename Gauge, bool pack>
1116  __global__ void Kernel_UnPackTop(int size, GaugeFixUnPackArg<Gauge> arg, complex<Float> *array, int parity, int face, int dir){
1117  int idx = blockIdx.x * blockDim.x + threadIdx.x;
1118  if ( idx >= size ) return;
1119  int X[4];
1120  for ( int dr = 0; dr < 4; ++dr ) X[dr] = arg.X[dr];
1121  int x[4];
1122  int za, xodd;
1123  int borderid = arg.X[face] - 1;
1124  switch ( face ) {
1125  case 0: //X FACE
1126  za = idx / ( X[1] / 2);
1127  x[3] = za / X[2];
1128  x[2] = za - x[3] * X[2];
1129  x[0] = borderid;
1130  xodd = (borderid + x[2] + x[3] + parity) & 1;
1131  x[1] = (2 * idx + xodd) - za * X[1];
1132  break;
1133  case 1: //Y FACE
1134  za = idx / ( X[0] / 2);
1135  x[3] = za / X[2];
1136  x[2] = za - x[3] * X[2];
1137  x[1] = borderid;
1138  xodd = (borderid + x[2] + x[3] + parity) & 1;
1139  x[0] = (2 * idx + xodd) - za * X[0];
1140  break;
1141  case 2: //Z FACE
1142  za = idx / ( X[0] / 2);
1143  x[3] = za / X[1];
1144  x[1] = za - x[3] * X[1];
1145  x[2] = borderid;
1146  xodd = (borderid + x[1] + x[3] + parity) & 1;
1147  x[0] = (2 * idx + xodd) - za * X[0];
1148  break;
1149  case 3: //T FACE
1150  za = idx / ( X[0] / 2);
1151  x[2] = za / X[1];
1152  x[1] = za - x[2] * X[1];
1153  x[3] = borderid;
1154  xodd = (borderid + x[1] + x[2] + parity) & 1;
1155  x[0] = (2 * idx + xodd) - za * X[0];
1156  break;
1157  }
1158  for ( int dr = 0; dr < 4; ++dr ) {
1159  x[dr] += arg.border[dr];
1160  X[dr] += 2 * arg.border[dr];
1161  }
1162  int id = (((x[3] * X[2] + x[2]) * X[1] + x[1]) * X[0] + x[0]) >> 1;
1163  typedef complex<Float> Cmplx;
1164  typedef typename mapper<Float>::type RegType;
1165  RegType tmp[NElems];
1166  Cmplx data[9];
1167  if ( pack ) {
1168  arg.dataOr.load(data, id, dir, parity);
1169  arg.dataOr.reconstruct.Pack(tmp, data, id);
1170  for ( int i = 0; i < NElems / 2; ++i ) array[idx + size * i] = Cmplx(tmp[2*i+0], tmp[2*i+1]);
1171  }
1172  else{
1173  for ( int i = 0; i < NElems / 2; ++i ) {
1174  tmp[2*i+0] = array[idx + size * i].real();
1175  tmp[2*i+1] = array[idx + size * i].imag();
1176  }
1177  arg.dataOr.reconstruct.Unpack(data, tmp, id, dir, 0, arg.dataOr.X, arg.dataOr.R);
1178  arg.dataOr.save(data, id, dir, parity);
1179  }
1180  }
1181 #endif
1182 
1183 
1184  template<typename Float, typename Gauge, int NElems, int gauge_dir>
1185  void gaugefixingOVR( Gauge dataOr, cudaGaugeField& data,
1186  const int Nsteps, const int verbose_interval,
1187  const Float relax_boost, const double tolerance,
1188  const int reunit_interval, const int stopWtheta) {
1189 
1190 
1191  TimeProfile profileInternalGaugeFixOVR("InternalGaugeFixQudaOVR", false);
1192 
1193  profileInternalGaugeFixOVR.TPSTART(QUDA_PROFILE_COMPUTE);
1194  double flop = 0;
1195  double byte = 0;
1196 
1197  printfQuda("\tOverrelaxation boost parameter: %lf\n", (double)relax_boost);
1198  printfQuda("\tStop criterium: %lf\n", tolerance);
1199  if ( stopWtheta ) printfQuda("\tStop criterium method: theta\n");
1200  else printfQuda("\tStop criterium method: Delta\n");
1201  printfQuda("\tMaximum number of iterations: %d\n", Nsteps);
1202  printfQuda("\tReunitarize at every %d steps\n", reunit_interval);
1203  printfQuda("\tPrint convergence results at every %d steps\n", verbose_interval);
1204 
1205 
1206  const double unitarize_eps = 1e-14;
1207  const double max_error = 1e-10;
1208  const int reunit_allow_svd = 1;
1209  const int reunit_svd_only = 0;
1210  const double svd_rel_error = 1e-6;
1211  const double svd_abs_error = 1e-6;
1212  setUnitarizeLinksConstants(unitarize_eps, max_error,
1213  reunit_allow_svd, reunit_svd_only,
1214  svd_rel_error, svd_abs_error);
1215  int num_failures = 0;
1216  int* num_failures_dev = static_cast<int*>(pool_device_malloc(sizeof(int)));
1217  cudaMemset(num_failures_dev, 0, sizeof(int));
1218 
1219  GaugeFixQualityArg<Gauge> argQ(dataOr, data);
1220  GaugeFixQuality<Float,Gauge, gauge_dir> GaugeFixQuality(argQ);
1221 
1222  GaugeFixArg<Float, Gauge> arg(dataOr, data, relax_boost);
1223  GaugeFix<Float,Gauge, gauge_dir> gaugeFix(arg);
1224 
1225 #ifdef MULTI_GPU
1226  void *send[4];
1227  void *recv[4];
1228  void *sendg[4];
1229  void *recvg[4];
1230  void *send_d[4];
1231  void *recv_d[4];
1232  void *sendg_d[4];
1233  void *recvg_d[4];
1234  void *hostbuffer_h[4];
1235  cudaStream_t GFStream[9];
1236  size_t offset[4];
1237  size_t bytes[4];
1238  size_t faceVolume[4];
1239  size_t faceVolumeCB[4];
1240  // do the exchange
1241  MsgHandle *mh_recv_back[4];
1242  MsgHandle *mh_recv_fwd[4];
1243  MsgHandle *mh_send_fwd[4];
1244  MsgHandle *mh_send_back[4];
1245  int X[4];
1246  dim3 block[4];
1247  dim3 grid[4];
1248 
1249  if ( comm_partitioned() ) {
1250 
1251  for ( int dir = 0; dir < 4; ++dir ) {
1252  X[dir] = data.X()[dir] - data.R()[dir] * 2;
1253  if ( !commDimPartitioned(dir) && data.R()[dir] != 0 ) errorQuda("Not supported!\n");
1254  }
1255  for ( int i = 0; i < 4; i++ ) {
1256  faceVolume[i] = 1;
1257  for ( int j = 0; j < 4; j++ ) {
1258  if ( i == j ) continue;
1259  faceVolume[i] *= X[j];
1260  }
1261  faceVolumeCB[i] = faceVolume[i] / 2;
1262  }
1263 
1264  for ( int d = 0; d < 4; d++ ) {
1265  if ( !commDimPartitioned(d)) continue;
1266  offset[d] = faceVolumeCB[d] * NElems;
1267  bytes[d] = sizeof(Float) * offset[d];
1268  send_d[d] = device_malloc(bytes[d]);
1269  recv_d[d] = device_malloc(bytes[d]);
1270  sendg_d[d] = device_malloc(bytes[d]);
1271  recvg_d[d] = device_malloc(bytes[d]);
1272  cudaStreamCreate(&GFStream[d]);
1273  cudaStreamCreate(&GFStream[4 + d]);
1274  #ifndef GPU_COMMS
1275  hostbuffer_h[d] = (void*)pinned_malloc(4 * bytes[d]);
1276  #endif
1277  block[d] = make_uint3(128, 1, 1);
1278  grid[d] = make_uint3((faceVolumeCB[d] + block[d].x - 1) / block[d].x, 1, 1);
1279  }
1280  cudaStreamCreate(&GFStream[8]);
1281  for ( int d = 0; d < 4; d++ ) {
1282  if ( !commDimPartitioned(d)) continue;
1283  #ifdef GPU_COMMS
1284  recv[d] = recv_d[d];
1285  send[d] = send_d[d];
1286  recvg[d] = recvg_d[d];
1287  sendg[d] = sendg_d[d];
1288  #else
1289  recv[d] = hostbuffer_h[d];
1290  send[d] = static_cast<char*>(hostbuffer_h[d]) + bytes[d];
1291  recvg[d] = static_cast<char*>(hostbuffer_h[d]) + 3 * bytes[d];
1292  sendg[d] = static_cast<char*>(hostbuffer_h[d]) + 2 * bytes[d];
1293  #endif
1294  mh_recv_back[d] = comm_declare_receive_relative(recv[d], d, -1, bytes[d]);
1295  mh_recv_fwd[d] = comm_declare_receive_relative(recvg[d], d, +1, bytes[d]);
1296  mh_send_back[d] = comm_declare_send_relative(sendg[d], d, -1, bytes[d]);
1297  mh_send_fwd[d] = comm_declare_send_relative(send[d], d, +1, bytes[d]);
1298  }
1299  }
1300  GaugeFixUnPackArg<Gauge> dataexarg(dataOr, data);
1301  GaugeFixBorderPointsArg<Float, Gauge> argBorder(dataOr, data, relax_boost, faceVolume, faceVolumeCB);
1302  GaugeFixBorderPoints<Float,Gauge, gauge_dir> gfixBorderPoints(argBorder);
1303  GaugeFixInteriorPointsArg<Float, Gauge> argInt(dataOr, data, relax_boost);
1304  GaugeFixInteriorPoints<Float,Gauge, gauge_dir> gfixIntPoints(argInt);
1305  #endif
1306 
1307  GaugeFixQuality.apply(0);
1308  flop += (double)GaugeFixQuality.flops();
1309  byte += (double)GaugeFixQuality.bytes();
1310  double action0 = argQ.getAction();
1311  printfQuda("Step: %d\tAction: %.16e\ttheta: %.16e\n", 0, argQ.getAction(), argQ.getTheta());
1312 
1313 
1314  unitarizeLinks(data, data, num_failures_dev);
1315  qudaMemcpy(&num_failures, num_failures_dev, sizeof(int), cudaMemcpyDeviceToHost);
1316  if ( num_failures > 0 ) {
1317  pool_device_free(num_failures_dev);
1318  errorQuda("Error in the unitarization\n");
1319  exit(1);
1320  }
1321  cudaMemset(num_failures_dev, 0, sizeof(int));
1322 
1323  int iter = 0;
1324  for ( iter = 0; iter < Nsteps; iter++ ) {
1325  for ( int p = 0; p < 2; p++ ) {
1326  #ifndef MULTI_GPU
1327  gaugeFix.setParity(p);
1328  gaugeFix.apply(0);
1329  flop += (double)gaugeFix.flops();
1330  byte += (double)gaugeFix.bytes();
1331  #else
1332  if ( !comm_partitioned() ) {
1333  gaugeFix.setParity(p);
1334  gaugeFix.apply(0);
1335  flop += (double)gaugeFix.flops();
1336  byte += (double)gaugeFix.bytes();
1337  }
1338  else{
1339  gfixIntPoints.setParity(p);
1340  gfixBorderPoints.setParity(p); //compute border points
1341  gfixBorderPoints.apply(0);
1342  flop += (double)gfixBorderPoints.flops();
1343  byte += (double)gfixBorderPoints.bytes();
1344  flop += (double)gfixIntPoints.flops();
1345  byte += (double)gfixIntPoints.bytes();
1346  for ( int d = 0; d < 4; d++ ) {
1347  if ( !commDimPartitioned(d)) continue;
1348  comm_start(mh_recv_back[d]);
1349  comm_start(mh_recv_fwd[d]);
1350  }
1351  //wait for the update to the halo points before start packing...
1353  for ( int d = 0; d < 4; d++ ) {
1354  if ( !commDimPartitioned(d)) continue;
1355  //extract top face
1356  Kernel_UnPackTop<NElems, Float, Gauge, true> <<< grid[d], block[d], 0, GFStream[d] >>> (faceVolumeCB[d], dataexarg, reinterpret_cast<complex<Float>*>(send_d[d]), p, d, d);
1357  //extract bottom ghost
1358  Kernel_UnPackGhost<NElems, Float, Gauge, true> <<< grid[d], block[d], 0, GFStream[4 + d] >>> (faceVolumeCB[d], dataexarg, reinterpret_cast<complex<Float>*>(sendg_d[d]), 1 - p, d, d);
1359  }
1360  #ifdef GPU_COMMS
1361  for ( int d = 0; d < 4; d++ ) {
1362  if ( !commDimPartitioned(d)) continue;
1363  qudaStreamSynchronize(GFStream[d]);
1364  comm_start(mh_send_fwd[d]);
1365  qudaStreamSynchronize(GFStream[4 + d]);
1366  comm_start(mh_send_back[d]);
1367  }
1368  #else
1369  for ( int d = 0; d < 4; d++ ) {
1370  if ( !commDimPartitioned(d)) continue;
1371  cudaMemcpyAsync(send[d], send_d[d], bytes[d], cudaMemcpyDeviceToHost, GFStream[d]);
1372  }
1373  for ( int d = 0; d < 4; d++ ) {
1374  if ( !commDimPartitioned(d)) continue;
1375  cudaMemcpyAsync(sendg[d], sendg_d[d], bytes[d], cudaMemcpyDeviceToHost, GFStream[4 + d]);
1376  }
1377  #endif
1378  //compute interior points
1379  gfixIntPoints.apply(GFStream[8]);
1380 
1381  #ifndef GPU_COMMS
1382  for ( int d = 0; d < 4; d++ ) {
1383  if ( !commDimPartitioned(d)) continue;
1384  qudaStreamSynchronize(GFStream[d]);
1385  comm_start(mh_send_fwd[d]);
1386  qudaStreamSynchronize(GFStream[4 + d]);
1387  comm_start(mh_send_back[d]);
1388  }
1389  for ( int d = 0; d < 4; d++ ) {
1390  if ( !commDimPartitioned(d)) continue;
1391  comm_wait(mh_recv_back[d]);
1392  cudaMemcpyAsync(recv_d[d], recv[d], bytes[d], cudaMemcpyHostToDevice, GFStream[d]);
1393  }
1394  for ( int d = 0; d < 4; d++ ) {
1395  if ( !commDimPartitioned(d)) continue;
1396  comm_wait(mh_recv_fwd[d]);
1397  cudaMemcpyAsync(recvg_d[d], recvg[d], bytes[d], cudaMemcpyHostToDevice, GFStream[4 + d]);
1398  }
1399  #endif
1400  for ( int d = 0; d < 4; d++ ) {
1401  if ( !commDimPartitioned(d)) continue;
1402  #ifdef GPU_COMMS
1403  comm_wait(mh_recv_back[d]);
1404  #endif
1405  Kernel_UnPackGhost<NElems, Float, Gauge, false> <<< grid[d], block[d], 0, GFStream[d] >>> (faceVolumeCB[d], dataexarg, reinterpret_cast<complex<Float>*>(recv_d[d]), p, d, d);
1406  }
1407  for ( int d = 0; d < 4; d++ ) {
1408  if ( !commDimPartitioned(d)) continue;
1409  #ifdef GPU_COMMS
1410  comm_wait(mh_recv_fwd[d]);
1411  #endif
1412  Kernel_UnPackTop<NElems, Float, Gauge, false> <<< grid[d], block[d], 0, GFStream[4 + d] >>> (faceVolumeCB[d], dataexarg, reinterpret_cast<complex<Float>*>(recvg_d[d]), 1 - p, d, d);
1413  }
1414  for ( int d = 0; d < 4; d++ ) {
1415  if ( !commDimPartitioned(d)) continue;
1416  comm_wait(mh_send_back[d]);
1417  comm_wait(mh_send_fwd[d]);
1418  qudaStreamSynchronize(GFStream[d]);
1419  qudaStreamSynchronize(GFStream[4 + d]);
1420  }
1421  qudaStreamSynchronize(GFStream[8]);
1422  }
1423  #endif
1424  /*gaugeFix.setParity(p);
1425  gaugeFix.apply(0);
1426  flop += (double)gaugeFix.flops();
1427  byte += (double)gaugeFix.bytes();
1428  #ifdef MULTI_GPU
1429  if(comm_partitioned()){//exchange updated top face links in current parity
1430  for (int d=0; d<4; d++) {
1431  if (!commDimPartitioned(d)) continue;
1432  comm_start(mh_recv_back[d]);
1433  //extract top face
1434  Kernel_UnPackTop<NElems, Float, Gauge><<<grid[d], block[d]>>>(faceVolumeCB[d], dataexarg, reinterpret_cast<Float*>(send_d[d]), p, d, d, true);
1435  #ifndef GPU_COMMS
1436  cudaMemcpy(send[d], send_d[d], bytes[d], cudaMemcpyDeviceToHost);
1437  #else
1438  qudaDeviceSynchronize();
1439  #endif
1440  comm_start(mh_send_fwd[d]);
1441  comm_wait(mh_recv_back[d]);
1442  comm_wait(mh_send_fwd[d]);
1443  #ifndef GPU_COMMS
1444  cudaMemcpy(recv_d[d], recv[d], bytes[d], cudaMemcpyHostToDevice);
1445  #endif
1446  //inject top face in ghost
1447  Kernel_UnPackGhost<NElems, Float, Gauge><<<grid[d], block[d]>>>(faceVolumeCB[d], dataexarg, reinterpret_cast<Float*>(recv_d[d]), p, d, d, false);
1448  }
1449  //exchange updated ghost links in opposite parity
1450  for (int d=0; d<4; d++) {
1451  if (!commDimPartitioned(d)) continue;
1452  comm_start(mh_recv_fwd[d]);
1453  Kernel_UnPackGhost<NElems, Float, Gauge><<<grid[d], block[d]>>>(faceVolumeCB[d], dataexarg, reinterpret_cast<Float*>(sendg_d[d]), 1-p, d, d, true);
1454  #ifndef GPU_COMMS
1455  cudaMemcpy(sendg[d], sendg_d[d], bytes[d], cudaMemcpyDeviceToHost);
1456  #else
1457  qudaDeviceSynchronize();
1458  #endif
1459  comm_start(mh_send_back[d]);
1460  comm_wait(mh_recv_fwd[d]);
1461  comm_wait(mh_send_back[d]);
1462  #ifndef GPU_COMMS
1463  cudaMemcpy(recvg_d[d], recvg[d], bytes[d], cudaMemcpyHostToDevice);
1464  #endif
1465  Kernel_UnPackTop<NElems, Float, Gauge><<<grid[d], block[d]>>>(faceVolumeCB[d], dataexarg, reinterpret_cast<Float*>(recvg_d[d]), 1-p, d, d, false);
1466  }
1467  }
1468  #endif*/
1469  }
1470  if ((iter % reunit_interval) == (reunit_interval - 1)) {
1471  unitarizeLinks(data, data, num_failures_dev);
1472  qudaMemcpy(&num_failures, num_failures_dev, sizeof(int), cudaMemcpyDeviceToHost);
1473  if ( num_failures > 0 ) errorQuda("Error in the unitarization\n");
1474  cudaMemset(num_failures_dev, 0, sizeof(int));
1475  flop += 4588.0 * data.X()[0]*data.X()[1]*data.X()[2]*data.X()[3];
1476  byte += 8.0 * data.X()[0]*data.X()[1]*data.X()[2]*data.X()[3] * dataOr.Bytes();
1477  }
1478  GaugeFixQuality.apply(0);
1479  flop += (double)GaugeFixQuality.flops();
1480  byte += (double)GaugeFixQuality.bytes();
1481  double action = argQ.getAction();
1482  double diff = abs(action0 - action);
1483  if ((iter % verbose_interval) == (verbose_interval - 1))
1484  printfQuda("Step: %d\tAction: %.16e\ttheta: %.16e\tDelta: %.16e\n", iter + 1, argQ.getAction(), argQ.getTheta(), diff);
1485  if ( stopWtheta ) {
1486  if ( argQ.getTheta() < tolerance ) break;
1487  }
1488  else{
1489  if ( diff < tolerance ) break;
1490  }
1491  action0 = action;
1492  }
1493  if ((iter % reunit_interval) != 0 ) {
1494  unitarizeLinks(data, data, num_failures_dev);
1495  qudaMemcpy(&num_failures, num_failures_dev, sizeof(int), cudaMemcpyDeviceToHost);
1496  if ( num_failures > 0 ) errorQuda("Error in the unitarization\n");
1497  cudaMemset(num_failures_dev, 0, sizeof(int));
1498  flop += 4588.0 * data.X()[0]*data.X()[1]*data.X()[2]*data.X()[3];
1499  byte += 8.0 * data.X()[0]*data.X()[1]*data.X()[2]*data.X()[3] * dataOr.Bytes();
1500  }
1501  if ((iter % verbose_interval) != 0 ) {
1502  GaugeFixQuality.apply(0);
1503  flop += (double)GaugeFixQuality.flops();
1504  byte += (double)GaugeFixQuality.bytes();
1505  double action = argQ.getAction();
1506  double diff = abs(action0 - action);
1507  printfQuda("Step: %d\tAction: %.16e\ttheta: %.16e\tDelta: %.16e\n", iter + 1, argQ.getAction(), argQ.getTheta(), diff);
1508  }
1509  pool_device_free(num_failures_dev);
1510  #ifdef MULTI_GPU
1511  if ( comm_partitioned() ) {
1512  data.exchangeExtendedGhost(data.R(),false);
1513  for ( int d = 0; d < 4; d++ ) {
1514  if ( commDimPartitioned(d)) {
1515  comm_free(mh_send_fwd[d]);
1516  comm_free(mh_send_back[d]);
1517  comm_free(mh_recv_back[d]);
1518  comm_free(mh_recv_fwd[d]);
1519  device_free(send_d[d]);
1520  device_free(recv_d[d]);
1521  device_free(sendg_d[d]);
1522  device_free(recvg_d[d]);
1523  cudaStreamDestroy(GFStream[d]);
1524  cudaStreamDestroy(GFStream[4 + d]);
1525  #ifndef GPU_COMMS
1526  host_free(hostbuffer_h[d]);
1527  #endif
1528  }
1529  }
1530  cudaStreamDestroy(GFStream[8]);
1531  }
1532  #endif
1533  checkCudaError();
1535  profileInternalGaugeFixOVR.TPSTOP(QUDA_PROFILE_COMPUTE);
1536  if (getVerbosity() > QUDA_SUMMARIZE){
1537  double secs = profileInternalGaugeFixOVR.Last(QUDA_PROFILE_COMPUTE);
1538  double gflops = (flop * 1e-9) / (secs);
1539  double gbytes = byte / (secs * 1e9);
1540  #ifdef MULTI_GPU
1541  printfQuda("Time: %6.6f s, Gflop/s = %6.1f, GB/s = %6.1f\n", secs, gflops * comm_size(), gbytes * comm_size());
1542  #else
1543  printfQuda("Time: %6.6f s, Gflop/s = %6.1f, GB/s = %6.1f\n", secs, gflops, gbytes);
1544  #endif
1545  }
1546  }
1547 
1548  template<typename Float, int NElems, typename Gauge>
1549  void gaugefixingOVR( Gauge dataOr, cudaGaugeField& data, const int gauge_dir, const int Nsteps, const int verbose_interval,
1550  const Float relax_boost, const double tolerance, const int reunit_interval, const int stopWtheta) {
1551  if ( gauge_dir != 3 ) {
1552  printfQuda("Starting Landau gauge fixing...\n");
1553  gaugefixingOVR<Float, Gauge, NElems, 4>(dataOr, data, Nsteps, verbose_interval, relax_boost, tolerance, reunit_interval, stopWtheta);
1554  }
1555  else {
1556  printfQuda("Starting Coulomb gauge fixing...\n");
1557  gaugefixingOVR<Float, Gauge, NElems, 3>(dataOr, data, Nsteps, verbose_interval, relax_boost, tolerance, reunit_interval, stopWtheta);
1558  }
1559  }
1560 
1561 
1562 
1563  template<typename Float>
1564  void gaugefixingOVR( cudaGaugeField& data, const int gauge_dir, const int Nsteps, const int verbose_interval,
1565  const Float relax_boost, const double tolerance, const int reunit_interval, const int stopWtheta) {
1566 
1567  // Switching to FloatNOrder for the gauge field in order to support RECONSTRUCT_12
1568  if ( data.isNative() ) {
1569  if ( data.Reconstruct() == QUDA_RECONSTRUCT_NO ) {
1570  //printfQuda("QUDA_RECONSTRUCT_NO\n");
1571  numParams = 18;
1572  typedef typename gauge_mapper<Float,QUDA_RECONSTRUCT_NO>::type Gauge;
1573  gaugefixingOVR<Float, 18>(Gauge(data), data, gauge_dir, Nsteps, verbose_interval, relax_boost, tolerance, reunit_interval, stopWtheta);
1574  } else if ( data.Reconstruct() == QUDA_RECONSTRUCT_12 ) {
1575  //printfQuda("QUDA_RECONSTRUCT_12\n");
1576  numParams = 12;
1577  typedef typename gauge_mapper<Float,QUDA_RECONSTRUCT_12>::type Gauge;
1578  gaugefixingOVR<Float, 12>(Gauge(data), data, gauge_dir, Nsteps, verbose_interval, relax_boost, tolerance, reunit_interval, stopWtheta);
1579  } else if ( data.Reconstruct() == QUDA_RECONSTRUCT_8 ) {
1580  //printfQuda("QUDA_RECONSTRUCT_8\n");
1581  numParams = 8;
1582  typedef typename gauge_mapper<Float,QUDA_RECONSTRUCT_8>::type Gauge;
1583  gaugefixingOVR<Float, 8>(Gauge(data), data, gauge_dir, Nsteps, verbose_interval, relax_boost, tolerance, reunit_interval, stopWtheta);
1584  } else {
1585  errorQuda("Reconstruction type %d of gauge field not supported", data.Reconstruct());
1586  }
1587  } else {
1588  errorQuda("Invalid Gauge Order\n");
1589  }
1590  }
1591 
1592 #endif // GPU_GAUGE_ALG
1593 
1594 
1606  void gaugefixingOVR( cudaGaugeField& data, const int gauge_dir, const int Nsteps, const int verbose_interval, const double relax_boost,
1607  const double tolerance, const int reunit_interval, const int stopWtheta) {
1608 #ifdef GPU_GAUGE_ALG
1609  if ( data.Precision() == QUDA_HALF_PRECISION ) {
1610  errorQuda("Half precision not supported\n");
1611  }
1612  if ( data.Precision() == QUDA_SINGLE_PRECISION ) {
1613  gaugefixingOVR<float> (data, gauge_dir, Nsteps, verbose_interval, (float)relax_boost, tolerance, reunit_interval, stopWtheta);
1614  } else if ( data.Precision() == QUDA_DOUBLE_PRECISION ) {
1615  gaugefixingOVR<double>(data, gauge_dir, Nsteps, verbose_interval, relax_boost, tolerance, reunit_interval, stopWtheta);
1616  } else {
1617  errorQuda("Precision %d not supported", data.Precision());
1618  }
1619 #else
1620  errorQuda("Gauge fixing has not been built");
1621 #endif // GPU_GAUGE_ALG
1622  }
1623 
1624 
1625 } //namespace quda
static bool reunit_allow_svd
#define qudaMemcpy(dst, src, count, kind)
Definition: quda_cuda_api.h:33
int commDimPartitioned(int dir)
double mu
Definition: test_util.cpp:1648
#define pinned_malloc(size)
Definition: malloc_quda.h:67
__device__ __host__ void setZero(Matrix< T, N > *m)
Definition: quda_matrix.h:702
static __device__ __host__ int linkIndex(const int x[], const I X[4])
#define LAUNCH_KERNEL_LOCAL_PARITY(kernel, tp, stream, arg,...)
QudaVerbosity getVerbosity()
Definition: util_quda.cpp:21
#define errorQuda(...)
Definition: util_quda.h:121
void setUnitarizeLinksConstants(double unitarize_eps, double max_error, bool allow_svd, bool svd_only, double svd_rel_error, double svd_abs_error)
#define host_free(ptr)
Definition: malloc_quda.h:71
int * num_failures_dev
cudaStream_t * stream
int comm_partitioned()
Loop over comm_dim_partitioned(dim) for all comms dimensions.
cudaColorSpinorField * tmp
Definition: covdev_test.cpp:44
__device__ __host__ double getRealTraceUVdagger(const Matrix< T, 3 > &a, const Matrix< T, 3 > &b)
Definition: quda_matrix.h:1131
void comm_allreduce_array(double *data, size_t size)
Definition: comm_mpi.cpp:272
virtual std::string paramString(const TuneParam &param) const
Definition: tune_quda.h:287
void gaugefixingOVR(cudaGaugeField &data, const int gauge_dir, const int Nsteps, const int verbose_interval, const double relax_boost, const double tolerance, const int reunit_interval, const int stopWtheta)
Gauge fixing with overrelaxation with support for single and multi GPU.
int num_failures
QudaGaugeParam param
Definition: pack_test.cpp:17
static double svd_rel_error
const int * R() const
int comm_size(void)
Definition: comm_mpi.cpp:88
#define qudaDeviceSynchronize()
void unitarizeLinks(cudaGaugeField &outfield, const cudaGaugeField &infield, int *fails)
#define comm_declare_send_relative(buffer, dim, dir, nbytes)
Definition: comm_quda.h:59
cudaError_t qudaStreamSynchronize(cudaStream_t &stream)
Wrapper around cudaStreamSynchronize or cuStreamSynchronize.
#define comm_declare_receive_relative(buffer, dim, dir, nbytes)
Definition: comm_quda.h:74
static __device__ __host__ int linkIndexM1(const int x[], const I X[4], const int mu)
void exchangeExtendedGhost(const int *R, bool no_comms_fill=false)
This does routine will populate the border / halo region of a gauge field that has been created using...
static bool reunit_svd_only
void comm_start(MsgHandle *mh)
Definition: comm_mpi.cpp:216
#define pool_device_malloc(size)
Definition: malloc_quda.h:125
constexpr int size
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
Definition: tune.cpp:643
static double svd_abs_error
Main header file for host and device accessors to GaugeFields.
void comm_free(MsgHandle *&mh)
Definition: comm_mpi.cpp:207
int X[4]
Definition: covdev_test.cpp:70
std::complex< double > Complex
Definition: quda_internal.h:46
__device__ __host__ void SubTraceUnit(Matrix< T, 3 > &a)
Definition: quda_matrix.h:1125
__device__ __host__ void pack(Arg &arg, int ghost_idx, int s, int parity)
Definition: dslash_pack.cuh:83
static double unitarize_eps
#define printfQuda(...)
Definition: util_quda.h:115
unsigned long long flops
Definition: blas_quda.cu:22
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
#define device_malloc(size)
Definition: malloc_quda.h:64
int faceVolume[4]
Definition: test_util.cpp:31
QudaReconstructType Reconstruct() const
Definition: gauge_field.h:250
__host__ __device__ ValueType abs(ValueType x)
Definition: complex_quda.h:125
#define pool_device_free(ptr)
Definition: malloc_quda.h:126
#define checkCudaError()
Definition: util_quda.h:161
void comm_wait(MsgHandle *mh)
Definition: comm_mpi.cpp:222
__host__ __device__ ValueType conj(ValueType x)
Definition: complex_quda.h:130
QudaTune getTuning()
Query whether autotuning is enabled or not. Default is enabled but can be overridden by setting QUDA_...
Definition: util_quda.cpp:52
QudaPrecision Precision() const
bool isNative() const
QudaParity parity
Definition: covdev_test.cpp:54
unsigned long long bytes
Definition: blas_quda.cu:23
int comm_dim_partitioned(int dim)
__host__ __device__ int getCoords(int coord[], const Arg &arg, int &idx, int parity, int &dim)
Compute the space-time coordinates we are at.
const int * X() const
#define device_free(ptr)
Definition: malloc_quda.h:69