QUDA  0.9.0
gauge_fix_ovr.cu
Go to the documentation of this file.
1 #include <quda_internal.h>
2 #include <quda_matrix.h>
3 #include <tune_quda.h>
4 #include <gauge_field.h>
5 #include <gauge_field_order.h>
6 #include <launch_kernel.cuh>
7 #include <unitarization_links.h>
8 #include <comm_quda.h>
9 #include <gauge_fix_ovr_extra.h>
11 #include <cub_helper.cuh>
12 #include <index_helper.cuh>
13 
14 namespace quda {
15 
16 #ifdef GPU_GAUGE_ALG
17 
18  static int numParams = 18;
19 
20 #define LAUNCH_KERNEL_GAUGEFIX(kernel, tp, stream, arg, parity, ...) \
21  if ( tp.block.z == 0 ) { \
22  switch ( tp.block.x ) { \
23  case 256: \
24  kernel<0, 32,__VA_ARGS__> \
25  << < tp.grid.x, tp.block.x, tp.shared_bytes, stream >> > (arg, parity); \
26  break; \
27  case 512: \
28  kernel<0, 64,__VA_ARGS__> \
29  << < tp.grid.x, tp.block.x, tp.shared_bytes, stream >> > (arg, parity); \
30  break; \
31  case 768: \
32  kernel<0, 96,__VA_ARGS__> \
33  << < tp.grid.x, tp.block.x, tp.shared_bytes, stream >> > (arg, parity); \
34  break; \
35  case 1024: \
36  kernel<0, 128,__VA_ARGS__> \
37  << < tp.grid.x, tp.block.x, tp.shared_bytes, stream >> > (arg, parity); \
38  break; \
39  default: \
40  errorQuda("%s not implemented for %d threads", # kernel, tp.block.x); \
41  } \
42  } \
43  else if ( tp.block.z == 1 ) { \
44  switch ( tp.block.x ) { \
45  case 256: \
46  kernel<1, 32,__VA_ARGS__> \
47  << < tp.grid.x, tp.block.x, tp.shared_bytes, stream >> > (arg, parity); \
48  break; \
49  case 512: \
50  kernel<1, 64,__VA_ARGS__> \
51  << < tp.grid.x, tp.block.x, tp.shared_bytes, stream >> > (arg, parity); \
52  break; \
53  case 768: \
54  kernel<1, 96,__VA_ARGS__> \
55  << < tp.grid.x, tp.block.x, tp.shared_bytes, stream >> > (arg, parity); \
56  break; \
57  case 1024: \
58  kernel<1, 128,__VA_ARGS__> \
59  << < tp.grid.x, tp.block.x, tp.shared_bytes, stream >> > (arg, parity); \
60  break; \
61  default: \
62  errorQuda("%s not implemented for %d threads", # kernel, tp.block.x); \
63  } \
64  } \
65  else if ( tp.block.z == 2 ) { \
66  switch ( tp.block.x ) { \
67  case 256: \
68  kernel<2, 32,__VA_ARGS__> \
69  << < tp.grid.x, tp.block.x, tp.shared_bytes, stream >> > (arg, parity); \
70  break; \
71  case 512: \
72  kernel<2, 64,__VA_ARGS__> \
73  << < tp.grid.x, tp.block.x, tp.shared_bytes, stream >> > (arg, parity); \
74  break; \
75  case 768: \
76  kernel<2, 96,__VA_ARGS__> \
77  << < tp.grid.x, tp.block.x, tp.shared_bytes, stream >> > (arg, parity); \
78  break; \
79  case 1024: \
80  kernel<2, 128,__VA_ARGS__> \
81  << < tp.grid.x, tp.block.x, tp.shared_bytes, stream >> > (arg, parity); \
82  break; \
83  default: \
84  errorQuda("%s not implemented for %d threads", # kernel, tp.block.x); \
85  } \
86  } \
87  else if ( tp.block.z == 3 ) { \
88  switch ( tp.block.x ) { \
89  case 128: \
90  kernel<3, 32,__VA_ARGS__> \
91  << < tp.grid.x, tp.block.x, tp.shared_bytes, stream >> > (arg, parity); \
92  break; \
93  case 256: \
94  kernel<3, 64,__VA_ARGS__> \
95  << < tp.grid.x, tp.block.x, tp.shared_bytes, stream >> > (arg, parity); \
96  break; \
97  case 384: \
98  kernel<3, 96,__VA_ARGS__> \
99  << < tp.grid.x, tp.block.x, tp.shared_bytes, stream >> > (arg, parity); \
100  break; \
101  case 512: \
102  kernel<3, 128,__VA_ARGS__> \
103  << < tp.grid.x, tp.block.x, tp.shared_bytes, stream >> > (arg, parity); \
104  break; \
105  case 640: \
106  kernel<3, 160,__VA_ARGS__> \
107  << < tp.grid.x, tp.block.x, tp.shared_bytes, stream >> > (arg, parity); \
108  break; \
109  case 768: \
110  kernel<3, 192,__VA_ARGS__> \
111  << < tp.grid.x, tp.block.x, tp.shared_bytes, stream >> > (arg, parity); \
112  break; \
113  case 896: \
114  kernel<3, 224,__VA_ARGS__> \
115  << < tp.grid.x, tp.block.x, tp.shared_bytes, stream >> > (arg, parity); \
116  break; \
117  case 1024: \
118  kernel<3, 256,__VA_ARGS__> \
119  << < tp.grid.x, tp.block.x, tp.shared_bytes, stream >> > (arg, parity); \
120  break; \
121  default: \
122  errorQuda("%s not implemented for %d threads", # kernel, tp.block.x); \
123  } \
124  } \
125  else if ( tp.block.z == 4 ) { \
126  switch ( tp.block.x ) { \
127  case 128: \
128  kernel<4, 32,__VA_ARGS__> \
129  << < tp.grid.x, tp.block.x, tp.shared_bytes, stream >> > (arg, parity); \
130  break; \
131  case 256: \
132  kernel<4, 64,__VA_ARGS__> \
133  << < tp.grid.x, tp.block.x, tp.shared_bytes, stream >> > (arg, parity); \
134  break; \
135  case 384: \
136  kernel<4, 96,__VA_ARGS__> \
137  << < tp.grid.x, tp.block.x, tp.shared_bytes, stream >> > (arg, parity); \
138  break; \
139  case 512: \
140  kernel<4, 128,__VA_ARGS__> \
141  << < tp.grid.x, tp.block.x, tp.shared_bytes, stream >> > (arg, parity); \
142  break; \
143  case 640: \
144  kernel<4, 160,__VA_ARGS__> \
145  << < tp.grid.x, tp.block.x, tp.shared_bytes, stream >> > (arg, parity); \
146  break; \
147  case 768: \
148  kernel<4, 192,__VA_ARGS__> \
149  << < tp.grid.x, tp.block.x, tp.shared_bytes, stream >> > (arg, parity); \
150  break; \
151  case 896: \
152  kernel<4, 224,__VA_ARGS__> \
153  << < tp.grid.x, tp.block.x, tp.shared_bytes, stream >> > (arg, parity); \
154  break; \
155  case 1024: \
156  kernel<4, 256,__VA_ARGS__> \
157  << < tp.grid.x, tp.block.x, tp.shared_bytes, stream >> > (arg, parity); \
158  break; \
159  default: \
160  errorQuda("%s not implemented for %d threads", # kernel, tp.block.x); \
161  } \
162  } \
163  else if ( tp.block.z == 5 ) { \
164  switch ( tp.block.x ) { \
165  case 128: \
166  kernel<5, 32,__VA_ARGS__> \
167  << < tp.grid.x, tp.block.x, tp.shared_bytes, stream >> > (arg, parity); \
168  break; \
169  case 256: \
170  kernel<5, 64,__VA_ARGS__> \
171  << < tp.grid.x, tp.block.x, tp.shared_bytes, stream >> > (arg, parity); \
172  break; \
173  case 384: \
174  kernel<5, 96,__VA_ARGS__> \
175  << < tp.grid.x, tp.block.x, tp.shared_bytes, stream >> > (arg, parity); \
176  break; \
177  case 512: \
178  kernel<5, 128,__VA_ARGS__> \
179  << < tp.grid.x, tp.block.x, tp.shared_bytes, stream >> > (arg, parity); \
180  break; \
181  case 640: \
182  kernel<5, 160,__VA_ARGS__> \
183  << < tp.grid.x, tp.block.x, tp.shared_bytes, stream >> > (arg, parity); \
184  break; \
185  case 768: \
186  kernel<5, 192,__VA_ARGS__> \
187  << < tp.grid.x, tp.block.x, tp.shared_bytes, stream >> > (arg, parity); \
188  break; \
189  case 896: \
190  kernel<5, 224,__VA_ARGS__> \
191  << < tp.grid.x, tp.block.x, tp.shared_bytes, stream >> > (arg, parity); \
192  break; \
193  case 1024: \
194  kernel<5, 256,__VA_ARGS__> \
195  << < tp.grid.x, tp.block.x, tp.shared_bytes, stream >> > (arg, parity); \
196  break; \
197  default: \
198  errorQuda("%s not implemented for %d threads", # kernel, tp.block.x); \
199  } \
200  } \
201  else{ \
202  errorQuda("Not implemented for %d", tp.block.z); \
203  }
204 
205 
209  template <typename Gauge>
210  struct GaugeFixQualityArg : public ReduceArg<double2> {
211  int threads; // number of active threads required
212  int X[4]; // grid dimensions
213 #ifdef MULTI_GPU
214  int border[4];
215 #endif
216  Gauge dataOr;
217  GaugeFixQualityArg(const Gauge &dataOr, const cudaGaugeField &data)
218  : ReduceArg<double2>(), dataOr(dataOr) {
219 
220  for ( int dir = 0; dir < 4; ++dir ) {
221  X[dir] = data.X()[dir] - data.R()[dir] * 2;
222  #ifdef MULTI_GPU
223  border[dir] = data.R()[dir];
224  #endif
225  }
226  threads = X[0]*X[1]*X[2]*X[3]/2;
227  }
228  double getAction(){ return result_h[0].x; }
229  double getTheta(){ return result_h[0].y; }
230  };
231 
232 
236  template<int blockSize, typename Float, typename Gauge, int gauge_dir>
237  __global__ void computeFix_quality(GaugeFixQualityArg<Gauge> argQ){
238  typedef complex<Float> Cmplx;
239 
240  int idx = threadIdx.x + blockIdx.x * blockDim.x;
241  int parity = threadIdx.y;
242 
243  double2 data = make_double2(0.0,0.0);
244  if ( idx < argQ.threads ) {
245  int X[4];
246  #pragma unroll
247  for ( int dr = 0; dr < 4; ++dr ) X[dr] = argQ.X[dr];
248 
249  int x[4];
250  getCoords(x, idx, X, parity);
251 #ifdef MULTI_GPU
252  #pragma unroll
253  for ( int dr = 0; dr < 4; ++dr ) {
254  x[dr] += argQ.border[dr];
255  X[dr] += 2 * argQ.border[dr];
256  }
257 #endif
259  setZero(&delta);
260  idx = linkIndex(x,X);
261  //load upward links
262  for ( int mu = 0; mu < gauge_dir; mu++ ) {
263  Matrix<Cmplx,3> U;
264  argQ.dataOr.load((Float*)(U.data),idx, mu, parity);
265  delta -= U;
266  }
267  //18*gauge_dir
268  data.x = -delta(0,0).x - delta(1,1).x - delta(2,2).x;
269  //2
270  //load downward links
271  for ( int mu = 0; mu < gauge_dir; mu++ ) {
272  Matrix<Cmplx,3> U;
273  argQ.dataOr.load((Float*)(U.data),linkIndexM1(x,X,mu), mu, 1 - parity);
274  delta += U;
275  }
276  //18*gauge_dir
277  delta -= conj(delta);
278  //18
280  //12
281  data.y = getRealTraceUVdagger(delta, delta);
282  //35
283  //T=36*gauge_dir+65
284  }
285  reduce2d<blockSize,2>(argQ, data);
286  }
287 
288 
292  template<typename Float, typename Gauge, int gauge_dir>
293  class GaugeFixQuality : TunableLocalParity {
294  GaugeFixQualityArg<Gauge> argQ;
295  mutable char aux_string[128]; // used as a label in the autotuner
296  private:
297 
298  unsigned int minThreads() const { return argQ.threads; }
299 
300  public:
301  GaugeFixQuality(GaugeFixQualityArg<Gauge> &argQ) : argQ(argQ) { }
302  ~GaugeFixQuality () { }
303 
304  void apply(const cudaStream_t &stream){
305  TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
306  argQ.result_h[0] = make_double2(0.0,0.0);
307  LAUNCH_KERNEL_LOCAL_PARITY(computeFix_quality, tp, stream, argQ, Float, Gauge, gauge_dir);
309  if ( comm_size() != 1 ) comm_allreduce_array((double*)argQ.result_h, 2);
310  argQ.result_h[0].x /= (double)(3 * gauge_dir * 2 * argQ.threads * comm_size());
311  argQ.result_h[0].y /= (double)(3 * 2 * argQ.threads * comm_size());
312  }
313 
314  TuneKey tuneKey() const {
315  std::stringstream vol;
316  vol << argQ.X[0] << "x";
317  vol << argQ.X[1] << "x";
318  vol << argQ.X[2] << "x";
319  vol << argQ.X[3];
320  sprintf(aux_string,"threads=%d,prec=%lu,gaugedir=%d",argQ.threads, sizeof(Float),gauge_dir);
321  return TuneKey(vol.str().c_str(), typeid(*this).name(), aux_string);
322 
323  }
324  std::string paramString(const TuneParam &param) const {
325  std::stringstream ps;
326  ps << "block=(" << param.block.x << "," << param.block.y << "," << param.block.z << ")";
327  ps << "shared=" << param.shared_bytes;
328  return ps.str();
329  }
330  long long flops() const {
331  return (36LL * gauge_dir + 65LL) * 2 * argQ.threads;
332  } // Only correct if there is no link reconstruction, no cub reduction accounted also
333  //long long bytes() const { return (1)*2*gauge_dir*argQ.dataOr.Bytes(); }//no accounting the reduction!!!! argQ.dataOr.Bytes() return 0....
334  long long bytes() const {
335  return 2LL * gauge_dir * 2 * argQ.threads * numParams * sizeof(Float);
336  } //no accounting the reduction!!!!
337 
338  };
339 
340 
344  template <typename Float, typename Gauge>
345  struct GaugeFixArg {
346  int threads; // number of active threads required
347  int X[4]; // grid dimensions
348 #ifdef MULTI_GPU
349  int border[4];
350 #endif
351  Gauge dataOr;
352  cudaGaugeField &data;
353  const Float relax_boost;
354 
355  GaugeFixArg(Gauge & dataOr, cudaGaugeField & data, const Float relax_boost)
356  : dataOr(dataOr), data(data), relax_boost(relax_boost) {
357 
358  for ( int dir = 0; dir < 4; ++dir ) {
359  X[dir] = data.X()[dir] - data.R()[dir] * 2;
360  #ifdef MULTI_GPU
361  border[dir] = data.R()[dir];
362  #endif
363  }
364  threads = X[0] * X[1] * X[2] * X[3] >> 1;
365  }
366  };
367 
368 
369 
370 
374  template<int ImplementationType, int blockSize, typename Float, typename Gauge, int gauge_dir>
375  __global__ void computeFix(GaugeFixArg<Float, Gauge> arg, int parity){
376  typedef complex<Float> Cmplx;
377 
378  int tid = (threadIdx.x + blockSize) % blockSize;
379  int idx = blockIdx.x * blockSize + tid;
380 
381  if ( idx >= arg.threads ) return;
382 
383  // 8 threads per lattice site
384  if ( ImplementationType < 3 ) {
385  int X[4];
386  #pragma unroll
387  for ( int dr = 0; dr < 4; ++dr ) X[dr] = arg.X[dr];
388 
389  int x[4];
390  getCoords(x, idx, X, parity);
391  #ifdef MULTI_GPU
392  #pragma unroll
393  for ( int dr = 0; dr < 4; ++dr ) {
394  x[dr] += arg.border[dr];
395  X[dr] += 2 * arg.border[dr];
396  }
397  #endif
398  int mu = (threadIdx.x / blockSize);
399  int oddbit = parity;
400  if ( threadIdx.x >= blockSize * 4 ) {
401  mu -= 4;
402  x[mu] = (x[mu] - 1 + X[mu]) % X[mu];
403  oddbit = 1 - parity;
404  }
405  idx = (((x[3] * X[2] + x[2]) * X[1] + x[1]) * X[0] + x[0]) >> 1;
406  Matrix<Cmplx,3> link;
407  arg.dataOr.load((Float*)(link.data),idx, mu, oddbit);
408  // 8 treads per lattice site, the reduction is performed by shared memory without using atomicadd.
409  // this implementation needs 8x more shared memory than the implementation using atomicadd
410  if ( ImplementationType == 0 ) GaugeFixHit_NoAtomicAdd<blockSize, Float, gauge_dir, 3>(link, arg.relax_boost, tid);
411  // 8 treads per lattice site, the reduction is performed by shared memory using atomicadd
412  if ( ImplementationType == 1 ) GaugeFixHit_AtomicAdd<blockSize, Float, gauge_dir, 3>(link, arg.relax_boost, tid);
413  // 8 treads per lattice site, the reduction is performed by shared memory without using atomicadd.
414  // uses the same amount of shared memory as the atomicadd implementation with more thread block synchronization
415  if ( ImplementationType == 2 ) GaugeFixHit_NoAtomicAdd_LessSM<blockSize, Float, gauge_dir, 3>(link, arg.relax_boost, tid);
416  arg.dataOr.save((Float*)(link.data),idx, mu, oddbit);
417  }
418  // 4 threads per lattice site
419  else{
420  int X[4];
421  #pragma unroll
422  for ( int dr = 0; dr < 4; ++dr ) X[dr] = arg.X[dr];
423 
424  int x[4];
425  getCoords(x, idx, X, parity);
426  #ifdef MULTI_GPU
427  #pragma unroll
428  for ( int dr = 0; dr < 4; ++dr ) {
429  x[dr] += arg.border[dr];
430  X[dr] += 2 * arg.border[dr];
431  }
432  #endif
433  int mu = (threadIdx.x / blockSize);
434  idx = (((x[3] * X[2] + x[2]) * X[1] + x[1]) * X[0] + x[0]) >> 1;
435  Matrix<Cmplx,3> link;
436  //load upward link
437  arg.dataOr.load((Float*)(link.data),idx, mu, parity);
438 
439 
440  x[mu] = (x[mu] - 1 + X[mu]) % X[mu];
441  int idx1 = (((x[3] * X[2] + x[2]) * X[1] + x[1]) * X[0] + x[0]) >> 1;
442  Matrix<Cmplx,3> link1;
443  //load downward link
444  arg.dataOr.load((Float*)(link1.data),idx1, mu, 1 - parity);
445 
446  // 4 treads per lattice site, the reduction is performed by shared memory without using atomicadd.
447  // this implementation needs 4x more shared memory than the implementation using atomicadd
448  if ( ImplementationType == 3 ) GaugeFixHit_NoAtomicAdd<blockSize, Float, gauge_dir, 3>(link, link1, arg.relax_boost, tid);
449  // 4 treads per lattice site, the reduction is performed by shared memory using atomicadd
450  if ( ImplementationType == 4 ) GaugeFixHit_AtomicAdd<blockSize, Float, gauge_dir, 3>(link, link1, arg.relax_boost, tid);
451  // 4 treads per lattice site, the reduction is performed by shared memory without using atomicadd.
452  // uses the same amount of shared memory as the atomicadd implementation with more thread block synchronization
453  if ( ImplementationType == 5 ) GaugeFixHit_NoAtomicAdd_LessSM<blockSize, Float, gauge_dir, 3>(link, link1, arg.relax_boost, tid);
454 
455  arg.dataOr.save((Float*)(link.data),idx, mu, parity);
456  arg.dataOr.save((Float*)(link1.data),idx1, mu, 1 - parity);
457 
458  }
459  }
460 
461 
462 
463 
464 
465 
469  template<typename Float, typename Gauge, int gauge_dir>
470  class GaugeFix : Tunable {
471  GaugeFixArg<Float, Gauge> arg;
472  int parity;
473  mutable char aux_string[128]; // used as a label in the autotuner
474  protected:
475 
476  dim3 createGrid (const dim3 &block) const {
477  unsigned int blockx = block.x / 8;
478  if ( block.z > 2 ) blockx = block.x / 4;
479  unsigned int gx = (arg.threads + blockx - 1) / blockx;
480  return dim3(gx, 1, 1);
481  }
482  bool advanceBlockDim (TuneParam &param) const {
483  //Use param.block.z to tune and save state for best kernel option
484  // to make use or not of atomicAdd operations and 4 or 8 threads per lattice site!!!
485  const unsigned int min_threads0 = 32 * 8;
486  const unsigned int min_threads1 = 32 * 4;
487  const unsigned int max_threads = 1024; // FIXME: use deviceProp.maxThreadsDim[0];
488  const unsigned int atmadd = 0;
489  unsigned int min_threads = min_threads0;
490  param.block.z += atmadd; //USE TO SELECT BEST KERNEL OPTION WITH/WITHOUT USING ATOMICADD
491  if ( param.block.z > 2 ) min_threads = 32 * 4;
492  param.block.x += min_threads;
493  param.block.y = 1;
494  param.grid = createGrid(param.block);
495 
496 
497 
498  if ((param.block.x >= min_threads) && (param.block.x <= max_threads)) {
499  if ( param.block.z == 0 ) param.shared_bytes = param.block.x * 4 * sizeof(Float);
500  else if ( param.block.z == 1 || param.block.z == 2 ) param.shared_bytes = param.block.x * 4 * sizeof(Float) / 8;
501  else if ( param.block.z == 3 ) param.shared_bytes = param.block.x * 4 * sizeof(Float);
502  else if ( param.block.z == 4 || param.block.z == 5 ) param.shared_bytes = param.block.x * sizeof(Float);
503  return true;
504  }
505  else if ( param.block.z == 0 ) {
506  param.block.x = min_threads0;
507  param.block.y = 1;
508  param.block.z = 1; //USE FOR ATOMIC ADD
509  param.grid = createGrid(param.block);
510  param.shared_bytes = param.block.x * 4 * sizeof(Float) / 8;
511  return true;
512  }
513  else if ( param.block.z == 1 ) {
514  param.block.x = min_threads0;
515  param.block.y = 1;
516  param.block.z = 2; //USE FOR NO ATOMIC ADD and LESS SHARED MEM
517  param.grid = createGrid(param.block);
518  param.shared_bytes = param.block.x * 4 * sizeof(Float) / 8;
519  return true;
520  }
521  else if ( param.block.z == 2 ) {
522  param.block.x = min_threads1;
523  param.block.y = 1;
524  param.block.z = 3; //USE FOR NO ATOMIC ADD
525  param.grid = createGrid(param.block);
526  param.shared_bytes = param.block.x * 4 * sizeof(Float);
527  return true;
528  }
529  else if ( param.block.z == 3 ) {
530  param.block.x = min_threads1;
531  param.block.y = 1;
532  param.block.z = 4;
533  param.grid = createGrid(param.block);
534  param.shared_bytes = param.block.x * sizeof(Float);
535  return true;
536  }
537  else if ( param.block.z == 4 ) {
538  param.block.x = min_threads1;
539  param.block.y = 1;
540  param.block.z = 5;
541  param.grid = createGrid(param.block);
542  param.shared_bytes = param.block.x * sizeof(Float);
543  return true;
544  }
545  else
546  return false;
547  }
548  private:
549  unsigned int sharedBytesPerThread() const {
550  return 0;
551  }
552  unsigned int sharedBytesPerBlock(const TuneParam &param) const {
553  if ( param.block.z == 0 ) return param.block.x * 4 * sizeof(Float);
554  else if ( param.block.z == 1 || param.block.z == 2 ) return param.block.x * 4 * sizeof(Float) / 8;
555  else if ( param.block.z == 3 ) return param.block.x * 4 * sizeof(Float);
556  else return param.block.x * sizeof(Float);
557  }
558 
559  bool tuneSharedBytes() const {
560  return false;
561  } // Don't tune shared memory
562  bool tuneGridDim() const {
563  return false;
564  } // Don't tune the grid dimensions.
565  unsigned int minThreads() const {
566  return arg.threads;
567  }
568 
569  public:
570  virtual void initTuneParam(TuneParam &param) const {
571  param.block = dim3(256, 1, 0);
572  param.grid = createGrid(param.block);
573  param.shared_bytes = param.block.x * 4 * sizeof(Float);
574  }
575 
576  GaugeFix(GaugeFixArg<Float, Gauge> &arg) : arg(arg), parity(0) { }
577  ~GaugeFix () { }
578 
579  void setParity(const int par){
580  parity = par;
581  }
582 
583  void apply(const cudaStream_t &stream){
584  TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
585  LAUNCH_KERNEL_GAUGEFIX(computeFix, tp, stream, arg, parity, Float, Gauge, gauge_dir);
586  }
587 
589  virtual void defaultTuneParam(TuneParam &param) const {
590  initTuneParam(param);
591  }
592 
593  TuneKey tuneKey() const {
594  std::stringstream vol;
595  vol << arg.X[0] << "x";
596  vol << arg.X[1] << "x";
597  vol << arg.X[2] << "x";
598  vol << arg.X[3];
599  sprintf(aux_string,"threads=%d,prec=%lu,gaugedir=%d",arg.threads,sizeof(Float),gauge_dir);
600  return TuneKey(vol.str().c_str(), typeid(*this).name(), aux_string);
601  }
602 
603  std::string paramString(const TuneParam &param) const {
604  std::stringstream ps(Tunable::paramString(param));
605  ps << ", atomicadd=" << param.block.z;
606  return ps.str();
607  }
608 
609  //need this
610  void preTune() {
611  arg.data.backup();
612  }
613  void postTune() {
614  arg.data.restore();
615  }
616  long long flops() const {
617  return 3LL * (22 + 28 * gauge_dir + 224 * 3) * arg.threads;
618  } // Only correct if there is no link reconstruction
619  //long long bytes() const { return (1)*8*2*arg.dataOr.Bytes(); } // Only correct if there is no link reconstruction load+save
620  long long bytes() const {
621  return 8LL * 2 * arg.threads * numParams * sizeof(Float);
622  } //no accounting the reduction!!!!
623  };
624 
625 
626 
627 
628 #ifdef MULTI_GPU
629  template <typename Float, typename Gauge>
630  struct GaugeFixInteriorPointsArg {
631  int threads; // number of active threads required
632  int X[4]; // grid dimensions
633 #ifdef MULTI_GPU
634  int border[4];
635 #endif
636  Gauge dataOr;
637  cudaGaugeField &data;
638  const Float relax_boost;
639  GaugeFixInteriorPointsArg(Gauge & dataOr, cudaGaugeField & data, const Float relax_boost)
640  : dataOr(dataOr), data(data), relax_boost(relax_boost) {
641 
642 #ifdef MULTI_GPU
643  for ( int dir = 0; dir < 4; ++dir ) {
644  if ( comm_dim_partitioned(dir)) border[dir] = data.R()[dir] + 1; //skip BORDER_RADIUS + face border point
645  else border[dir] = 0;
646  }
647  for ( int dir = 0; dir < 4; ++dir ) X[dir] = data.X()[dir] - border[dir] * 2;
648 #else
649  for ( int dir = 0; dir < 4; ++dir ) X[dir] = data.X()[dir];
650 #endif
651  threads = X[0] * X[1] * X[2] * X[3] >> 1;
652  }
653  };
654 
655 
656 
657 
661  template<int ImplementationType, int blockSize, typename Float, typename Gauge, int gauge_dir>
662  __global__ void computeFixInteriorPoints(GaugeFixInteriorPointsArg<Float, Gauge> arg, int parity){
663  int tid = (threadIdx.x + blockSize) % blockSize;
664  int idx = blockIdx.x * blockSize + tid;
665  if ( idx >= arg.threads ) return;
666  typedef complex<Float> Complex;
667  int X[4];
668 #pragma unroll
669  for ( int dr = 0; dr < 4; ++dr ) X[dr] = arg.X[dr];
670  int x[4];
671 #ifdef MULTI_GPU
672  int za = (idx / (X[0] / 2));
673  int zb = (za / X[1]);
674  x[1] = za - zb * X[1];
675  x[3] = (zb / X[2]);
676  x[2] = zb - x[3] * X[2];
677  int p = 0; for ( int dr = 0; dr < 4; ++dr ) p += arg.border[dr];
678  p = p & 1;
679  int x1odd = (x[1] + x[2] + x[3] + parity + p) & 1;
680  //int x1odd = (x[1] + x[2] + x[3] + parity) & 1;
681  x[0] = (2 * idx + x1odd) - za * X[0];
682  for ( int dr = 0; dr < 4; ++dr ) {
683  x[dr] += arg.border[dr];
684  X[dr] += 2 * arg.border[dr];
685  }
686 #else
687  getCoords(x, idx, X, parity);
688 #endif
689  int mu = (threadIdx.x / blockSize);
690 
691  // 8 threads per lattice site
692  if ( ImplementationType < 3 ) {
693  if ( threadIdx.x >= blockSize * 4 ) {
694  mu -= 4;
695  x[mu] = (x[mu] - 1 + X[mu]) % X[mu];
696  parity = 1 - parity;
697  }
698  idx = (((x[3] * X[2] + x[2]) * X[1] + x[1]) * X[0] + x[0]) >> 1;
699  Matrix<Complex,3> link;
700  arg.dataOr.load((Float*)(link.data),idx, mu, parity);
701  // 8 treads per lattice site, the reduction is performed by shared memory without using atomicadd.
702  // this implementation needs 8x more shared memory than the implementation using atomicadd
703  if ( ImplementationType == 0 ) GaugeFixHit_NoAtomicAdd<blockSize, Float, gauge_dir, 3>(link, arg.relax_boost, tid);
704  // 8 treads per lattice site, the reduction is performed by shared memory using atomicadd
705  if ( ImplementationType == 1 ) GaugeFixHit_AtomicAdd<blockSize, Float, gauge_dir, 3>(link, arg.relax_boost, tid);
706  // 8 treads per lattice site, the reduction is performed by shared memory without using atomicadd.
707  // uses the same amount of shared memory as the atomicadd implementation with more thread block synchronization
708  if ( ImplementationType == 2 ) GaugeFixHit_NoAtomicAdd_LessSM<blockSize, Float, gauge_dir, 3>(link, arg.relax_boost, tid);
709  arg.dataOr.save((Float*)(link.data),idx, mu, parity);
710  }
711  // 4 threads per lattice site
712  else{
713  idx = (((x[3] * X[2] + x[2]) * X[1] + x[1]) * X[0] + x[0]) >> 1;
714  Matrix<Complex,3> link;
715  arg.dataOr.load((Float*)(link.data),idx, mu, parity);
716 
717 
718  x[mu] = (x[mu] - 1 + X[mu]) % X[mu];
719  int idx1 = (((x[3] * X[2] + x[2]) * X[1] + x[1]) * X[0] + x[0]) >> 1;
720  Matrix<Complex,3> link1;
721  arg.dataOr.load((Float*)(link1.data),idx1, mu, 1 - parity);
722 
723  // 4 treads per lattice site, the reduction is performed by shared memory without using atomicadd.
724  // this implementation needs 4x more shared memory than the implementation using atomicadd
725  if ( ImplementationType == 3 ) GaugeFixHit_NoAtomicAdd<blockSize, Float, gauge_dir, 3>(link, link1, arg.relax_boost, tid);
726  // 4 treads per lattice site, the reduction is performed by shared memory using atomicadd
727  if ( ImplementationType == 4 ) GaugeFixHit_AtomicAdd<blockSize, Float, gauge_dir, 3>(link, link1, arg.relax_boost, tid);
728  // 4 treads per lattice site, the reduction is performed by shared memory without using atomicadd.
729  // uses the same amount of shared memory as the atomicadd implementation with more thread block synchronization
730  if ( ImplementationType == 5 ) GaugeFixHit_NoAtomicAdd_LessSM<blockSize, Float, gauge_dir, 3>(link, link1, arg.relax_boost, tid);
731 
732  arg.dataOr.save((Float*)(link.data),idx, mu, parity);
733  arg.dataOr.save((Float*)(link1.data),idx1, mu, 1 - parity);
734 
735  }
736  }
737 
738 
739 
740 
741 
742 
743 
744 
748  template<typename Float, typename Gauge, int gauge_dir>
749  class GaugeFixInteriorPoints : Tunable {
750  GaugeFixInteriorPointsArg<Float, Gauge> arg;
751  int parity;
752  mutable char aux_string[128]; // used as a label in the autotuner
753  protected:
754 
755  dim3 createGrid (const dim3 &block) const {
756  unsigned int blockx = block.x / 8;
757  if ( block.z > 2 ) blockx = block.x / 4;
758  unsigned int gx = (arg.threads + blockx - 1) / blockx;
759  return dim3(gx, 1, 1);
760  }
761  bool advanceBlockDim (TuneParam &param) const {
762  //Use param.block.z to tune and save state for best kernel option
763  // to make use or not of atomicAdd operations and 4 or 8 threads per lattice site!!!
764  const unsigned int min_threads0 = 32 * 8;
765  const unsigned int min_threads1 = 32 * 4;
766  const unsigned int max_threads = 1024; // FIXME: use deviceProp.maxThreadsDim[0];
767  const unsigned int atmadd = 0;
768  unsigned int min_threads = min_threads0;
769  param.block.z += atmadd; //USE TO SELECT BEST KERNEL OPTION WITH/WITHOUT USING ATOMICADD
770  if ( param.block.z > 2 ) min_threads = 32 * 4;
771  param.block.x += min_threads;
772  param.block.y = 1;
773  param.grid = createGrid(param.block);
774 
775 
776 
777  if ((param.block.x >= min_threads) && (param.block.x <= max_threads)) {
778  if ( param.block.z == 0 ) param.shared_bytes = param.block.x * 4 * sizeof(Float);
779  else if ( param.block.z == 1 || param.block.z == 2 ) param.shared_bytes = param.block.x * 4 * sizeof(Float) / 8;
780  else if ( param.block.z == 3 ) param.shared_bytes = param.block.x * 4 * sizeof(Float);
781  else if ( param.block.z == 4 || param.block.z == 5 ) param.shared_bytes = param.block.x * sizeof(Float);
782  return true;
783  }
784  else if ( param.block.z == 0 ) {
785  param.block.x = min_threads0;
786  param.block.y = 1;
787  param.block.z = 1; //USE FOR ATOMIC ADD
788  param.grid = createGrid(param.block);
789  param.shared_bytes = param.block.x * 4 * sizeof(Float) / 8;
790  return true;
791  }
792  else if ( param.block.z == 1 ) {
793  param.block.x = min_threads0;
794  param.block.y = 1;
795  param.block.z = 2; //USE FOR NO ATOMIC ADD and LESS SHARED MEM
796  param.grid = createGrid(param.block);
797  param.shared_bytes = param.block.x * 4 * sizeof(Float) / 8;
798  return true;
799  }
800  else if ( param.block.z == 2 ) {
801  param.block.x = min_threads1;
802  param.block.y = 1;
803  param.block.z = 3; //USE FOR NO ATOMIC ADD
804  param.grid = createGrid(param.block);
805  param.shared_bytes = param.block.x * 4 * sizeof(Float);
806  return true;
807  }
808  else if ( param.block.z == 3 ) {
809  param.block.x = min_threads1;
810  param.block.y = 1;
811  param.block.z = 4;
812  param.grid = createGrid(param.block);
813  param.shared_bytes = param.block.x * sizeof(Float);
814  return true;
815  }
816  else if ( param.block.z == 4 ) {
817  param.block.x = min_threads1;
818  param.block.y = 1;
819  param.block.z = 5;
820  param.grid = createGrid(param.block);
821  param.shared_bytes = param.block.x * sizeof(Float);
822  return true;
823  }
824  else
825  return false;
826  }
827  private:
828  unsigned int sharedBytesPerThread() const {
829  return 0;
830  }
831  unsigned int sharedBytesPerBlock(const TuneParam &param) const {
832  if ( param.block.z == 0 ) return param.block.x * 4 * sizeof(Float);
833  else if ( param.block.z == 1 || param.block.z == 2 ) return param.block.x * 4 * sizeof(Float) / 8;
834  else if ( param.block.z == 3 ) return param.block.x * 4 * sizeof(Float);
835  else return param.block.x * sizeof(Float);
836  }
837 
838  bool tuneSharedBytes() const {
839  return false;
840  } // Don't tune shared memory
841  bool tuneGridDim() const {
842  return false;
843  } // Don't tune the grid dimensions.
844  unsigned int minThreads() const {
845  return arg.threads;
846  }
847 
848  public:
849  virtual void initTuneParam(TuneParam &param) const {
850  param.block = dim3(256, 1, 0);
851  param.grid = createGrid(param.block);
852  param.shared_bytes = param.block.x * 4 * sizeof(Float);
853  }
854  GaugeFixInteriorPoints(GaugeFixInteriorPointsArg<Float, Gauge> &arg) : arg(arg), parity(0) { }
855  ~GaugeFixInteriorPoints () { }
856  void setParity(const int par){
857  parity = par;
858  }
859 
860  void apply(const cudaStream_t &stream){
861  TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
862  LAUNCH_KERNEL_GAUGEFIX(computeFixInteriorPoints, tp, stream, arg, parity, Float, Gauge, gauge_dir);
863  }
864 
865 
867  virtual void defaultTuneParam(TuneParam &param) const {
868  initTuneParam(param);
869  }
870 
871  TuneKey tuneKey() const {
872  std::stringstream vol;
873  vol << arg.X[0] << "x";
874  vol << arg.X[1] << "x";
875  vol << arg.X[2] << "x";
876  vol << arg.X[3];
877  sprintf(aux_string,"threads=%d,prec=%lu,gaugedir=%d",arg.threads,sizeof(Float),gauge_dir);
878  return TuneKey(vol.str().c_str(), typeid(*this).name(), aux_string);
879  }
880 
881  std::string paramString(const TuneParam &param) const {
882  std::stringstream ps(Tunable::paramString(param));
883  ps << ", atomicadd=" << param.block.z;
884  return ps.str();
885  }
886 
887  //need this
888  void preTune() {
889  arg.data.backup();
890  }
891  void postTune() {
892  arg.data.restore();
893  }
894  long long flops() const {
895  return 3LL * (22 + 28 * gauge_dir + 224 * 3) * arg.threads;
896  } // Only correct if there is no link reconstruction
897  //long long bytes() const { return (1)*8*2*arg.dataOr.Bytes(); } // Only correct if there is no link reconstruction load+save
898  long long bytes() const {
899  return 8LL * 2 * arg.threads * numParams * sizeof(Float);
900  } // Only correct if there is no link reconstruction load+save
901  };
902 
903 
904 
905 
906 
907 
908 
909 
910 
911 
912 
913 
914 
915 
916 
917 
918  template <typename Float, typename Gauge>
919  struct GaugeFixBorderPointsArg {
920  int threads; // number of active threads required
921  int X[4]; // grid dimensions
922  int border[4];
923  int *borderpoints[2];
924  int *faceindicessize[2];
925  size_t faceVolume[4];
926  size_t faceVolumeCB[4];
927  Gauge dataOr;
928  cudaGaugeField &data;
929  const Float relax_boost;
930 
931  GaugeFixBorderPointsArg(Gauge & dataOr, cudaGaugeField & data, const Float relax_boost, size_t faceVolume_[4], size_t faceVolumeCB_[4])
932  : dataOr(dataOr), data(data), relax_boost(relax_boost) {
933 
934 
935  for ( int dir = 0; dir < 4; ++dir ) {
936  X[dir] = data.X()[dir] - data.R()[dir] * 2;
937  border[dir] = data.R()[dir];
938  }
939 
940  /*for(int dir=0; dir<4; ++dir){
941  if(comm_dim_partitioned(dir)) border[dir] = BORDER_RADIUS;
942  else border[dir] = 0;
943  }
944  for(int dir=0; dir<4; ++dir) X[dir] = data.X()[dir] - border[dir]*2;*/
945  for ( int dir = 0; dir < 4; ++dir ) {
946  faceVolume[dir] = faceVolume_[dir];
947  faceVolumeCB[dir] = faceVolumeCB_[dir];
948  }
949  if ( comm_partitioned() ) PreCalculateLatticeIndices(faceVolume, faceVolumeCB, X, border, threads, borderpoints);
950  }
951  };
952 
956  template<int ImplementationType, int blockSize, typename Float, typename Gauge, int gauge_dir>
957  __global__ void computeFixBorderPoints(GaugeFixBorderPointsArg<Float, Gauge> arg, int parity){
958  typedef complex<Float> Cmplx;
959 
960  int tid = (threadIdx.x + blockSize) % blockSize;
961  int idx = blockIdx.x * blockSize + tid;
962  if ( idx >= arg.threads ) return;
963  int mu = (threadIdx.x / blockSize);
964  idx = arg.borderpoints[parity][idx];
965  int X[4], x[4];
966  x[3] = idx / (arg.X[0] * arg.X[1] * arg.X[2]);
967  x[2] = (idx / (arg.X[0] * arg.X[1])) % arg.X[2];
968  x[1] = (idx / arg.X[0]) % arg.X[1];
969  x[0] = idx % arg.X[0];
970  #pragma unroll
971  for ( int dr = 0; dr < 4; ++dr ) x[dr] += arg.border[dr];
972  #pragma unroll
973  for ( int dr = 0; dr < 4; ++dr ) X[dr] = arg.X[dr] + 2 * arg.border[dr];
974 
975  // 8 threads per lattice site
976  if ( ImplementationType < 3 ) {
977  if ( threadIdx.x >= blockSize * 4 ) {
978  mu -= 4;
979  x[mu] = (x[mu] - 1 + X[mu]) % X[mu];
980  parity = 1 - parity;
981  }
982  idx = (((x[3] * X[2] + x[2]) * X[1] + x[1]) * X[0] + x[0]) >> 1;
983  Matrix<Cmplx,3> link;
984  arg.dataOr.load((Float*)(link.data),idx, mu, parity);
985  // 8 treads per lattice site, the reduction is performed by shared memory without using atomicadd.
986  // this implementation needs 8x more shared memory than the implementation using atomicadd
987  if ( ImplementationType == 0 ) GaugeFixHit_NoAtomicAdd<blockSize, Float, gauge_dir, 3>(link, arg.relax_boost, tid);
988  // 8 treads per lattice site, the reduction is performed by shared memory using atomicadd
989  if ( ImplementationType == 1 ) GaugeFixHit_AtomicAdd<blockSize, Float, gauge_dir, 3>(link, arg.relax_boost, tid);
990  // 8 treads per lattice site, the reduction is performed by shared memory without using atomicadd.
991  // uses the same amount of shared memory as the atomicadd implementation with more thread block synchronization
992  if ( ImplementationType == 2 ) GaugeFixHit_NoAtomicAdd_LessSM<blockSize, Float, gauge_dir, 3>(link, arg.relax_boost, tid);
993  arg.dataOr.save((Float*)(link.data),idx, mu, parity);
994  }
995  // 4 threads per lattice site
996  else{
997  idx = (((x[3] * X[2] + x[2]) * X[1] + x[1]) * X[0] + x[0]) >> 1;
998  Matrix<Cmplx,3> link;
999  arg.dataOr.load((Float*)(link.data),idx, mu, parity);
1000 
1001 
1002  x[mu] = (x[mu] - 1 + X[mu]) % X[mu];
1003  int idx1 = (((x[3] * X[2] + x[2]) * X[1] + x[1]) * X[0] + x[0]) >> 1;
1004  Matrix<Cmplx,3> link1;
1005  arg.dataOr.load((Float*)(link1.data),idx1, mu, 1 - parity);
1006 
1007  // 4 treads per lattice site, the reduction is performed by shared memory without using atomicadd.
1008  // this implementation needs 4x more shared memory than the implementation using atomicadd
1009  if ( ImplementationType == 3 ) GaugeFixHit_NoAtomicAdd<blockSize, Float, gauge_dir, 3>(link, link1, arg.relax_boost, tid);
1010  // 4 treads per lattice site, the reduction is performed by shared memory using atomicadd
1011  if ( ImplementationType == 4 ) GaugeFixHit_AtomicAdd<blockSize, Float, gauge_dir, 3>(link, link1, arg.relax_boost, tid);
1012  // 4 treads per lattice site, the reduction is performed by shared memory without using atomicadd.
1013  // uses the same amount of shared memory as the atomicadd implementation with more thread block synchronization
1014  if ( ImplementationType == 5 ) GaugeFixHit_NoAtomicAdd_LessSM<blockSize, Float, gauge_dir, 3>(link, link1, arg.relax_boost, tid);
1015 
1016  arg.dataOr.save((Float*)(link.data),idx, mu, parity);
1017  arg.dataOr.save((Float*)(link1.data),idx1, mu, 1 - parity);
1018  }
1019  }
1020 
1021 
1022 
1023 
1027  template<typename Float, typename Gauge, int gauge_dir>
1028  class GaugeFixBorderPoints : Tunable {
1029  GaugeFixBorderPointsArg<Float, Gauge> arg;
1030  int parity;
1031  mutable char aux_string[128]; // used as a label in the autotuner
1032  protected:
1033 
1034  dim3 createGrid (const dim3 &block) const {
1035  unsigned int blockx = block.x / 8;
1036  if ( block.z > 2 ) blockx = block.x / 4;
1037  unsigned int gx = (arg.threads + blockx - 1) / blockx;
1038  return dim3(gx, 1, 1);
1039  }
1040  bool advanceBlockDim (TuneParam &param) const {
1041  //Use param.block.z to tune and save state for best kernel option
1042  // to make use or not of atomicAdd operations and 4 or 8 threads per lattice site!!!
1043  const unsigned int min_threads0 = 32 * 8;
1044  const unsigned int min_threads1 = 32 * 4;
1045  const unsigned int max_threads = 1024; // FIXME: use deviceProp.maxThreadsDim[0];
1046  const unsigned int atmadd = 0;
1047  unsigned int min_threads = min_threads0;
1048  param.block.z += atmadd; //USE TO SELECT BEST KERNEL OPTION WITH/WITHOUT USING ATOMICADD
1049  if ( param.block.z > 2 ) min_threads = 32 * 4;
1050  param.block.x += min_threads;
1051  param.block.y = 1;
1052  param.grid = createGrid(param.block);
1053 
1054 
1055 
1056  if ((param.block.x >= min_threads) && (param.block.x <= max_threads)) {
1057  if ( param.block.z == 0 ) param.shared_bytes = param.block.x * 4 * sizeof(Float);
1058  else if ( param.block.z == 1 || param.block.z == 2 ) param.shared_bytes = param.block.x * 4 * sizeof(Float) / 8;
1059  else if ( param.block.z == 3 ) param.shared_bytes = param.block.x * 4 * sizeof(Float);
1060  else if ( param.block.z == 4 || param.block.z == 5 ) param.shared_bytes = param.block.x * sizeof(Float);
1061  return true;
1062  }
1063  else if ( param.block.z == 0 ) {
1064  param.block.x = min_threads0;
1065  param.block.y = 1;
1066  param.block.z = 1; //USE FOR ATOMIC ADD
1067  param.grid = createGrid(param.block);
1068  param.shared_bytes = param.block.x * 4 * sizeof(Float) / 8;
1069  return true;
1070  }
1071  else if ( param.block.z == 1 ) {
1072  param.block.x = min_threads0;
1073  param.block.y = 1;
1074  param.block.z = 2; //USE FOR NO ATOMIC ADD and LESS SHARED MEM
1075  param.grid = createGrid(param.block);
1076  param.shared_bytes = param.block.x * 4 * sizeof(Float) / 8;
1077  return true;
1078  }
1079  else if ( param.block.z == 2 ) {
1080  param.block.x = min_threads1;
1081  param.block.y = 1;
1082  param.block.z = 3; //USE FOR NO ATOMIC ADD
1083  param.grid = createGrid(param.block);
1084  param.shared_bytes = param.block.x * 4 * sizeof(Float);
1085  return true;
1086  }
1087  else if ( param.block.z == 3 ) {
1088  param.block.x = min_threads1;
1089  param.block.y = 1;
1090  param.block.z = 4;
1091  param.grid = createGrid(param.block);
1092  param.shared_bytes = param.block.x * sizeof(Float);
1093  return true;
1094  }
1095  else if ( param.block.z == 4 ) {
1096  param.block.x = min_threads1;
1097  param.block.y = 1;
1098  param.block.z = 5;
1099  param.grid = createGrid(param.block);
1100  param.shared_bytes = param.block.x * sizeof(Float);
1101  return true;
1102  }
1103  else
1104  return false;
1105  }
1106  private:
1107  unsigned int sharedBytesPerThread() const {
1108  return 0;
1109  }
1110  unsigned int sharedBytesPerBlock(const TuneParam &param) const {
1111  if ( param.block.z == 0 ) return param.block.x * 4 * sizeof(Float);
1112  else if ( param.block.z == 1 || param.block.z == 2 ) return param.block.x * 4 * sizeof(Float) / 8;
1113  else if ( param.block.z == 3 ) return param.block.x * 4 * sizeof(Float);
1114  else return param.block.x * sizeof(Float);
1115  }
1116 
1117  bool tuneSharedBytes() const {
1118  return false;
1119  } // Don't tune shared memory
1120  bool tuneGridDim() const {
1121  return false;
1122  } // Don't tune the grid dimensions.
1123  unsigned int minThreads() const {
1124  return arg.threads;
1125  }
1126 
1127  public:
1128  virtual void initTuneParam(TuneParam &param) const {
1129  param.block = dim3(256, 1, 0);
1130  param.grid = createGrid(param.block);
1131  param.shared_bytes = param.block.x * 4 * sizeof(Float);
1132  }
1133  GaugeFixBorderPoints(GaugeFixBorderPointsArg<Float, Gauge> &arg) : arg(arg), parity(0) { }
1134  ~GaugeFixBorderPoints () {
1135  if ( comm_partitioned() ) for ( int i = 0; i < 2; i++ ) pool_device_free(arg.borderpoints[i]);
1136  }
1137  void setParity(const int par){
1138  parity = par;
1139  }
1140 
1141  void apply(const cudaStream_t &stream){
1142  TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
1143  LAUNCH_KERNEL_GAUGEFIX(computeFixBorderPoints, tp, stream, arg, parity, Float, Gauge, gauge_dir);
1144  }
1145 
1147  virtual void defaultTuneParam(TuneParam &param) const {
1148  initTuneParam(param);
1149  }
1150 
1151  TuneKey tuneKey() const {
1152  std::stringstream vol;
1153  vol << arg.X[0] << "x";
1154  vol << arg.X[1] << "x";
1155  vol << arg.X[2] << "x";
1156  vol << arg.X[3];
1157  sprintf(aux_string,"threads=%d,prec=%lu,gaugedir=%d",arg.threads,sizeof(Float),gauge_dir);
1158  return TuneKey(vol.str().c_str(), typeid(*this).name(), aux_string);
1159  }
1160 
1161  std::string paramString(const TuneParam &param) const {
1162  std::stringstream ps(Tunable::paramString(param));
1163  ps << ", atomicadd=" << param.block.z;
1164  return ps.str();
1165  }
1166 
1167  //need this
1168  void preTune() {
1169  arg.data.backup();
1170  }
1171  void postTune() {
1172  arg.data.restore();
1173  }
1174  long long flops() const {
1175  return 3LL * (22 + 28 * gauge_dir + 224 * 3) * arg.threads;
1176  } // Only correct if there is no link reconstruction
1177  //long long bytes() const { return (1)*8*2*arg.dataOr.Bytes(); } // Only correct if there is no link reconstruction load+save
1178  long long bytes() const {
1179  return 8LL * 2 * arg.threads * numParams * sizeof(Float);
1180  } // Only correct if there is no link reconstruction load+save
1181 
1182  };
1183 
1184 
1185 
1186 
1187 
1188 
1189 
1190 
1191 
1192 
1193 
1194 
1195 
1196 
1197  template <typename Gauge>
1198  struct GaugeFixUnPackArg {
1199  int X[4]; // grid dimensions
1200 #ifdef MULTI_GPU
1201  int border[4];
1202 #endif
1203  Gauge dataOr;
1204  GaugeFixUnPackArg(Gauge & dataOr, cudaGaugeField & data)
1205  : dataOr(dataOr) {
1206  for ( int dir = 0; dir < 4; ++dir ) {
1207  X[dir] = data.X()[dir] - data.R()[dir] * 2;
1208  #ifdef MULTI_GPU
1209  border[dir] = data.R()[dir];
1210  #endif
1211  }
1212  }
1213  };
1214 
1215 
1216  template<int NElems, typename Float, typename Gauge, bool pack>
1217  __global__ void Kernel_UnPackGhost(int size, GaugeFixUnPackArg<Gauge> arg, complex<Float> *array, int parity, int face, int dir){
1218  int idx = blockIdx.x * blockDim.x + threadIdx.x;
1219  if ( idx >= size ) return;
1220  int X[4];
1221  for ( int dr = 0; dr < 4; ++dr ) X[dr] = arg.X[dr];
1222  int x[4];
1223  int za, xodd;
1224  int borderid = 0;
1225  parity = 1 - parity;
1226  switch ( face ) {
1227  case 0: //X FACE
1228  za = idx / ( X[1] / 2);
1229  x[3] = za / X[2];
1230  x[2] = za - x[3] * X[2];
1231  x[0] = borderid;
1232  xodd = (borderid + x[2] + x[3] + parity) & 1;
1233  x[1] = (2 * idx + xodd) - za * X[1];
1234  break;
1235  case 1: //Y FACE
1236  za = idx / ( X[0] / 2);
1237  x[3] = za / X[2];
1238  x[2] = za - x[3] * X[2];
1239  x[1] = borderid;
1240  xodd = (borderid + x[2] + x[3] + parity) & 1;
1241  x[0] = (2 * idx + xodd) - za * X[0];
1242  break;
1243  case 2: //Z FACE
1244  za = idx / ( X[0] / 2);
1245  x[3] = za / X[1];
1246  x[1] = za - x[3] * X[1];
1247  x[2] = borderid;
1248  xodd = (borderid + x[1] + x[3] + parity) & 1;
1249  x[0] = (2 * idx + xodd) - za * X[0];
1250  break;
1251  case 3: //T FACE
1252  za = idx / ( X[0] / 2);
1253  x[2] = za / X[1];
1254  x[1] = za - x[2] * X[1];
1255  x[3] = borderid;
1256  xodd = (borderid + x[1] + x[2] + parity) & 1;
1257  x[0] = (2 * idx + xodd) - za * X[0];
1258  break;
1259  }
1260  for ( int dr = 0; dr < 4; ++dr ) {
1261  x[dr] += arg.border[dr];
1262  X[dr] += 2 * arg.border[dr];
1263  }
1264  x[face] -= 1;
1265  parity = 1 - parity;
1266  int id = (((x[3] * X[2] + x[2]) * X[1] + x[1]) * X[0] + x[0]) >> 1;
1267  typedef complex<Float> Cmplx;
1268  typedef typename mapper<Float>::type RegType;
1269  RegType tmp[NElems];
1270  RegType data[18];
1271  if ( pack ) {
1272  arg.dataOr.load(data, id, dir, parity);
1273  arg.dataOr.reconstruct.Pack(tmp, data, id);
1274  for ( int i = 0; i < NElems / 2; ++i ) array[idx + size * i] = ((Cmplx*)tmp)[i];
1275  }
1276  else{
1277  for ( int i = 0; i < NElems / 2; ++i ) ((Cmplx*)tmp)[i] = array[idx + size * i];
1278  arg.dataOr.reconstruct.Unpack(data, tmp, id, dir, 0, arg.dataOr.X, arg.dataOr.R);
1279  arg.dataOr.save(data, id, dir, parity);
1280  }
1281  }
1282 
1283 
1284 
1285 
1286  template<int NElems, typename Float, typename Gauge, bool pack>
1287  __global__ void Kernel_UnPackTop(int size, GaugeFixUnPackArg<Gauge> arg, complex<Float> *array, int parity, int face, int dir){
1288  int idx = blockIdx.x * blockDim.x + threadIdx.x;
1289  if ( idx >= size ) return;
1290  int X[4];
1291  for ( int dr = 0; dr < 4; ++dr ) X[dr] = arg.X[dr];
1292  int x[4];
1293  int za, xodd;
1294  int borderid = arg.X[face] - 1;
1295  switch ( face ) {
1296  case 0: //X FACE
1297  za = idx / ( X[1] / 2);
1298  x[3] = za / X[2];
1299  x[2] = za - x[3] * X[2];
1300  x[0] = borderid;
1301  xodd = (borderid + x[2] + x[3] + parity) & 1;
1302  x[1] = (2 * idx + xodd) - za * X[1];
1303  break;
1304  case 1: //Y FACE
1305  za = idx / ( X[0] / 2);
1306  x[3] = za / X[2];
1307  x[2] = za - x[3] * X[2];
1308  x[1] = borderid;
1309  xodd = (borderid + x[2] + x[3] + parity) & 1;
1310  x[0] = (2 * idx + xodd) - za * X[0];
1311  break;
1312  case 2: //Z FACE
1313  za = idx / ( X[0] / 2);
1314  x[3] = za / X[1];
1315  x[1] = za - x[3] * X[1];
1316  x[2] = borderid;
1317  xodd = (borderid + x[1] + x[3] + parity) & 1;
1318  x[0] = (2 * idx + xodd) - za * X[0];
1319  break;
1320  case 3: //T FACE
1321  za = idx / ( X[0] / 2);
1322  x[2] = za / X[1];
1323  x[1] = za - x[2] * X[1];
1324  x[3] = borderid;
1325  xodd = (borderid + x[1] + x[2] + parity) & 1;
1326  x[0] = (2 * idx + xodd) - za * X[0];
1327  break;
1328  }
1329  for ( int dr = 0; dr < 4; ++dr ) {
1330  x[dr] += arg.border[dr];
1331  X[dr] += 2 * arg.border[dr];
1332  }
1333  int id = (((x[3] * X[2] + x[2]) * X[1] + x[1]) * X[0] + x[0]) >> 1;
1334  typedef complex<Float> Cmplx;
1335  typedef typename mapper<Float>::type RegType;
1336  RegType tmp[NElems];
1337  RegType data[18];
1338  if ( pack ) {
1339  arg.dataOr.load(data, id, dir, parity);
1340  arg.dataOr.reconstruct.Pack(tmp, data, id);
1341  for ( int i = 0; i < NElems / 2; ++i ) array[idx + size * i] = ((Cmplx*)tmp)[i];
1342  }
1343  else{
1344  for ( int i = 0; i < NElems / 2; ++i ) ((Cmplx*)tmp)[i] = array[idx + size * i];
1345  arg.dataOr.reconstruct.Unpack(data, tmp, id, dir, 0, arg.dataOr.X, arg.dataOr.R);
1346  arg.dataOr.save(data, id, dir, parity);
1347  }
1348  }
1349 #endif
1350 
1351 
1352 
1353 
1354 
1355 
1356 
1357 
1358 
1359 
1360 
1361 
1362 
1363 
1364 
1365 
1366 
1367 
1368  template<typename Float, typename Gauge, int NElems, int gauge_dir>
1369  void gaugefixingOVR( Gauge dataOr, cudaGaugeField& data,
1370  const int Nsteps, const int verbose_interval,
1371  const Float relax_boost, const double tolerance,
1372  const int reunit_interval, const int stopWtheta) {
1373 
1374 
1375  TimeProfile profileInternalGaugeFixOVR("InternalGaugeFixQudaOVR", false);
1376 
1377  profileInternalGaugeFixOVR.TPSTART(QUDA_PROFILE_COMPUTE);
1378  double flop = 0;
1379  double byte = 0;
1380 
1381  printfQuda("\tOverrelaxation boost parameter: %lf\n", (double)relax_boost);
1382  printfQuda("\tStop criterium: %lf\n", tolerance);
1383  if ( stopWtheta ) printfQuda("\tStop criterium method: theta\n");
1384  else printfQuda("\tStop criterium method: Delta\n");
1385  printfQuda("\tMaximum number of iterations: %d\n", Nsteps);
1386  printfQuda("\tReunitarize at every %d steps\n", reunit_interval);
1387  printfQuda("\tPrint convergence results at every %d steps\n", verbose_interval);
1388 
1389 
1390  const double unitarize_eps = 1e-14;
1391  const double max_error = 1e-10;
1392  const int reunit_allow_svd = 1;
1393  const int reunit_svd_only = 0;
1394  const double svd_rel_error = 1e-6;
1395  const double svd_abs_error = 1e-6;
1399  int num_failures = 0;
1400  int* num_failures_dev = static_cast<int*>(pool_device_malloc(sizeof(int)));
1401  cudaMemset(num_failures_dev, 0, sizeof(int));
1402 
1403  GaugeFixQualityArg<Gauge> argQ(dataOr, data);
1404  GaugeFixQuality<Float,Gauge, gauge_dir> GaugeFixQuality(argQ);
1405 
1406  GaugeFixArg<Float, Gauge> arg(dataOr, data, relax_boost);
1407  GaugeFix<Float,Gauge, gauge_dir> gaugeFix(arg);
1408 
1409 #ifdef MULTI_GPU
1410  void *send[4];
1411  void *recv[4];
1412  void *sendg[4];
1413  void *recvg[4];
1414  void *send_d[4];
1415  void *recv_d[4];
1416  void *sendg_d[4];
1417  void *recvg_d[4];
1418  void *hostbuffer_h[4];
1419  cudaStream_t GFStream[9];
1420  size_t offset[4];
1421  size_t bytes[4];
1422  size_t faceVolume[4];
1423  size_t faceVolumeCB[4];
1424  // do the exchange
1425  MsgHandle *mh_recv_back[4];
1426  MsgHandle *mh_recv_fwd[4];
1427  MsgHandle *mh_send_fwd[4];
1428  MsgHandle *mh_send_back[4];
1429  int X[4];
1430  dim3 block[4];
1431  dim3 grid[4];
1432 
1433  if ( comm_partitioned() ) {
1434 
1435  for ( int dir = 0; dir < 4; ++dir ) {
1436  X[dir] = data.X()[dir] - data.R()[dir] * 2;
1437  if ( !commDimPartitioned(dir) && data.R()[dir] != 0 ) errorQuda("Not supported!\n");
1438  }
1439  for ( int i = 0; i < 4; i++ ) {
1440  faceVolume[i] = 1;
1441  for ( int j = 0; j < 4; j++ ) {
1442  if ( i == j ) continue;
1443  faceVolume[i] *= X[j];
1444  }
1445  faceVolumeCB[i] = faceVolume[i] / 2;
1446  }
1447 
1448  for ( int d = 0; d < 4; d++ ) {
1449  if ( !commDimPartitioned(d)) continue;
1450  offset[d] = faceVolumeCB[d] * NElems;
1451  bytes[d] = sizeof(Float) * offset[d];
1452  send_d[d] = device_malloc(bytes[d]);
1453  recv_d[d] = device_malloc(bytes[d]);
1454  sendg_d[d] = device_malloc(bytes[d]);
1455  recvg_d[d] = device_malloc(bytes[d]);
1456  cudaStreamCreate(&GFStream[d]);
1457  cudaStreamCreate(&GFStream[4 + d]);
1458  #ifndef GPU_COMMS
1459  hostbuffer_h[d] = (void*)pinned_malloc(4 * bytes[d]);
1460  #endif
1461  block[d] = make_uint3(128, 1, 1);
1462  grid[d] = make_uint3((faceVolumeCB[d] + block[d].x - 1) / block[d].x, 1, 1);
1463  }
1464  cudaStreamCreate(&GFStream[8]);
1465  for ( int d = 0; d < 4; d++ ) {
1466  if ( !commDimPartitioned(d)) continue;
1467  #ifdef GPU_COMMS
1468  recv[d] = recv_d[d];
1469  send[d] = send_d[d];
1470  recvg[d] = recvg_d[d];
1471  sendg[d] = sendg_d[d];
1472  #else
1473  recv[d] = hostbuffer_h[d];
1474  send[d] = static_cast<char*>(hostbuffer_h[d]) + bytes[d];
1475  recvg[d] = static_cast<char*>(hostbuffer_h[d]) + 3 * bytes[d];
1476  sendg[d] = static_cast<char*>(hostbuffer_h[d]) + 2 * bytes[d];
1477  #endif
1478  mh_recv_back[d] = comm_declare_receive_relative(recv[d], d, -1, bytes[d]);
1479  mh_recv_fwd[d] = comm_declare_receive_relative(recvg[d], d, +1, bytes[d]);
1480  mh_send_back[d] = comm_declare_send_relative(sendg[d], d, -1, bytes[d]);
1481  mh_send_fwd[d] = comm_declare_send_relative(send[d], d, +1, bytes[d]);
1482  }
1483  }
1484  GaugeFixUnPackArg<Gauge> dataexarg(dataOr, data);
1485  GaugeFixBorderPointsArg<Float, Gauge> argBorder(dataOr, data, relax_boost, faceVolume, faceVolumeCB);
1486  GaugeFixBorderPoints<Float,Gauge, gauge_dir> gfixBorderPoints(argBorder);
1487  GaugeFixInteriorPointsArg<Float, Gauge> argInt(dataOr, data, relax_boost);
1488  GaugeFixInteriorPoints<Float,Gauge, gauge_dir> gfixIntPoints(argInt);
1489  #endif
1490 
1491  GaugeFixQuality.apply(0);
1492  flop += (double)GaugeFixQuality.flops();
1493  byte += (double)GaugeFixQuality.bytes();
1494  double action0 = argQ.getAction();
1495  printfQuda("Step: %d\tAction: %.16e\ttheta: %.16e\n", 0, argQ.getAction(), argQ.getTheta());
1496 
1497 
1498  unitarizeLinks(data, data, num_failures_dev);
1499  qudaMemcpy(&num_failures, num_failures_dev, sizeof(int), cudaMemcpyDeviceToHost);
1500  if ( num_failures > 0 ) {
1502  errorQuda("Error in the unitarization\n");
1503  exit(1);
1504  }
1505  cudaMemset(num_failures_dev, 0, sizeof(int));
1506 
1507  int iter = 0;
1508  for ( iter = 0; iter < Nsteps; iter++ ) {
1509  for ( int p = 0; p < 2; p++ ) {
1510  #ifndef MULTI_GPU
1511  gaugeFix.setParity(p);
1512  gaugeFix.apply(0);
1513  flop += (double)gaugeFix.flops();
1514  byte += (double)gaugeFix.bytes();
1515  #else
1516  if ( !comm_partitioned() ) {
1517  gaugeFix.setParity(p);
1518  gaugeFix.apply(0);
1519  flop += (double)gaugeFix.flops();
1520  byte += (double)gaugeFix.bytes();
1521  }
1522  else{
1523  gfixIntPoints.setParity(p);
1524  gfixBorderPoints.setParity(p); //compute border points
1525  gfixBorderPoints.apply(0);
1526  flop += (double)gfixBorderPoints.flops();
1527  byte += (double)gfixBorderPoints.bytes();
1528  flop += (double)gfixIntPoints.flops();
1529  byte += (double)gfixIntPoints.bytes();
1530  for ( int d = 0; d < 4; d++ ) {
1531  if ( !commDimPartitioned(d)) continue;
1532  comm_start(mh_recv_back[d]);
1533  comm_start(mh_recv_fwd[d]);
1534  }
1535  //wait for the update to the halo points before start packing...
1537  for ( int d = 0; d < 4; d++ ) {
1538  if ( !commDimPartitioned(d)) continue;
1539  //extract top face
1540  Kernel_UnPackTop<NElems, Float, Gauge, true><< < grid[d], block[d], 0, GFStream[d] >> > (faceVolumeCB[d], dataexarg, reinterpret_cast<complex<Float>*>(send_d[d]), p, d, d);
1541  //extract bottom ghost
1542  Kernel_UnPackGhost<NElems, Float, Gauge, true><< < grid[d], block[d], 0, GFStream[4 + d] >> > (faceVolumeCB[d], dataexarg, reinterpret_cast<complex<Float>*>(sendg_d[d]), 1 - p, d, d);
1543  }
1544  #ifdef GPU_COMMS
1545  for ( int d = 0; d < 4; d++ ) {
1546  if ( !commDimPartitioned(d)) continue;
1547  qudaStreamSynchronize(GFStream[d]);
1548  comm_start(mh_send_fwd[d]);
1549  qudaStreamSynchronize(GFStream[4 + d]);
1550  comm_start(mh_send_back[d]);
1551  }
1552  #else
1553  for ( int d = 0; d < 4; d++ ) {
1554  if ( !commDimPartitioned(d)) continue;
1555  cudaMemcpyAsync(send[d], send_d[d], bytes[d], cudaMemcpyDeviceToHost, GFStream[d]);
1556  }
1557  for ( int d = 0; d < 4; d++ ) {
1558  if ( !commDimPartitioned(d)) continue;
1559  cudaMemcpyAsync(sendg[d], sendg_d[d], bytes[d], cudaMemcpyDeviceToHost, GFStream[4 + d]);
1560  }
1561  #endif
1562  //compute interior points
1563  gfixIntPoints.apply(GFStream[8]);
1564 
1565  #ifndef GPU_COMMS
1566  for ( int d = 0; d < 4; d++ ) {
1567  if ( !commDimPartitioned(d)) continue;
1568  qudaStreamSynchronize(GFStream[d]);
1569  comm_start(mh_send_fwd[d]);
1570  qudaStreamSynchronize(GFStream[4 + d]);
1571  comm_start(mh_send_back[d]);
1572  }
1573  for ( int d = 0; d < 4; d++ ) {
1574  if ( !commDimPartitioned(d)) continue;
1575  comm_wait(mh_recv_back[d]);
1576  cudaMemcpyAsync(recv_d[d], recv[d], bytes[d], cudaMemcpyHostToDevice, GFStream[d]);
1577  }
1578  for ( int d = 0; d < 4; d++ ) {
1579  if ( !commDimPartitioned(d)) continue;
1580  comm_wait(mh_recv_fwd[d]);
1581  cudaMemcpyAsync(recvg_d[d], recvg[d], bytes[d], cudaMemcpyHostToDevice, GFStream[4 + d]);
1582  }
1583  #endif
1584  for ( int d = 0; d < 4; d++ ) {
1585  if ( !commDimPartitioned(d)) continue;
1586  #ifdef GPU_COMMS
1587  comm_wait(mh_recv_back[d]);
1588  #endif
1589  Kernel_UnPackGhost<NElems, Float, Gauge, false><< < grid[d], block[d], 0, GFStream[d] >> > (faceVolumeCB[d], dataexarg, reinterpret_cast<complex<Float>*>(recv_d[d]), p, d, d);
1590  }
1591  for ( int d = 0; d < 4; d++ ) {
1592  if ( !commDimPartitioned(d)) continue;
1593  #ifdef GPU_COMMS
1594  comm_wait(mh_recv_fwd[d]);
1595  #endif
1596  Kernel_UnPackTop<NElems, Float, Gauge, false><< < grid[d], block[d], 0, GFStream[4 + d] >> > (faceVolumeCB[d], dataexarg, reinterpret_cast<complex<Float>*>(recvg_d[d]), 1 - p, d, d);
1597  }
1598  for ( int d = 0; d < 4; d++ ) {
1599  if ( !commDimPartitioned(d)) continue;
1600  comm_wait(mh_send_back[d]);
1601  comm_wait(mh_send_fwd[d]);
1602  qudaStreamSynchronize(GFStream[d]);
1603  qudaStreamSynchronize(GFStream[4 + d]);
1604  }
1605  qudaStreamSynchronize(GFStream[8]);
1606  }
1607  #endif
1608  /*gaugeFix.setParity(p);
1609  gaugeFix.apply(0);
1610  flop += (double)gaugeFix.flops();
1611  byte += (double)gaugeFix.bytes();
1612  #ifdef MULTI_GPU
1613  if(comm_partitioned()){//exchange updated top face links in current parity
1614  for (int d=0; d<4; d++) {
1615  if (!commDimPartitioned(d)) continue;
1616  comm_start(mh_recv_back[d]);
1617  //extract top face
1618  Kernel_UnPackTop<NElems, Float, Gauge><<<grid[d], block[d]>>>(faceVolumeCB[d], dataexarg, reinterpret_cast<Float*>(send_d[d]), p, d, d, true);
1619  #ifndef GPU_COMMS
1620  cudaMemcpy(send[d], send_d[d], bytes[d], cudaMemcpyDeviceToHost);
1621  #else
1622  qudaDeviceSynchronize();
1623  #endif
1624  comm_start(mh_send_fwd[d]);
1625  comm_wait(mh_recv_back[d]);
1626  comm_wait(mh_send_fwd[d]);
1627  #ifndef GPU_COMMS
1628  cudaMemcpy(recv_d[d], recv[d], bytes[d], cudaMemcpyHostToDevice);
1629  #endif
1630  //inject top face in ghost
1631  Kernel_UnPackGhost<NElems, Float, Gauge><<<grid[d], block[d]>>>(faceVolumeCB[d], dataexarg, reinterpret_cast<Float*>(recv_d[d]), p, d, d, false);
1632  }
1633  //exchange updated ghost links in opposite parity
1634  for (int d=0; d<4; d++) {
1635  if (!commDimPartitioned(d)) continue;
1636  comm_start(mh_recv_fwd[d]);
1637  Kernel_UnPackGhost<NElems, Float, Gauge><<<grid[d], block[d]>>>(faceVolumeCB[d], dataexarg, reinterpret_cast<Float*>(sendg_d[d]), 1-p, d, d, true);
1638  #ifndef GPU_COMMS
1639  cudaMemcpy(sendg[d], sendg_d[d], bytes[d], cudaMemcpyDeviceToHost);
1640  #else
1641  qudaDeviceSynchronize();
1642  #endif
1643  comm_start(mh_send_back[d]);
1644  comm_wait(mh_recv_fwd[d]);
1645  comm_wait(mh_send_back[d]);
1646  #ifndef GPU_COMMS
1647  cudaMemcpy(recvg_d[d], recvg[d], bytes[d], cudaMemcpyHostToDevice);
1648  #endif
1649  Kernel_UnPackTop<NElems, Float, Gauge><<<grid[d], block[d]>>>(faceVolumeCB[d], dataexarg, reinterpret_cast<Float*>(recvg_d[d]), 1-p, d, d, false);
1650  }
1651  }
1652  #endif*/
1653  }
1654  if ((iter % reunit_interval) == (reunit_interval - 1)) {
1655  unitarizeLinks(data, data, num_failures_dev);
1656  qudaMemcpy(&num_failures, num_failures_dev, sizeof(int), cudaMemcpyDeviceToHost);
1657  if ( num_failures > 0 ) errorQuda("Error in the unitarization\n");
1658  cudaMemset(num_failures_dev, 0, sizeof(int));
1659  flop += 4588.0 * data.X()[0]*data.X()[1]*data.X()[2]*data.X()[3];
1660  byte += 8.0 * data.X()[0]*data.X()[1]*data.X()[2]*data.X()[3] * dataOr.Bytes();
1661  }
1662  GaugeFixQuality.apply(0);
1663  flop += (double)GaugeFixQuality.flops();
1664  byte += (double)GaugeFixQuality.bytes();
1665  double action = argQ.getAction();
1666  double diff = abs(action0 - action);
1667  if ((iter % verbose_interval) == (verbose_interval - 1))
1668  printfQuda("Step: %d\tAction: %.16e\ttheta: %.16e\tDelta: %.16e\n", iter + 1, argQ.getAction(), argQ.getTheta(), diff);
1669  if ( stopWtheta ) {
1670  if ( argQ.getTheta() < tolerance ) break;
1671  }
1672  else{
1673  if ( diff < tolerance ) break;
1674  }
1675  action0 = action;
1676  }
1677  if ((iter % reunit_interval) != 0 ) {
1678  unitarizeLinks(data, data, num_failures_dev);
1679  qudaMemcpy(&num_failures, num_failures_dev, sizeof(int), cudaMemcpyDeviceToHost);
1680  if ( num_failures > 0 ) errorQuda("Error in the unitarization\n");
1681  cudaMemset(num_failures_dev, 0, sizeof(int));
1682  flop += 4588.0 * data.X()[0]*data.X()[1]*data.X()[2]*data.X()[3];
1683  byte += 8.0 * data.X()[0]*data.X()[1]*data.X()[2]*data.X()[3] * dataOr.Bytes();
1684  }
1685  if ((iter % verbose_interval) != 0 ) {
1686  GaugeFixQuality.apply(0);
1687  flop += (double)GaugeFixQuality.flops();
1688  byte += (double)GaugeFixQuality.bytes();
1689  double action = argQ.getAction();
1690  double diff = abs(action0 - action);
1691  printfQuda("Step: %d\tAction: %.16e\ttheta: %.16e\tDelta: %.16e\n", iter + 1, argQ.getAction(), argQ.getTheta(), diff);
1692  }
1694  #ifdef MULTI_GPU
1695  if ( comm_partitioned() ) {
1696  data.exchangeExtendedGhost(data.R(),false);
1697  for ( int d = 0; d < 4; d++ ) {
1698  if ( commDimPartitioned(d)) {
1699  comm_free(mh_send_fwd[d]);
1700  comm_free(mh_send_back[d]);
1701  comm_free(mh_recv_back[d]);
1702  comm_free(mh_recv_fwd[d]);
1703  device_free(send_d[d]);
1704  device_free(recv_d[d]);
1705  device_free(sendg_d[d]);
1706  device_free(recvg_d[d]);
1707  cudaStreamDestroy(GFStream[d]);
1708  cudaStreamDestroy(GFStream[4 + d]);
1709  #ifndef GPU_COMMS
1710  host_free(hostbuffer_h[d]);
1711  #endif
1712  }
1713  }
1714  cudaStreamDestroy(GFStream[8]);
1715  }
1716  #endif
1717  checkCudaError();
1719  profileInternalGaugeFixOVR.TPSTOP(QUDA_PROFILE_COMPUTE);
1720  if (getVerbosity() > QUDA_SUMMARIZE){
1721  double secs = profileInternalGaugeFixOVR.Last(QUDA_PROFILE_COMPUTE);
1722  double gflops = (flop * 1e-9) / (secs);
1723  double gbytes = byte / (secs * 1e9);
1724  #ifdef MULTI_GPU
1725  printfQuda("Time: %6.6f s, Gflop/s = %6.1f, GB/s = %6.1f\n", secs, gflops * comm_size(), gbytes * comm_size());
1726  #else
1727  printfQuda("Time: %6.6f s, Gflop/s = %6.1f, GB/s = %6.1f\n", secs, gflops, gbytes);
1728  #endif
1729  }
1730  }
1731 
1732  template<typename Float, int NElems, typename Gauge>
1733  void gaugefixingOVR( Gauge dataOr, cudaGaugeField& data, const int gauge_dir, const int Nsteps, const int verbose_interval,
1734  const Float relax_boost, const double tolerance, const int reunit_interval, const int stopWtheta) {
1735  if ( gauge_dir != 3 ) {
1736  printfQuda("Starting Landau gauge fixing...\n");
1737  gaugefixingOVR<Float, Gauge, NElems, 4>(dataOr, data, Nsteps, verbose_interval, relax_boost, tolerance, reunit_interval, stopWtheta);
1738  }
1739  else {
1740  printfQuda("Starting Coulomb gauge fixing...\n");
1741  gaugefixingOVR<Float, Gauge, NElems, 3>(dataOr, data, Nsteps, verbose_interval, relax_boost, tolerance, reunit_interval, stopWtheta);
1742  }
1743  }
1744 
1745 
1746 
1747  template<typename Float>
1748  void gaugefixingOVR( cudaGaugeField& data, const int gauge_dir, const int Nsteps, const int verbose_interval,
1749  const Float relax_boost, const double tolerance, const int reunit_interval, const int stopWtheta) {
1750 
1751  // Switching to FloatNOrder for the gauge field in order to support RECONSTRUCT_12
1752  if ( data.isNative() ) {
1753  if ( data.Reconstruct() == QUDA_RECONSTRUCT_NO ) {
1754  //printfQuda("QUDA_RECONSTRUCT_NO\n");
1755  numParams = 18;
1756  typedef typename gauge_mapper<Float,QUDA_RECONSTRUCT_NO>::type Gauge;
1757  gaugefixingOVR<Float, 18>(Gauge(data), data, gauge_dir, Nsteps, verbose_interval, relax_boost, tolerance, reunit_interval, stopWtheta);
1758  } else if ( data.Reconstruct() == QUDA_RECONSTRUCT_12 ) {
1759  //printfQuda("QUDA_RECONSTRUCT_12\n");
1760  numParams = 12;
1761  typedef typename gauge_mapper<Float,QUDA_RECONSTRUCT_12>::type Gauge;
1762  gaugefixingOVR<Float, 12>(Gauge(data), data, gauge_dir, Nsteps, verbose_interval, relax_boost, tolerance, reunit_interval, stopWtheta);
1763  } else if ( data.Reconstruct() == QUDA_RECONSTRUCT_8 ) {
1764  //printfQuda("QUDA_RECONSTRUCT_8\n");
1765  numParams = 8;
1766  typedef typename gauge_mapper<Float,QUDA_RECONSTRUCT_8>::type Gauge;
1767  gaugefixingOVR<Float, 8>(Gauge(data), data, gauge_dir, Nsteps, verbose_interval, relax_boost, tolerance, reunit_interval, stopWtheta);
1768  } else {
1769  errorQuda("Reconstruction type %d of gauge field not supported", data.Reconstruct());
1770  }
1771  } else {
1772  errorQuda("Invalid Gauge Order\n");
1773  }
1774  }
1775 
1776 #endif // GPU_GAUGE_ALG
1777 
1778 
1790  void gaugefixingOVR( cudaGaugeField& data, const int gauge_dir, const int Nsteps, const int verbose_interval, const double relax_boost,
1791  const double tolerance, const int reunit_interval, const int stopWtheta) {
1792 #ifdef GPU_GAUGE_ALG
1793  if ( data.Precision() == QUDA_HALF_PRECISION ) {
1794  errorQuda("Half precision not supported\n");
1795  }
1796  if ( data.Precision() == QUDA_SINGLE_PRECISION ) {
1797  gaugefixingOVR<float> (data, gauge_dir, Nsteps, verbose_interval, (float)relax_boost, tolerance, reunit_interval, stopWtheta);
1798  } else if ( data.Precision() == QUDA_DOUBLE_PRECISION ) {
1799  gaugefixingOVR<double>(data, gauge_dir, Nsteps, verbose_interval, relax_boost, tolerance, reunit_interval, stopWtheta);
1800  } else {
1801  errorQuda("Precision %d not supported", data.Precision());
1802  }
1803 #else
1804  errorQuda("Gauge fixing has not been built");
1805 #endif // GPU_GAUGE_ALG
1806  }
1807 
1808 
1809 } //namespace quda
#define qudaMemcpy(dst, src, count, kind)
Definition: quda_cuda_api.h:32
dim3 dim3 blockDim
int commDimPartitioned(int dir)
double mu
Definition: test_util.cpp:1643
#define pinned_malloc(size)
Definition: malloc_quda.h:55
__device__ __host__ void setZero(Matrix< T, N > *m)
Definition: quda_matrix.h:592
static __device__ __host__ int linkIndex(const int x[], const I X[4])
#define LAUNCH_KERNEL_LOCAL_PARITY(kernel, tp, stream, arg,...)
QudaVerbosity getVerbosity()
Definition: util_quda.cpp:20
#define errorQuda(...)
Definition: util_quda.h:90
void setUnitarizeLinksConstants(double unitarize_eps, double max_error, bool allow_svd, bool svd_only, double svd_rel_error, double svd_abs_error)
#define host_free(ptr)
Definition: malloc_quda.h:59
int * num_failures_dev
std::complex< double > Complex
Definition: eig_variables.h:13
cudaStream_t * stream
int comm_partitioned()
Loop over comm_dim_partitioned(dim) for all comms dimensions.
cudaColorSpinorField * tmp
Definition: covdev_test.cpp:44
__device__ __host__ double getRealTraceUVdagger(const Matrix< T, 3 > &a, const Matrix< T, 3 > &b)
Definition: quda_matrix.h:1021
void comm_allreduce_array(double *data, size_t size)
Definition: comm_mpi.cpp:296
virtual std::string paramString(const TuneParam &param) const
Definition: tune_quda.h:208
void gaugefixingOVR(cudaGaugeField &data, const int gauge_dir, const int Nsteps, const int verbose_interval, const double relax_boost, const double tolerance, const int reunit_interval, const int stopWtheta)
Gauge fixing with overrelaxation with support for single and multi GPU.
int num_failures
void exit(int) __attribute__((noreturn))
size_t size_t offset
QudaGaugeParam param
Definition: pack_test.cpp:17
void comm_free(MsgHandle *mh)
Definition: comm_mpi.cpp:252
const int * R() const
int comm_size(void)
Definition: comm_mpi.cpp:126
static unsigned int delta
void unitarizeLinks(cudaGaugeField &outfield, const cudaGaugeField &infield, int *fails)
#define comm_declare_send_relative(buffer, dim, dir, nbytes)
Definition: comm_quda.h:59
cudaError_t qudaStreamSynchronize(cudaStream_t &stream)
Wrapper around cudaStreamSynchronize or cuStreamSynchronize.
#define comm_declare_receive_relative(buffer, dim, dir, nbytes)
Definition: comm_quda.h:74
static __device__ __host__ int linkIndexM1(const int x[], const I X[4], const int mu)
void exchangeExtendedGhost(const int *R, bool no_comms_fill=false)
This does routine will populate the border / halo region of a gauge field that has been created using...
for(int s=0;s< param.dc.Ls;s++)
T data[N *N]
Definition: quda_matrix.h:74
static __inline__ size_t p
void comm_start(MsgHandle *mh)
Definition: comm_mpi.cpp:260
#define pool_device_malloc(size)
Definition: malloc_quda.h:113
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
Definition: tune.cpp:603
Main header file for host and device accessors to GaugeFields.
__device__ __host__ void SubTraceUnit(Matrix< T, 3 > &a)
Definition: quda_matrix.h:1015
cudaError_t qudaDeviceSynchronize()
Wrapper around cudaDeviceSynchronize or cuDeviceSynchronize.
int sprintf(char *, const char *,...) __attribute__((__format__(__printf__
const void int blockSize
if(err !=cudaSuccess)
#define printfQuda(...)
Definition: util_quda.h:84
unsigned long long flops
Definition: blas_quda.cu:42
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
Definition: complex_quda.h:880
#define device_malloc(size)
Definition: malloc_quda.h:52
int faceVolume[4]
Definition: test_util.cpp:32
QudaReconstructType Reconstruct() const
Definition: gauge_field.h:203
__host__ __device__ ValueType abs(ValueType x)
Definition: complex_quda.h:110
#define pool_device_free(ptr)
Definition: malloc_quda.h:114
#define checkCudaError()
Definition: util_quda.h:129
struct cudaExtent unsigned int cudaArray_t array
void comm_wait(MsgHandle *mh)
Definition: comm_mpi.cpp:266
__host__ __device__ ValueType conj(ValueType x)
Definition: complex_quda.h:115
QudaTune getTuning()
Query whether autotuning is enabled or not. Default is enabled but can be overridden by setting QUDA_...
Definition: util_quda.cpp:51
static __inline__ size_t size_t d
QudaPrecision Precision() const
bool isNative() const
QudaParity parity
Definition: covdev_test.cpp:53
unsigned long long bytes
Definition: blas_quda.cu:43
int comm_dim_partitioned(int dim)
const int * X() const
#define device_free(ptr)
Definition: malloc_quda.h:57
static __device__ __host__ void getCoords(int x[], int cb_index, const I X[], int parity)