QUDA  1.0.0
gauge_fix_fft.cu
Go to the documentation of this file.
1 #include <quda_internal.h>
2 #include <quda_matrix.h>
3 #include <tune_quda.h>
4 #include <gauge_field.h>
5 #include <gauge_field_order.h>
6 #include <launch_kernel.cuh>
7 #include <unitarization_links.h>
8 #include <atomic.cuh>
9 #include <cub_helper.cuh>
10 #include <index_helper.cuh>
11 
12 #include <cufft.h>
13 
14 #ifdef GPU_GAUGE_ALG
15 #include <CUFFT_Plans.h>
16 #endif
17 
18 namespace quda {
19 
20 #ifdef GPU_GAUGE_ALG
21 
22 //UNCOMMENT THIS IF YOU WAN'T TO USE LESS MEMORY
23 #define GAUGEFIXING_DONT_USE_GX
24 //Without using the precalculation of g(x),
25 //we loose some performance, because Delta(x) is written in normal lattice coordinates need for the FFTs
26 //and the gauge array in even/odd format
27 
28 #ifdef HOST_DEBUG
29 #ifdef GAUGEFIXING_DONT_USE_GX
30 #warning Not using precalculated g(x)
31 #else
32 #warning Using precalculated g(x)
33 #endif
34 #endif
35 
36 
37 #ifndef FL_UNITARIZE_PI
38 #define FL_UNITARIZE_PI 3.14159265358979323846
39 #endif
40 
41  template <typename Float>
42  struct GaugeFixFFTRotateArg {
43  int threads; // number of active threads required
44  int X[4]; // grid dimensions
45  complex<Float> *tmp0;
46  complex<Float> *tmp1;
47  GaugeFixFFTRotateArg(const cudaGaugeField &data){
48  for ( int dir = 0; dir < 4; ++dir ) X[dir] = data.X()[dir];
49  threads = X[0] * X[1] * X[2] * X[3];
50  tmp0 = 0;
51  tmp1 = 0;
52  }
53  };
54 
55  template <int direction, typename Float>
56  __global__ void fft_rotate_kernel_2D2D(GaugeFixFFTRotateArg<Float> arg){ //Cmplx *data_in, Cmplx *data_out){
57  int id = blockIdx.x * blockDim.x + threadIdx.x;
58  if ( id >= arg.threads ) return;
59  if ( direction == 0 ) {
60  int x3 = id / (arg.X[0] * arg.X[1] * arg.X[2]);
61  int x2 = (id / (arg.X[0] * arg.X[1])) % arg.X[2];
62  int x1 = (id / arg.X[0]) % arg.X[1];
63  int x0 = id % arg.X[0];
64 
65  int id = x0 + (x1 + (x2 + x3 * arg.X[2]) * arg.X[1]) * arg.X[0];
66  int id_out = x2 + (x3 + (x0 + x1 * arg.X[0]) * arg.X[3]) * arg.X[2];
67  arg.tmp1[id_out] = arg.tmp0[id];
68  //data_out[id_out] = data_in[id];
69  }
70  if ( direction == 1 ) {
71 
72  int x1 = id / (arg.X[2] * arg.X[3] * arg.X[0]);
73  int x0 = (id / (arg.X[2] * arg.X[3])) % arg.X[0];
74  int x3 = (id / arg.X[2]) % arg.X[3];
75  int x2 = id % arg.X[2];
76 
77  int id = x2 + (x3 + (x0 + x1 * arg.X[0]) * arg.X[3]) * arg.X[2];
78  int id_out = x0 + (x1 + (x2 + x3 * arg.X[2]) * arg.X[1]) * arg.X[0];
79  arg.tmp1[id_out] = arg.tmp0[id];
80  //data_out[id_out] = data_in[id];
81  }
82  }
83 
84 
85 
86 
87 
88 
89  template<typename Float>
90  class GaugeFixFFTRotate : Tunable {
91  GaugeFixFFTRotateArg<Float> arg;
92  int direction;
93  mutable char aux_string[128]; // used as a label in the autotuner
94  private:
95  unsigned int sharedBytesPerThread() const {
96  return 0;
97  }
98  unsigned int sharedBytesPerBlock(const TuneParam &param) const {
99  return 0;
100  }
101  //bool tuneSharedBytes() const { return false; } // Don't tune shared memory
102  bool tuneGridDim() const {
103  return false;
104  } // Don't tune the grid dimensions.
105  unsigned int minThreads() const {
106  return arg.threads;
107  }
108 
109  public:
110  GaugeFixFFTRotate(GaugeFixFFTRotateArg<Float> &arg) : arg(arg) {
111  direction = 0;
112  }
113  ~GaugeFixFFTRotate () {
114  }
115  void setDirection(int dir, complex<Float> *data_in, complex<Float> *data_out){
116  direction = dir;
117  arg.tmp0 = data_in;
118  arg.tmp1 = data_out;
119  }
120 
121  void apply(const cudaStream_t &stream){
122  TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
123  if ( direction == 0 )
124  fft_rotate_kernel_2D2D<0, Float > <<< tp.grid, tp.block, 0, stream >>> (arg);
125  else if ( direction == 1 )
126  fft_rotate_kernel_2D2D<1, Float > <<< tp.grid, tp.block, 0, stream >>> (arg);
127  else
128  errorQuda("Error in GaugeFixFFTRotate option.\n");
129  }
130 
131  TuneKey tuneKey() const {
132  std::stringstream vol;
133  vol << arg.X[0] << "x";
134  vol << arg.X[1] << "x";
135  vol << arg.X[2] << "x";
136  vol << arg.X[3];
137  sprintf(aux_string,"threads=%d,prec=%lu", arg.threads, sizeof(Float));
138  return TuneKey(vol.str().c_str(), typeid(*this).name(), aux_string);
139 
140  }
141 
142  long long flops() const {
143  return 0;
144  }
145  long long bytes() const {
146  return 4LL * sizeof(Float) * arg.threads;
147  }
148 
149  };
150 
151 
152  template <typename Float, typename Gauge>
153  struct GaugeFixQualityArg : public ReduceArg<double2> {
154  int threads; // number of active threads required
155  int X[4]; // grid dimensions
156  Gauge dataOr;
157  complex<Float> *delta;
158 
159  GaugeFixQualityArg(const Gauge &dataOr, const cudaGaugeField &data, complex<Float> * delta)
160  : ReduceArg<double2>(), dataOr(dataOr), delta(delta) {
161  for ( int dir = 0; dir < 4; ++dir ) X[dir] = data.X()[dir];
162  threads = data.VolumeCB();
163  }
164  double getAction(){ return result_h[0].x; }
165  double getTheta(){ return result_h[0].y; }
166  };
167 
168  template<int blockSize, int Elems, typename Float, typename Gauge, int gauge_dir>
169  __global__ void computeFix_quality(GaugeFixQualityArg<Float, Gauge> argQ){
170  int idx_cb = threadIdx.x + blockIdx.x * blockDim.x;
171  int parity = threadIdx.y;
172 
173  double2 data = make_double2(0.0,0.0);
174  while (idx_cb < argQ.threads) {
175  typedef complex<Float> Cmplx;
176 
177  int x[4];
178  getCoords(x, idx_cb, argQ.X, parity);
179  Matrix<Cmplx,3> delta;
180  setZero(&delta);
181  //idx = linkIndex(x,X);
182  for ( int mu = 0; mu < gauge_dir; mu++ ) {
183  Matrix<Cmplx,3> U = argQ.dataOr(mu, idx_cb, parity);
184  delta -= U;
185  }
186  //18*gauge_dir
187  data.x += -delta(0, 0).x - delta(1, 1).x - delta(2, 2).x;
188  //2
189  for ( int mu = 0; mu < gauge_dir; mu++ ) {
190  Matrix<Cmplx,3> U = argQ.dataOr(mu, linkIndexM1(x,argQ.X,mu), 1 - parity);
191  delta += U;
192  }
193  //18*gauge_dir
194  delta -= conj(delta);
195  //18
196  //SAVE DELTA!!!!!
197  SubTraceUnit(delta);
198  int idx = getIndexFull(idx_cb, argQ.X, parity);
199  //Saving Delta
200  argQ.delta[idx] = delta(0,0);
201  argQ.delta[idx + 2 * argQ.threads] = delta(0,1);
202  argQ.delta[idx + 4 * argQ.threads] = delta(0,2);
203  argQ.delta[idx + 6 * argQ.threads] = delta(1,1);
204  argQ.delta[idx + 8 * argQ.threads] = delta(1,2);
205  argQ.delta[idx + 10 * argQ.threads] = delta(2,2);
206  //12
207  data.y += getRealTraceUVdagger(delta, delta);
208  //35
209  //T=36*gauge_dir+65
210 
211  idx_cb += blockDim.x * gridDim.x;
212  }
213 
214  reduce2d<blockSize,2>(argQ, data);
215  }
216 
217 
218 
219  template<int Elems, typename Float, typename Gauge, int gauge_dir>
220  class GaugeFixQuality : TunableLocalParity {
221  GaugeFixQualityArg<Float, Gauge> argQ;
222  mutable char aux_string[128]; // used as a label in the autotuner
223 
224  private:
225  bool tuneGridDim() const { return true; }
226 
227  public:
228  GaugeFixQuality(GaugeFixQualityArg<Float, Gauge> &argQ)
229  : argQ(argQ) {
230  }
231  ~GaugeFixQuality () { }
232 
233  void apply(const cudaStream_t &stream){
234  TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
235  argQ.result_h[0] = make_double2(0.0,0.0);
236  LAUNCH_KERNEL_LOCAL_PARITY(computeFix_quality, tp, stream, argQ, Elems, Float, Gauge, gauge_dir);
238  argQ.result_h[0].x /= (double)(3 * gauge_dir * 2 * argQ.threads);
239  argQ.result_h[0].y /= (double)(3 * 2 * argQ.threads);
240  }
241 
242  TuneKey tuneKey() const {
243  std::stringstream vol;
244  vol << argQ.X[0] << "x" << argQ.X[1] << "x" << argQ.X[2] << "x" << argQ.X[3];
245  sprintf(aux_string,"threads=%d,prec=%lu,gaugedir=%d", argQ.threads, sizeof(Float), gauge_dir);
246  return TuneKey(vol.str().c_str(), typeid(*this).name(), aux_string);
247  }
248 
249  long long flops() const {
250  return (36LL * gauge_dir + 65LL) * 2 * argQ.threads;
251  } // Only correct if there is no link reconstruction, no cub reduction accounted also
252  long long bytes() const {
253  return (2LL * gauge_dir + 2LL) * Elems * 2 * argQ.threads * sizeof(Float);
254  } //Not accounting the reduction!!!
255 
256  };
257 
258 
259 
260  template <typename Float>
261  struct GaugeFixArg {
262  int threads; // number of active threads required
263  int X[4]; // grid dimensions
264  cudaGaugeField &data;
265  Float *invpsq;
266  complex<Float> *delta;
267  complex<Float> *gx;
268 
269  GaugeFixArg( cudaGaugeField & data, const int Elems) : data(data){
270  for ( int dir = 0; dir < 4; ++dir ) X[dir] = data.X()[dir];
271  threads = X[0] * X[1] * X[2] * X[3];
272  invpsq = (Float*)device_malloc(sizeof(Float) * threads);
273  delta = (complex<Float>*)device_malloc(sizeof(complex<Float>) * threads * 6);
274 #ifdef GAUGEFIXING_DONT_USE_GX
275  gx = (complex<Float>*)device_malloc(sizeof(complex<Float>) * threads);
276 #else
277  gx = (complex<Float>*)device_malloc(sizeof(complex<Float>) * threads * Elems);
278 #endif
279  }
280  void free(){
281  device_free(invpsq);
282  device_free(delta);
283  device_free(gx);
284  }
285  };
286 
287 
288 
289 
290  template <typename Float>
291  __global__ void kernel_gauge_set_invpsq(GaugeFixArg<Float> arg){
292  int id = blockIdx.x * blockDim.x + threadIdx.x;
293  if ( id >= arg.threads ) return;
294  int x1 = id / (arg.X[2] * arg.X[3] * arg.X[0]);
295  int x0 = (id / (arg.X[2] * arg.X[3])) % arg.X[0];
296  int x3 = (id / arg.X[2]) % arg.X[3];
297  int x2 = id % arg.X[2];
298  //id = x2 + (x3 + (x0 + x1 * arg.X[0]) * arg.X[3]) * arg.X[2];
299  Float sx = sin( (Float)x0 * FL_UNITARIZE_PI / (Float)arg.X[0]);
300  Float sy = sin( (Float)x1 * FL_UNITARIZE_PI / (Float)arg.X[1]);
301  Float sz = sin( (Float)x2 * FL_UNITARIZE_PI / (Float)arg.X[2]);
302  Float st = sin( (Float)x3 * FL_UNITARIZE_PI / (Float)arg.X[3]);
303  Float sinsq = sx * sx + sy * sy + sz * sz + st * st;
304  Float prcfact = 0.0;
305  //The FFT normalization is done here
306  if ( sinsq > 0.00001 ) prcfact = 4.0 / (sinsq * (Float)arg.threads);
307  arg.invpsq[id] = prcfact;
308  }
309 
310 
311  template<typename Float>
312  class GaugeFixSETINVPSP : Tunable {
313  GaugeFixArg<Float> arg;
314  mutable char aux_string[128]; // used as a label in the autotuner
315  private:
316  unsigned int sharedBytesPerThread() const {
317  return 0;
318  }
319  unsigned int sharedBytesPerBlock(const TuneParam &param) const {
320  return 0;
321  }
322  bool tuneSharedBytes() const {
323  return false;
324  } // Don't tune shared memory
325  bool tuneGridDim() const {
326  return false;
327  } // Don't tune the grid dimensions.
328  unsigned int minThreads() const {
329  return arg.threads;
330  }
331 
332  public:
333  GaugeFixSETINVPSP(GaugeFixArg<Float> &arg) : arg(arg) { }
334  ~GaugeFixSETINVPSP () { }
335 
336  void apply(const cudaStream_t &stream){
337  TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
338  kernel_gauge_set_invpsq<Float> <<< tp.grid, tp.block, 0, stream >>> (arg);
339  }
340 
341  TuneKey tuneKey() const {
342  std::stringstream vol;
343  vol << arg.X[0] << "x";
344  vol << arg.X[1] << "x";
345  vol << arg.X[2] << "x";
346  vol << arg.X[3];
347  sprintf(aux_string,"threads=%d,prec=%lu", arg.threads, sizeof(Float));
348  return TuneKey(vol.str().c_str(), typeid(*this).name(), aux_string);
349 
350  }
351 
352  long long flops() const {
353  return 21 * arg.threads;
354  }
355  long long bytes() const {
356  return sizeof(Float) * arg.threads;
357  }
358 
359  };
360 
361  template<typename Float>
362  __global__ void kernel_gauge_mult_norm_2D(GaugeFixArg<Float> arg){
363  int id = blockIdx.x * blockDim.x + threadIdx.x;
364  if ( id < arg.threads ) arg.gx[id] = arg.gx[id] * arg.invpsq[id];
365  }
366 
367 
368  template<typename Float>
369  class GaugeFixINVPSP : Tunable {
370  GaugeFixArg<Float> arg;
371  mutable char aux_string[128]; // used as a label in the autotuner
372  private:
373  unsigned int sharedBytesPerThread() const {
374  return 0;
375  }
376  unsigned int sharedBytesPerBlock(const TuneParam &param) const {
377  return 0;
378  }
379  //bool tuneSharedBytes() const { return false; } // Don't tune shared memory
380  bool tuneGridDim() const {
381  return false;
382  } // Don't tune the grid dimensions.
383  unsigned int minThreads() const {
384  return arg.threads;
385  }
386 
387  public:
388  GaugeFixINVPSP(GaugeFixArg<Float> &arg)
389  : arg(arg){
390  cudaFuncSetCacheConfig( kernel_gauge_mult_norm_2D<Float>, cudaFuncCachePreferL1);
391  }
392  ~GaugeFixINVPSP () {
393  }
394 
395  void apply(const cudaStream_t &stream){
396  TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
397  kernel_gauge_mult_norm_2D<Float> <<< tp.grid, tp.block, 0, stream >>> (arg);
398  }
399 
400  TuneKey tuneKey() const {
401  std::stringstream vol;
402  vol << arg.X[0] << "x";
403  vol << arg.X[1] << "x";
404  vol << arg.X[2] << "x";
405  vol << arg.X[3];
406  sprintf(aux_string,"threads=%d,prec=%lu", arg.threads, sizeof(Float));
407  return TuneKey(vol.str().c_str(), typeid(*this).name(), aux_string);
408 
409  }
410 
411  void preTune(){
412  //since delta contents are irrelevant at this point, we can swap gx with delta
413  complex<Float> *tmp = arg.gx;
414  arg.gx = arg.delta;
415  arg.delta = tmp;
416  }
417  void postTune(){
418  arg.gx = arg.delta;
419  }
420  long long flops() const {
421  return 2LL * arg.threads;
422  }
423  long long bytes() const {
424  return 5LL * sizeof(Float) * arg.threads;
425  }
426 
427  };
428 
429 
430 
431  template <typename Float>
432  __host__ __device__ inline void reunit_link( Matrix<complex<Float>,3> &U ){
433 
434  complex<Float> t2((Float)0.0, (Float)0.0);
435  Float t1 = 0.0;
436  //first normalize first row
437  //sum of squares of row
438 #pragma unroll
439  for ( int c = 0; c < 3; c++ ) t1 += norm(U(0,c));
440  t1 = (Float)1.0 / sqrt(t1);
441  //14
442  //used to normalize row
443 #pragma unroll
444  for ( int c = 0; c < 3; c++ ) U(0,c) *= t1;
445  //6
446 #pragma unroll
447  for ( int c = 0; c < 3; c++ ) t2 += conj(U(0,c)) * U(1,c);
448  //24
449 #pragma unroll
450  for ( int c = 0; c < 3; c++ ) U(1,c) -= t2 * U(0,c);
451  //24
452  //normalize second row
453  //sum of squares of row
454  t1 = 0.0;
455 #pragma unroll
456  for ( int c = 0; c < 3; c++ ) t1 += norm(U(1,c));
457  t1 = (Float)1.0 / sqrt(t1);
458  //14
459  //used to normalize row
460 #pragma unroll
461  for ( int c = 0; c < 3; c++ ) U(1, c) *= t1;
462  //6
463  //Reconstruct lat row
464  U(2,0) = conj(U(0,1) * U(1,2) - U(0,2) * U(1,1));
465  U(2,1) = conj(U(0,2) * U(1,0) - U(0,0) * U(1,2));
466  U(2,2) = conj(U(0,0) * U(1,1) - U(0,1) * U(1,0));
467  //42
468  //T=130
469  }
470 
471 #ifdef GAUGEFIXING_DONT_USE_GX
472 
473  template <typename Float, typename Gauge>
474  __global__ void kernel_gauge_fix_U_EO_NEW( GaugeFixArg<Float> arg, Gauge dataOr, Float half_alpha){
475  int id = threadIdx.x + blockIdx.x * blockDim.x;
476  int parity = threadIdx.y;
477 
478  if ( id >= arg.threads/2 ) return;
479 
480  typedef complex<Float> Cmplx;
481 
482  int x[4];
483  getCoords(x, id, arg.X, parity);
484  int idx = ((x[3] * arg.X[2] + x[2]) * arg.X[1] + x[1]) * arg.X[0] + x[0];
485  Matrix<Cmplx,3> de;
486  //Read Delta
487  de(0,0) = arg.delta[idx + 0 * arg.threads];
488  de(0,1) = arg.delta[idx + 1 * arg.threads];
489  de(0,2) = arg.delta[idx + 2 * arg.threads];
490  de(1,1) = arg.delta[idx + 3 * arg.threads];
491  de(1,2) = arg.delta[idx + 4 * arg.threads];
492  de(2,2) = arg.delta[idx + 5 * arg.threads];
493 
494  de(1,0) = Cmplx(-de(0,1).x, de(0,1).y);
495  de(2,0) = Cmplx(-de(0,2).x, de(0,2).y);
496  de(2,1) = Cmplx(-de(1,2).x, de(1,2).y);
497  Matrix<Cmplx,3> g;
498  setIdentity(&g);
499  g += de * half_alpha;
500  //36
501  reunit_link<Float>( g );
502  //130
503 
504 
505  for ( int mu = 0; mu < 4; mu++ ) {
506  Matrix<Cmplx,3> U = dataOr(mu, id, parity);
507  Matrix<Cmplx,3> g0;
508  U = g * U;
509  //198
510  idx = linkNormalIndexP1(x,arg.X,mu);
511  //Read Delta
512  de(0,0) = arg.delta[idx + 0 * arg.threads];
513  de(0,1) = arg.delta[idx + 1 * arg.threads];
514  de(0,2) = arg.delta[idx + 2 * arg.threads];
515  de(1,1) = arg.delta[idx + 3 * arg.threads];
516  de(1,2) = arg.delta[idx + 4 * arg.threads];
517  de(2,2) = arg.delta[idx + 5 * arg.threads];
518 
519  de(1,0) = Cmplx(-de(0,1).x, de(0,1).y);
520  de(2,0) = Cmplx(-de(0,2).x, de(0,2).y);
521  de(2,1) = Cmplx(-de(1,2).x, de(1,2).y);
522 
523  setIdentity(&g0);
524  g0 += de * half_alpha;
525  //36
526  reunit_link<Float>( g0 );
527  //130
528 
529  U = U * conj(g0);
530  //198
531  dataOr(mu, id, parity) = U;
532  }
533  }
534 
535 
536  template<typename Float, typename Gauge>
537  class GaugeFixNEW : TunableLocalParity {
538  GaugeFixArg<Float> arg;
539  Float half_alpha;
540  Gauge dataOr;
541  mutable char aux_string[128]; // used as a label in the autotuner
542  private:
543 
544  // since GaugeFixArg is used by other kernels that don't use
545  // tunableLocalParity, arg.threads stores Volume and not VolumeCB
546  // so we need to divide by two
547  unsigned int minThreads() const { return arg.threads/2; }
548 
549  public:
550  GaugeFixNEW(Gauge & dataOr, GaugeFixArg<Float> &arg, Float alpha)
551  : dataOr(dataOr), arg(arg) {
552  half_alpha = alpha * 0.5;
553  cudaFuncSetCacheConfig( kernel_gauge_fix_U_EO_NEW<Float, Gauge>, cudaFuncCachePreferL1);
554  }
555  ~GaugeFixNEW () { }
556 
557  void setAlpha(Float alpha){ half_alpha = alpha * 0.5; }
558 
559  void apply(const cudaStream_t &stream){
560  TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
561  kernel_gauge_fix_U_EO_NEW<Float, Gauge> <<< tp.grid, tp.block, 0, stream >>> (arg, dataOr, half_alpha);
562  }
563 
564  TuneKey tuneKey() const {
565  std::stringstream vol;
566  vol << arg.X[0] << "x" << arg.X[1] << "x" << arg.X[2] << "x" << arg.X[3];
567  sprintf(aux_string,"threads=%d,prec=%lu", arg.threads, sizeof(Float));
568  return TuneKey(vol.str().c_str(), typeid(*this).name(), aux_string);
569 
570  }
571 
572  //need this
573  void preTune() {
574  arg.data.backup();
575  }
576  void postTune() {
577  arg.data.restore();
578  }
579  long long flops() const {
580  return 2414LL * arg.threads;
581  //Not accounting here the reconstruction of the gauge if 12 or 8!!!!!!
582  }
583  long long bytes() const {
584  return ( dataOr.Bytes() * 4LL + 5 * 12LL * sizeof(Float)) * arg.threads;
585  }
586 
587  };
588 
589 
590 
591 #else
592  template <int Elems, typename Float>
593  __global__ void kernel_gauge_GX(GaugeFixArg<Float> arg, Float half_alpha){
594 
595  int id = blockIdx.x * blockDim.x + threadIdx.x;
596 
597  if ( id >= arg.threads ) return;
598 
599  typedef complex<Float> Cmplx;
600 
601  Matrix<Cmplx,3> de;
602  //Read Delta
603  de(0,0) = arg.delta[id];
604  de(0,1) = arg.delta[id + arg.threads];
605  de(0,2) = arg.delta[id + 2 * arg.threads];
606  de(1,1) = arg.delta[id + 3 * arg.threads];
607  de(1,2) = arg.delta[id + 4 * arg.threads];
608  de(2,2) = arg.delta[id + 5 * arg.threads];
609 
610  de(1,0) = makeComplex(-de(0,1).x, de(0,1).y);
611  de(2,0) = makeComplex(-de(0,2).x, de(0,2).y);
612  de(2,1) = makeComplex(-de(1,2).x, de(1,2).y);
613 
614 
615  Matrix<Cmplx,3> g;
616  setIdentity(&g);
617  g += de * half_alpha;
618  //36
619  reunit_link<Float>( g );
620  //130
621  //gx is represented in even/odd order
622  //normal lattice index to even/odd index
623  int x3 = id / (arg.X[0] * arg.X[1] * arg.X[2]);
624  int x2 = (id / (arg.X[0] * arg.X[1])) % arg.X[2];
625  int x1 = (id / arg.X[0]) % arg.X[1];
626  int x0 = id % arg.X[0];
627  id = (x0 + (x1 + (x2 + x3 * arg.X[2]) * arg.X[1]) * arg.X[0]) >> 1;
628  id += ((x0 + x1 + x2 + x3) & 1 ) * arg.threads / 2;
629 
630  for ( int i = 0; i < Elems; i++ ) arg.gx[id + i * arg.threads] = g.data[i];
631  //T=166 for Elems 9
632  //T=208 for Elems 6
633  }
634 
635 
636 
637 
638  template<int Elems, typename Float>
639  class GaugeFix_GX : Tunable {
640  GaugeFixArg<Float> arg;
641  Float half_alpha;
642  mutable char aux_string[128]; // used as a label in the autotuner
643  private:
644  unsigned int sharedBytesPerThread() const {
645  return 0;
646  }
647  unsigned int sharedBytesPerBlock(const TuneParam &param) const {
648  return 0;
649  }
650  //bool tuneSharedBytes() const { return false; } // Don't tune shared memory
651  bool tuneGridDim() const {
652  return false;
653  } // Don't tune the grid dimensions.
654  unsigned int minThreads() const {
655  return arg.threads;
656  }
657 
658  public:
659  GaugeFix_GX(GaugeFixArg<Float> &arg, Float alpha)
660  : arg(arg) {
661  half_alpha = alpha * 0.5;
662  cudaFuncSetCacheConfig( kernel_gauge_GX<Elems, Float>, cudaFuncCachePreferL1);
663  }
664  ~GaugeFix_GX () {
665  }
666 
667  void setAlpha(Float alpha){
668  half_alpha = alpha * 0.5;
669  }
670 
671 
672  void apply(const cudaStream_t &stream){
673  TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
674  kernel_gauge_GX<Elems, Float> <<< tp.grid, tp.block, 0, stream >>> (arg, half_alpha);
675  }
676 
677  TuneKey tuneKey() const {
678  std::stringstream vol;
679  vol << arg.X[0] << "x";
680  vol << arg.X[1] << "x";
681  vol << arg.X[2] << "x";
682  vol << arg.X[3];
683  sprintf(aux_string,"threads=%d,prec=%lu", arg.threads, sizeof(Float));
684  return TuneKey(vol.str().c_str(), typeid(*this).name(), aux_string);
685 
686  }
687 
688  long long flops() const {
689  if ( Elems == 6 ) return 208LL * arg.threads;
690  else return 166LL * arg.threads;
691  }
692  long long bytes() const {
693  return 4LL * Elems * sizeof(Float) * arg.threads;
694  }
695 
696  };
697 
698 
699  template <int Elems, typename Float, typename Gauge>
700  __global__ void kernel_gauge_fix_U_EO( GaugeFixArg<Float> arg, Gauge dataOr){
701  int idd = threadIdx.x + blockIdx.x * blockDim.x;
702 
703  if ( idd >= arg.threads ) return;
704 
705  int parity = 0;
706  int id = idd;
707  if ( idd >= arg.threads / 2 ) {
708  parity = 1;
709  id -= arg.threads / 2;
710  }
711  typedef complex<Float> Cmplx;
712 
713  Matrix<Cmplx,3> g;
714  //for(int i = 0; i < Elems; i++) g.data[i] = arg.gx[idd + i * arg.threads];
715  for ( int i = 0; i < Elems; i++ ) {
716  g.data[i] = arg.gx[idd + i * arg.threads];
717  }
718  if ( Elems == 6 ) {
719  g(2,0) = conj(g(0,1) * g(1,2) - g(0,2) * g(1,1));
720  g(2,1) = conj(g(0,2) * g(1,0) - g(0,0) * g(1,2));
721  g(2,2) = conj(g(0,0) * g(1,1) - g(0,1) * g(1,0));
722  //42
723  }
724  int x[4];
725  getCoords(x, id, arg.X, parity);
726  for ( int mu = 0; mu < 4; mu++ ) {
727  Matrix<Cmplx,3> U = dataOr(mu, id, parity);
728  Matrix<Cmplx,3> g0;
729  U = g * U;
730  //198
731  int idm1 = linkIndexP1(x,arg.X,mu);
732  idm1 += (1 - parity) * arg.threads / 2;
733  //for(int i = 0; i < Elems; i++) g0.data[i] = arg.gx[idm1 + i * arg.threads];
734  for ( int i = 0; i < Elems; i++ ) {
735  g0.data[i] = arg.gx[idm1 + i * arg.threads];
736  }
737  if ( Elems == 6 ) {
738  g0(2,0) = conj(g0(0,1) * g0(1,2) - g0(0,2) * g0(1,1));
739  g0(2,1) = conj(g0(0,2) * g0(1,0) - g0(0,0) * g0(1,2));
740  g0(2,2) = conj(g0(0,0) * g0(1,1) - g0(0,1) * g0(1,0));
741  //42
742  }
743  U = U * conj(g0);
744  //198
745  dataOr.save(mu, id, parity) = U;
746  }
747  //T=42+4*(198*2+42) Elems=6
748  //T=4*(198*2) Elems=9
749  //Not accounting here the reconstruction of the gauge if 12 or 8!!!!!!
750  }
751 
752 
753  template<int Elems, typename Float, typename Gauge>
754  class GaugeFix : Tunable {
755  GaugeFixArg<Float> arg;
756  Gauge dataOr;
757  mutable char aux_string[128]; // used as a label in the autotuner
758  private:
759  unsigned int sharedBytesPerThread() const {
760  return 0;
761  }
762  unsigned int sharedBytesPerBlock(const TuneParam &param) const {
763  return 0;
764  }
765  //bool tuneSharedBytes() const { return false; } // Don't tune shared memory
766  bool tuneGridDim() const {
767  return false;
768  } // Don't tune the grid dimensions.
769  unsigned int minThreads() const {
770  return arg.threads;
771  }
772 
773  public:
774  GaugeFix(Gauge & dataOr, GaugeFixArg<Float> &arg)
775  : dataOr(dataOr), arg(arg) {
776  cudaFuncSetCacheConfig( kernel_gauge_fix_U_EO<Elems, Float, Gauge>, cudaFuncCachePreferL1);
777  }
778  ~GaugeFix () { }
779 
780 
781  void apply(const cudaStream_t &stream){
782  TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
783  kernel_gauge_fix_U_EO<Elems, Float, Gauge> <<< tp.grid, tp.block, 0, stream >>> (arg, dataOr);
784  }
785 
786  TuneKey tuneKey() const {
787  std::stringstream vol;
788  vol << arg.X[0] << "x";
789  vol << arg.X[1] << "x";
790  vol << arg.X[2] << "x";
791  vol << arg.X[3];
792  sprintf(aux_string,"threads=%d,prec=%lu", arg.threads, sizeof(Float));
793  return TuneKey(vol.str().c_str(), typeid(*this).name(), aux_string);
794 
795  }
796 
797  //need this
798  void preTune() {
799  arg.data.backup();
800  }
801  void postTune() {
802  arg.data.restore();
803  }
804  long long flops() const {
805  if ( Elems == 6 ) return 1794LL * arg.threads;
806  else return 1536LL * arg.threads;
807  //Not accounting here the reconstruction of the gauge if 12 or 8!!!!!!
808  }
809  long long bytes() const {
810  return 26LL * Elems * sizeof(Float) * arg.threads;
811  }
812 
813  };
814 #endif
815 //GAUGEFIXING_DONT_USE_GX
816 
817 
818  template<int Elems, typename Float, typename Gauge, int gauge_dir>
819  void gaugefixingFFT( Gauge dataOr, cudaGaugeField& data, \
820  const int Nsteps, const int verbose_interval, \
821  const Float alpha0, const int autotune, const double tolerance, \
822  const int stopWtheta) {
823 
824  TimeProfile profileInternalGaugeFixFFT("InternalGaugeFixQudaFFT", false);
825 
826  profileInternalGaugeFixFFT.TPSTART(QUDA_PROFILE_COMPUTE);
827 
828  Float alpha = alpha0;
829  std::cout << "\tAlpha parameter of the Steepest Descent Method: " << alpha << std::endl;
830  if ( autotune ) std::cout << "\tAuto tune active: yes" << std::endl;
831  else std::cout << "\tAuto tune active: no" << std::endl;
832  std::cout << "\tStop criterium: " << tolerance << std::endl;
833  if ( stopWtheta ) std::cout << "\tStop criterium method: theta" << std::endl;
834  else std::cout << "\tStop criterium method: Delta" << std::endl;
835  std::cout << "\tMaximum number of iterations: " << Nsteps << std::endl;
836  std::cout << "\tPrint convergence results at every " << verbose_interval << " steps" << std::endl;
837 
838 
839  unsigned int delta_pad = data.X()[0] * data.X()[1] * data.X()[2] * data.X()[3];
840  int4 size = make_int4( data.X()[0], data.X()[1], data.X()[2], data.X()[3] );
841  cufftHandle plan_xy;
842  cufftHandle plan_zt;
843 
844  GaugeFixArg<Float> arg(data, Elems);
845  SetPlanFFT2DMany( plan_zt, size, 0, arg.delta); //for space and time ZT
846  SetPlanFFT2DMany( plan_xy, size, 1, arg.delta); //with space only XY
847 
848 
849  GaugeFixFFTRotateArg<Float> arg_rotate(data);
850  GaugeFixFFTRotate<Float> GFRotate(arg_rotate);
851 
852  GaugeFixSETINVPSP<Float> setinvpsp(arg);
853  setinvpsp.apply(0);
854  GaugeFixINVPSP<Float> invpsp(arg);
855 
856 
857 #ifdef GAUGEFIXING_DONT_USE_GX
858  //without using GX, gx will be created only for plane rotation but with less size
859  GaugeFixNEW<Float, Gauge> gfixNew(dataOr, arg, alpha);
860 #else
861  //using GX
862  GaugeFix_GX<Elems, Float> calcGX(arg, alpha);
863  GaugeFix<Elems, Float, Gauge> gfix(dataOr, arg);
864 #endif
865 
866  GaugeFixQualityArg<Float, Gauge> argQ(dataOr, data, arg.delta);
867  GaugeFixQuality<Elems, Float, Gauge, gauge_dir> gfixquality(argQ);
868 
869  gfixquality.apply(0);
870  double action0 = argQ.getAction();
871  printf("Step: %d\tAction: %.16e\ttheta: %.16e\n", 0, argQ.getAction(), argQ.getTheta());
872 
873  double diff = 0.0;
874  int iter = 0;
875  for ( iter = 0; iter < Nsteps; iter++ ) {
876  for ( int k = 0; k < 6; k++ ) {
877  //------------------------------------------------------------------------
878  // Set a pointer do the element k in lattice volume
879  // each element is stored with stride lattice volume
880  // it uses gx as temporary array!!!!!!
881  //------------------------------------------------------------------------
882  complex<Float> *_array = arg.delta + k * delta_pad;
884  //------------------------------------------------------------------------
885  // Perform FFT on xy plane
886  //------------------------------------------------------------------------
887  ApplyFFT(plan_xy, _array, arg.gx, CUFFT_FORWARD);
888  //------------------------------------------------------------------------
889  // Rotate hypercube, xyzt -> ztxy
890  //------------------------------------------------------------------------
891  GFRotate.setDirection(0, arg.gx, _array);
892  GFRotate.apply(0);
893  //------------------------------------------------------------------------
894  // Perform FFT on zt plane
895  //------------------------------------------------------------------------
896  ApplyFFT(plan_zt, _array, arg.gx, CUFFT_FORWARD);
897  //------------------------------------------------------------------------
898  // Normalize FFT and apply pmax^2/p^2
899  //------------------------------------------------------------------------
900  invpsp.apply(0);
901  //------------------------------------------------------------------------
902  // Perform IFFT on zt plane
903  //------------------------------------------------------------------------
904  ApplyFFT(plan_zt, arg.gx, _array, CUFFT_INVERSE);
905  //------------------------------------------------------------------------
906  // Rotate hypercube, ztxy -> xyzt
907  //------------------------------------------------------------------------
908  GFRotate.setDirection(1, _array, arg.gx);
909  GFRotate.apply(0);
910  //------------------------------------------------------------------------
911  // Perform IFFT on xy plane
912  //------------------------------------------------------------------------
913  ApplyFFT(plan_xy, arg.gx, _array, CUFFT_INVERSE);
914  }
915  #ifdef GAUGEFIXING_DONT_USE_GX
916  //------------------------------------------------------------------------
917  // Apply gauge fix to current gauge field
918  //------------------------------------------------------------------------
919  gfixNew.apply(0);
920  #else
921  //------------------------------------------------------------------------
922  // Calculate g(x)
923  //------------------------------------------------------------------------
924  calcGX.apply(0);
925  //------------------------------------------------------------------------
926  // Apply gauge fix to current gauge field
927  //------------------------------------------------------------------------
928  gfix.apply(0);
929  #endif
930  //------------------------------------------------------------------------
931  // Measure gauge quality and recalculate new Delta(x)
932  //------------------------------------------------------------------------
933  gfixquality.apply(0);
934  double action = argQ.getAction();
935  diff = abs(action0 - action);
936  if ((iter % verbose_interval) == (verbose_interval - 1))
937  printf("Step: %d\tAction: %.16e\ttheta: %.16e\tDelta: %.16e\n", iter + 1, argQ.getAction(), argQ.getTheta(), diff);
938  if ( autotune && ((action - action0) < -1e-14) ) {
939  if ( alpha > 0.01 ) {
940  alpha = 0.95 * alpha;
941  #ifdef GAUGEFIXING_DONT_USE_GX
942  gfixNew.setAlpha(alpha);
943  #else
944  calcGX.setAlpha(alpha);
945  #endif
946  printf(">>>>>>>>>>>>>> Warning: changing alpha down -> %.4e\n", alpha );
947  }
948  }
949  //------------------------------------------------------------------------
950  // Check gauge fix quality criterium
951  //------------------------------------------------------------------------
952  if ( stopWtheta ) { if ( argQ.getTheta() < tolerance ) break; }
953  else { if ( diff < tolerance ) break; }
954 
955  action0 = action;
956  }
957  if ((iter % verbose_interval) != 0 )
958  printf("Step: %d\tAction: %.16e\ttheta: %.16e\tDelta: %.16e\n", iter, argQ.getAction(), argQ.getTheta(), diff);
959 
960  // Reunitarize at end
961  const double unitarize_eps = 1e-14;
962  const double max_error = 1e-10;
963  const int reunit_allow_svd = 1;
964  const int reunit_svd_only = 0;
965  const double svd_rel_error = 1e-6;
966  const double svd_abs_error = 1e-6;
967  setUnitarizeLinksConstants(unitarize_eps, max_error,
968  reunit_allow_svd, reunit_svd_only,
969  svd_rel_error, svd_abs_error);
970  int num_failures = 0;
971  int* num_failures_dev = static_cast<int*>(pool_device_malloc(sizeof(int)));
972  cudaMemset(num_failures_dev, 0, sizeof(int));
973  unitarizeLinks(data, data, num_failures_dev);
974  qudaMemcpy(&num_failures, num_failures_dev, sizeof(int), cudaMemcpyDeviceToHost);
975 
976  pool_device_free(num_failures_dev);
977  if ( num_failures > 0 ) {
978  errorQuda("Error in the unitarization\n");
979  exit(1);
980  }
981  // end reunitarize
982 
983 
984  arg.free();
985  CUFFT_SAFE_CALL(cufftDestroy(plan_zt));
986  CUFFT_SAFE_CALL(cufftDestroy(plan_xy));
987  checkCudaError();
989  profileInternalGaugeFixFFT.TPSTOP(QUDA_PROFILE_COMPUTE);
990 
991  if (getVerbosity() > QUDA_SUMMARIZE){
992  double secs = profileInternalGaugeFixFFT.Last(QUDA_PROFILE_COMPUTE);
993  double fftflop = 5.0 * (log2((double)( data.X()[0] * data.X()[1]) ) + log2( (double)(data.X()[2] * data.X()[3] )));
994  fftflop *= (double)( data.X()[0] * data.X()[1] * data.X()[2] * data.X()[3] );
995  double gflops = setinvpsp.flops() + gfixquality.flops();
996  double gbytes = setinvpsp.bytes() + gfixquality.bytes();
997  double flop = invpsp.flops() * Elems;
998  double byte = invpsp.bytes() * Elems;
999  flop += (GFRotate.flops() + fftflop) * Elems * 2;
1000  byte += GFRotate.bytes() * Elems * 4; //includes FFT reads, assuming 1 read and 1 write per site
1001  #ifdef GAUGEFIXING_DONT_USE_GX
1002  flop += gfixNew.flops();
1003  byte += gfixNew.bytes();
1004  #else
1005  flop += calcGX.flops();
1006  byte += calcGX.bytes();
1007  flop += gfix.flops();
1008  byte += gfix.bytes();
1009  #endif
1010  flop += gfixquality.flops();
1011  byte += gfixquality.bytes();
1012  gflops += flop * iter;
1013  gbytes += byte * iter;
1014  gflops += 4588.0 * data.X()[0]*data.X()[1]*data.X()[2]*data.X()[3]; //Reunitarize at end
1015  gbytes += 8.0 * data.X()[0]*data.X()[1]*data.X()[2]*data.X()[3] * dataOr.Bytes() ; //Reunitarize at end
1016 
1017  gflops = (gflops * 1e-9) / (secs);
1018  gbytes = gbytes / (secs * 1e9);
1019  printfQuda("Time: %6.6f s, Gflop/s = %6.1f, GB/s = %6.1f\n", secs, gflops, gbytes);
1020  }
1021  }
1022 
1023  template<int Elems, typename Float, typename Gauge>
1024  void gaugefixingFFT( Gauge dataOr, cudaGaugeField& data, const int gauge_dir, \
1025  const int Nsteps, const int verbose_interval, const Float alpha, const int autotune, \
1026  const double tolerance, const int stopWtheta) {
1027  if ( gauge_dir != 3 ) {
1028  printf("Starting Landau gauge fixing with FFTs...\n");
1029  gaugefixingFFT<Elems, Float, Gauge, 4>(dataOr, data, Nsteps, verbose_interval, alpha, autotune, tolerance, stopWtheta);
1030  }
1031  else {
1032  printf("Starting Coulomb gauge fixing with FFTs...\n");
1033  gaugefixingFFT<Elems, Float, Gauge, 3>(dataOr, data, Nsteps, verbose_interval, alpha, autotune, tolerance, stopWtheta);
1034  }
1035  }
1036 
1037 
1038 
1039  template<typename Float>
1040  void gaugefixingFFT( cudaGaugeField& data, const int gauge_dir, \
1041  const int Nsteps, const int verbose_interval, const Float alpha, const int autotune, \
1042  const double tolerance, const int stopWtheta) {
1043 
1044  // Switching to FloatNOrder for the gauge field in order to support RECONSTRUCT_12
1045  // Need to fix this!!
1046  //9 and 6 means the number of complex elements used to store g(x) and Delta(x)
1047  if ( data.isNative() ) {
1048  if ( data.Reconstruct() == QUDA_RECONSTRUCT_NO ) {
1049  //printfQuda("QUDA_RECONSTRUCT_NO\n");
1050  typedef typename gauge_mapper<Float,QUDA_RECONSTRUCT_NO>::type Gauge;
1051  gaugefixingFFT<9, Float>(Gauge(data), data, gauge_dir, Nsteps, verbose_interval, alpha, autotune, tolerance, stopWtheta);
1052  } else if ( data.Reconstruct() == QUDA_RECONSTRUCT_12 ) {
1053  //printfQuda("QUDA_RECONSTRUCT_12\n");
1054  typedef typename gauge_mapper<Float,QUDA_RECONSTRUCT_12>::type Gauge;
1055  gaugefixingFFT<6, Float>(Gauge(data), data, gauge_dir, Nsteps, verbose_interval, alpha, autotune, tolerance, stopWtheta);
1056  } else if ( data.Reconstruct() == QUDA_RECONSTRUCT_8 ) {
1057  //printfQuda("QUDA_RECONSTRUCT_8\n");
1058  typedef typename gauge_mapper<Float,QUDA_RECONSTRUCT_8>::type Gauge;
1059  gaugefixingFFT<6, Float>(Gauge(data), data, gauge_dir, Nsteps, verbose_interval, alpha, autotune, tolerance, stopWtheta);
1060 
1061  } else {
1062  errorQuda("Reconstruction type %d of gauge field not supported", data.Reconstruct());
1063  }
1064  } else {
1065  errorQuda("Invalid Gauge Order\n");
1066  }
1067  }
1068 
1069 #endif // GPU_GAUGE_ALG
1070 
1071 
1083  void gaugefixingFFT( cudaGaugeField& data, const int gauge_dir, \
1084  const int Nsteps, const int verbose_interval, const double alpha, const int autotune, \
1085  const double tolerance, const int stopWtheta) {
1086 
1087 #ifdef GPU_GAUGE_ALG
1088 #ifdef MULTI_GPU
1090  errorQuda("Gauge Fixing with FFTs in multi-GPU support NOT implemented yet!\n");
1091 #endif
1092  if ( data.Precision() == QUDA_HALF_PRECISION ) {
1093  errorQuda("Half precision not supported\n");
1094  }
1095  if ( data.Precision() == QUDA_SINGLE_PRECISION ) {
1096  gaugefixingFFT<float> (data, gauge_dir, Nsteps, verbose_interval, (float)alpha, autotune, tolerance, stopWtheta);
1097  } else if ( data.Precision() == QUDA_DOUBLE_PRECISION ) {
1098  gaugefixingFFT<double>(data, gauge_dir, Nsteps, verbose_interval, alpha, autotune, tolerance, stopWtheta);
1099  } else {
1100  errorQuda("Precision %d not supported", data.Precision());
1101  }
1102 #else
1103  errorQuda("Gauge fixing has bot been built");
1104 #endif
1105  }
1106 
1107 
1108 
1109 }
static bool reunit_allow_svd
static __device__ __host__ int getIndexFull(int cb_index, const I X[4], int parity)
#define qudaMemcpy(dst, src, count, kind)
Definition: quda_cuda_api.h:33
cudaColorSpinorField * tmp1
double mu
Definition: test_util.cpp:1648
__device__ __host__ void setZero(Matrix< T, N > *m)
Definition: quda_matrix.h:702
#define LAUNCH_KERNEL_LOCAL_PARITY(kernel, tp, stream, arg,...)
__host__ __device__ ValueType norm(const complex< ValueType > &z)
Returns the magnitude of z squared.
QudaVerbosity getVerbosity()
Definition: util_quda.cpp:21
#define errorQuda(...)
Definition: util_quda.h:121
void setUnitarizeLinksConstants(double unitarize_eps, double max_error, bool allow_svd, bool svd_only, double svd_rel_error, double svd_abs_error)
int * num_failures_dev
__host__ __device__ ValueType sqrt(ValueType x)
Definition: complex_quda.h:120
void SetPlanFFT2DMany(cufftHandle &plan, int4 size, int dim, float2 *data)
Creates a CUFFT plan supporting 4D (2D+2D) data layouts for single-precision complex-to-complex.
Definition: CUFFT_Plans.h:96
cudaStream_t * stream
cudaColorSpinorField * tmp
Definition: covdev_test.cpp:44
__device__ __host__ double getRealTraceUVdagger(const Matrix< T, 3 > &a, const Matrix< T, 3 > &b)
Definition: quda_matrix.h:1131
int num_failures
QudaGaugeParam param
Definition: pack_test.cpp:17
static double svd_rel_error
#define qudaDeviceSynchronize()
void unitarizeLinks(cudaGaugeField &outfield, const cudaGaugeField &infield, int *fails)
static __device__ __host__ int linkIndexM1(const int x[], const I X[4], const int mu)
__host__ __device__ ValueType sin(ValueType x)
Definition: complex_quda.h:51
static bool reunit_svd_only
T data[N *N]
Definition: quda_matrix.h:72
#define pool_device_malloc(size)
Definition: malloc_quda.h:125
constexpr int size
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
Definition: tune.cpp:643
static double svd_abs_error
Main header file for host and device accessors to GaugeFields.
int X[4]
Definition: covdev_test.cpp:70
__device__ __host__ void SubTraceUnit(Matrix< T, 3 > &a)
Definition: quda_matrix.h:1125
__device__ __host__ void setIdentity(Matrix< T, N > *m)
Definition: quda_matrix.h:653
void ApplyFFT(cufftHandle &plan, float2 *data_in, float2 *data_out, int direction)
Call CUFFT to perform a single-precision complex-to-complex transform plan in the transform direction...
Definition: CUFFT_Plans.h:29
void gaugefixingFFT(cudaGaugeField &data, const int gauge_dir, const int Nsteps, const int verbose_interval, const double alpha, const int autotune, const double tolerance, const int stopWtheta)
Gauge fixing with Steepest descent method with FFTs with support for single GPU only.
static double unitarize_eps
#define printfQuda(...)
Definition: util_quda.h:115
int VolumeCB() const
unsigned long long flops
Definition: blas_quda.cu:22
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
#define device_malloc(size)
Definition: malloc_quda.h:64
QudaReconstructType Reconstruct() const
Definition: gauge_field.h:250
__host__ __device__ ValueType abs(ValueType x)
Definition: complex_quda.h:125
#define pool_device_free(ptr)
Definition: malloc_quda.h:126
#define checkCudaError()
Definition: util_quda.h:161
__host__ __device__ ValueType conj(ValueType x)
Definition: complex_quda.h:130
QudaTune getTuning()
Query whether autotuning is enabled or not. Default is enabled but can be overridden by setting QUDA_...
Definition: util_quda.cpp:52
QudaPrecision Precision() const
static __device__ __host__ int linkIndexP1(const int x[], const I X[4], const int mu)
bool isNative() const
QudaParity parity
Definition: covdev_test.cpp:54
static __device__ __host__ int linkNormalIndexP1(const int x[], const I X[4], const int mu)
#define CUFFT_SAFE_CALL(call)
Definition: CUFFT_Plans.h:10
unsigned long long bytes
Definition: blas_quda.cu:23
int comm_dim_partitioned(int dim)
__host__ __device__ int getCoords(int coord[], const Arg &arg, int &idx, int parity, int &dim)
Compute the space-time coordinates we are at.
const int * X() const
#define device_free(ptr)
Definition: malloc_quda.h:69