QUDA  0.9.0
unitarize_links_quda.cu
Go to the documentation of this file.
1 #include <cstdlib>
2 #include <cstdio>
3 #include <iostream>
4 #include <iomanip>
5 #include <cuda.h>
6 #include <gauge_field.h>
7 #include <gauge_field_order.h>
8 
9 #include <tune_quda.h>
10 #include <quda_matrix.h>
11 #include <unitarization_links.h>
12 
13 #include <su3_project.cuh>
14 #include <index_helper.cuh>
15 
16 
17 namespace quda{
18 #ifdef GPU_UNITARIZE
19 
20 namespace{
21  #include <svd_quda.h>
22 }
23 
24 #ifndef FL_UNITARIZE_PI
25 #define FL_UNITARIZE_PI 3.14159265358979323846
26 #endif
27 #ifndef FL_UNITARIZE_PI23
28 #define FL_UNITARIZE_PI23 FL_UNITARIZE_PI*0.66666666666666666666
29 #endif
30 
31  static const int max_iter_newton = 20;
32  static const int max_iter = 20;
33 
34  static double unitarize_eps = 1e-14;
35  static double max_error = 1e-10;
36  static int reunit_allow_svd = 1;
37  static int reunit_svd_only = 0;
38  static double svd_rel_error = 1e-6;
39  static double svd_abs_error = 1e-6;
40 
41  template <typename Out, typename In>
42  struct UnitarizeLinksArg {
43  int threads; // number of active threads required
44  int X[4]; // grid dimensions
45  Out output;
46  const In input;
47  int *fails;
48  const int max_iter;
49  const double unitarize_eps;
50  const double max_error;
51  const int reunit_allow_svd;
52  const int reunit_svd_only;
53  const double svd_rel_error;
54  const double svd_abs_error;
55  const static bool check_unitarization = true;
56 
57  UnitarizeLinksArg(Out &output, const In &input, const GaugeField &data, int* fails,
58  int max_iter, double unitarize_eps, double max_error,
60  double svd_abs_error)
61  : output(output), input(input), fails(fails), unitarize_eps(unitarize_eps),
62  max_iter(max_iter), max_error(max_error), reunit_allow_svd(reunit_allow_svd),
65  {
66  for(int dir=0; dir<4; ++dir) X[dir] = data.X()[dir];
67  threads = X[0]*X[1]*X[2]*X[3];
68  }
69  };
70 
71  void setUnitarizeLinksConstants(double unitarize_eps_, double max_error_,
72  bool reunit_allow_svd_, bool reunit_svd_only_,
73  double svd_rel_error_, double svd_abs_error_) {
74  unitarize_eps = unitarize_eps_;
75  max_error = max_error_;
76  reunit_allow_svd = reunit_allow_svd_;
77  reunit_svd_only = reunit_svd_only_;
78  svd_rel_error = svd_rel_error_;
79  svd_abs_error = svd_abs_error_;
80  }
81 
82 
83  template<class Cmplx>
84  __device__ __host__
85  bool isUnitarizedLinkConsistent(const Matrix<Cmplx,3>& initial_matrix,
86  const Matrix<Cmplx,3>& unitary_matrix,
87  double max_error)
88  {
89  Matrix<Cmplx,3> temporary;
90  temporary = conj(initial_matrix)*unitary_matrix;
91  temporary = temporary*temporary - conj(initial_matrix)*initial_matrix;
92 
93  for(int i=0; i<3; ++i){
94  for(int j=0; j<3; ++j){
95  if( fabs(temporary(i,j).x) > max_error || fabs(temporary(i,j).y) > max_error){
96  return false;
97  }
98  }
99  }
100  return true;
101  }
102 
103 
104  template<class T>
105  __device__ __host__
106  T getAbsMin(const T* const array, int size){
107  T min = fabs(array[0]);
108  for(int i=1; i<size; ++i){
109  T abs_val = fabs(array[i]);
110  if((abs_val) < min){ min = abs_val; }
111  }
112  return min;
113  }
114 
115 
116  template<class Real>
117  __device__ __host__
118  inline bool checkAbsoluteError(Real a, Real b, Real epsilon)
119  {
120  if( fabs(a-b) < epsilon) return true;
121  return false;
122  }
123 
124 
125  template<class Real>
126  __device__ __host__
127  inline bool checkRelativeError(Real a, Real b, Real epsilon)
128  {
129  if( fabs((a-b)/b) < epsilon ) return true;
130  return false;
131  }
132 
133 
134 
135  // Compute the reciprocal square root of the matrix q
136  // Also modify q if the eigenvalues are dangerously small.
137  template<class Float, typename Arg>
138  __device__ __host__
139  bool reciprocalRoot(const Matrix<complex<Float>,3>& q, Matrix<complex<Float>,3>* res, Arg &arg){
140 
141  Matrix<complex<Float>,3> qsq, tempq;
142 
143  Float c[3];
144  Float g[3];
145 
146  const Float one_third = 0.333333333333333333333;
147  const Float one_ninth = 0.111111111111111111111;
148  const Float one_eighteenth = 0.055555555555555555555;
149 
150  qsq = q*q;
151  tempq = qsq*q;
152 
153  c[0] = getTrace(q).x;
154  c[1] = getTrace(qsq).x * 0.5;
155  c[2] = getTrace(tempq).x * one_third;;
156 
157  g[0] = g[1] = g[2] = c[0] * one_third;
158  Float r,s,theta;
159  s = c[1]*one_third - c[0]*c[0]*one_eighteenth;
160 
161  Float cosTheta;
162  if(fabs(s) >= arg.unitarize_eps){ // faster when this conditional is removed?
163  const Float rsqrt_s = rsqrt(s);
164  r = c[2]*0.5 - (c[0]*one_third)*(c[1] - c[0]*c[0]*one_ninth);
165  cosTheta = r*rsqrt_s*rsqrt_s*rsqrt_s;
166 
167  if(fabs(cosTheta) >= 1.0){
168  theta = (r > 0) ? 0.0 : FL_UNITARIZE_PI;
169  }else{
170  theta = acos(cosTheta); // this is the primary performance limiter
171  }
172 
173  const Float sqrt_s = s*rsqrt_s;
174 
175 #if 0 // experimental version
176  Float as, ac;
177  sincos( theta*one_third, &as, &ac );
178  g[0] = c[0]*one_third + 2*sqrt_s*ac;
179  //g[1] = c[0]*one_third + 2*sqrt_s*(ac*cos(1*FL_UNITARIZE_PI23) - as*sin(1*FL_UNITARIZE_PI23));
180  g[1] = c[0]*one_third - 2*sqrt_s*(0.5*ac + as*0.8660254037844386467637);
181  //g[2] = c[0]*one_third + 2*sqrt_s*(ac*cos(2*FL_UNITARIZE_PI23) - as*sin(2*FL_UNITARIZE_PI23));
182  g[2] = c[0]*one_third + 2*sqrt_s*(-0.5*ac + as*0.8660254037844386467637);
183 #else
184  g[0] = c[0]*one_third + 2*sqrt_s*cos( theta*one_third );
185  g[1] = c[0]*one_third + 2*sqrt_s*cos( theta*one_third + FL_UNITARIZE_PI23 );
186  g[2] = c[0]*one_third + 2*sqrt_s*cos( theta*one_third + 2*FL_UNITARIZE_PI23 );
187 #endif
188  }
189 
190  // Check the eigenvalues, if the determinant does not match the product of the eigenvalues
191  // return false. Then call SVD instead.
192  Float det = getDeterminant(q).x;
193  if( fabs(det) < arg.svd_abs_error) return false;
194  if( checkRelativeError(g[0]*g[1]*g[2],det,arg.svd_rel_error) == false ) return false;
195 
196 
197  // At this point we have finished with the c's
198  // use these to store sqrt(g)
199  for(int i=0; i<3; ++i) c[i] = sqrt(g[i]);
200 
201  // done with the g's, use these to store u, v, w
202  g[0] = c[0]+c[1]+c[2];
203  g[1] = c[0]*c[1] + c[0]*c[2] + c[1]*c[2];
204  g[2] = c[0]*c[1]*c[2];
205 
206  const Float denominator = 1.0 / ( g[2]*(g[0]*g[1]-g[2]) );
207  c[0] = (g[0]*g[1]*g[1] - g[2]*(g[0]*g[0]+g[1])) * denominator;
208  c[1] = (-g[0]*g[0]*g[0] - g[2] + 2.*g[0]*g[1]) * denominator;
209  c[2] = g[0] * denominator;
210 
211  tempq = c[1]*q + c[2]*qsq;
212  // Add a real scalar
213  tempq(0,0).x += c[0];
214  tempq(1,1).x += c[0];
215  tempq(2,2).x += c[0];
216 
217  *res = tempq;
218 
219  return true;
220  }
221 
222 
223 
224 
225  template<class Float, typename Arg>
226  __host__ __device__
227  bool unitarizeLinkMILC(const Matrix<complex<Float>,3>& in, Matrix<complex<Float>,3>* const result, Arg &arg)
228  {
230  if( !arg.reunit_svd_only ){
231  if( reciprocalRoot<Float>(conj(in)*in,&u,arg) ){
232  *result = in*u;
233  return true;
234  }
235  }
236 
237  // If we've got this far, then the Caley-Hamilton unitarization
238  // has failed. If SVD is not allowed, the unitarization has failed.
239  if( !arg.reunit_allow_svd ) return false;
240 
242  Float singular_values[3];
243  computeSVD<Float>(in, u, v, singular_values);
244  *result = u*conj(v);
245  return true;
246  } // unitarizeMILC
247 
248 
249  template<class Float>
250  __host__ __device__
251  bool unitarizeLinkSVD(const Matrix<complex<Float>,3>& in, Matrix<complex<Float>,3>* const result,
252  const double max_error)
253  {
254  Matrix<complex<Float>,3> u, v;
255  Float singular_values[3];
256  computeSVD<Float>(in, u, v, singular_values); // should pass pointers to u,v I guess
257 
258  *result = u*conj(v);
259 
260  if (isUnitary(*result,max_error)==false)
261  {
262  printf("ERROR: Link unitarity test failed\n");
263  printf("TOLERANCE: %g\n", max_error);
264  return false;
265  }
266  return true;
267  }
268 
269 
270  template<class Float>
271  __host__ __device__
272  bool unitarizeLinkNewton(const Matrix<complex<Float>,3>& in, Matrix<complex<Float>,3>* const result, int max_iter)
273  {
274  Matrix<complex<Float>,3> u, uinv;
275  u = in;
276 
277  for(int i=0; i<max_iter; ++i){
278  computeMatrixInverse(u, &uinv);
279  u = 0.5*(u + conj(uinv));
280  }
281 
282  if(isUnitarizedLinkConsistent(in,u,0.0000001)==false)
283  {
284  printf("ERROR: Unitarized link is not consistent with incoming link\n");
285  return false;
286  }
287  *result = u;
288 
289  return true;
290  }
291 
292  void unitarizeLinksCPU(cpuGaugeField &outfield, const cpuGaugeField& infield)
293  {
294  if (infield.Precision() != outfield.Precision())
295  errorQuda("Precisions must match (out=%d != in=%d)", outfield.Precision(), infield.Precision());
296 
297  int num_failures = 0;
298  Matrix<complex<double>,3> inlink, outlink;
299 
300  for (int i=0; i<infield.Volume(); ++i){
301  for (int dir=0; dir<4; ++dir){
302  if (infield.Precision() == QUDA_SINGLE_PRECISION){
303  copyArrayToLink(&inlink, ((float*)(infield.Gauge_p()) + (i*4 + dir)*18)); // order of arguments?
304  if( unitarizeLinkNewton<double>(inlink, &outlink, max_iter_newton) == false ) num_failures++;
305  copyLinkToArray(((float*)(outfield.Gauge_p()) + (i*4 + dir)*18), outlink);
306  } else if (infield.Precision() == QUDA_DOUBLE_PRECISION){
307  copyArrayToLink(&inlink, ((double*)(infield.Gauge_p()) + (i*4 + dir)*18)); // order of arguments?
308  if( unitarizeLinkNewton<double>(inlink, &outlink, max_iter_newton) == false ) num_failures++;
309  copyLinkToArray(((double*)(outfield.Gauge_p()) + (i*4 + dir)*18), outlink);
310  } // precision?
311  } // dir
312  } // loop over volume
313  return;
314  }
315 
316  // CPU function which checks that the gauge field is unitary
317  bool isUnitary(const cpuGaugeField& field, double max_error)
318  {
319  Matrix<complex<double>,3> link, identity;
320 
321  for(int i=0; i<field.Volume(); ++i){
322  for(int dir=0; dir<4; ++dir){
323  if(field.Precision() == QUDA_SINGLE_PRECISION){
324  copyArrayToLink(&link, ((float*)(field.Gauge_p()) + (i*4 + dir)*18)); // order of arguments?
325  }else if(field.Precision() == QUDA_DOUBLE_PRECISION){
326  copyArrayToLink(&link, ((double*)(field.Gauge_p()) + (i*4 + dir)*18)); // order of arguments?
327  }else{
328  errorQuda("Unsupported precision\n");
329  }
330  if(isUnitary(link,max_error) == false){
331  printf("Unitarity failure\n");
332  printf("site index = %d,\t direction = %d\n", i, dir);
333  printLink(link);
334  identity = conj(link)*link;
335  printLink(identity);
336  return false;
337  }
338  } // dir
339  } // i
340  return true;
341  } // is unitary
342 
343 
344  template<typename Float, typename Out, typename In>
345  __global__ void DoUnitarizedLink(UnitarizeLinksArg<Out,In> arg){
346  int idx = threadIdx.x + blockIdx.x*blockDim.x;
347  if(idx >= arg.threads) return;
348  int parity = 0;
349  if(idx >= arg.threads/2) {
350  parity = 1;
351  idx -= arg.threads/2;
352  }
353  int X[4];
354  for(int dr=0; dr<4; ++dr) X[dr] = arg.X[dr];
355  int x[4];
356  getCoords(x, idx, X, parity);
357 
358  idx = linkIndex(x,X);
359  // result is always in double precision
360  Matrix<complex<double>,3> v, result;
362  for (int mu = 0; mu < 4; mu++) {
363  arg.input.load((Float*)(tmp.data),idx, mu, parity);
364 
365  v = tmp;
366  unitarizeLinkMILC(v, &result, arg);
367  if (arg.check_unitarization) {
368  if (isUnitary(result,arg.max_error) == false) atomicAdd(arg.fails, 1);
369  }
370  //WRITE BACK IF FAIL??????????
371  tmp = result;
372 
373  arg.output.save((Float*)(tmp.data),idx, mu, parity);
374  }
375  }
376 
377 
378 
379  template<typename Float, typename Out, typename In>
380  class UnitarizeLinks : Tunable {
381  UnitarizeLinksArg<Out,In> arg;
382 
383  unsigned int sharedBytesPerThread() const { return 0; }
384  unsigned int sharedBytesPerBlock(const TuneParam &) const { return 0; }
385 
386  // don't tune the grid dimension
387  bool tuneGridDim() const { return false; }
388  unsigned int minThreads() const { return arg.threads; }
389 
390  public:
391  UnitarizeLinks(UnitarizeLinksArg<Out,In> &arg) : arg(arg) { }
392 
393 
394  void apply(const cudaStream_t &stream){
395  TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
396  DoUnitarizedLink<Float,Out,In><<<tp.grid, tp.block, 0, stream>>>(arg);
397  }
398  void preTune() { if (arg.input.gauge == arg.output.gauge) arg.output.save(); }
399  void postTune() {
400  if (arg.input.gauge == arg.output.gauge) arg.output.load();
401  cudaMemset(arg.fails, 0, sizeof(int)); // reset fails counter
402  }
403 
404  long long flops() const {
405  // Accounted only the minimum flops for the case reunitarize_svd_only=0
406  return 4588LL*arg.threads;
407  }
408  long long bytes() const { return 4ll * arg.threads * (arg.input.Bytes() + arg.output.Bytes()); }
409 
410  TuneKey tuneKey() const {
411  std::stringstream vol, aux;
412  vol << arg.X[0] << "x";
413  vol << arg.X[1] << "x";
414  vol << arg.X[2] << "x";
415  vol << arg.X[3];
416  aux << "threads=" << arg.threads << ",prec=" << sizeof(Float);
417  return TuneKey(vol.str().c_str(), typeid(*this).name(), aux.str().c_str());
418  }
419  };
420 
421 
422  template<typename Float, typename Out, typename In>
423  void unitarizeLinks(Out output, const In input, const cudaGaugeField& meta, int* fails) {
424  UnitarizeLinksArg<Out,In> arg(output, input, meta, fails, max_iter, unitarize_eps, max_error,
426  UnitarizeLinks<Float, Out, In> unitlinks(arg) ;
427  unitlinks.apply(0);
428  qudaDeviceSynchronize(); // need to synchronize to ensure failure write has completed
429  }
430 
431 template<typename Float>
432 void unitarizeLinks(cudaGaugeField& output, const cudaGaugeField &input, int* fails) {
433 
434  if( output.isNative() && input.isNative() ) {
435  if(output.Reconstruct() == QUDA_RECONSTRUCT_NO) {
436  typedef typename gauge_mapper<Float,QUDA_RECONSTRUCT_NO>::type Out;
437 
438  if(input.Reconstruct() == QUDA_RECONSTRUCT_NO) {
439  typedef typename gauge_mapper<Float,QUDA_RECONSTRUCT_NO>::type In;
440  unitarizeLinks<Float>(Out(output), In(input), input, fails) ;
441  } else if(input.Reconstruct() == QUDA_RECONSTRUCT_12) {
442  typedef typename gauge_mapper<Float,QUDA_RECONSTRUCT_12>::type In;
443  unitarizeLinks<Float>(Out(output), In(input), input, fails) ;
444  } else if(input.Reconstruct() == QUDA_RECONSTRUCT_8) {
445  typedef typename gauge_mapper<Float,QUDA_RECONSTRUCT_8>::type In;
446  unitarizeLinks<Float>(Out(output), In(input), input, fails) ;
447  } else {
448  errorQuda("Reconstruction type %d of gauge field not supported", input.Reconstruct());
449  }
450 
451  } else if(output.Reconstruct() == QUDA_RECONSTRUCT_12){
452  typedef typename gauge_mapper<Float,QUDA_RECONSTRUCT_12>::type Out;
453 
454  if(input.Reconstruct() == QUDA_RECONSTRUCT_NO) {
455  typedef typename gauge_mapper<Float,QUDA_RECONSTRUCT_NO>::type In;
456  unitarizeLinks<Float>(Out(output), In(input), input, fails) ;
457  } else if(input.Reconstruct() == QUDA_RECONSTRUCT_12) {
458  typedef typename gauge_mapper<Float,QUDA_RECONSTRUCT_12>::type In;
459  unitarizeLinks<Float>(Out(output), In(input), input, fails) ;
460  } else if(input.Reconstruct() == QUDA_RECONSTRUCT_8) {
461  typedef typename gauge_mapper<Float,QUDA_RECONSTRUCT_8>::type In;
462  unitarizeLinks<Float>(Out(output), In(input), input, fails) ;
463  } else {
464  errorQuda("Reconstruction type %d of gauge field not supported", input.Reconstruct());
465  }
466 
467 
468  } else if(output.Reconstruct() == QUDA_RECONSTRUCT_8){
469  typedef typename gauge_mapper<Float,QUDA_RECONSTRUCT_8>::type Out;
470 
471  if(input.Reconstruct() == QUDA_RECONSTRUCT_NO) {
472  typedef typename gauge_mapper<Float,QUDA_RECONSTRUCT_NO>::type In;
473  unitarizeLinks<Float>(Out(output), In(input), input, fails) ;
474  } else if(input.Reconstruct() == QUDA_RECONSTRUCT_12) {
475  typedef typename gauge_mapper<Float,QUDA_RECONSTRUCT_12>::type In;
476  unitarizeLinks<Float>(Out(output), In(input), input, fails) ;
477  } else if(input.Reconstruct() == QUDA_RECONSTRUCT_8) {
478  typedef typename gauge_mapper<Float,QUDA_RECONSTRUCT_8>::type In;
479  unitarizeLinks<Float>(Out(output), In(input), input, fails) ;
480  } else {
481  errorQuda("Reconstruction type %d of gauge field not supported", input.Reconstruct());
482  }
483 
484 
485  } else {
486  errorQuda("Reconstruction type %d of gauge field not supported", output.Reconstruct());
487  }
488  } else {
489  errorQuda("Invalid Gauge Order (output=%d, input=%d)", output.Order(), input.Order());
490  }
491 }
492 
493 #endif
494 
495  void unitarizeLinks(cudaGaugeField& output, const cudaGaugeField &input, int* fails) {
496 #ifdef GPU_UNITARIZE
497  if (input.Precision() != output.Precision())
498  errorQuda("input (%d) and output (%d) precisions must match", output.Precision(), input.Precision());
499 
500  if (input.Precision() == QUDA_SINGLE_PRECISION) {
501  unitarizeLinks<float>(output, input, fails);
502  } else if(input.Precision() == QUDA_DOUBLE_PRECISION) {
503  unitarizeLinks<double>(output, input, fails);
504  } else {
505  errorQuda("Precision %d not supported", input.Precision());
506  }
507 #else
508  errorQuda("Unitarization has not been built");
509 #endif
510  }
511 
512  void unitarizeLinks(cudaGaugeField &links, int* fails) {
513  unitarizeLinks(links, links, fails);
514  }
515 
516 
517  template <typename Float, typename G>
518  struct ProjectSU3Arg {
519  int threads; // number of active threads required
520  G u;
521  Float tol;
522  int *fails;
523  int X[4];
524  ProjectSU3Arg(G u, const GaugeField &meta, Float tol, int *fails)
525  : u(u), tol(tol), fails(fails) {
526  for(int dir=0; dir<4; ++dir) X[dir] = meta.X()[dir];
527  threads = meta.VolumeCB();
528  }
529  };
530 
531  template<typename Float, typename G>
533  int idx = threadIdx.x + blockIdx.x*blockDim.x;
534  int parity = blockIdx.y;
535  if(idx >= arg.threads) return;
536 
538 
539  for (int mu = 0; mu < 4; mu++) {
540  arg.u.load((Float*)(u.data),idx, mu, parity);
541  polarSu3<Float>(u, arg.tol);
542 
543  // count number of failures
544  if(isUnitary(u, arg.tol) == false) atomicAdd(arg.fails, 1);
545 
546  arg.u.save((Float*)(u.data),idx, mu, parity);
547  }
548  }
549 
550  template<typename Float, typename G>
551  class ProjectSU3 : Tunable {
553 
554  unsigned int sharedBytesPerThread() const { return 0; }
555  unsigned int sharedBytesPerBlock(const TuneParam &) const { return 0; }
556 
557  // don't tune the grid dimension
558  bool tuneGridDim() const { return false; }
559  unsigned int minThreads() const { return arg.threads; }
560 
561  public:
563 
564  void apply(const cudaStream_t &stream){
565  TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
566  ProjectSU3kernel<Float,G><<<tp.grid, tp.block, 0, stream>>>(arg);
567  }
568  void preTune() { arg.u.save(); }
569  void postTune() { arg.u.load(); }
570 
571  long long flops() const { return 0; } // depends on number of iterations
572  long long bytes() const { return 4ll * arg.threads * arg.u.Bytes(); }
573 
574  TuneKey tuneKey() const {
575  std::stringstream vol, aux;
576  vol << arg.X[0] << "x" << arg.X[1] << "x" << arg.X[2] << "x" << arg.X[3];
577  aux << "threads=" << arg.threads << ",prec=" << sizeof(Float);
578  return TuneKey(vol.str().c_str(), typeid(*this).name(), aux.str().c_str());
579  }
580  };
581 
582 
583  template <typename Float>
584  void projectSU3(cudaGaugeField &u, double tol, int *fails) {
585  if (u.Reconstruct() == QUDA_RECONSTRUCT_NO) {
587  ProjectSU3Arg<Float,G> arg(G(u), u, static_cast<Float>(tol), fails);
588  ProjectSU3<Float,G> project(arg);
589  project.apply(0);
591  checkCudaError();
592  } else {
593  errorQuda("Reconstruct %d not supported", u.Reconstruct());
594  }
595  }
596 
597  void projectSU3(cudaGaugeField &u, double tol, int *fails) {
598 #ifdef GPU_UNITARIZE
599  // check the the field doesn't have staggered phases applied
600  if (u.StaggeredPhaseApplied())
601  errorQuda("Cannot project gauge field with staggered phases applied");
602 
603  if (u.Precision() == QUDA_DOUBLE_PRECISION) {
604  projectSU3<double>(u, tol, fails);
605  } else if (u.Precision() == QUDA_SINGLE_PRECISION) {
606  projectSU3<float>(u, tol, fails);
607  } else {
608  errorQuda("Precision %d not supported", u.Precision());
609  }
610 #else
611  errorQuda("Unitarization has not been built");
612 #endif
613  }
614 
615 } // namespace quda
616 
unsigned int sharedBytesPerThread() const
dim3 dim3 blockDim
void apply(const cudaStream_t &stream)
double mu
Definition: test_util.cpp:1643
unsigned int minThreads() const
static __device__ __host__ int linkIndex(const int x[], const I X[4])
QudaVerbosity getVerbosity()
Definition: util_quda.cpp:20
#define errorQuda(...)
Definition: util_quda.h:90
void setUnitarizeLinksConstants(double unitarize_eps, double max_error, bool allow_svd, bool svd_only, double svd_rel_error, double svd_abs_error)
TuneKey tuneKey() const
__host__ __device__ ValueType sqrt(ValueType x)
Definition: complex_quda.h:105
void * links[4]
Definition: covdev_test.cpp:47
ProjectSU3(ProjectSU3Arg< Float, G > &arg)
cudaStream_t * stream
__global__ void ProjectSU3kernel(ProjectSU3Arg< Float, G > arg)
cudaColorSpinorField * tmp
Definition: covdev_test.cpp:44
ProjectSU3Arg(G u, const GaugeField &meta, Float tol, int *fails)
ProjectSU3Arg< Float, G > arg
long long bytes() const
int num_failures
__host__ __device__ void printLink(const Matrix< Cmplx, 3 > &link)
Definition: quda_matrix.h:1039
void copyLinkToArray(float *array, const Matrix< float2, 3 > &link)
Definition: quda_matrix.h:978
bool isUnitary(const cpuGaugeField &field, double max_error)
#define b
double tol
Definition: test_util.cpp:1647
void unitarizeLinks(cudaGaugeField &outfield, const cudaGaugeField &infield, int *fails)
int printf(const char *,...) __attribute__((__format__(__printf__
cpuColorSpinorField * in
T data[N *N]
Definition: quda_matrix.h:74
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
Definition: tune.cpp:603
Main header file for host and device accessors to GaugeFields.
void projectSU3(cudaGaugeField &U, double tol, int *fails)
Project the input gauge field onto the SU(3) group. This is a destructive operation. The number of link failures is reported so appropriate action can be taken.
cudaError_t qudaDeviceSynchronize()
Wrapper around cudaDeviceSynchronize or cuDeviceSynchronize.
void copyArrayToLink(Matrix< float2, 3 > *link, float *array)
Definition: quda_matrix.h:951
long long flops() const
__device__ __host__ T getTrace(const Matrix< T, 3 > &a)
Definition: quda_matrix.h:305
void unitarizeLinksCPU(cpuGaugeField &outfield, const cpuGaugeField &infield)
int VolumeCB() const
double fabs(double)
unsigned long long flops
Definition: blas_quda.cu:42
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
Definition: complex_quda.h:880
__device__ __host__ void computeMatrixInverse(const Matrix< T, 3 > &u, Matrix< T, 3 > *uinv)
Definition: quda_matrix.h:501
const void * c
__host__ __device__ ValueType cos(ValueType x)
Definition: complex_quda.h:35
QudaReconstructType Reconstruct() const
Definition: gauge_field.h:203
__host__ __device__ ValueType acos(ValueType x)
Definition: complex_quda.h:50
#define checkCudaError()
Definition: util_quda.h:129
struct cudaExtent unsigned int cudaArray_t array
__device__ __host__ T getDeterminant(const Mat< T, 3 > &a)
Definition: quda_matrix.h:312
__host__ __device__ ValueType conj(ValueType x)
Definition: complex_quda.h:115
QudaTune getTuning()
Query whether autotuning is enabled or not. Default is enabled but can be overridden by setting QUDA_...
Definition: util_quda.cpp:51
QudaPrecision Precision() const
QudaParity parity
Definition: covdev_test.cpp:53
#define a
char aux[TuneKey::aux_n]
Definition: tune_quda.h:189
unsigned long long bytes
Definition: blas_quda.cu:43
const int * X() const
unsigned int sharedBytesPerBlock(const TuneParam &) const
static __device__ __host__ void getCoords(int x[], int cb_index, const I X[], int parity)