QUDA  1.0.0
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros
unitarize_links_quda.cu
Go to the documentation of this file.
1 #include <cstdlib>
2 #include <cstdio>
3 #include <iostream>
4 #include <iomanip>
5 #include <cuda.h>
6 #include <gauge_field.h>
7 #include <gauge_field_order.h>
8 
9 #include <tune_quda.h>
10 #include <quda_matrix.h>
11 #include <unitarization_links.h>
12 
13 #include <su3_project.cuh>
14 #include <index_helper.cuh>
15 
16 
17 namespace quda{
18 #ifdef GPU_UNITARIZE
19 
20 namespace{
21  #include <svd_quda.h>
22 }
23 
24 #ifndef FL_UNITARIZE_PI
25 #define FL_UNITARIZE_PI 3.14159265358979323846
26 #endif
27 #ifndef FL_UNITARIZE_PI23
28 #define FL_UNITARIZE_PI23 FL_UNITARIZE_PI*0.66666666666666666666
29 #endif
30 
31  static const int max_iter_newton = 20;
32  static const int max_iter = 20;
33 
34  static double unitarize_eps = 1e-14;
35  static double max_error = 1e-10;
36  static int reunit_allow_svd = 1;
37  static int reunit_svd_only = 0;
38  static double svd_rel_error = 1e-6;
39  static double svd_abs_error = 1e-6;
40 
41  template <typename Out, typename In>
42  struct UnitarizeLinksArg {
43  int threads; // number of active threads required
44  int X[4]; // grid dimensions
45  Out output;
46  const In input;
47  int *fails;
48  const int max_iter;
49  const double unitarize_eps;
50  const double max_error;
51  const int reunit_allow_svd;
52  const int reunit_svd_only;
53  const double svd_rel_error;
54  const double svd_abs_error;
55  const static bool check_unitarization = true;
56 
57  UnitarizeLinksArg(Out &output, const In &input, const GaugeField &data, int* fails,
58  int max_iter, double unitarize_eps, double max_error,
59  int reunit_allow_svd, int reunit_svd_only, double svd_rel_error,
60  double svd_abs_error)
61  : threads(data.VolumeCB()), output(output), input(input), fails(fails), unitarize_eps(unitarize_eps),
62  max_iter(max_iter), max_error(max_error), reunit_allow_svd(reunit_allow_svd),
63  reunit_svd_only(reunit_svd_only), svd_rel_error(svd_rel_error),
64  svd_abs_error(svd_abs_error)
65  {
66  for (int dir=0; dir<4; ++dir) X[dir] = data.X()[dir];
67  }
68  };
69 
70 #endif // GPU_UNITARIZE
71 
72  void setUnitarizeLinksConstants(double unitarize_eps_, double max_error_,
73  bool reunit_allow_svd_, bool reunit_svd_only_,
74  double svd_rel_error_, double svd_abs_error_) {
75 #ifdef GPU_UNITARIZE
76  unitarize_eps = unitarize_eps_;
77  max_error = max_error_;
78  reunit_allow_svd = reunit_allow_svd_;
79  reunit_svd_only = reunit_svd_only_;
80  svd_rel_error = svd_rel_error_;
81  svd_abs_error = svd_abs_error_;
82 #else
83  errorQuda("Unitarization has not been built");
84 #endif
85  }
86 
87 #ifdef GPU_UNITARIZE
88  template<class Cmplx>
89  __device__ __host__
90  bool isUnitarizedLinkConsistent(const Matrix<Cmplx,3>& initial_matrix,
91  const Matrix<Cmplx,3>& unitary_matrix,
92  double max_error)
93  {
94  Matrix<Cmplx,3> temporary;
95  temporary = conj(initial_matrix)*unitary_matrix;
96  temporary = temporary*temporary - conj(initial_matrix)*initial_matrix;
97 
98  for(int i=0; i<3; ++i){
99  for(int j=0; j<3; ++j){
100  if( fabs(temporary(i,j).x) > max_error || fabs(temporary(i,j).y) > max_error){
101  return false;
102  }
103  }
104  }
105  return true;
106  }
107 
108 
109  template<class T>
110  __device__ __host__
111  T getAbsMin(const T* const array, int size){
112  T min = fabs(array[0]);
113  for(int i=1; i<size; ++i){
114  T abs_val = fabs(array[i]);
115  if((abs_val) < min){ min = abs_val; }
116  }
117  return min;
118  }
119 
120 
121  template<class Real>
122  __device__ __host__
123  inline bool checkAbsoluteError(Real a, Real b, Real epsilon)
124  {
125  if( fabs(a-b) < epsilon) return true;
126  return false;
127  }
128 
129 
130  template<class Real>
131  __device__ __host__
132  inline bool checkRelativeError(Real a, Real b, Real epsilon)
133  {
134  if( fabs((a-b)/b) < epsilon ) return true;
135  return false;
136  }
137 
138 
139 
140  // Compute the reciprocal square root of the matrix q
141  // Also modify q if the eigenvalues are dangerously small.
142  template<class Float, typename Arg>
143  __device__ __host__
144  bool reciprocalRoot(const Matrix<complex<Float>,3>& q, Matrix<complex<Float>,3>* res, Arg &arg){
145 
146  Matrix<complex<Float>,3> qsq, tempq;
147 
148  Float c[3];
149  Float g[3];
150 
151  const Float one_third = 0.333333333333333333333;
152  const Float one_ninth = 0.111111111111111111111;
153  const Float one_eighteenth = 0.055555555555555555555;
154 
155  qsq = q*q;
156  tempq = qsq*q;
157 
158  c[0] = getTrace(q).x;
159  c[1] = getTrace(qsq).x * 0.5;
160  c[2] = getTrace(tempq).x * one_third;;
161 
162  g[0] = g[1] = g[2] = c[0] * one_third;
163  Float r,s,theta;
164  s = c[1]*one_third - c[0]*c[0]*one_eighteenth;
165 
166  Float cosTheta;
167  if(fabs(s) >= arg.unitarize_eps){ // faster when this conditional is removed?
168  const Float rsqrt_s = rsqrt(s);
169  r = c[2]*0.5 - (c[0]*one_third)*(c[1] - c[0]*c[0]*one_ninth);
170  cosTheta = r*rsqrt_s*rsqrt_s*rsqrt_s;
171 
172  if(fabs(cosTheta) >= 1.0){
173  theta = (r > 0) ? 0.0 : FL_UNITARIZE_PI;
174  }else{
175  theta = acos(cosTheta); // this is the primary performance limiter
176  }
177 
178  const Float sqrt_s = s*rsqrt_s;
179 
180 #if 0 // experimental version
181  Float as, ac;
182  sincos( theta*one_third, &as, &ac );
183  g[0] = c[0]*one_third + 2*sqrt_s*ac;
184  //g[1] = c[0]*one_third + 2*sqrt_s*(ac*cos(1*FL_UNITARIZE_PI23) - as*sin(1*FL_UNITARIZE_PI23));
185  g[1] = c[0]*one_third - 2*sqrt_s*(0.5*ac + as*0.8660254037844386467637);
186  //g[2] = c[0]*one_third + 2*sqrt_s*(ac*cos(2*FL_UNITARIZE_PI23) - as*sin(2*FL_UNITARIZE_PI23));
187  g[2] = c[0]*one_third + 2*sqrt_s*(-0.5*ac + as*0.8660254037844386467637);
188 #else
189  g[0] = c[0]*one_third + 2*sqrt_s*cos( theta*one_third );
190  g[1] = c[0]*one_third + 2*sqrt_s*cos( theta*one_third + FL_UNITARIZE_PI23 );
191  g[2] = c[0]*one_third + 2*sqrt_s*cos( theta*one_third + 2*FL_UNITARIZE_PI23 );
192 #endif
193  }
194 
195  // Check the eigenvalues, if the determinant does not match the product of the eigenvalues
196  // return false. Then call SVD instead.
197  Float det = getDeterminant(q).x;
198  if( fabs(det) < arg.svd_abs_error) return false;
199  if( checkRelativeError(g[0]*g[1]*g[2],det,arg.svd_rel_error) == false ) return false;
200 
201 
202  // At this point we have finished with the c's
203  // use these to store sqrt(g)
204  for(int i=0; i<3; ++i) c[i] = sqrt(g[i]);
205 
206  // done with the g's, use these to store u, v, w
207  g[0] = c[0]+c[1]+c[2];
208  g[1] = c[0]*c[1] + c[0]*c[2] + c[1]*c[2];
209  g[2] = c[0]*c[1]*c[2];
210 
211  const Float denominator = 1.0 / ( g[2]*(g[0]*g[1]-g[2]) );
212  c[0] = (g[0]*g[1]*g[1] - g[2]*(g[0]*g[0]+g[1])) * denominator;
213  c[1] = (-g[0]*g[0]*g[0] - g[2] + 2.*g[0]*g[1]) * denominator;
214  c[2] = g[0] * denominator;
215 
216  tempq = c[1]*q + c[2]*qsq;
217  // Add a real scalar
218  tempq(0,0).x += c[0];
219  tempq(1,1).x += c[0];
220  tempq(2,2).x += c[0];
221 
222  *res = tempq;
223 
224  return true;
225  }
226 
227 
228 
229 
230  template<class Float, typename Arg>
231  __host__ __device__
232  bool unitarizeLinkMILC(const Matrix<complex<Float>,3>& in, Matrix<complex<Float>,3>* const result, Arg &arg)
233  {
235  if( !arg.reunit_svd_only ){
236  if( reciprocalRoot<Float>(conj(in)*in,&u,arg) ){
237  *result = in*u;
238  return true;
239  }
240  }
241 
242  // If we've got this far, then the Caley-Hamilton unitarization
243  // has failed. If SVD is not allowed, the unitarization has failed.
244  if( !arg.reunit_allow_svd ) return false;
245 
247  Float singular_values[3];
248  computeSVD<Float>(in, u, v, singular_values);
249  *result = u*conj(v);
250  return true;
251  } // unitarizeMILC
252 
253 
254  template<class Float>
255  __host__ __device__
256  bool unitarizeLinkSVD(const Matrix<complex<Float>,3>& in, Matrix<complex<Float>,3>* const result,
257  const double max_error)
258  {
259  Matrix<complex<Float>,3> u, v;
260  Float singular_values[3];
261  computeSVD<Float>(in, u, v, singular_values); // should pass pointers to u,v I guess
262 
263  *result = u*conj(v);
264 
265  if (result->isUnitary(max_error)==false)
266  {
267  printf("ERROR: Link unitarity test failed\n");
268  printf("TOLERANCE: %g\n", max_error);
269  return false;
270  }
271  return true;
272  }
273 
274 
275  template<class Float>
276  __host__ __device__
277  bool unitarizeLinkNewton(const Matrix<complex<Float>,3>& in, Matrix<complex<Float>,3>* const result, int max_iter)
278  {
279  Matrix<complex<Float>,3> u, uinv;
280  u = in;
281 
282  for(int i=0; i<max_iter; ++i){
283  uinv = inverse(u);
284  u = 0.5*(u + conj(uinv));
285  }
286 
287  if(isUnitarizedLinkConsistent(in,u,0.0000001)==false)
288  {
289  printf("ERROR: Unitarized link is not consistent with incoming link\n");
290  return false;
291  }
292  *result = u;
293 
294  return true;
295  }
296 
297 #endif // GPU_UNITARIZE
298 
299  void unitarizeLinksCPU(cpuGaugeField &outfield, const cpuGaugeField& infield)
300  {
301 #ifdef GPU_UNITARIZE
302  if (infield.Precision() != outfield.Precision())
303  errorQuda("Precisions must match (out=%d != in=%d)", outfield.Precision(), infield.Precision());
304 
305  int num_failures = 0;
306  Matrix<complex<double>,3> inlink, outlink;
307 
308  for (int i=0; i<infield.Volume(); ++i){
309  for (int dir=0; dir<4; ++dir){
310  if (infield.Precision() == QUDA_SINGLE_PRECISION){
311  copyArrayToLink(&inlink, ((float*)(infield.Gauge_p()) + (i*4 + dir)*18)); // order of arguments?
312  if( unitarizeLinkNewton<double>(inlink, &outlink, max_iter_newton) == false ) num_failures++;
313  copyLinkToArray(((float*)(outfield.Gauge_p()) + (i*4 + dir)*18), outlink);
314  } else if (infield.Precision() == QUDA_DOUBLE_PRECISION){
315  copyArrayToLink(&inlink, ((double*)(infield.Gauge_p()) + (i*4 + dir)*18)); // order of arguments?
316  if( unitarizeLinkNewton<double>(inlink, &outlink, max_iter_newton) == false ) num_failures++;
317  copyLinkToArray(((double*)(outfield.Gauge_p()) + (i*4 + dir)*18), outlink);
318  } // precision?
319  } // dir
320  } // loop over volume
321  return;
322 #else
323  errorQuda("Unitarization has not been built");
324 #endif
325  }
326 
327 
328  // CPU function which checks that the gauge field is unitary
329  bool isUnitary(const cpuGaugeField& field, double max_error)
330  {
331 #ifdef GPU_UNITARIZE
332  Matrix<complex<double>,3> link, identity;
333 
334  for(int i=0; i<field.Volume(); ++i){
335  for(int dir=0; dir<4; ++dir){
336  if(field.Precision() == QUDA_SINGLE_PRECISION){
337  copyArrayToLink(&link, ((float*)(field.Gauge_p()) + (i*4 + dir)*18)); // order of arguments?
338  }else if(field.Precision() == QUDA_DOUBLE_PRECISION){
339  copyArrayToLink(&link, ((double*)(field.Gauge_p()) + (i*4 + dir)*18)); // order of arguments?
340  }else{
341  errorQuda("Unsupported precision\n");
342  }
343  if (link.isUnitary(max_error) == false) {
344  printf("Unitarity failure\n");
345  printf("site index = %d,\t direction = %d\n", i, dir);
346  printLink(link);
347  identity = conj(link)*link;
348  printLink(identity);
349  return false;
350  }
351  } // dir
352  } // i
353  return true;
354 #else
355  errorQuda("Unitarization has not been built");
356  return false;
357 #endif
358  } // is unitary
359 
360 
361 #ifdef GPU_UNITARIZE
362 
363  template<typename Float, typename Out, typename In>
364  __global__ void DoUnitarizedLink(UnitarizeLinksArg<Out,In> arg){
365  int idx = threadIdx.x + blockIdx.x*blockDim.x;
366  int parity = threadIdx.y + blockIdx.y*blockDim.y;
367  int mu = threadIdx.z + blockIdx.z*blockDim.z;
368  if (idx >= arg.threads) return;
369  if (mu >= 4) return;
370 
371  // result is always in double precision
372  Matrix<complex<double>,3> v, result;
373  Matrix<complex<Float>,3> tmp = arg.input(mu, idx, parity);
374 
375  v = tmp;
376  unitarizeLinkMILC(v, &result, arg);
377  if (arg.check_unitarization) {
378  if (result.isUnitary(arg.max_error) == false) atomicAdd(arg.fails, 1);
379  }
380  tmp = result;
381 
382  arg.output(mu, idx, parity) = tmp;
383  }
384 
385 
386 
387  template<typename Float, typename Out, typename In>
388  class UnitarizeLinks : TunableVectorYZ {
389  UnitarizeLinksArg<Out,In> arg;
390  const GaugeField &meta;
391 
392  unsigned int sharedBytesPerThread() const { return 0; }
393  unsigned int sharedBytesPerBlock(const TuneParam &) const { return 0; }
394 
395  // don't tune the grid dimension
396  bool tuneGridDim() const { return false; }
397  unsigned int minThreads() const { return arg.threads; }
398 
399  public:
400  UnitarizeLinks(UnitarizeLinksArg<Out,In> &arg, const GaugeField &meta)
401  : TunableVectorYZ(2,4), arg(arg), meta(meta) { }
402 
403  void apply(const cudaStream_t &stream){
404  TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
405  DoUnitarizedLink<Float,Out,In><<<tp.grid, tp.block, tp.shared_bytes, stream>>>(arg);
406  }
407  void preTune() { if (arg.input.gauge == arg.output.gauge) arg.output.save(); }
408  void postTune() {
409  if (arg.input.gauge == arg.output.gauge) arg.output.load();
410  cudaMemset(arg.fails, 0, sizeof(int)); // reset fails counter
411  }
412 
413  long long flops() const {
414  // Accounted only the minimum flops for the case reunitarize_svd_only=0
415  return 4ll * 2 * arg.threads * 1147;
416  }
417  long long bytes() const { return 4ll * 2 * arg.threads * (arg.input.Bytes() + arg.output.Bytes()); }
418 
419  TuneKey tuneKey() const {
420  std::stringstream aux;
421  aux << "threads=" << arg.threads << ",prec=" << sizeof(Float);
422  return TuneKey(meta.VolString(), typeid(*this).name(), aux.str().c_str());
423  }
424  };
425 
426 
427  template<typename Float, typename Out, typename In>
428  void unitarizeLinks(Out output, const In input, const cudaGaugeField& meta, int* fails) {
429  UnitarizeLinksArg<Out,In> arg(output, input, meta, fails, max_iter, unitarize_eps, max_error,
430  reunit_allow_svd, reunit_svd_only, svd_rel_error, svd_abs_error);
431  UnitarizeLinks<Float, Out, In> unitlinks(arg, meta);
432  unitlinks.apply(0);
433  qudaDeviceSynchronize(); // need to synchronize to ensure failure write has completed
434  }
435 
436 template<typename Float>
437 void unitarizeLinks(cudaGaugeField& output, const cudaGaugeField &input, int* fails) {
438 
439  if( output.isNative() && input.isNative() ) {
440  if(output.Reconstruct() == QUDA_RECONSTRUCT_NO) {
442 
443  if(input.Reconstruct() == QUDA_RECONSTRUCT_NO) {
445  unitarizeLinks<Float>(Out(output), In(input), input, fails) ;
446  } else if(input.Reconstruct() == QUDA_RECONSTRUCT_12) {
448  unitarizeLinks<Float>(Out(output), In(input), input, fails) ;
449  } else if(input.Reconstruct() == QUDA_RECONSTRUCT_8) {
451  unitarizeLinks<Float>(Out(output), In(input), input, fails) ;
452  } else {
453  errorQuda("Reconstruction type %d of gauge field not supported", input.Reconstruct());
454  }
455 
456  } else if(output.Reconstruct() == QUDA_RECONSTRUCT_12){
458 
459  if(input.Reconstruct() == QUDA_RECONSTRUCT_NO) {
461  unitarizeLinks<Float>(Out(output), In(input), input, fails) ;
462  } else if(input.Reconstruct() == QUDA_RECONSTRUCT_12) {
464  unitarizeLinks<Float>(Out(output), In(input), input, fails) ;
465  } else if(input.Reconstruct() == QUDA_RECONSTRUCT_8) {
467  unitarizeLinks<Float>(Out(output), In(input), input, fails) ;
468  } else {
469  errorQuda("Reconstruction type %d of gauge field not supported", input.Reconstruct());
470  }
471 
472 
473  } else if(output.Reconstruct() == QUDA_RECONSTRUCT_8){
475 
476  if(input.Reconstruct() == QUDA_RECONSTRUCT_NO) {
478  unitarizeLinks<Float>(Out(output), In(input), input, fails) ;
479  } else if(input.Reconstruct() == QUDA_RECONSTRUCT_12) {
481  unitarizeLinks<Float>(Out(output), In(input), input, fails) ;
482  } else if(input.Reconstruct() == QUDA_RECONSTRUCT_8) {
484  unitarizeLinks<Float>(Out(output), In(input), input, fails) ;
485  } else {
486  errorQuda("Reconstruction type %d of gauge field not supported", input.Reconstruct());
487  }
488 
489 
490  } else {
491  errorQuda("Reconstruction type %d of gauge field not supported", output.Reconstruct());
492  }
493  } else {
494  errorQuda("Invalid Gauge Order (output=%d, input=%d)", output.Order(), input.Order());
495  }
496 }
497 
498 #endif // GPU_UNITARIZE
499 
500  void unitarizeLinks(cudaGaugeField& output, const cudaGaugeField &input, int* fails) {
501 #ifdef GPU_UNITARIZE
502  if (input.Precision() != output.Precision())
503  errorQuda("input (%d) and output (%d) precisions must match", output.Precision(), input.Precision());
504 
505  if (input.Precision() == QUDA_SINGLE_PRECISION) {
506  unitarizeLinks<float>(output, input, fails);
507  } else if(input.Precision() == QUDA_DOUBLE_PRECISION) {
508  unitarizeLinks<double>(output, input, fails);
509  } else {
510  errorQuda("Precision %d not supported", input.Precision());
511  }
512 #else
513  errorQuda("Unitarization has not been built");
514 #endif
515  }
516 
517  void unitarizeLinks(cudaGaugeField &links, int* fails) {
518  unitarizeLinks(links, links, fails);
519  }
520 
521 
522  template <typename Float, typename G>
523  struct ProjectSU3Arg {
524  int threads; // number of active threads required
525  G u;
526  Float tol;
527  int *fails;
528  ProjectSU3Arg(G u, const GaugeField &meta, Float tol, int *fails)
529  : threads(meta.VolumeCB()), u(u), tol(tol), fails(fails) { }
530  };
531 
532  template<typename Float, typename G>
534  int idx = threadIdx.x + blockIdx.x*blockDim.x;
535  int parity = threadIdx.y + blockIdx.y*blockDim.y;
536  int mu = threadIdx.z + blockIdx.z*blockDim.z;
537  if (idx >= arg.threads) return;
538  if (mu >= 4) return;
539 
540  Matrix<complex<Float>,3> u = arg.u(mu, idx, parity);
541 
542  polarSu3<Float>(u, arg.tol);
543 
544  // count number of failures
545  if (u.isUnitary(arg.tol) == false) {
546  atomicAdd(arg.fails, 1);
547  }
548 
549  arg.u(mu, idx, parity) = u;
550  }
551 
552  template<typename Float, typename G>
555  const GaugeField &meta;
556 
557  unsigned int sharedBytesPerThread() const { return 0; }
558  unsigned int sharedBytesPerBlock(const TuneParam &) const { return 0; }
559 
560  // don't tune the grid dimension
561  bool tuneGridDim() const { return false; }
562  unsigned int minThreads() const { return arg.threads; }
563 
564  public:
566  : TunableVectorYZ(2, 4), arg(arg), meta(meta) { }
567 
568  void apply(const cudaStream_t &stream){
569  TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
570  ProjectSU3kernel<Float,G><<<tp.grid, tp.block, tp.shared_bytes, stream>>>(arg);
571  }
572  void preTune() { arg.u.save(); }
573  void postTune() {
574  arg.u.load();
575  cudaMemset(arg.fails, 0, sizeof(int)); // reset fails counter
576  }
577 
578  long long flops() const { return 0; } // depends on number of iterations
579  long long bytes() const { return 4ll * 2 * arg.threads * 2 * arg.u.Bytes(); }
580 
581  TuneKey tuneKey() const {
582  std::stringstream aux;
583  aux << "threads=" << arg.threads << ",prec=" << sizeof(Float);
584  return TuneKey(meta.VolString(), typeid(*this).name(), aux.str().c_str());
585  }
586  };
587 
588 
589  template <typename Float>
590  void projectSU3(cudaGaugeField &u, double tol, int *fails) {
591  if (u.Reconstruct() == QUDA_RECONSTRUCT_NO) {
593  ProjectSU3Arg<Float,G> arg(G(u), u, static_cast<Float>(tol), fails);
594  ProjectSU3<Float,G> project(arg, u);
595  project.apply(0);
597  checkCudaError();
598  } else {
599  errorQuda("Reconstruct %d not supported", u.Reconstruct());
600  }
601  }
602 
603  void projectSU3(cudaGaugeField &u, double tol, int *fails) {
604 
605 #ifdef GPU_UNITARIZE
606  // check the the field doesn't have staggered phases applied
607  if (u.StaggeredPhaseApplied())
608  errorQuda("Cannot project gauge field with staggered phases applied");
609 
610  if (u.Precision() == QUDA_DOUBLE_PRECISION) {
611  projectSU3<double>(u, tol, fails);
612  } else if (u.Precision() == QUDA_SINGLE_PRECISION) {
613  projectSU3<float>(u, tol, fails);
614  } else {
615  errorQuda("Precision %d not supported", u.Precision());
616  }
617 #else
618  errorQuda("Unitarization has not been built");
619 #endif
620 
621  }
622 
623 } // namespace quda
624 
static bool reunit_allow_svd
unsigned int sharedBytesPerThread() const
__device__ __host__ bool isUnitary(double max_error) const
Definition: quda_matrix.h:209
void apply(const cudaStream_t &stream)
double mu
Definition: test_util.cpp:1648
unsigned int minThreads() const
QudaVerbosity getVerbosity()
Definition: util_quda.cpp:21
#define errorQuda(...)
Definition: util_quda.h:121
void setUnitarizeLinksConstants(double unitarize_eps, double max_error, bool allow_svd, bool svd_only, double svd_rel_error, double svd_abs_error)
TuneKey tuneKey() const
__host__ __device__ ValueType sqrt(ValueType x)
Definition: complex_quda.h:120
void * links[4]
Definition: covdev_test.cpp:47
double epsilon
Definition: test_util.cpp:1649
cudaStream_t * stream
__global__ void ProjectSU3kernel(ProjectSU3Arg< Float, G > arg)
cudaColorSpinorField * tmp
Definition: covdev_test.cpp:44
const char * VolString() const
const GaugeField & meta
ProjectSU3Arg(G u, const GaugeField &meta, Float tol, int *fails)
ProjectSU3Arg< Float, G > arg
long long bytes() const
int num_failures
__host__ __device__ void printLink(const Matrix< Cmplx, 3 > &link)
Definition: quda_matrix.h:1149
void copyLinkToArray(float *array, const Matrix< float2, 3 > &link)
Definition: quda_matrix.h:1088
bool isUnitary(const cpuGaugeField &field, double max_error)
static __device__ double2 atomicAdd(double2 *addr, double2 val)
Implementation of double2 atomic addition using two double-precision additions.
Definition: atomic.cuh:51
static double svd_rel_error
#define qudaDeviceSynchronize()
double tol
Definition: test_util.cpp:1656
void unitarizeLinks(cudaGaugeField &outfield, const cudaGaugeField &infield, int *fails)
bool StaggeredPhaseApplied() const
Definition: gauge_field.h:260
static bool reunit_svd_only
cpuColorSpinorField * in
constexpr int size
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
Definition: tune.cpp:643
int Volume() const
static double svd_abs_error
Main header file for host and device accessors to GaugeFields.
int X[4]
Definition: covdev_test.cpp:70
void projectSU3(cudaGaugeField &U, double tol, int *fails)
Project the input gauge field onto the SU(3) group. This is a destructive operation. The number of link failures is reported so appropriate action can be taken.
void copyArrayToLink(Matrix< float2, 3 > *link, float *array)
Definition: quda_matrix.h:1061
long long flops() const
__device__ __host__ T getTrace(const Matrix< T, 3 > &a)
Definition: quda_matrix.h:415
ProjectSU3(ProjectSU3Arg< Float, G > &arg, const GaugeField &meta)
void unitarizeLinksCPU(cpuGaugeField &outfield, const cpuGaugeField &infield)
__device__ __host__ Matrix< T, 3 > inverse(const Matrix< T, 3 > &u)
Definition: quda_matrix.h:611
static double unitarize_eps
__shared__ float s[]
unsigned long long flops
Definition: blas_quda.cu:22
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
__host__ __device__ ValueType cos(ValueType x)
Definition: complex_quda.h:46
QudaReconstructType Reconstruct() const
Definition: gauge_field.h:250
QudaGaugeFieldOrder Order() const
Definition: gauge_field.h:251
__host__ __device__ ValueType acos(ValueType x)
Definition: complex_quda.h:61
#define checkCudaError()
Definition: util_quda.h:161
__device__ __host__ T getDeterminant(const Mat< T, 3 > &a)
Definition: quda_matrix.h:422
__host__ __device__ ValueType conj(ValueType x)
Definition: complex_quda.h:130
QudaTune getTuning()
Query whether autotuning is enabled or not. Default is enabled but can be overridden by setting QUDA_...
Definition: util_quda.cpp:52
QudaPrecision Precision() const
bool isNative() const
QudaParity parity
Definition: covdev_test.cpp:54
unsigned long long bytes
Definition: blas_quda.cu:23
unsigned int sharedBytesPerBlock(const TuneParam &) const