QUDA  v0.7.0
A library for QCD on GPUs
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
unitarize_links_quda.cu
Go to the documentation of this file.
1 #include <cstdlib>
2 #include <cstdio>
3 #include <iostream>
4 #include <iomanip>
5 #include <cuda.h>
6 #include <gauge_field.h>
7 
8 #include <tune_quda.h>
9 #include <quda_matrix.h>
10 #include <hisq_links_quda.h>
11 
12 #ifdef GPU_UNITARIZE
13 
14 namespace quda{
15 
16 namespace{
17  #include <svd_quda.h>
18 }
19 
20 #ifndef FL_UNITARIZE_PI
21 #define FL_UNITARIZE_PI 3.14159265358979323846
22 #endif
23 #ifndef FL_UNITARIZE_PI23
24 #define FL_UNITARIZE_PI23 FL_UNITARIZE_PI*2.0/3.0
25 #endif
26 
27  __constant__ int INPUT_PADDING=0;
28  __constant__ int OUTPUT_PADDING=0;
29  __constant__ int DEV_MAX_ITER = 20;
30 
31  static int HOST_MAX_ITER = 20;
32 
33  __constant__ double DEV_FL_MAX_ERROR;
34  __constant__ double DEV_FL_UNITARIZE_EPS;
35  __constant__ bool DEV_FL_REUNIT_ALLOW_SVD;
36  __constant__ bool DEV_FL_REUNIT_SVD_ONLY;
37  __constant__ double DEV_FL_REUNIT_SVD_REL_ERROR;
38  __constant__ double DEV_FL_REUNIT_SVD_ABS_ERROR;
39  __constant__ bool DEV_FL_CHECK_UNITARIZATION;
40 
41  static double HOST_FL_MAX_ERROR;
42  static double HOST_FL_UNITARIZE_EPS;
43  static bool HOST_FL_REUNIT_ALLOW_SVD;
44  static bool HOST_FL_REUNIT_SVD_ONLY;
45  static double HOST_FL_REUNIT_SVD_REL_ERROR;
46  static double HOST_FL_REUNIT_SVD_ABS_ERROR;
47  static bool HOST_FL_CHECK_UNITARIZATION;
48 
49  void setUnitarizeLinksPadding(int input_padding_h, int output_padding_h)
50  {
51  cudaMemcpyToSymbol(INPUT_PADDING, &input_padding_h, sizeof(int));
52  cudaMemcpyToSymbol(OUTPUT_PADDING, &output_padding_h, sizeof(int));
53  return;
54  }
55 
56 
57  template<class Cmplx>
58  __device__ __host__
59  bool isUnitary(const Matrix<Cmplx,3>& matrix, double max_error)
60  {
61  const Matrix<Cmplx,3> identity = conj(matrix)*matrix;
62 
63  for(int i=0; i<3; ++i){
64  if( fabs(identity(i,i).x - 1.0) > max_error || fabs(identity(i,i).y) > max_error) return false;
65  for(int j=i+1; j<3; ++j){
66  if( fabs(identity(i,j).x) > max_error || fabs(identity(i,j).y) > max_error
67  || fabs(identity(j,i).x) > max_error || fabs(identity(j,i).y) > max_error ){
68  return false;
69  }
70  }
71  }
72  return true;
73  }
74 
75 
76 
77  template<class Cmplx>
78  __device__ __host__
79  bool isUnitarizedLinkConsistent(const Matrix<Cmplx,3>& initial_matrix,
80  const Matrix<Cmplx,3>& unitary_matrix,
81  double max_error)
82  {
83  Matrix<Cmplx,3> temporary;
84  temporary = conj(initial_matrix)*unitary_matrix;
85  temporary = temporary*temporary - conj(initial_matrix)*initial_matrix;
86 
87  for(int i=0; i<3; ++i){
88  for(int j=0; j<3; ++j){
89  if( fabs(temporary(i,j).x) > max_error || fabs(temporary(i,j).y) > max_error){
90  return false;
91  }
92  }
93  }
94  return true;
95  }
96 
97 
98 
99  void setUnitarizeLinksConstants(double unitarize_eps_h, double max_error_h,
100  bool allow_svd_h, bool svd_only_h,
101  double svd_rel_error_h, double svd_abs_error_h,
102  bool check_unitarization_h)
103  {
104 
105  // not_set is only initialised once
106  static bool not_set=true;
107 
108  if(not_set){
109  cudaMemcpyToSymbol(DEV_FL_UNITARIZE_EPS, &unitarize_eps_h, sizeof(double));
110  cudaMemcpyToSymbol(DEV_FL_REUNIT_ALLOW_SVD, &allow_svd_h, sizeof(bool));
111  cudaMemcpyToSymbol(DEV_FL_REUNIT_SVD_ONLY, &svd_only_h, sizeof(bool));
112  cudaMemcpyToSymbol(DEV_FL_REUNIT_SVD_REL_ERROR, &svd_rel_error_h, sizeof(double));
113  cudaMemcpyToSymbol(DEV_FL_REUNIT_SVD_ABS_ERROR, &svd_abs_error_h, sizeof(double));
114  cudaMemcpyToSymbol(DEV_FL_MAX_ERROR, &max_error_h, sizeof(double));
115  cudaMemcpyToSymbol(DEV_FL_CHECK_UNITARIZATION, &check_unitarization_h, sizeof(bool));
116 
117 
118  HOST_FL_UNITARIZE_EPS = unitarize_eps_h;
119  HOST_FL_REUNIT_ALLOW_SVD = allow_svd_h;
120  HOST_FL_REUNIT_SVD_ONLY = svd_only_h;
121  HOST_FL_REUNIT_SVD_REL_ERROR = svd_rel_error_h;
122  HOST_FL_REUNIT_SVD_ABS_ERROR = svd_abs_error_h;
123  HOST_FL_MAX_ERROR = max_error_h;
124  HOST_FL_CHECK_UNITARIZATION = check_unitarization_h;
125 
126  not_set = false;
127  }
128  checkCudaError();
129  return;
130  }
131 
132 
133  template<class T>
134  __device__ __host__
135  T getAbsMin(const T* const array, int size){
136  T min = fabs(array[0]);
137  for(int i=1; i<size; ++i){
138  T abs_val = fabs(array[i]);
139  if((abs_val) < min){ min = abs_val; }
140  }
141  return min;
142  }
143 
144 
145  template<class Real>
146  __device__ __host__
147  inline bool checkAbsoluteError(Real a, Real b, Real epsilon)
148  {
149  if( fabs(a-b) < epsilon) return true;
150  return false;
151  }
152 
153 
154  template<class Real>
155  __device__ __host__
156  inline bool checkRelativeError(Real a, Real b, Real epsilon)
157  {
158  if( fabs((a-b)/b) < epsilon ) return true;
159  return false;
160  }
161 
162 
163 
164 
165  // Compute the reciprocal square root of the matrix q
166  // Also modify q if the eigenvalues are dangerously small.
167  template<class Cmplx>
168  __device__ __host__
169  bool reciprocalRoot(const Matrix<Cmplx,3>& q, Matrix<Cmplx,3>* res){
170 
171  Matrix<Cmplx,3> qsq, tempq;
172 
173 
174  typename RealTypeId<Cmplx>::Type c[3];
175  typename RealTypeId<Cmplx>::Type g[3];
176 
177  qsq = q*q;
178  tempq = qsq*q;
179 
180  c[0] = getTrace(q).x;
181  c[1] = getTrace(qsq).x/2.0;
182  c[2] = getTrace(tempq).x/3.0;
183 
184  g[0] = g[1] = g[2] = c[0]/3.;
185  typename RealTypeId<Cmplx>::Type r,s,theta;
186  s = c[1]/3. - c[0]*c[0]/18;
187 
188 #ifdef __CUDA_ARCH__
189 #define FL_UNITARIZE_EPS DEV_FL_UNITARIZE_EPS
190 #else
191 #define FL_UNITARIZE_EPS HOST_FL_UNITARIZE_EPS
192 #endif
193 
194 
195 #ifdef __CUDA_ARCH__
196 #define FL_REUNIT_SVD_REL_ERROR DEV_FL_REUNIT_SVD_REL_ERROR
197 #define FL_REUNIT_SVD_ABS_ERROR DEV_FL_REUNIT_SVD_ABS_ERROR
198 #else // cpu
199 #define FL_REUNIT_SVD_REL_ERROR HOST_FL_REUNIT_SVD_REL_ERROR
200 #define FL_REUNIT_SVD_ABS_ERROR HOST_FL_REUNIT_SVD_ABS_ERROR
201 #endif
202 
203 
204  typename RealTypeId<Cmplx>::Type cosTheta;
205  if(fabs(s) >= FL_UNITARIZE_EPS){
206  const typename RealTypeId<Cmplx>::Type sqrt_s = sqrt(s);
207  r = c[2]/2. - (c[0]/3.)*(c[1] - c[0]*c[0]/9.);
208  cosTheta = r/(sqrt_s*sqrt_s*sqrt_s);
209  if(fabs(cosTheta) >= 1.0){
210  if( r > 0 ){
211  theta = 0.0;
212  }else{
213  theta = FL_UNITARIZE_PI;
214  }
215  }else{
216  theta = acos(cosTheta);
217  }
218  g[0] = c[0]/3 + 2*sqrt_s*cos( theta/3 );
219  g[1] = c[0]/3 + 2*sqrt_s*cos( theta/3 + FL_UNITARIZE_PI23 );
220  g[2] = c[0]/3 + 2*sqrt_s*cos( theta/3 + 2*FL_UNITARIZE_PI23 );
221  }
222 
223  // Check the eigenvalues, if the determinant does not match the product of the eigenvalues
224  // return false. Then call SVD instead.
225  typename RealTypeId<Cmplx>::Type det = getDeterminant(q).x;
226  if( fabs(det) < FL_REUNIT_SVD_ABS_ERROR ){
227  return false;
228  }
229  if( checkRelativeError(g[0]*g[1]*g[2],det,FL_REUNIT_SVD_REL_ERROR) == false ) return false;
230 
231 
232  // At this point we have finished with the c's
233  // use these to store sqrt(g)
234  for(int i=0; i<3; ++i) c[i] = sqrt(g[i]);
235 
236  // done with the g's, use these to store u, v, w
237  g[0] = c[0]+c[1]+c[2];
238  g[1] = c[0]*c[1] + c[0]*c[2] + c[1]*c[2];
239  g[2] = c[0]*c[1]*c[2];
240 
241  const typename RealTypeId<Cmplx>::Type & denominator = g[2]*(g[0]*g[1]-g[2]);
242  c[0] = (g[0]*g[1]*g[1] - g[2]*(g[0]*g[0]+g[1]))/denominator;
243  c[1] = (-g[0]*g[0]*g[0] - g[2] + 2.*g[0]*g[1])/denominator;
244  c[2] = g[0]/denominator;
245 
246  tempq = c[1]*q + c[2]*qsq;
247  // Add a real scalar
248  tempq(0,0).x += c[0];
249  tempq(1,1).x += c[0];
250  tempq(2,2).x += c[0];
251 
252  *res = tempq;
253 
254  return true;
255  }
256 
257 
258 
259 
260  template<class Cmplx>
261  __host__ __device__
262  bool unitarizeLinkMILC(const Matrix<Cmplx,3>& in, Matrix<Cmplx,3>* const result)
263  {
264  Matrix<Cmplx,3> u;
265 #ifdef __CUDA_ARCH__
266 #define FL_REUNIT_SVD_ONLY DEV_FL_REUNIT_SVD_ONLY
267 #define FL_REUNIT_ALLOW_SVD DEV_FL_REUNIT_ALLOW_SVD
268 #else
269 #define FL_REUNIT_SVD_ONLY HOST_FL_REUNIT_SVD_ONLY
270 #define FL_REUNIT_ALLOW_SVD HOST_FL_REUNIT_ALLOW_SVD
271 #endif
272  if( !FL_REUNIT_SVD_ONLY ){
273  if( reciprocalRoot<Cmplx>(conj(in)*in,&u) ){
274  *result = in*u;
275  return true;
276  }
277  }
278 
279  // If we've got this far, then the Caley-Hamilton unitarization
280  // has failed. If SVD is not allowed, the unitarization has failed.
281  if( !FL_REUNIT_ALLOW_SVD ) return false;
282 
283  Matrix<Cmplx,3> v;
284  typename RealTypeId<Cmplx>::Type singular_values[3];
285  computeSVD<Cmplx>(in, u, v, singular_values); // should pass pointers to u, v I guess
286  *result = u*conj(v);
287  return true;
288  } // unitarizeMILC
289 
290 
291  template<class Cmplx>
292  __host__ __device__
293  bool unitarizeLinkSVD(const Matrix<Cmplx,3>& in, Matrix<Cmplx,3>* const result)
294  {
295  Matrix<Cmplx,3> u, v;
296  typename RealTypeId<Cmplx>::Type singular_values[3];
297  computeSVD<Cmplx>(in, u, v, singular_values); // should pass pointers to u,v I guess
298 
299  *result = u*conj(v);
300 
301 #ifdef __CUDA_ARCH__
302 #define FL_MAX_ERROR DEV_FL_MAX_ERROR
303 #else
304 #define FL_MAX_ERROR HOST_FL_MAX_ERROR
305 #endif
306  if(isUnitary(*result,FL_MAX_ERROR)==false)
307  {
308 #if (!defined(__CUDA_ARCH__) || (__COMPUTE_CAPABILITY__>=200))
309  printf("ERROR: Link unitarity test failed\n");
310  printf("TOLERANCE: %g\n", FL_MAX_ERROR);
311 #endif
312  return false;
313  }
314  return true;
315  }
316 #undef FL_MAX_ERROR
317 
318 
319  template<class Cmplx>
320  __host__ __device__
321  bool unitarizeLinkNewton(const Matrix<Cmplx,3>& in, Matrix<Cmplx,3>* const result)
322  {
323  Matrix<Cmplx,3> u, uinv;
324  u = in;
325 
326 #ifdef __CUDA_ARCH__
327 #define MAX_ITER DEV_MAX_ITER
328 #else
329 #define MAX_ITER HOST_MAX_ITER
330 #endif
331  for(int i=0; i<MAX_ITER; ++i){
332  computeMatrixInverse(u, &uinv);
333  u = 0.5*(u + conj(uinv));
334  }
335 
336 #undef MAX_ITER
337  if(isUnitarizedLinkConsistent(in,u,0.0000001)==false)
338  {
339 #if (!defined(__CUDA_ARCH__) || (__COMPUTE_CAPABILITY__>=200))
340  printf("ERROR: Unitarized link is not consistent with incoming link\n");
341 #endif
342  return false;
343  }
344  *result = u;
345 
346  return true;
347  }
348 
349 
350 
351 
352 
353 
354 
355  template<class Cmplx>
356  __global__ void getUnitarizedField(const Cmplx* inlink_even, const Cmplx* inlink_odd,
357  Cmplx* outlink_even, Cmplx* outlink_odd,
358  int* num_failures, const int threads)
359  {
360  int mem_idx = blockIdx.x*blockDim.x + threadIdx.x;
361  if (mem_idx >= threads) return;
362 
363  const Cmplx* inlink;
364  Cmplx* outlink;
365 
366  inlink = inlink_even;
367  outlink = outlink_even;
368 
369  if(mem_idx >= threads/2){
370  mem_idx = mem_idx - (threads/2);
371  inlink = inlink_odd;
372  outlink = outlink_odd;
373  }
374 
375  // Unitarization is always done in double precision
376  Matrix<double2,3> v, result;
377  for(int dir=0; dir<4; ++dir){
378  loadLinkVariableFromArray(inlink, dir, mem_idx, (threads/2)+INPUT_PADDING, &v);
379  unitarizeLinkMILC(v, &result);
380 #ifdef __CUDA_ARCH__
381 #define FL_MAX_ERROR DEV_FL_MAX_ERROR
382 #define FL_CHECK_UNITARIZATION DEV_FL_CHECK_UNITARIZATION
383 #else
384 #define FL_MAX_ERROR HOST_FL_MAX_ERROR
385 #define FL_CHECK_UNITARIZATION HOST_FL_CHECK_UNITARIZATION
386 #endif
387  if(FL_CHECK_UNITARIZATION){
388  if(isUnitary(result,FL_MAX_ERROR) == false)
389  {
390 #ifdef __CUDA_ARCH__
391  atomicAdd(num_failures, 1);
392 #else
393  (*num_failures)++;
394 #endif
395  }
396  }
397  writeLinkVariableToArray(result, dir, mem_idx, (threads/2)+OUTPUT_PADDING, outlink);
398  }
399  return;
400  }
401 
402  class UnitarizeLinksCuda : public Tunable {
403  private:
404  const cudaGaugeField &inField;
405  cudaGaugeField &outField;
406  int *fails;
407 
408  unsigned int sharedBytesPerThread() const { return 0; }
409  unsigned int sharedBytesPerBlock(const TuneParam &) const { return 0; }
410 
411  // don't tune the grid dimension
412  bool tuneGridDim() const { return false; }
413  unsigned int minThreads() const { return inField.Volume(); }
414 
415  public:
416  UnitarizeLinksCuda(const cudaGaugeField& inField, cudaGaugeField& outField, int* fails) :
417  inField(inField), outField(outField), fails(fails) { ; }
418  virtual ~UnitarizeLinksCuda() { ; }
419 
420  void apply(const cudaStream_t &stream) {
421  TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
422 
423  if(inField.Precision() == QUDA_SINGLE_PRECISION){
424  getUnitarizedField<<<tp.grid,tp.block>>>((float2*)inField.Even_p(), (float2*)inField.Odd_p(),
425  (float2*)outField.Even_p(), (float2*)outField.Odd_p(),
426  fails, inField.Volume());
427  }else if(inField.Precision() == QUDA_DOUBLE_PRECISION){
428  getUnitarizedField<<<tp.grid,tp.block>>>((double2*)inField.Even_p(), (double2*)inField.Odd_p(),
429  (double2*)outField.Even_p(), (double2*)outField.Odd_p(),
430  fails, inField.Volume());
431  } else {
432  errorQuda("UnitarizeLinks not implemented for precision %d", inField.Precision());
433  }
434 
435  }
436  void preTune() { ; }
437  void postTune() { cudaMemset(fails, 0, sizeof(int)); } // reset fails counter
438 
439  long long flops() const { return 0; } // FIXME: add flops counter
440 
441  TuneKey tuneKey() const {
442  std::stringstream vol, aux;
443  vol << inField.X()[0] << "x";
444  vol << inField.X()[1] << "x";
445  vol << inField.X()[2] << "x";
446  vol << inField.X()[3] << "x";
447  aux << "threads=" << inField.Volume() << ",prec=" << inField.Precision();
448  aux << "stride=" << inField.Stride();
449  return TuneKey(vol.str().c_str(), typeid(*this).name(), aux.str().c_str());
450  }
451  }; // UnitarizeLinksCuda
452 
454  cudaGaugeField& inField,
455  cudaGaugeField* outField,
456  int* fails) {
457  UnitarizeLinksCuda unitarizeLinks(inField, *outField, fails);
458  unitarizeLinks.apply(0);
459  }
460 
461  void unitarizeLinksCPU(const QudaGaugeParam& param, cpuGaugeField& infield, cpuGaugeField* outfield)
462  {
463  int num_failures = 0;
464  Matrix<double2,3> inlink, outlink;
465 
466  for(int i=0; i<infield.Volume(); ++i){
467  for(int dir=0; dir<4; ++dir){
468  if(param.cpu_prec == QUDA_SINGLE_PRECISION){
469  copyArrayToLink(&inlink, ((float*)(infield.Gauge_p()) + (i*4 + dir)*18)); // order of arguments?
470  if( unitarizeLinkNewton<double2>(inlink, &outlink) == false ) num_failures++;
471  copyLinkToArray(((float*)(outfield->Gauge_p()) + (i*4 + dir)*18), outlink);
472  }else if(param.cpu_prec == QUDA_DOUBLE_PRECISION){
473  copyArrayToLink(&inlink, ((double*)(infield.Gauge_p()) + (i*4 + dir)*18)); // order of arguments?
474  if( unitarizeLinkNewton<double2>(inlink, &outlink) == false ) num_failures++;
475  copyLinkToArray(((double*)(outfield->Gauge_p()) + (i*4 + dir)*18), outlink);
476  } // precision?
477  } // dir
478  } // loop over volume
479  return;
480  }
481 
482  // CPU function which checks that the gauge field is unitary
483  bool isUnitary(const QudaGaugeParam& param, cpuGaugeField& field, double max_error)
484  {
485  Matrix<double2,3> link, identity;
486 
487  for(int i=0; i<field.Volume(); ++i){
488  for(int dir=0; dir<4; ++dir){
489  if(param.cpu_prec == QUDA_SINGLE_PRECISION){
490  copyArrayToLink(&link, ((float*)(field.Gauge_p()) + (i*4 + dir)*18)); // order of arguments?
491  }else if(param.cpu_prec == QUDA_DOUBLE_PRECISION){
492  copyArrayToLink(&link, ((double*)(field.Gauge_p()) + (i*4 + dir)*18)); // order of arguments?
493  }else{
494  errorQuda("Unsupported precision\n");
495  }
496  if(isUnitary(link,max_error) == false){
497  printf("Unitarity failure\n");
498  printf("site index = %d,\t direction = %d\n", i, dir);
499  printLink(link);
500  identity = conj(link)*link;
501  printLink(identity);
502  return false;
503  }
504  } // dir
505  } // i
506  return true;
507  } // is unitary
508 
509 } // namespace quda
510 
511 #endif
int y[4]
QudaVerbosity getVerbosity()
Definition: util_quda.cpp:20
void unitarizeLinksCPU(const QudaGaugeParam &param, cpuGaugeField &infield, cpuGaugeField *outfield)
#define errorQuda(...)
Definition: util_quda.h:73
void setUnitarizeLinksConstants(double unitarize_eps, double max_error, bool allow_svd, bool svd_only, double svd_rel_error, double svd_abs_error, bool check_unitarization=true)
__host__ __device__ ValueType sqrt(ValueType x)
Definition: complex_quda.h:105
cudaStream_t * stream
__global__ void const FloatN FloatM FloatM Float Float int threads
Definition: llfat_core.h:1099
__device__ __host__ T getDeterminant(const Matrix< T, 3 > &a)
Definition: quda_matrix.h:385
void setUnitarizeLinksPadding(int input_padding, int output_padding)
__host__ __device__ void printLink(const Matrix< Cmplx, 3 > &link)
Definition: quda_matrix.h:1012
void copyLinkToArray(float *array, const Matrix< float2, 3 > &link)
Definition: quda_matrix.h:985
QudaGaugeParam param
Definition: pack_test.cpp:17
cpuColorSpinorField * in
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
Definition: tune.cpp:271
bool isUnitary(const QudaGaugeParam &param, cpuGaugeField &field, double max_error)
void unitarizeLinksCuda(const QudaGaugeParam &param, cudaGaugeField &infield, cudaGaugeField *outfield, int *num_failures)
int mem_idx
Definition: llfat_core.h:812
int x[4]
void copyArrayToLink(Matrix< float2, 3 > *link, float *array)
Definition: quda_matrix.h:962
__device__ __host__ T getTrace(const Matrix< T, 3 > &a)
Definition: quda_matrix.h:378
__device__ void loadLinkVariableFromArray(const T *const array, const int dir, const int idx, const int stride, Matrix< T, 3 > *link)
Definition: quda_matrix.h:767
__device__ __host__ void computeMatrixInverse(const Matrix< T, 3 > &u, Matrix< T, 3 > *uinv)
Definition: quda_matrix.h:555
__host__ __device__ ValueType cos(ValueType x)
Definition: complex_quda.h:35
__host__ __device__ ValueType acos(ValueType x)
Definition: complex_quda.h:50
#define checkCudaError()
Definition: util_quda.h:110
__device__ void writeLinkVariableToArray(const Matrix< T, 3 > &link, const int dir, const int idx, const int stride, T *const array)
Definition: quda_matrix.h:830
__host__ __device__ ValueType conj(ValueType x)
Definition: complex_quda.h:115
QudaTune getTuning()
Definition: util_quda.cpp:32
VOLATILE spinorFloat * s
QudaPrecision cpu_prec
Definition: quda.h:40