QUDA  v0.5.0
A library for QCD on GPUs
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
unitarize_links_quda.cu
Go to the documentation of this file.
1 #include <cstdlib>
2 #include <cstdio>
3 #include <iostream>
4 #include <iomanip>
5 #include <cuda.h>
6 #include <gauge_field.h>
7 
8 #include <quda_matrix.h>
9 #include <svd_quda.h>
10 #include <hisq_links_quda.h>
11 
12 namespace quda{
13 
14 #ifndef FL_UNITARIZE_PI
15 #define FL_UNITARIZE_PI 3.14159265358979323846
16 #endif
17 #ifndef FL_UNITARIZE_PI23
18 #define FL_UNITARIZE_PI23 FL_UNITARIZE_PI*2.0/3.0
19 #endif
20 
21  __constant__ int INPUT_PADDING=0;
22  __constant__ int OUTPUT_PADDING=0;
23  __constant__ int DEV_MAX_ITER = 20;
24 
25  static int HOST_MAX_ITER = 20;
26 
27  __constant__ double DEV_FL_MAX_ERROR;
28  __constant__ double DEV_FL_UNITARIZE_EPS;
29  __constant__ bool DEV_FL_REUNIT_ALLOW_SVD;
30  __constant__ bool DEV_FL_REUNIT_SVD_ONLY;
31  __constant__ double DEV_FL_REUNIT_SVD_REL_ERROR;
32  __constant__ double DEV_FL_REUNIT_SVD_ABS_ERROR;
33  __constant__ bool DEV_FL_CHECK_UNITARIZATION;
34 
35  static double HOST_FL_MAX_ERROR;
36  static double HOST_FL_UNITARIZE_EPS;
37  static bool HOST_FL_REUNIT_ALLOW_SVD;
38  static bool HOST_FL_REUNIT_SVD_ONLY;
39  static double HOST_FL_REUNIT_SVD_REL_ERROR;
40  static double HOST_FL_REUNIT_SVD_ABS_ERROR;
41  static bool HOST_FL_CHECK_UNITARIZATION;
42 
43  void setUnitarizeLinksPadding(int input_padding_h, int output_padding_h)
44  {
45  cudaMemcpyToSymbol(INPUT_PADDING, &input_padding_h, sizeof(int));
46  cudaMemcpyToSymbol(OUTPUT_PADDING, &output_padding_h, sizeof(int));
47  return;
48  }
49 
50 
51  template<class Cmplx>
52  __device__ __host__
53  bool isUnitary(const Matrix<Cmplx,3>& matrix, double max_error)
54  {
55  const Matrix<Cmplx,3> identity = conj(matrix)*matrix;
56 
57  for(int i=0; i<3; ++i){
58  if( fabs(identity(i,i).x - 1.0) > max_error || fabs(identity(i,i).y) > max_error) return false;
59  for(int j=i+1; j<3; ++j){
60  if( fabs(identity(i,j).x) > max_error || fabs(identity(i,j).y) > max_error
61  || fabs(identity(j,i).x) > max_error || fabs(identity(j,i).y) > max_error ){
62  return false;
63  }
64  }
65  }
66  return true;
67  }
68 
69 
70 
71  template<class Cmplx>
72  __device__ __host__
73  bool isUnitarizedLinkConsistent(const Matrix<Cmplx,3>& initial_matrix,
74  const Matrix<Cmplx,3>& unitary_matrix,
75  double max_error)
76  {
77  Matrix<Cmplx,3> temporary;
78  temporary = conj(initial_matrix)*unitary_matrix;
79  temporary = temporary*temporary - conj(initial_matrix)*initial_matrix;
80 
81  for(int i=0; i<3; ++i){
82  for(int j=0; j<3; ++j){
83  if( fabs(temporary(i,j).x) > max_error || fabs(temporary(i,j).y) > max_error){
84  return false;
85  }
86  }
87  }
88  return true;
89  }
90 
91 
92 
93  void setUnitarizeLinksConstants(double unitarize_eps_h, double max_error_h,
94  bool allow_svd_h, bool svd_only_h,
95  double svd_rel_error_h, double svd_abs_error_h,
96  bool check_unitarization_h)
97  {
98 
99  // not_set is only initialised once
100  static bool not_set=true;
101 
102  if(not_set){
103  cudaMemcpyToSymbol(DEV_FL_UNITARIZE_EPS, &unitarize_eps_h, sizeof(double));
104  cudaMemcpyToSymbol(DEV_FL_REUNIT_ALLOW_SVD, &allow_svd_h, sizeof(bool));
105  cudaMemcpyToSymbol(DEV_FL_REUNIT_SVD_ONLY, &svd_only_h, sizeof(bool));
106  cudaMemcpyToSymbol(DEV_FL_REUNIT_SVD_REL_ERROR, &svd_rel_error_h, sizeof(double));
107  cudaMemcpyToSymbol(DEV_FL_REUNIT_SVD_ABS_ERROR, &svd_abs_error_h, sizeof(double));
108  cudaMemcpyToSymbol(DEV_FL_MAX_ERROR, &max_error_h, sizeof(double));
109  cudaMemcpyToSymbol(DEV_FL_CHECK_UNITARIZATION, &check_unitarization_h, sizeof(bool));
110 
111 
112  HOST_FL_UNITARIZE_EPS = unitarize_eps_h;
113  HOST_FL_REUNIT_ALLOW_SVD = allow_svd_h;
114  HOST_FL_REUNIT_SVD_ONLY = svd_only_h;
115  HOST_FL_REUNIT_SVD_REL_ERROR = svd_rel_error_h;
116  HOST_FL_REUNIT_SVD_ABS_ERROR = svd_abs_error_h;
117  HOST_FL_MAX_ERROR = max_error_h;
118  HOST_FL_CHECK_UNITARIZATION = check_unitarization_h;
119 
120  not_set = false;
121  }
122  checkCudaError();
123  return;
124  }
125 
126 
127  template<class T>
128  __device__ __host__
129  T getAbsMin(const T* const array, int size){
130  T min = fabs(array[0]);
131  for(int i=1; i<size; ++i){
132  T abs_val = fabs(array[i]);
133  if((abs_val) < min){ min = abs_val; }
134  }
135  return min;
136  }
137 
138 
139  template<class Real>
140  __device__ __host__
141  inline bool checkAbsoluteError(Real a, Real b, Real epsilon)
142  {
143  if( fabs(a-b) < epsilon) return true;
144  return false;
145  }
146 
147 
148  template<class Real>
149  __device__ __host__
150  inline bool checkRelativeError(Real a, Real b, Real epsilon)
151  {
152  if( fabs((a-b)/b) < epsilon ) return true;
153  return false;
154  }
155 
156 
157 
158 
159  // Compute the reciprocal square root of the matrix q
160  // Also modify q if the eigenvalues are dangerously small.
161  template<class Cmplx>
162  __device__ __host__
164 
165  Matrix<Cmplx,3> qsq, tempq;
166 
167 
168  typename RealTypeId<Cmplx>::Type c[3];
169  typename RealTypeId<Cmplx>::Type g[3];
170 
171  qsq = q*q;
172  tempq = qsq*q;
173 
174  c[0] = getTrace(q).x;
175  c[1] = getTrace(qsq).x/2.0;
176  c[2] = getTrace(tempq).x/3.0;
177 
178  g[0] = g[1] = g[2] = c[0]/3.;
179  typename RealTypeId<Cmplx>::Type r,s,theta;
180  s = c[1]/3. - c[0]*c[0]/18;
181 
182 #ifdef __CUDA_ARCH__
183 #define FL_UNITARIZE_EPS DEV_FL_UNITARIZE_EPS
184 #else
185 #define FL_UNITARIZE_EPS HOST_FL_UNITARIZE_EPS
186 #endif
187 
188 
189 #ifdef __CUDA_ARCH__
190 #define FL_REUNIT_SVD_REL_ERROR DEV_FL_REUNIT_SVD_REL_ERROR
191 #define FL_REUNIT_SVD_ABS_ERROR DEV_FL_REUNIT_SVD_ABS_ERROR
192 #else // cpu
193 #define FL_REUNIT_SVD_REL_ERROR HOST_FL_REUNIT_SVD_REL_ERROR
194 #define FL_REUNIT_SVD_ABS_ERROR HOST_FL_REUNIT_SVD_ABS_ERROR
195 #endif
196 
197 
198  typename RealTypeId<Cmplx>::Type cosTheta;
199  if(fabs(s) >= FL_UNITARIZE_EPS){
200  const typename RealTypeId<Cmplx>::Type sqrt_s = sqrt(s);
201  r = c[2]/2. - (c[0]/3.)*(c[1] - c[0]*c[0]/9.);
202  cosTheta = r/(sqrt_s*sqrt_s*sqrt_s);
203  if(fabs(cosTheta) >= 1.0){
204  if( r > 0 ){
205  theta = 0.0;
206  }else{
207  theta = FL_UNITARIZE_PI;
208  }
209  }else{
210  theta = acos(cosTheta);
211  }
212  g[0] = c[0]/3 + 2*sqrt_s*cos( theta/3 );
213  g[1] = c[0]/3 + 2*sqrt_s*cos( theta/3 + FL_UNITARIZE_PI23 );
214  g[2] = c[0]/3 + 2*sqrt_s*cos( theta/3 + 2*FL_UNITARIZE_PI23 );
215  }
216 
217  // Check the eigenvalues, if the determinant does not match the product of the eigenvalues
218  // return false. Then call SVD instead.
219  typename RealTypeId<Cmplx>::Type det = getDeterminant(q).x;
220  if( fabs(det) < FL_REUNIT_SVD_ABS_ERROR ){
221  return false;
222  }
223  if( checkRelativeError(g[0]*g[1]*g[2],det,FL_REUNIT_SVD_REL_ERROR) == false ) return false;
224 
225 
226  // At this point we have finished with the c's
227  // use these to store sqrt(g)
228  for(int i=0; i<3; ++i) c[i] = sqrt(g[i]);
229 
230  // done with the g's, use these to store u, v, w
231  g[0] = c[0]+c[1]+c[2];
232  g[1] = c[0]*c[1] + c[0]*c[2] + c[1]*c[2];
233  g[2] = c[0]*c[1]*c[2];
234 
235  const typename RealTypeId<Cmplx>::Type & denominator = g[2]*(g[0]*g[1]-g[2]);
236  c[0] = (g[0]*g[1]*g[1] - g[2]*(g[0]*g[0]+g[1]))/denominator;
237  c[1] = (-g[0]*g[0]*g[0] - g[2] + 2.*g[0]*g[1])/denominator;
238  c[2] = g[0]/denominator;
239 
240  tempq = c[1]*q + c[2]*qsq;
241  // Add a real scalar
242  tempq(0,0).x += c[0];
243  tempq(1,1).x += c[0];
244  tempq(2,2).x += c[0];
245 
246  *res = tempq;
247 
248  return true;
249  }
250 
251 
252 
253 
254  template<class Cmplx>
255  __host__ __device__
257  {
258  Matrix<Cmplx,3> u;
259 #ifdef __CUDA_ARCH__
260 #define FL_REUNIT_SVD_ONLY DEV_FL_REUNIT_SVD_ONLY
261 #define FL_REUNIT_ALLOW_SVD DEV_FL_REUNIT_ALLOW_SVD
262 #else
263 #define FL_REUNIT_SVD_ONLY HOST_FL_REUNIT_SVD_ONLY
264 #define FL_REUNIT_ALLOW_SVD HOST_FL_REUNIT_ALLOW_SVD
265 #endif
266  if( !FL_REUNIT_SVD_ONLY ){
267  if( reciprocalRoot<Cmplx>(conj(in)*in,&u) ){
268  *result = in*u;
269  return true;
270  }
271  }
272 
273  // If we've got this far, then the Caley-Hamilton unitarization
274  // has failed. If SVD is not allowed, the unitarization has failed.
275  if( !FL_REUNIT_ALLOW_SVD ) return false;
276 
277  Matrix<Cmplx,3> v;
278  typename RealTypeId<Cmplx>::Type singular_values[3];
279  computeSVD<Cmplx>(in, u, v, singular_values); // should pass pointers to u, v I guess
280  *result = u*conj(v);
281  return true;
282  } // unitarizeMILC
283 
284 
285  template<class Cmplx>
286  __host__ __device__
288  {
289  Matrix<Cmplx,3> u, v;
290  typename RealTypeId<Cmplx>::Type singular_values[3];
291  computeSVD<Cmplx>(in, u, v, singular_values); // should pass pointers to u,v I guess
292 
293  *result = u*conj(v);
294 
295 #ifdef __CUDA_ARCH__
296 #define FL_MAX_ERROR DEV_FL_MAX_ERROR
297 #else
298 #define FL_MAX_ERROR HOST_FL_MAX_ERROR
299 #endif
300  if(isUnitary(*result,FL_MAX_ERROR)==false)
301  {
302 #if (!defined(__CUDA_ARCH__) || (__COMPUTE_CAPABILITY__>=200))
303  printf("ERROR: Link unitarity test failed\n");
304  printf("TOLERANCE: %g\n", FL_MAX_ERROR);
305 #endif
306  return false;
307  }
308  return true;
309  }
310 #undef FL_MAX_ERROR
311 
312 
313  template<class Cmplx>
314  __host__ __device__
316  {
317  Matrix<Cmplx,3> u, uinv;
318  u = in;
319 
320 #ifdef __CUDA_ARCH__
321 #define MAX_ITER DEV_MAX_ITER
322 #else
323 #define MAX_ITER HOST_MAX_ITER
324 #endif
325  for(int i=0; i<MAX_ITER; ++i){
326  computeMatrixInverse(u, &uinv);
327  u = 0.5*(u + conj(uinv));
328  }
329 
330 #undef MAX_ITER
331  if(isUnitarizedLinkConsistent(in,u,0.0000001)==false)
332  {
333 #if (!defined(__CUDA_ARCH__) || (__COMPUTE_CAPABILITY__>=200))
334  printf("ERROR: Unitarized link is not consistent with incoming link\n");
335 #endif
336  return false;
337  }
338  *result = u;
339 
340  return true;
341  }
342 
343 
344 
345 
346 
347 
348 
349  template<class Cmplx>
350  __global__ void getUnitarizedField(const Cmplx* inlink_even, const Cmplx* inlink_odd,
351  Cmplx* outlink_even, Cmplx* outlink_odd,
352  int* num_failures, const int threads)
353  {
354  int mem_idx = blockIdx.x*blockDim.x + threadIdx.x;
355  if (mem_idx >= threads) return;
356 
357  const Cmplx* inlink;
358  Cmplx* outlink;
359 
360  inlink = inlink_even;
361  outlink = outlink_even;
362 
363  if(mem_idx >= Vh){
364  mem_idx = mem_idx - Vh;
365  inlink = inlink_odd;
366  outlink = outlink_odd;
367  }
368 
369  // Unitarization is always done in double precision
370  Matrix<double2,3> v, result;
371  for(int dir=0; dir<4; ++dir){
372  loadLinkVariableFromArray(inlink, dir, mem_idx, Vh+INPUT_PADDING, &v);
373  unitarizeLinkMILC(v, &result);
374 #ifdef __CUDA_ARCH__
375 #define FL_MAX_ERROR DEV_FL_MAX_ERROR
376 #define FL_CHECK_UNITARIZATION DEV_FL_CHECK_UNITARIZATION
377 #else
378 #define FL_MAX_ERROR HOST_FL_MAX_ERROR
379 #define FL_CHECK_UNITARIZATION HOST_FL_CHECK_UNITARIZATION
380 #endif
382  if(isUnitary(result,FL_MAX_ERROR) == false)
383  {
384 #ifdef __CUDA_ARCH__
385  atomicAdd(num_failures, 1);
386 #else
387  (*num_failures)++;
388 #endif
389  }
390  }
391  writeLinkVariableToArray(result, dir, mem_idx, Vh+OUTPUT_PADDING, outlink);
392  }
393  return;
394  }
395 
396  class UnitarizeLinksCuda : public Tunable {
397  private:
398  const cudaGaugeField &inField;
399  cudaGaugeField &outField;
400  int *fails;
401 
402  int sharedBytesPerThread() const { return 0; }
403  int sharedBytesPerBlock(const TuneParam &) const { return 0; }
404 
405  // don't tune the grid dimension
406  bool advanceGridDim(TuneParam &param) const { return false; }
407 
408  // generalize Tunable::advanceBlockDim() to also set gridDim, with extra checking to ensure that gridDim isn't too large for the device
409  bool advanceBlockDim(TuneParam &param) const
410  {
411  const unsigned int max_threads = deviceProp.maxThreadsDim[0];
412  const unsigned int max_blocks = deviceProp.maxGridSize[0];
413  const unsigned int max_shared = 16384; // FIXME: use deviceProp.sharedMemPerBlock;
414  const int step = deviceProp.warpSize;
415  const int threads = inField.Volume();
416  bool ret;
417  param.block.x += step;
418  if (param.block.x > max_threads || sharedBytesPerThread()*param.block.x > max_shared) {
419  param.block = dim3((threads+max_blocks-1)/max_blocks, 1, 1); // ensure the blockDim is large enough, given the limit on gridDim
420  param.block.x = ((param.block.x+step-1) / step) * step; // round up to the nearest "step"
421  if (param.block.x > max_threads) errorQuda("Local lattice volume is too large for device");
422  ret = false;
423  } else {
424  ret = true;
425  }
426  param.grid = dim3((threads+param.block.x-1)/param.block.x, 1, 1);
427  return ret;
428  }
429 
430  public:
431  UnitarizeLinksCuda(const cudaGaugeField& inField, cudaGaugeField& outField, int* fails) :
432  inField(inField), outField(outField), fails(fails) { ; }
433  virtual ~UnitarizeLinksCuda() { ; }
434 
435  void apply(const cudaStream_t &stream) {
436  TuneParam tp = tuneLaunch(*this, dslashTuning, verbosity);
437 
438  if(inField.Precision() == QUDA_SINGLE_PRECISION){
439  getUnitarizedField<<<tp.grid,tp.block>>>((float2*)inField.Even_p(), (float2*)inField.Odd_p(),
440  (float2*)outField.Even_p(), (float2*)outField.Odd_p(),
441  fails, inField.Volume());
442  }else if(inField.Precision() == QUDA_DOUBLE_PRECISION){
443  getUnitarizedField<<<tp.grid,tp.block>>>((double2*)inField.Even_p(), (double2*)inField.Odd_p(),
444  (double2*)outField.Even_p(), (double2*)outField.Odd_p(),
445  fails, inField.Volume());
446  } else {
447  errorQuda("UnitarizeLinks not implemented for precision %d", inField.Precision());
448  }
449 
450  }
451  void preTune() { ; }
452  void postTune() { cudaMemset(fails, 0, sizeof(int)); } // reset fails counter
453 
454  virtual void initTuneParam(TuneParam &param) const
455  {
456  const unsigned int max_threads = deviceProp.maxThreadsDim[0];
457  const unsigned int max_blocks = deviceProp.maxGridSize[0];
458  const int threads = inField.Volume();
459  const int step = deviceProp.warpSize;
460  param.block = dim3((threads+max_blocks-1)/max_blocks, 1, 1); // ensure the blockDim is large enough, given the limit on gridDim
461  param.block.x = ((param.block.x+step-1) / step) * step; // round up to the nearest "step"
462  if (param.block.x > max_threads) errorQuda("Local lattice volume is too large for device");
463  param.grid = dim3((threads+param.block.x-1)/param.block.x, 1, 1);
464  param.shared_bytes = sharedBytesPerThread()*param.block.x > sharedBytesPerBlock(param) ?
465  sharedBytesPerThread()*param.block.x : sharedBytesPerBlock(param);
466  }
467 
469  void defaultTuneParam(TuneParam &param) const {
470  initTuneParam(param);
471  }
472 
473  long long flops() const { return 0; } // FIXME: add flops counter
474 
475  TuneKey tuneKey() const {
476  std::stringstream vol, aux;
477  vol << inField.X()[0] << "x";
478  vol << inField.X()[1] << "x";
479  vol << inField.X()[2] << "x";
480  vol << inField.X()[3] << "x";
481  aux << "threads=" << inField.Volume() << ",prec=" << inField.Precision();
482  aux << "stride=" << inField.Stride();
483  return TuneKey(vol.str(), typeid(*this).name(), aux.str());
484  }
485  }; // UnitarizeLinksCuda
486 
488  cudaGaugeField& inField,
489  cudaGaugeField* outField,
490  int* fails) {
491  UnitarizeLinksCuda unitarizeLinks(inField, *outField, fails);
492  unitarizeLinks.apply(0);
493  }
494 
496  {
497  int num_failures = 0;
498  Matrix<double2,3> inlink, outlink;
499 
500  for(int i=0; i<infield.Volume(); ++i){
501  for(int dir=0; dir<4; ++dir){
502  if(param.cpu_prec == QUDA_SINGLE_PRECISION){
503  copyArrayToLink(&inlink, ((float*)(infield.Gauge_p()) + (i*4 + dir)*18)); // order of arguments?
504  if( unitarizeLinkNewton<double2>(inlink, &outlink) == false ) num_failures++;
505  copyLinkToArray(((float*)(outfield->Gauge_p()) + (i*4 + dir)*18), outlink);
506  }else if(param.cpu_prec == QUDA_DOUBLE_PRECISION){
507  copyArrayToLink(&inlink, ((double*)(infield.Gauge_p()) + (i*4 + dir)*18)); // order of arguments?
508  if( unitarizeLinkNewton<double2>(inlink, &outlink) == false ) num_failures++;
509  copyLinkToArray(((double*)(outfield->Gauge_p()) + (i*4 + dir)*18), outlink);
510  } // precision?
511  } // dir
512  } // loop over volume
513  return;
514  }
515 
516  // CPU function which checks that the gauge field is unitary
517  bool isUnitary(const QudaGaugeParam& param, cpuGaugeField& field, double max_error)
518  {
519  Matrix<double2,3> link, identity;
520 
521  for(int i=0; i<field.Volume(); ++i){
522  for(int dir=0; dir<4; ++dir){
523  if(param.cpu_prec == QUDA_SINGLE_PRECISION){
524  copyArrayToLink(&link, ((float*)(field.Gauge_p()) + (i*4 + dir)*18)); // order of arguments?
525  }else if(param.cpu_prec == QUDA_DOUBLE_PRECISION){
526  copyArrayToLink(&link, ((double*)(field.Gauge_p()) + (i*4 + dir)*18)); // order of arguments?
527  }else{
528  errorQuda("Unsupported precision\n");
529  }
530  if(isUnitary(link,max_error) == false){
531  printf("Unitarity failure\n");
532  printf("site index = %d,\t direction = %d\n", i, dir);
533  printLink(link);
534  identity = conj(link)*link;
535  printLink(identity);
536  return false;
537  }
538  } // dir
539  } // i
540  return true;
541  } // is unitary
542 
543 } // namespace quda