QUDA: quda/lib/unitarize_links

QUDA v0.4.0
A library for QCD on GPUs
00001 #include <cstdlib>
00002 #include <cstdio>
00003 #include <iostream>
00004 #include <iomanip>
00005 #include <cuda.h>
00006 #include <gauge_field.h>
00007 
00008 #include "quda_matrix.h"
00009 #include "svd_quda.h"
00010 #include <hisq_links_quda.h>
00011 
00012 #ifndef FL_UNITARIZE_PI
00013 #define FL_UNITARIZE_PI 3.14159265358979323846
00014 #endif
00015 #ifndef FL_UNITARIZE_PI23
00016 #define FL_UNITARIZE_PI23 FL_UNITARIZE_PI*2.0/3.0
00017 #endif 
00018  
00019 __constant__ int INPUT_PADDING=0;
00020 __constant__ int OUTPUT_PADDING=0;
00021 __constant__ int DEV_MAX_ITER = 20;
00022 
00023 static int HOST_MAX_ITER = 20;
00024 
00025 __constant__ double DEV_FL_MAX_ERROR;
00026 __constant__ double DEV_FL_UNITARIZE_EPS;
00027 __constant__ bool   DEV_FL_REUNIT_ALLOW_SVD;
00028 __constant__ bool   DEV_FL_REUNIT_SVD_ONLY;
00029 __constant__ double DEV_FL_REUNIT_SVD_REL_ERROR;
00030 __constant__ double DEV_FL_REUNIT_SVD_ABS_ERROR;
00031 __constant__ bool   DEV_FL_CHECK_UNITARIZATION;
00032 
00033 static double HOST_FL_MAX_ERROR;
00034 static double HOST_FL_UNITARIZE_EPS;
00035 static bool   HOST_FL_REUNIT_ALLOW_SVD;
00036 static bool   HOST_FL_REUNIT_SVD_ONLY;
00037 static double HOST_FL_REUNIT_SVD_REL_ERROR;
00038 static double HOST_FL_REUNIT_SVD_ABS_ERROR;
00039 static bool   HOST_FL_CHECK_UNITARIZATION;
00040 namespace hisq{
00041 
00042   void setUnitarizeLinksPadding(int input_padding, int output_padding)
00043   {
00044     cudaMemcpyToSymbol("INPUT_PADDING", &input_padding, sizeof(int));
00045     cudaMemcpyToSymbol("OUTPUT_PADDING", &output_padding, sizeof(int));
00046     return;
00047   }
00048 
00049 
00050   template<class Cmplx>
00051     __device__ __host__
00052     bool isUnitary(const Matrix<Cmplx,3>& matrix, double max_error)
00053     {
00054       const Matrix<Cmplx,3> identity = conj(matrix)*matrix;
00055 
00056       for(int i=0; i<3; ++i){
00057         if( fabs(identity(i,i).x - 1.0) > max_error || fabs(identity(i,i).y) > max_error) return false;
00058           for(int j=i+1; j<3; ++j){
00059             if( fabs(identity(i,j).x) > max_error || fabs(identity(i,j).y) > max_error
00060             ||  fabs(identity(j,i).x) > max_error || fabs(identity(j,i).y) > max_error ){
00061               return false;
00062             }
00063           }
00064         }
00065         return true;
00066     }
00067 
00068 
00069 
00070 template<class Cmplx>
00071 __device__ __host__
00072 bool isUnitarizedLinkConsistent(const Matrix<Cmplx,3>& initial_matrix,
00073                                 const Matrix<Cmplx,3>& unitary_matrix,
00074                                 double max_error)       
00075 {
00076   Matrix<Cmplx,3> temporary; 
00077   temporary = conj(initial_matrix)*unitary_matrix;
00078   temporary = temporary*temporary - conj(initial_matrix)*initial_matrix;
00079    
00080   for(int i=0; i<3; ++i){
00081     for(int j=0; j<3; ++j){
00082         if( fabs(temporary(i,j).x) > max_error || fabs(temporary(i,j).y) > max_error){
00083          return false;
00084         }
00085     }
00086   }
00087   return true;
00088 }
00089 
00090 
00091 
00092  void setUnitarizeLinksConstants(double unitarize_eps, double max_error, 
00093                                  bool allow_svd, bool svd_only,
00094                                  double svd_rel_error, double svd_abs_error, bool check_unitarization)
00095       {
00096 
00097         // not_set is only initialised once
00098         static bool not_set=true;
00099                 
00100         if(not_set){
00101           cudaMemcpyToSymbol("DEV_FL_UNITARIZE_EPS", &unitarize_eps, sizeof(double));
00102           cudaMemcpyToSymbol("DEV_FL_REUNIT_ALLOW_SVD", &allow_svd, sizeof(bool));
00103           cudaMemcpyToSymbol("DEV_FL_REUNIT_SVD_ONLY", &svd_only, sizeof(bool));
00104           cudaMemcpyToSymbol("DEV_FL_REUNIT_SVD_REL_ERROR", &svd_rel_error, sizeof(double));
00105           cudaMemcpyToSymbol("DEV_FL_REUNIT_SVD_ABS_ERROR", &svd_abs_error, sizeof(double));
00106           cudaMemcpyToSymbol("DEV_FL_MAX_ERROR", &max_error, sizeof(double));
00107           cudaMemcpyToSymbol("DEV_FL_CHECK_UNITARIZATION", &check_unitarization, sizeof(bool));
00108           
00109 
00110           HOST_FL_UNITARIZE_EPS = unitarize_eps;
00111           HOST_FL_REUNIT_ALLOW_SVD = allow_svd;
00112           HOST_FL_REUNIT_SVD_ONLY = svd_only;
00113           HOST_FL_REUNIT_SVD_REL_ERROR = svd_rel_error;
00114           HOST_FL_REUNIT_SVD_ABS_ERROR = svd_abs_error;
00115           HOST_FL_MAX_ERROR = max_error;     
00116           HOST_FL_CHECK_UNITARIZATION = check_unitarization;
00117 
00118           not_set = false;
00119         }
00120         checkCudaError();
00121         return;
00122       }
00123 
00124 
00125     template<class T>
00126       __device__ __host__
00127     T getAbsMin(const T* const array, int size){
00128       T min = fabs(array[0]);
00129       for(int i=1; i<size; ++i){
00130         T abs_val = fabs(array[i]);
00131         if((abs_val) < min){ min = abs_val; }   
00132       }
00133       return min;
00134     }
00135 
00136 
00137           template<class Real>
00138     __device__ __host__
00139     inline bool checkAbsoluteError(Real a, Real b, Real epsilon)
00140           {
00141                         if( fabs(a-b) <  epsilon) return true;
00142                         return false;
00143                 }
00144 
00145 
00146     template<class Real>
00147     __device__ __host__ 
00148     inline bool checkRelativeError(Real a, Real b, Real epsilon)
00149     {
00150       if( fabs((a-b)/b)  < epsilon ) return true;
00151       printf("Relative error = %g\n", fabs((a-b)/b));
00152       return false;
00153     }
00154     
00155 
00156 
00157 
00158     // Compute the reciprocal square root of the matrix q
00159     // Also modify q if the eigenvalues are dangerously small.
00160     template<class Cmplx> 
00161       __device__  __host__ 
00162       bool reciprocalRoot(const Matrix<Cmplx,3>& q, Matrix<Cmplx,3>* res){
00163 
00164                 Matrix<Cmplx,3> qsq, tempq;
00165 
00166 
00167                 typename RealTypeId<Cmplx>::Type c[3];
00168                 typename RealTypeId<Cmplx>::Type g[3];
00169 
00170                 qsq = q*q;
00171                 tempq = qsq*q;
00172 
00173                 c[0] = getTrace(q).x;
00174                 c[1] = getTrace(qsq).x/2.0;
00175                 c[2] = getTrace(tempq).x/3.0;
00176 
00177                 g[0] = g[1] = g[2] = c[0]/3.;
00178                 typename RealTypeId<Cmplx>::Type r,s,theta;
00179                 s = c[1]/3. - c[0]*c[0]/18;
00180 
00181 #ifdef __CUDA_ARCH__
00182 #define FL_UNITARIZE_EPS DEV_FL_UNITARIZE_EPS
00183 #else
00184 #define FL_UNITARIZE_EPS HOST_FL_UNITARIZE_EPS
00185 #endif
00186 
00187 
00188 #ifdef __CUDA_ARCH__
00189 #define FL_REUNIT_SVD_REL_ERROR DEV_FL_REUNIT_SVD_REL_ERROR
00190 #define FL_REUNIT_SVD_ABS_ERROR DEV_FL_REUNIT_SVD_ABS_ERROR
00191 #else // cpu
00192 #define FL_REUNIT_SVD_REL_ERROR HOST_FL_REUNIT_SVD_REL_ERROR
00193 #define FL_REUNIT_SVD_ABS_ERROR HOST_FL_REUNIT_SVD_ABS_ERROR
00194 #endif
00195 
00196 
00197                 typename RealTypeId<Cmplx>::Type cosTheta; 
00198                 if(fabs(s) >= FL_UNITARIZE_EPS){
00199                   r = c[2]/2. - (c[0]/3.)*(c[1] - c[0]*c[0]/9.);
00200                   cosTheta = r/sqrt(s*s*s);
00201                   if(fabs(cosTheta) >= 1.0){
00202                     if( r > 0 ){ 
00203                         theta = 0.0;
00204                     }else{
00205                         theta = FL_UNITARIZE_PI;
00206                     }
00207                   }else{ 
00208                         theta = acos(cosTheta);
00209                   }
00210                   g[0] = c[0]/3 + 2*sqrt(s)*cos( theta/3 );
00211                   g[1] = c[0]/3 + 2*sqrt(s)*cos( theta/3 + FL_UNITARIZE_PI23 );
00212                   g[2] = c[0]/3 + 2*sqrt(s)*cos( theta/3 + 2*FL_UNITARIZE_PI23 );
00213                 }
00214                 
00215                 // Check the eigenvalues, if the determinant does not match the product of the eigenvalues
00216                 // return false. Then call SVD instead.
00217                 typename RealTypeId<Cmplx>::Type det = getDeterminant(q).x;
00218                 if( fabs(det) < FL_REUNIT_SVD_ABS_ERROR ){ 
00219                         return false;
00220                 }
00221                 if( checkRelativeError(g[0]*g[1]*g[2],det,FL_REUNIT_SVD_REL_ERROR) == false ) return false;
00222 
00223 
00224                 // At this point we have finished with the c's 
00225                 // use these to store sqrt(g)
00226                 for(int i=0; i<3; ++i) c[i] = sqrt(g[i]);
00227 
00228                 // done with the g's, use these to store u, v, w
00229                 g[0] = c[0]+c[1]+c[2];
00230                 g[1] = c[0]*c[1] + c[0]*c[2] + c[1]*c[2];
00231                 g[2] = c[0]*c[1]*c[2];
00232         
00233                 const typename RealTypeId<Cmplx>::Type & denominator  = g[2]*(g[0]*g[1]-g[2]); 
00234                 c[0] = (g[0]*g[1]*g[1] - g[2]*(g[0]*g[0]+g[1]))/denominator;
00235                 c[1] = (-g[0]*g[0]*g[0] - g[2] + 2.*g[0]*g[1])/denominator;
00236                 c[2] =  g[0]/denominator;
00237 
00238                 tempq = c[1]*q + c[2]*qsq;
00239                 // Add a real scalar
00240                 tempq(0,0).x += c[0];
00241                 tempq(1,1).x += c[0];
00242                 tempq(2,2).x += c[0];
00243 
00244                 *res = tempq;
00245                 
00246                 return true;
00247       }
00248 
00249 
00250 
00251 
00252   template<class Cmplx>
00253   __host__ __device__
00254   bool unitarizeLinkMILC(const Matrix<Cmplx,3>& in, Matrix<Cmplx,3>* const result)
00255   {
00256     Matrix<Cmplx,3> u;
00257 #ifdef __CUDA_ARCH__
00258 #define FL_REUNIT_SVD_ONLY  DEV_FL_REUNIT_SVD_ONLY
00259 #define FL_REUNIT_ALLOW_SVD DEV_FL_REUNIT_ALLOW_SVD
00260 #else
00261 #define FL_REUNIT_SVD_ONLY  HOST_FL_REUNIT_SVD_ONLY
00262 #define FL_REUNIT_ALLOW_SVD HOST_FL_REUNIT_ALLOW_SVD
00263 #endif
00264     if( !FL_REUNIT_SVD_ONLY ){
00265       if( reciprocalRoot<Cmplx>(conj(in)*in,&u) ){
00266         *result = in*u;
00267         return true;
00268       }
00269     }
00270 
00271    // If we've got this far, then the Caley-Hamilton unitarization 
00272    // has failed. If SVD is not allowed, the unitarization has failed.
00273    if( !FL_REUNIT_ALLOW_SVD ) return false;
00274 
00275     Matrix<Cmplx,3> v;
00276     typename RealTypeId<Cmplx>::Type singular_values[3];
00277     computeSVD<Cmplx>(in, u, v, singular_values); // should pass pointers to u, v I guess
00278     *result = u*conj(v);
00279     return true;
00280   } // unitarizeMILC
00281     
00282 
00283    template<class Cmplx>
00284    __host__ __device__
00285    bool unitarizeLinkSVD(const Matrix<Cmplx,3>& in, Matrix<Cmplx,3>* const result)
00286    {
00287         Matrix<Cmplx,3> u, v;
00288         typename RealTypeId<Cmplx>::Type singular_values[3];
00289         computeSVD<Cmplx>(in, u, v, singular_values); // should pass pointers to u,v I guess    
00290 
00291         *result = u*conj(v);
00292 
00293 #ifdef __CUDA_ARCH__ 
00294 #define FL_MAX_ERROR  DEV_FL_MAX_ERROR
00295 #else 
00296 #define FL_MAX_ERROR  HOST_FL_MAX_ERROR
00297 #endif
00298         if(isUnitary(*result,FL_MAX_ERROR)==false)
00299         {
00300 #if (!defined(__CUDA_ARCH__) || (__COMPUTE_CAPABILITY__>=200))
00301           printf("ERROR: Unitarized link is not consistent with incoming link\n");
00302 #endif
00303           return false;
00304         }
00305         return true;
00306    }
00307 #undef FL_MAX_ERROR
00308 
00309 
00310    template<class Cmplx>
00311    __host__ __device__
00312    bool unitarizeLinkNewton(const Matrix<Cmplx,3>& in, Matrix<Cmplx,3>* const result)
00313    {
00314       Matrix<Cmplx,3> u, uinv;
00315       u = in;
00316 
00317 #ifdef __CUDA_ARCH__
00318 #define MAX_ITER DEV_MAX_ITER
00319 #else
00320 #define MAX_ITER HOST_MAX_ITER
00321 #endif
00322       for(int i=0; i<MAX_ITER; ++i){
00323          computeMatrixInverse(u, &uinv);
00324          u = 0.5*(u + conj(uinv));
00325       }
00326 
00327 #undef MAX_ITER 
00328       if(isUnitarizedLinkConsistent(in,u,0.0000001)==false)
00329       {
00330 #if (!defined(__CUDA_ARCH__) || (__COMPUTE_CAPABILITY__>=200))
00331         printf("ERROR: Unitarized link is not consistent with incoming link\n");
00332 #endif
00333         return false;
00334       }
00335       *result = u;
00336 
00337       return true;
00338    }   
00339 
00340 
00341   
00342 
00343 
00344 
00345 
00346   template<class Cmplx>
00347   __global__ void getUnitarizedField(const Cmplx* inlink_even, const Cmplx*  inlink_odd,
00348                                     Cmplx*  outlink_even, Cmplx*  outlink_odd,
00349                                     int* num_failures)
00350   {
00351     int mem_idx = blockIdx.x*blockDim.x + threadIdx.x;
00352     const Cmplx* inlink;
00353     Cmplx* outlink;
00354 
00355     inlink  = inlink_even;
00356     outlink = outlink_even;
00357     
00358     if(mem_idx >= Vh){
00359       mem_idx = mem_idx - Vh;
00360       inlink  = inlink_odd;
00361       outlink = outlink_odd;
00362     }
00363 
00364     // Unitarization is always done in double precision
00365     Matrix<double2,3> v, result;
00366     for(int dir=0; dir<4; ++dir){
00367        loadLinkVariableFromArray(inlink, dir, mem_idx, Vh+INPUT_PADDING, &v); 
00368        unitarizeLinkMILC(v, &result);
00369 #ifdef __CUDA_ARCH__
00370 #define FL_MAX_ERROR DEV_FL_MAX_ERROR
00371 #define FL_CHECK_UNITARIZATION DEV_FL_CHECK_UNITARIZATION
00372 #else
00373 #define FL_MAX_ERROR HOST_FL_MAX_ERROR
00374 #define FL_CHECK_UNITARIZATION HOST_FL_CHECK_UNITARIZATION
00375 #endif
00376      if(FL_CHECK_UNITARIZATION){
00377         if(isUnitary(result,FL_MAX_ERROR) == false)
00378         {
00379 #ifdef __CUDA_ARCH__
00380           atomicAdd(num_failures, 1);
00381 #else 
00382           (*num_failures)++;
00383 #endif
00384         }
00385       }
00386       writeLinkVariableToArray(result, dir, mem_idx, Vh+OUTPUT_PADDING, outlink); 
00387     }
00388     return;
00389   }
00390 
00391 
00392   void unitarizeLinksCuda(const QudaGaugeParam& param, cudaGaugeField& infield, cudaGaugeField* outfield, int* num_failures)
00393   {
00394         
00395     dim3 gridDim(infield.Volume()/BLOCK_DIM,1,1);
00396     dim3 blockDim(BLOCK_DIM,1,1); 
00397 
00398 
00399     checkCudaError();
00400 
00401     if(param.cuda_prec == QUDA_SINGLE_PRECISION){
00402       getUnitarizedField<<<gridDim,blockDim>>>((float2*)infield.Even_p(), (float2*)infield.Odd_p(),
00403                                                (float2*)outfield->Even_p(), (float2*)outfield->Odd_p(),
00404                                                num_failures);
00405                                                 
00406     }else if(param.cuda_prec == QUDA_DOUBLE_PRECISION){
00407       getUnitarizedField<<<gridDim,blockDim>>>((double2*)infield.Even_p(), (double2*)infield.Odd_p(),
00408                                                (double2*)outfield->Even_p(), (double2*)outfield->Odd_p(),
00409                                                num_failures);
00410     }
00411 
00412     checkCudaError();
00413     return;
00414   } // unitarize_links_cuda
00415 
00416 
00417  
00418   void unitarizeLinksCPU(const QudaGaugeParam& param, cpuGaugeField& infield, cpuGaugeField* outfield)
00419   {
00420     int num_failures = 0;
00421     Matrix<double2,3> inlink, outlink;
00422 
00423     for(int i=0; i<infield.Volume(); ++i){
00424         for(int dir=0; dir<4; ++dir){
00425           if(param.cpu_prec == QUDA_SINGLE_PRECISION){
00426             copyArrayToLink(&inlink, ((float*)(infield.Gauge_p()) + (i*4 + dir)*18)); // order of arguments?
00427             if( unitarizeLinkNewton<double2>(inlink, &outlink) == false ) num_failures++; 
00428             copyLinkToArray(((float*)(outfield->Gauge_p()) + (i*4 + dir)*18), outlink); 
00429           }else if(param.cpu_prec == QUDA_DOUBLE_PRECISION){
00430             copyArrayToLink(&inlink, ((double*)(infield.Gauge_p()) + (i*4 + dir)*18)); // order of arguments?
00431             if( unitarizeLinkNewton<double2>(inlink, &outlink) == false ) num_failures++; 
00432             copyLinkToArray(((double*)(outfield->Gauge_p()) + (i*4 + dir)*18), outlink); 
00433           } // precision?
00434         } // dir
00435     }  // loop over volume
00436     return;
00437   }
00438 
00439   // CPU function which checks that the gauge field is unitary
00440   bool isUnitary(const QudaGaugeParam& param, cpuGaugeField& field, double max_error)
00441   {
00442     Matrix<double2,3> link, identity;
00443 
00444     for(int i=0; i<field.Volume(); ++i){
00445        for(int dir=0; dir<4; ++dir){
00446          if(param.cpu_prec == QUDA_SINGLE_PRECISION){
00447             copyArrayToLink(&link, ((float*)(field.Gauge_p()) + (i*4 + dir)*18)); // order of arguments?
00448          }else if(param.cpu_prec == QUDA_DOUBLE_PRECISION){     
00449             copyArrayToLink(&link, ((double*)(field.Gauge_p()) + (i*4 + dir)*18)); // order of arguments?
00450          }else{
00451           errorQuda("Unsupported precision\n");
00452          }
00453          if(isUnitary(link,max_error) == false){ 
00454             printf("Unitarity failure\n");
00455             printf("site index = %d,\t direction = %d\n", i, dir);
00456             printLink(link);
00457             identity = conj(link)*link;
00458             printLink(identity);
00459             return false;
00460          }
00461        } // dir
00462     } // i        
00463     return true;
00464   } // is unitary
00465 
00466 
00467 
00468 } // namespace hisq