QUDA v0.4.0
A library for QCD on GPUs
|
00001 #include <cstdlib> 00002 #include <cstdio> 00003 #include <iostream> 00004 #include <iomanip> 00005 #include <cuda.h> 00006 #include <gauge_field.h> 00007 00008 #include "quda_matrix.h" 00009 #include "svd_quda.h" 00010 #include <hisq_links_quda.h> 00011 00012 #ifndef FL_UNITARIZE_PI 00013 #define FL_UNITARIZE_PI 3.14159265358979323846 00014 #endif 00015 #ifndef FL_UNITARIZE_PI23 00016 #define FL_UNITARIZE_PI23 FL_UNITARIZE_PI*2.0/3.0 00017 #endif 00018 00019 __constant__ int INPUT_PADDING=0; 00020 __constant__ int OUTPUT_PADDING=0; 00021 __constant__ int DEV_MAX_ITER = 20; 00022 00023 static int HOST_MAX_ITER = 20; 00024 00025 __constant__ double DEV_FL_MAX_ERROR; 00026 __constant__ double DEV_FL_UNITARIZE_EPS; 00027 __constant__ bool DEV_FL_REUNIT_ALLOW_SVD; 00028 __constant__ bool DEV_FL_REUNIT_SVD_ONLY; 00029 __constant__ double DEV_FL_REUNIT_SVD_REL_ERROR; 00030 __constant__ double DEV_FL_REUNIT_SVD_ABS_ERROR; 00031 __constant__ bool DEV_FL_CHECK_UNITARIZATION; 00032 00033 static double HOST_FL_MAX_ERROR; 00034 static double HOST_FL_UNITARIZE_EPS; 00035 static bool HOST_FL_REUNIT_ALLOW_SVD; 00036 static bool HOST_FL_REUNIT_SVD_ONLY; 00037 static double HOST_FL_REUNIT_SVD_REL_ERROR; 00038 static double HOST_FL_REUNIT_SVD_ABS_ERROR; 00039 static bool HOST_FL_CHECK_UNITARIZATION; 00040 namespace hisq{ 00041 00042 void setUnitarizeLinksPadding(int input_padding, int output_padding) 00043 { 00044 cudaMemcpyToSymbol("INPUT_PADDING", &input_padding, sizeof(int)); 00045 cudaMemcpyToSymbol("OUTPUT_PADDING", &output_padding, sizeof(int)); 00046 return; 00047 } 00048 00049 00050 template<class Cmplx> 00051 __device__ __host__ 00052 bool isUnitary(const Matrix<Cmplx,3>& matrix, double max_error) 00053 { 00054 const Matrix<Cmplx,3> identity = conj(matrix)*matrix; 00055 00056 for(int i=0; i<3; ++i){ 00057 if( fabs(identity(i,i).x - 1.0) > max_error || fabs(identity(i,i).y) > max_error) return false; 00058 for(int j=i+1; j<3; ++j){ 00059 if( fabs(identity(i,j).x) > max_error || fabs(identity(i,j).y) > max_error 00060 || fabs(identity(j,i).x) > max_error || fabs(identity(j,i).y) > max_error ){ 00061 return false; 00062 } 00063 } 00064 } 00065 return true; 00066 } 00067 00068 00069 00070 template<class Cmplx> 00071 __device__ __host__ 00072 bool isUnitarizedLinkConsistent(const Matrix<Cmplx,3>& initial_matrix, 00073 const Matrix<Cmplx,3>& unitary_matrix, 00074 double max_error) 00075 { 00076 Matrix<Cmplx,3> temporary; 00077 temporary = conj(initial_matrix)*unitary_matrix; 00078 temporary = temporary*temporary - conj(initial_matrix)*initial_matrix; 00079 00080 for(int i=0; i<3; ++i){ 00081 for(int j=0; j<3; ++j){ 00082 if( fabs(temporary(i,j).x) > max_error || fabs(temporary(i,j).y) > max_error){ 00083 return false; 00084 } 00085 } 00086 } 00087 return true; 00088 } 00089 00090 00091 00092 void setUnitarizeLinksConstants(double unitarize_eps, double max_error, 00093 bool allow_svd, bool svd_only, 00094 double svd_rel_error, double svd_abs_error, bool check_unitarization) 00095 { 00096 00097 // not_set is only initialised once 00098 static bool not_set=true; 00099 00100 if(not_set){ 00101 cudaMemcpyToSymbol("DEV_FL_UNITARIZE_EPS", &unitarize_eps, sizeof(double)); 00102 cudaMemcpyToSymbol("DEV_FL_REUNIT_ALLOW_SVD", &allow_svd, sizeof(bool)); 00103 cudaMemcpyToSymbol("DEV_FL_REUNIT_SVD_ONLY", &svd_only, sizeof(bool)); 00104 cudaMemcpyToSymbol("DEV_FL_REUNIT_SVD_REL_ERROR", &svd_rel_error, sizeof(double)); 00105 cudaMemcpyToSymbol("DEV_FL_REUNIT_SVD_ABS_ERROR", &svd_abs_error, sizeof(double)); 00106 cudaMemcpyToSymbol("DEV_FL_MAX_ERROR", &max_error, sizeof(double)); 00107 cudaMemcpyToSymbol("DEV_FL_CHECK_UNITARIZATION", &check_unitarization, sizeof(bool)); 00108 00109 00110 HOST_FL_UNITARIZE_EPS = unitarize_eps; 00111 HOST_FL_REUNIT_ALLOW_SVD = allow_svd; 00112 HOST_FL_REUNIT_SVD_ONLY = svd_only; 00113 HOST_FL_REUNIT_SVD_REL_ERROR = svd_rel_error; 00114 HOST_FL_REUNIT_SVD_ABS_ERROR = svd_abs_error; 00115 HOST_FL_MAX_ERROR = max_error; 00116 HOST_FL_CHECK_UNITARIZATION = check_unitarization; 00117 00118 not_set = false; 00119 } 00120 checkCudaError(); 00121 return; 00122 } 00123 00124 00125 template<class T> 00126 __device__ __host__ 00127 T getAbsMin(const T* const array, int size){ 00128 T min = fabs(array[0]); 00129 for(int i=1; i<size; ++i){ 00130 T abs_val = fabs(array[i]); 00131 if((abs_val) < min){ min = abs_val; } 00132 } 00133 return min; 00134 } 00135 00136 00137 template<class Real> 00138 __device__ __host__ 00139 inline bool checkAbsoluteError(Real a, Real b, Real epsilon) 00140 { 00141 if( fabs(a-b) < epsilon) return true; 00142 return false; 00143 } 00144 00145 00146 template<class Real> 00147 __device__ __host__ 00148 inline bool checkRelativeError(Real a, Real b, Real epsilon) 00149 { 00150 if( fabs((a-b)/b) < epsilon ) return true; 00151 printf("Relative error = %g\n", fabs((a-b)/b)); 00152 return false; 00153 } 00154 00155 00156 00157 00158 // Compute the reciprocal square root of the matrix q 00159 // Also modify q if the eigenvalues are dangerously small. 00160 template<class Cmplx> 00161 __device__ __host__ 00162 bool reciprocalRoot(const Matrix<Cmplx,3>& q, Matrix<Cmplx,3>* res){ 00163 00164 Matrix<Cmplx,3> qsq, tempq; 00165 00166 00167 typename RealTypeId<Cmplx>::Type c[3]; 00168 typename RealTypeId<Cmplx>::Type g[3]; 00169 00170 qsq = q*q; 00171 tempq = qsq*q; 00172 00173 c[0] = getTrace(q).x; 00174 c[1] = getTrace(qsq).x/2.0; 00175 c[2] = getTrace(tempq).x/3.0; 00176 00177 g[0] = g[1] = g[2] = c[0]/3.; 00178 typename RealTypeId<Cmplx>::Type r,s,theta; 00179 s = c[1]/3. - c[0]*c[0]/18; 00180 00181 #ifdef __CUDA_ARCH__ 00182 #define FL_UNITARIZE_EPS DEV_FL_UNITARIZE_EPS 00183 #else 00184 #define FL_UNITARIZE_EPS HOST_FL_UNITARIZE_EPS 00185 #endif 00186 00187 00188 #ifdef __CUDA_ARCH__ 00189 #define FL_REUNIT_SVD_REL_ERROR DEV_FL_REUNIT_SVD_REL_ERROR 00190 #define FL_REUNIT_SVD_ABS_ERROR DEV_FL_REUNIT_SVD_ABS_ERROR 00191 #else // cpu 00192 #define FL_REUNIT_SVD_REL_ERROR HOST_FL_REUNIT_SVD_REL_ERROR 00193 #define FL_REUNIT_SVD_ABS_ERROR HOST_FL_REUNIT_SVD_ABS_ERROR 00194 #endif 00195 00196 00197 typename RealTypeId<Cmplx>::Type cosTheta; 00198 if(fabs(s) >= FL_UNITARIZE_EPS){ 00199 r = c[2]/2. - (c[0]/3.)*(c[1] - c[0]*c[0]/9.); 00200 cosTheta = r/sqrt(s*s*s); 00201 if(fabs(cosTheta) >= 1.0){ 00202 if( r > 0 ){ 00203 theta = 0.0; 00204 }else{ 00205 theta = FL_UNITARIZE_PI; 00206 } 00207 }else{ 00208 theta = acos(cosTheta); 00209 } 00210 g[0] = c[0]/3 + 2*sqrt(s)*cos( theta/3 ); 00211 g[1] = c[0]/3 + 2*sqrt(s)*cos( theta/3 + FL_UNITARIZE_PI23 ); 00212 g[2] = c[0]/3 + 2*sqrt(s)*cos( theta/3 + 2*FL_UNITARIZE_PI23 ); 00213 } 00214 00215 // Check the eigenvalues, if the determinant does not match the product of the eigenvalues 00216 // return false. Then call SVD instead. 00217 typename RealTypeId<Cmplx>::Type det = getDeterminant(q).x; 00218 if( fabs(det) < FL_REUNIT_SVD_ABS_ERROR ){ 00219 return false; 00220 } 00221 if( checkRelativeError(g[0]*g[1]*g[2],det,FL_REUNIT_SVD_REL_ERROR) == false ) return false; 00222 00223 00224 // At this point we have finished with the c's 00225 // use these to store sqrt(g) 00226 for(int i=0; i<3; ++i) c[i] = sqrt(g[i]); 00227 00228 // done with the g's, use these to store u, v, w 00229 g[0] = c[0]+c[1]+c[2]; 00230 g[1] = c[0]*c[1] + c[0]*c[2] + c[1]*c[2]; 00231 g[2] = c[0]*c[1]*c[2]; 00232 00233 const typename RealTypeId<Cmplx>::Type & denominator = g[2]*(g[0]*g[1]-g[2]); 00234 c[0] = (g[0]*g[1]*g[1] - g[2]*(g[0]*g[0]+g[1]))/denominator; 00235 c[1] = (-g[0]*g[0]*g[0] - g[2] + 2.*g[0]*g[1])/denominator; 00236 c[2] = g[0]/denominator; 00237 00238 tempq = c[1]*q + c[2]*qsq; 00239 // Add a real scalar 00240 tempq(0,0).x += c[0]; 00241 tempq(1,1).x += c[0]; 00242 tempq(2,2).x += c[0]; 00243 00244 *res = tempq; 00245 00246 return true; 00247 } 00248 00249 00250 00251 00252 template<class Cmplx> 00253 __host__ __device__ 00254 bool unitarizeLinkMILC(const Matrix<Cmplx,3>& in, Matrix<Cmplx,3>* const result) 00255 { 00256 Matrix<Cmplx,3> u; 00257 #ifdef __CUDA_ARCH__ 00258 #define FL_REUNIT_SVD_ONLY DEV_FL_REUNIT_SVD_ONLY 00259 #define FL_REUNIT_ALLOW_SVD DEV_FL_REUNIT_ALLOW_SVD 00260 #else 00261 #define FL_REUNIT_SVD_ONLY HOST_FL_REUNIT_SVD_ONLY 00262 #define FL_REUNIT_ALLOW_SVD HOST_FL_REUNIT_ALLOW_SVD 00263 #endif 00264 if( !FL_REUNIT_SVD_ONLY ){ 00265 if( reciprocalRoot<Cmplx>(conj(in)*in,&u) ){ 00266 *result = in*u; 00267 return true; 00268 } 00269 } 00270 00271 // If we've got this far, then the Caley-Hamilton unitarization 00272 // has failed. If SVD is not allowed, the unitarization has failed. 00273 if( !FL_REUNIT_ALLOW_SVD ) return false; 00274 00275 Matrix<Cmplx,3> v; 00276 typename RealTypeId<Cmplx>::Type singular_values[3]; 00277 computeSVD<Cmplx>(in, u, v, singular_values); // should pass pointers to u, v I guess 00278 *result = u*conj(v); 00279 return true; 00280 } // unitarizeMILC 00281 00282 00283 template<class Cmplx> 00284 __host__ __device__ 00285 bool unitarizeLinkSVD(const Matrix<Cmplx,3>& in, Matrix<Cmplx,3>* const result) 00286 { 00287 Matrix<Cmplx,3> u, v; 00288 typename RealTypeId<Cmplx>::Type singular_values[3]; 00289 computeSVD<Cmplx>(in, u, v, singular_values); // should pass pointers to u,v I guess 00290 00291 *result = u*conj(v); 00292 00293 #ifdef __CUDA_ARCH__ 00294 #define FL_MAX_ERROR DEV_FL_MAX_ERROR 00295 #else 00296 #define FL_MAX_ERROR HOST_FL_MAX_ERROR 00297 #endif 00298 if(isUnitary(*result,FL_MAX_ERROR)==false) 00299 { 00300 #if (!defined(__CUDA_ARCH__) || (__COMPUTE_CAPABILITY__>=200)) 00301 printf("ERROR: Unitarized link is not consistent with incoming link\n"); 00302 #endif 00303 return false; 00304 } 00305 return true; 00306 } 00307 #undef FL_MAX_ERROR 00308 00309 00310 template<class Cmplx> 00311 __host__ __device__ 00312 bool unitarizeLinkNewton(const Matrix<Cmplx,3>& in, Matrix<Cmplx,3>* const result) 00313 { 00314 Matrix<Cmplx,3> u, uinv; 00315 u = in; 00316 00317 #ifdef __CUDA_ARCH__ 00318 #define MAX_ITER DEV_MAX_ITER 00319 #else 00320 #define MAX_ITER HOST_MAX_ITER 00321 #endif 00322 for(int i=0; i<MAX_ITER; ++i){ 00323 computeMatrixInverse(u, &uinv); 00324 u = 0.5*(u + conj(uinv)); 00325 } 00326 00327 #undef MAX_ITER 00328 if(isUnitarizedLinkConsistent(in,u,0.0000001)==false) 00329 { 00330 #if (!defined(__CUDA_ARCH__) || (__COMPUTE_CAPABILITY__>=200)) 00331 printf("ERROR: Unitarized link is not consistent with incoming link\n"); 00332 #endif 00333 return false; 00334 } 00335 *result = u; 00336 00337 return true; 00338 } 00339 00340 00341 00342 00343 00344 00345 00346 template<class Cmplx> 00347 __global__ void getUnitarizedField(const Cmplx* inlink_even, const Cmplx* inlink_odd, 00348 Cmplx* outlink_even, Cmplx* outlink_odd, 00349 int* num_failures) 00350 { 00351 int mem_idx = blockIdx.x*blockDim.x + threadIdx.x; 00352 const Cmplx* inlink; 00353 Cmplx* outlink; 00354 00355 inlink = inlink_even; 00356 outlink = outlink_even; 00357 00358 if(mem_idx >= Vh){ 00359 mem_idx = mem_idx - Vh; 00360 inlink = inlink_odd; 00361 outlink = outlink_odd; 00362 } 00363 00364 // Unitarization is always done in double precision 00365 Matrix<double2,3> v, result; 00366 for(int dir=0; dir<4; ++dir){ 00367 loadLinkVariableFromArray(inlink, dir, mem_idx, Vh+INPUT_PADDING, &v); 00368 unitarizeLinkMILC(v, &result); 00369 #ifdef __CUDA_ARCH__ 00370 #define FL_MAX_ERROR DEV_FL_MAX_ERROR 00371 #define FL_CHECK_UNITARIZATION DEV_FL_CHECK_UNITARIZATION 00372 #else 00373 #define FL_MAX_ERROR HOST_FL_MAX_ERROR 00374 #define FL_CHECK_UNITARIZATION HOST_FL_CHECK_UNITARIZATION 00375 #endif 00376 if(FL_CHECK_UNITARIZATION){ 00377 if(isUnitary(result,FL_MAX_ERROR) == false) 00378 { 00379 #ifdef __CUDA_ARCH__ 00380 atomicAdd(num_failures, 1); 00381 #else 00382 (*num_failures)++; 00383 #endif 00384 } 00385 } 00386 writeLinkVariableToArray(result, dir, mem_idx, Vh+OUTPUT_PADDING, outlink); 00387 } 00388 return; 00389 } 00390 00391 00392 void unitarizeLinksCuda(const QudaGaugeParam& param, cudaGaugeField& infield, cudaGaugeField* outfield, int* num_failures) 00393 { 00394 00395 dim3 gridDim(infield.Volume()/BLOCK_DIM,1,1); 00396 dim3 blockDim(BLOCK_DIM,1,1); 00397 00398 00399 checkCudaError(); 00400 00401 if(param.cuda_prec == QUDA_SINGLE_PRECISION){ 00402 getUnitarizedField<<<gridDim,blockDim>>>((float2*)infield.Even_p(), (float2*)infield.Odd_p(), 00403 (float2*)outfield->Even_p(), (float2*)outfield->Odd_p(), 00404 num_failures); 00405 00406 }else if(param.cuda_prec == QUDA_DOUBLE_PRECISION){ 00407 getUnitarizedField<<<gridDim,blockDim>>>((double2*)infield.Even_p(), (double2*)infield.Odd_p(), 00408 (double2*)outfield->Even_p(), (double2*)outfield->Odd_p(), 00409 num_failures); 00410 } 00411 00412 checkCudaError(); 00413 return; 00414 } // unitarize_links_cuda 00415 00416 00417 00418 void unitarizeLinksCPU(const QudaGaugeParam& param, cpuGaugeField& infield, cpuGaugeField* outfield) 00419 { 00420 int num_failures = 0; 00421 Matrix<double2,3> inlink, outlink; 00422 00423 for(int i=0; i<infield.Volume(); ++i){ 00424 for(int dir=0; dir<4; ++dir){ 00425 if(param.cpu_prec == QUDA_SINGLE_PRECISION){ 00426 copyArrayToLink(&inlink, ((float*)(infield.Gauge_p()) + (i*4 + dir)*18)); // order of arguments? 00427 if( unitarizeLinkNewton<double2>(inlink, &outlink) == false ) num_failures++; 00428 copyLinkToArray(((float*)(outfield->Gauge_p()) + (i*4 + dir)*18), outlink); 00429 }else if(param.cpu_prec == QUDA_DOUBLE_PRECISION){ 00430 copyArrayToLink(&inlink, ((double*)(infield.Gauge_p()) + (i*4 + dir)*18)); // order of arguments? 00431 if( unitarizeLinkNewton<double2>(inlink, &outlink) == false ) num_failures++; 00432 copyLinkToArray(((double*)(outfield->Gauge_p()) + (i*4 + dir)*18), outlink); 00433 } // precision? 00434 } // dir 00435 } // loop over volume 00436 return; 00437 } 00438 00439 // CPU function which checks that the gauge field is unitary 00440 bool isUnitary(const QudaGaugeParam& param, cpuGaugeField& field, double max_error) 00441 { 00442 Matrix<double2,3> link, identity; 00443 00444 for(int i=0; i<field.Volume(); ++i){ 00445 for(int dir=0; dir<4; ++dir){ 00446 if(param.cpu_prec == QUDA_SINGLE_PRECISION){ 00447 copyArrayToLink(&link, ((float*)(field.Gauge_p()) + (i*4 + dir)*18)); // order of arguments? 00448 }else if(param.cpu_prec == QUDA_DOUBLE_PRECISION){ 00449 copyArrayToLink(&link, ((double*)(field.Gauge_p()) + (i*4 + dir)*18)); // order of arguments? 00450 }else{ 00451 errorQuda("Unsupported precision\n"); 00452 } 00453 if(isUnitary(link,max_error) == false){ 00454 printf("Unitarity failure\n"); 00455 printf("site index = %d,\t direction = %d\n", i, dir); 00456 printLink(link); 00457 identity = conj(link)*link; 00458 printLink(identity); 00459 return false; 00460 } 00461 } // dir 00462 } // i 00463 return true; 00464 } // is unitary 00465 00466 00467 00468 } // namespace hisq