QUDA v0.4.0
A library for QCD on GPUs
|
00001 #include <cstdlib> 00002 #include <cstdio> 00003 #include <iostream> 00004 #include <iomanip> 00005 #include <cuda.h> 00006 #include <gauge_field.h> 00007 00008 #include "quda_matrix.h" 00009 #include "svd_quda.h" 00010 00011 00012 00013 #define HISQ_UNITARIZE_PI 3.14159265358979323846 00014 #define HISQ_UNITARIZE_PI23 HISQ_UNITARIZE_PI*2.0/3.0 00015 00016 // constants - File scope only 00017 __constant__ double DEV_HISQ_UNITARIZE_EPS; 00018 __constant__ double DEV_HISQ_FORCE_FILTER; 00019 __constant__ double DEV_MAX_DET_ERROR; 00020 __constant__ bool DEV_REUNIT_ALLOW_SVD; 00021 __constant__ bool DEV_REUNIT_SVD_ONLY; 00022 __constant__ double DEV_REUNIT_SVD_REL_ERROR; 00023 __constant__ double DEV_REUNIT_SVD_ABS_ERROR; 00024 00025 static double HOST_HISQ_UNITARIZE_EPS; 00026 static double HOST_HISQ_FORCE_FILTER; 00027 static double HOST_MAX_DET_ERROR; 00028 static bool HOST_REUNIT_ALLOW_SVD; 00029 static bool HOST_REUNIT_SVD_ONLY; 00030 static double HOST_REUNIT_SVD_REL_ERROR; 00031 static double HOST_REUNIT_SVD_ABS_ERROR; 00032 00033 00034 namespace hisq{ 00035 namespace fermion_force{ 00036 00037 00038 00039 void setUnitarizeForceConstants(double unitarize_eps, double hisq_force_filter, double max_det_error, 00040 bool allow_svd, bool svd_only, 00041 double svd_rel_error, double svd_abs_error) 00042 { 00043 00044 // not_set is only initialised once 00045 static bool not_set=true; 00046 00047 if(not_set){ 00048 cudaMemcpyToSymbol("DEV_HISQ_UNITARIZE_EPS", &unitarize_eps, sizeof(double)); 00049 cudaMemcpyToSymbol("DEV_HISQ_FORCE_FILTER", &hisq_force_filter, sizeof(double)); 00050 cudaMemcpyToSymbol("DEV_MAX_DET_ERROR", &max_det_error, sizeof(double)); 00051 cudaMemcpyToSymbol("DEV_REUNIT_ALLOW_SVD", &allow_svd, sizeof(bool)); 00052 cudaMemcpyToSymbol("DEV_REUNIT_SVD_ONLY", &svd_only, sizeof(bool)); 00053 cudaMemcpyToSymbol("DEV_REUNIT_SVD_REL_ERROR", &svd_rel_error, sizeof(double)); 00054 cudaMemcpyToSymbol("DEV_REUNIT_SVD_ABS_ERROR", &svd_abs_error, sizeof(double)); 00055 00056 HOST_HISQ_UNITARIZE_EPS = unitarize_eps; 00057 HOST_HISQ_FORCE_FILTER = hisq_force_filter; 00058 HOST_MAX_DET_ERROR = max_det_error; 00059 HOST_REUNIT_ALLOW_SVD = allow_svd; 00060 HOST_REUNIT_SVD_ONLY = svd_only; 00061 HOST_REUNIT_SVD_REL_ERROR = svd_rel_error; 00062 HOST_REUNIT_SVD_ABS_ERROR = svd_abs_error; 00063 00064 not_set = false; 00065 } 00066 checkCudaError(); 00067 return; 00068 } 00069 00070 00071 template<class Real> 00072 class DerivativeCoefficients{ 00073 private: 00074 Real b[6]; 00075 __device__ __host__ 00076 Real computeC00(const Real &, const Real &, const Real &); 00077 __device__ __host__ 00078 Real computeC01(const Real &, const Real &, const Real &); 00079 __device__ __host__ 00080 Real computeC02(const Real &, const Real &, const Real &); 00081 __device__ __host__ 00082 Real computeC11(const Real &, const Real &, const Real &); 00083 __device__ __host__ 00084 Real computeC12(const Real &, const Real &, const Real &); 00085 __device__ __host__ 00086 Real computeC22(const Real &, const Real &, const Real &); 00087 00088 public: 00089 __device__ __host__ void set(const Real & u, const Real & v, const Real & w); 00090 __device__ __host__ 00091 Real getB00() const { return b[0]; } 00092 __device__ __host__ 00093 Real getB01() const { return b[1]; } 00094 __device__ __host__ 00095 Real getB02() const { return b[2]; } 00096 __device__ __host__ 00097 Real getB11() const { return b[3]; } 00098 __device__ __host__ 00099 Real getB12() const { return b[4]; } 00100 __device__ __host__ 00101 Real getB22() const { return b[5]; } 00102 }; 00103 00104 template<class Real> 00105 __device__ __host__ 00106 Real DerivativeCoefficients<Real>::computeC00(const Real & u, const Real & v, const Real & w){ 00107 Real result = -pow(w,3)*pow(u,6) 00108 + 3*v*pow(w,3)*pow(u,4) 00109 + 3*pow(v,4)*w*pow(u,4) 00110 - pow(v,6)*pow(u,3) 00111 - 4*pow(w,4)*pow(u,3) 00112 - 12*pow(v,3)*pow(w,2)*pow(u,3) 00113 + 16*pow(v,2)*pow(w,3)*pow(u,2) 00114 + 3*pow(v,5)*w*pow(u,2) 00115 - 8*v*pow(w,4)*u 00116 - 3*pow(v,4)*pow(w,2)*u 00117 + pow(w,5) 00118 + pow(v,3)*pow(w,3); 00119 00120 return result; 00121 } 00122 00123 template<class Real> 00124 __device__ __host__ 00125 Real DerivativeCoefficients<Real>::computeC01(const Real & u, const Real & v, const Real & w){ 00126 Real result = - pow(w,2)*pow(u,7) 00127 - pow(v,2)*w*pow(u,6) 00128 + pow(v,4)*pow(u,5) // This was corrected! 00129 + 6*v*pow(w,2)*pow(u,5) 00130 - 5*pow(w,3)*pow(u,4) // This was corrected! 00131 - pow(v,3)*w*pow(u,4) 00132 - 2*pow(v,5)*pow(u,3) 00133 - 6*pow(v,2)*pow(w,2)*pow(u,3) 00134 + 10*v*pow(w,3)*pow(u,2) 00135 + 6*pow(v,4)*w*pow(u,2) 00136 - 3*pow(w,4)*u 00137 - 6*pow(v,3)*pow(w,2)*u 00138 + 2*pow(v,2)*pow(w,3); 00139 return result; 00140 } 00141 00142 template<class Real> 00143 __device__ __host__ 00144 Real DerivativeCoefficients<Real>::computeC02(const Real & u, const Real & v, const Real & w){ 00145 Real result = pow(w,2)*pow(u,5) 00146 + pow(v,2)*w*pow(u,4) 00147 - pow(v,4)*pow(u,3) 00148 - 4*v*pow(w,2)*pow(u,3) 00149 + 4*pow(w,3)*pow(u,2) 00150 + 3*pow(v,3)*w*pow(u,2) 00151 - 3*pow(v,2)*pow(w,2)*u 00152 + v*pow(w,3); 00153 return result; 00154 } 00155 00156 template<class Real> 00157 __device__ __host__ 00158 Real DerivativeCoefficients<Real>::computeC11(const Real & u, const Real & v, const Real & w){ 00159 Real result = - w*pow(u,8) 00160 - pow(v,2)*pow(u,7) 00161 + 7*v*w*pow(u,6) 00162 + 4*pow(v,3)*pow(u,5) 00163 - 5*pow(w,2)*pow(u,5) 00164 - 16*pow(v,2)*w*pow(u,4) 00165 - 4*pow(v,4)*pow(u,3) 00166 + 16*v*pow(w,2)*pow(u,3) 00167 - 3*pow(w,3)*pow(u,2) 00168 + 12*pow(v,3)*w*pow(u,2) 00169 - 12*pow(v,2)*pow(w,2)*u 00170 + 3*v*pow(w,3); 00171 return result; 00172 } 00173 00174 template<class Real> 00175 __device__ __host__ 00176 Real DerivativeCoefficients<Real>::computeC12(const Real & u, const Real & v, const Real & w){ 00177 Real result = w*pow(u,6) 00178 + pow(v,2)*pow(u,5) // Fixed this! 00179 - 5*v*w*pow(u,4) // Fixed this! 00180 - 2*pow(v,3)*pow(u,3) 00181 + 4*pow(w,2)*pow(u,3) 00182 + 6*pow(v,2)*w*pow(u,2) 00183 - 6*v*pow(w,2)*u 00184 + pow(w,3); 00185 return result; 00186 } 00187 00188 template<class Real> 00189 __device__ __host__ 00190 Real DerivativeCoefficients<Real>::computeC22(const Real & u, const Real & v, const Real & w){ 00191 Real result = - w*pow(u,4) 00192 - pow(v,2)*pow(u,3) 00193 + 3*v*w*pow(u,2) 00194 - 3*pow(w,2)*u; 00195 return result; 00196 } 00197 00198 template <class Real> 00199 __device__ __host__ 00200 void DerivativeCoefficients<Real>::set(const Real & u, const Real & v, const Real & w){ 00201 const Real & denominator = 2.0*pow(w*(u*v-w),3); 00202 b[0] = computeC00(u,v,w)/denominator; 00203 b[1] = computeC01(u,v,w)/denominator; 00204 b[2] = computeC02(u,v,w)/denominator; 00205 b[3] = computeC11(u,v,w)/denominator; 00206 b[4] = computeC12(u,v,w)/denominator; 00207 b[5] = computeC22(u,v,w)/denominator; 00208 return; 00209 } 00210 00211 00212 template<class Cmplx> 00213 __device__ __host__ 00214 void accumBothDerivatives(Matrix<Cmplx,3>* result, const Matrix<Cmplx,3> & left, const Matrix<Cmplx,3> & right, const Matrix<Cmplx,3> & outer_prod) 00215 { 00216 const typename RealTypeId<Cmplx>::Type temp = 2.0*getTrace(left*outer_prod).x; 00217 for(int k=0; k<3; ++k){ 00218 for(int l=0; l<3; ++l){ 00219 // Need to write it this way to get it to work 00220 // on the CPU. Not sure why. 00221 result->operator()(k,l).x += temp*right(k,l).x; 00222 result->operator()(k,l).y += temp*right(k,l).y; 00223 } 00224 } 00225 return; 00226 } 00227 00228 00229 template<class Cmplx> 00230 __device__ __host__ 00231 void accumDerivatives(Matrix<Cmplx,3>* result, const Matrix<Cmplx,3> & left, const Matrix<Cmplx,3> & right, const Matrix<Cmplx,3> & outer_prod) 00232 { 00233 Cmplx temp = getTrace(left*outer_prod); 00234 for(int k=0; k<3; ++k){ 00235 for(int l=0; l<3; ++l){ 00236 result->operator()(k,l) = temp*right(k,l); 00237 } 00238 } 00239 return; 00240 } 00241 00242 00243 template<class T> 00244 __device__ __host__ 00245 T getAbsMin(const T* const array, int size){ 00246 T min = fabs(array[0]); 00247 for(int i=1; i<size; ++i){ 00248 T abs_val = fabs(array[i]); 00249 if((abs_val) < min){ min = abs_val; } 00250 } 00251 return min; 00252 } 00253 00254 00255 template<class Real> 00256 __device__ __host__ 00257 inline bool checkAbsoluteError(Real a, Real b, Real epsilon) 00258 { 00259 if( fabs(a-b) < epsilon) return true; 00260 return false; 00261 } 00262 00263 00264 template<class Real> 00265 __device__ __host__ 00266 inline bool checkRelativeError(Real a, Real b, Real epsilon) 00267 { 00268 if( fabs((a-b)/b) < epsilon ) return true; 00269 return false; 00270 } 00271 00272 00273 00274 00275 // Compute the reciprocal square root of the matrix q 00276 // Also modify q if the eigenvalues are dangerously small. 00277 template<class Cmplx> 00278 __device__ __host__ 00279 void reciprocalRoot(Matrix<Cmplx,3>* res, DerivativeCoefficients<typename RealTypeId<Cmplx>::Type>* deriv_coeffs, 00280 typename RealTypeId<Cmplx>::Type f[3], Matrix<Cmplx,3> & q, int *unitarization_failed){ 00281 00282 Matrix<Cmplx,3> qsq, tempq; 00283 00284 typename RealTypeId<Cmplx>::Type c[3]; 00285 typename RealTypeId<Cmplx>::Type g[3]; 00286 00287 #ifdef __CUDA_ARCH__ 00288 #define REUNIT_SVD_ONLY DEV_REUNIT_SVD_ONLY 00289 #else 00290 #define REUNIT_SVD_ONLY HOST_REUNIT_SVD_ONLY 00291 #endif 00292 if(!REUNIT_SVD_ONLY){ 00293 qsq = q*q; 00294 tempq = qsq*q; 00295 00296 c[0] = getTrace(q).x; 00297 c[1] = getTrace(qsq).x/2.0; 00298 c[2] = getTrace(tempq).x/3.0; 00299 00300 g[0] = g[1] = g[2] = c[0]/3.; 00301 typename RealTypeId<Cmplx>::Type r,s,theta; 00302 s = c[1]/3. - c[0]*c[0]/18; 00303 r = c[2]/2. - (c[0]/3.)*(c[1] - c[0]*c[0]/9.); 00304 00305 #ifdef __CUDA_ARCH__ 00306 #define HISQ_UNITARIZE_EPS DEV_HISQ_UNITARIZE_EPS 00307 #else 00308 #define HISQ_UNITARIZE_EPS HOST_HISQ_UNITARIZE_EPS 00309 #endif 00310 00311 typename RealTypeId<Cmplx>::Type cosTheta = r/sqrt(s*s*s); 00312 if(fabs(s) < HISQ_UNITARIZE_EPS){ 00313 cosTheta = 1.; 00314 s = 0.0; 00315 } 00316 if(fabs(cosTheta)>1.0){ r>0 ? theta=0.0 : theta=HISQ_UNITARIZE_PI/3.0; } 00317 else{ theta = acos(cosTheta)/3.0; } 00318 00319 s = 2.0*sqrt(s); 00320 for(int i=0; i<3; ++i){ 00321 g[i] += s*cos(theta + (i-1)*HISQ_UNITARIZE_PI23); 00322 } 00323 00324 } // !REUNIT_SVD_ONLY? 00325 00326 // 00327 // Compare the product of the eigenvalues computed thus far to the 00328 // absolute value of the determinant. 00329 // If the determinant is very small or the relative error is greater than some predefined value 00330 // then recompute the eigenvalues using a singular-value decomposition. 00331 // Note that this particular calculation contains multiple branches, 00332 // so it doesn't appear to be particularly well-suited to the GPU 00333 // programming model. However, the analytic calculation of the 00334 // unitarization is extremely fast, and if the SVD routine is not called 00335 // too often, we expect pretty good performance. 00336 // 00337 00338 #ifdef __CUDA_ARCH__ 00339 #define REUNIT_ALLOW_SVD DEV_REUNIT_ALLOW_SVD 00340 #define REUNIT_SVD_REL_ERROR DEV_REUNIT_SVD_REL_ERROR 00341 #define REUNIT_SVD_ABS_ERROR DEV_REUNIT_SVD_ABS_ERROR 00342 #else // cpu 00343 #define REUNIT_ALLOW_SVD HOST_REUNIT_ALLOW_SVD 00344 #define REUNIT_SVD_REL_ERROR HOST_REUNIT_SVD_REL_ERROR 00345 #define REUNIT_SVD_ABS_ERROR HOST_REUNIT_SVD_ABS_ERROR 00346 #endif 00347 00348 if(REUNIT_ALLOW_SVD){ 00349 bool perform_svd = true; 00350 if(!REUNIT_SVD_ONLY){ 00351 const typename RealTypeId<Cmplx>::Type det = getDeterminant(q).x; 00352 if( fabs(det) >= REUNIT_SVD_ABS_ERROR){ 00353 if( checkRelativeError(g[0]*g[1]*g[2],det,REUNIT_SVD_REL_ERROR) ) perform_svd = false; 00354 } 00355 } 00356 00357 if(perform_svd){ 00358 printf("Performing SVD in force\n"); 00359 Matrix<Cmplx,3> tmp2; 00360 // compute the eigenvalues using the singular value decomposition 00361 computeSVD<Cmplx>(q,tempq,tmp2,g); 00362 // The array g contains the eigenvalues of the matrix q 00363 // The determinant is the product of the eigenvalues, and I can use this 00364 // to check the SVD 00365 const typename RealTypeId<Cmplx>::Type determinant = getDeterminant(q).x; 00366 const typename RealTypeId<Cmplx>::Type gprod = g[0]*g[1]*g[2]; 00367 // Check the svd result for errors 00368 #ifdef __CUDA_ARCH__ 00369 #define MAX_DET_ERROR DEV_MAX_DET_ERROR 00370 #else 00371 #define MAX_DET_ERROR HOST_MAX_DET_ERROR 00372 #endif 00373 if(fabs(gprod - determinant) > MAX_DET_ERROR){ 00374 #if (!defined(__CUDA_ARCH__) || (__COMPUTE_CAPABILITY__ >= 200)) 00375 printf("Warning: Error in determinant computed by SVD : %g > %g\n", fabs(gprod-determinant), MAX_DET_ERROR); 00376 printLink(q); 00377 #endif 00378 00379 #ifdef __CUDA_ARCH__ 00380 atomicAdd(unitarization_failed,1); 00381 #else 00382 (*unitarization_failed)++; 00383 #endif 00384 } 00385 } // perform_svd? 00386 00387 } // REUNIT_ALLOW_SVD? 00388 00389 #ifdef __CUDA_ARCH__ 00390 #define HISQ_FORCE_FILTER DEV_HISQ_FORCE_FILTER 00391 #else 00392 #define HISQ_FORCE_FILTER HOST_HISQ_FORCE_FILTER 00393 #endif 00394 typename RealTypeId<Cmplx>::Type delta = getAbsMin(g,3); 00395 if(delta < HISQ_FORCE_FILTER){ 00396 for(int i=0; i<3; ++i){ 00397 g[i] += HISQ_FORCE_FILTER; 00398 q(i,i).x += HISQ_FORCE_FILTER; 00399 } 00400 qsq = q*q; // recalculate Q^2 00401 } 00402 00403 00404 // At this point we have finished with the c's 00405 // use these to store sqrt(g) 00406 for(int i=0; i<3; ++i) c[i] = sqrt(g[i]); 00407 00408 // done with the g's, use these to store u, v, w 00409 g[0] = c[0]+c[1]+c[2]; 00410 g[1] = c[0]*c[1] + c[0]*c[2] + c[1]*c[2]; 00411 g[2] = c[0]*c[1]*c[2]; 00412 00413 // set the derivative coefficients! 00414 deriv_coeffs->set(g[0], g[1], g[2]); 00415 00416 const typename RealTypeId<Cmplx>::Type & denominator = g[2]*(g[0]*g[1]-g[2]); 00417 c[0] = (g[0]*g[1]*g[1] - g[2]*(g[0]*g[0]+g[1]))/denominator; 00418 c[1] = (-g[0]*g[0]*g[0] - g[2] + 2.*g[0]*g[1])/denominator; 00419 c[2] = g[0]/denominator; 00420 00421 tempq = c[1]*q + c[2]*qsq; 00422 // Add a real scalar 00423 tempq(0,0).x += c[0]; 00424 tempq(1,1).x += c[0]; 00425 tempq(2,2).x += c[0]; 00426 00427 f[0] = c[0]; 00428 f[1] = c[1]; 00429 f[2] = c[2]; 00430 00431 *res = tempq; 00432 return; 00433 } 00434 00435 00436 00437 // "v" denotes a "fattened" link variable 00438 template<class Cmplx> 00439 __device__ __host__ 00440 void getUnitarizeForceSite(const Matrix<Cmplx,3> & v, const Matrix<Cmplx,3> & outer_prod, Matrix<Cmplx,3>* result, int *unitarization_failed) 00441 { 00442 typename RealTypeId<Cmplx>::Type f[3]; 00443 typename RealTypeId<Cmplx>::Type b[6]; 00444 00445 Matrix<Cmplx,3> v_dagger = conj(v); // okay! 00446 Matrix<Cmplx,3> q = v_dagger*v; // okay! 00447 00448 Matrix<Cmplx,3> rsqrt_q; 00449 00450 DerivativeCoefficients<typename RealTypeId<Cmplx>::Type> deriv_coeffs; 00451 00452 reciprocalRoot<Cmplx>(&rsqrt_q, &deriv_coeffs, f, q, unitarization_failed); 00453 00454 // Pure hack here 00455 b[0] = deriv_coeffs.getB00(); 00456 b[1] = deriv_coeffs.getB01(); 00457 b[2] = deriv_coeffs.getB02(); 00458 b[3] = deriv_coeffs.getB11(); 00459 b[4] = deriv_coeffs.getB12(); 00460 b[5] = deriv_coeffs.getB22(); 00461 00462 00463 Matrix<Cmplx,3> & local_result = *result; 00464 00465 local_result = rsqrt_q*outer_prod; 00466 00467 // We are now finished with rsqrt_q 00468 Matrix<Cmplx,3> qv_dagger = q*v_dagger; 00469 Matrix<Cmplx,3> vv_dagger = v*v_dagger; 00470 Matrix<Cmplx,3> vqv_dagger = v*qv_dagger; 00471 Matrix<Cmplx,3> temp = f[1]*vv_dagger + f[2]*vqv_dagger; 00472 00473 00474 temp = f[1]*v_dagger + f[2]*qv_dagger; 00475 Matrix<Cmplx,3> conj_outer_prod = conj(outer_prod); 00476 00477 00478 temp = f[1]*v + f[2]*v*q; 00479 local_result = local_result + outer_prod*temp*v_dagger + f[2]*q*outer_prod*vv_dagger; 00480 00481 local_result = local_result + v_dagger*conj_outer_prod*conj(temp) + f[2]*qv_dagger*conj_outer_prod*v_dagger; 00482 00483 00484 // now done with vv_dagger, I think 00485 Matrix<Cmplx,3> qsqv_dagger = q*qv_dagger; 00486 Matrix<Cmplx,3> pv_dagger = b[0]*v_dagger + b[1]*qv_dagger + b[2]*qsqv_dagger; 00487 accumBothDerivatives(&local_result, v, pv_dagger, outer_prod); 00488 00489 Matrix<Cmplx,3> rv_dagger = b[1]*v_dagger + b[3]*qv_dagger + b[4]*qsqv_dagger; 00490 Matrix<Cmplx,3> vq = v*q; 00491 accumBothDerivatives(&local_result, vq, rv_dagger, outer_prod); 00492 00493 Matrix<Cmplx,3> sv_dagger = b[2]*v_dagger + b[4]*qv_dagger + b[5]*qsqv_dagger; 00494 Matrix<Cmplx,3> vqsq = vq*q; 00495 accumBothDerivatives(&local_result, vqsq, sv_dagger, outer_prod); 00496 return; 00497 } // get unit force term 00498 00499 00500 00501 template<class Cmplx> 00502 __global__ void getUnitarizeForceField(Cmplx* link_even, Cmplx* link_odd, 00503 Cmplx* old_force_even, Cmplx* old_force_odd, 00504 Cmplx* force_even, Cmplx* force_odd, 00505 int* unitarization_failed) 00506 { 00507 int mem_idx = blockIdx.x*blockDim.x + threadIdx.x; 00508 00509 Cmplx* force; 00510 Cmplx* link; 00511 Cmplx* old_force; 00512 00513 force = force_even; 00514 link = link_even; 00515 old_force = old_force_even; 00516 if(mem_idx >= Vh){ 00517 mem_idx = mem_idx - Vh; 00518 force = force_odd; 00519 link = link_odd; 00520 old_force = old_force_odd; 00521 } 00522 00523 00524 // This part of the calculation is always done in double precision 00525 Matrix<double2,3> v, result, oprod; 00526 00527 for(int dir=0; dir<4; ++dir){ 00528 loadLinkVariableFromArray(old_force, dir, mem_idx, Vh, &oprod); 00529 loadLinkVariableFromArray(link, dir, mem_idx, Vh, &v); 00530 00531 getUnitarizeForceSite<double2>(v, oprod, &result, unitarization_failed); 00532 00533 writeLinkVariableToArray(result, dir, mem_idx, Vh, force); 00534 } 00535 return; 00536 } // getUnitarizeForceField 00537 00538 00539 void unitarizeForceCPU(const QudaGaugeParam& param, cpuGaugeField& cpuOldForce, cpuGaugeField& cpuGauge, cpuGaugeField* cpuNewForce) 00540 { 00541 00542 int num_failures = 0; 00543 Matrix<double2,3> old_force, new_force, v; 00544 for(int i=0; i<cpuGauge.Volume(); ++i){ 00545 for(int dir=0; dir<4; ++dir){ 00546 if(param.cpu_prec == QUDA_SINGLE_PRECISION){ 00547 copyArrayToLink(&old_force, ((float*)(cpuOldForce.Gauge_p()) + (i*4 + dir)*18)); 00548 copyArrayToLink(&v, ((float*)(cpuGauge.Gauge_p()) + (i*4 + dir)*18)); 00549 getUnitarizeForceSite<double2>(v, old_force, &new_force, &num_failures); 00550 copyLinkToArray(((float*)(cpuNewForce->Gauge_p()) + (i*4 + dir)*18), new_force); 00551 }else if(param.cpu_prec == QUDA_DOUBLE_PRECISION){ 00552 copyArrayToLink(&old_force, ((double*)(cpuOldForce.Gauge_p()) + (i*4 + dir)*18)); 00553 copyArrayToLink(&v, ((double*)(cpuGauge.Gauge_p()) + (i*4 + dir)*18)); 00554 getUnitarizeForceSite<double2>(v, old_force, &new_force, &num_failures); 00555 copyLinkToArray(((double*)(cpuNewForce->Gauge_p()) + (i*4 + dir)*18), new_force); 00556 } // precision? 00557 } // dir 00558 } // i 00559 return; 00560 } // unitarize_force_cpu 00561 00562 00563 void unitarizeForceCuda(const QudaGaugeParam& param, cudaGaugeField& cudaOldForce, cudaGaugeField& cudaGauge, cudaGaugeField* cudaNewForce, int* unitarization_failed) 00564 { 00565 00566 dim3 gridDim(cudaGauge.Volume()/BLOCK_DIM,1,1); 00567 dim3 blockDim(BLOCK_DIM,1,1); 00568 00569 if(param.cuda_prec == QUDA_SINGLE_PRECISION){ 00570 getUnitarizeForceField<<<gridDim,blockDim>>>((float2*)cudaGauge.Even_p(), (float2*)cudaGauge.Odd_p(), 00571 (float2*)cudaOldForce.Even_p(), (float2*)cudaOldForce.Odd_p(), 00572 (float2*)cudaNewForce->Even_p(), (float2*)cudaNewForce->Odd_p(), 00573 unitarization_failed); 00574 }else if(param.cuda_prec == QUDA_DOUBLE_PRECISION){ 00575 getUnitarizeForceField<<<gridDim,blockDim>>>((double2*)cudaGauge.Even_p(), (double2*)cudaGauge.Odd_p(), 00576 (double2*)cudaOldForce.Even_p(), (double2*)cudaOldForce.Odd_p(), 00577 (double2*)cudaNewForce->Even_p(), (double2*)cudaNewForce->Odd_p(), 00578 unitarization_failed); 00579 } // precision? 00580 return; 00581 } // unitarize_force_cuda 00582 00583 00584 } // namespace fermion_force 00585 } // namespace hisq 00586 00587