QUDA: quda/lib/unitarize_force

QUDA v0.4.0
A library for QCD on GPUs
00001 #include <cstdlib>
00002 #include <cstdio>
00003 #include <iostream>
00004 #include <iomanip>
00005 #include <cuda.h>
00006 #include <gauge_field.h>
00007 
00008 #include "quda_matrix.h"
00009 #include "svd_quda.h"
00010 
00011 
00012 
00013 #define HISQ_UNITARIZE_PI 3.14159265358979323846
00014 #define HISQ_UNITARIZE_PI23 HISQ_UNITARIZE_PI*2.0/3.0
00015 
00016 // constants - File scope only
00017 __constant__ double DEV_HISQ_UNITARIZE_EPS;
00018 __constant__ double DEV_HISQ_FORCE_FILTER;
00019 __constant__ double DEV_MAX_DET_ERROR;
00020 __constant__ bool DEV_REUNIT_ALLOW_SVD;
00021 __constant__ bool DEV_REUNIT_SVD_ONLY;
00022 __constant__ double DEV_REUNIT_SVD_REL_ERROR;
00023 __constant__ double DEV_REUNIT_SVD_ABS_ERROR;
00024 
00025 static double HOST_HISQ_UNITARIZE_EPS;
00026 static double HOST_HISQ_FORCE_FILTER;
00027 static double HOST_MAX_DET_ERROR;
00028 static bool   HOST_REUNIT_ALLOW_SVD;
00029 static bool   HOST_REUNIT_SVD_ONLY;
00030 static double HOST_REUNIT_SVD_REL_ERROR;
00031 static double HOST_REUNIT_SVD_ABS_ERROR;
00032 
00033  
00034 namespace hisq{
00035   namespace fermion_force{
00036 
00037 
00038 
00039       void setUnitarizeForceConstants(double unitarize_eps, double hisq_force_filter, double max_det_error, 
00040                                       bool allow_svd, bool svd_only,
00041                                       double svd_rel_error, double svd_abs_error)
00042       {
00043 
00044         // not_set is only initialised once
00045         static bool not_set=true;
00046                 
00047         if(not_set){
00048           cudaMemcpyToSymbol("DEV_HISQ_UNITARIZE_EPS", &unitarize_eps, sizeof(double));
00049           cudaMemcpyToSymbol("DEV_HISQ_FORCE_FILTER", &hisq_force_filter, sizeof(double));
00050           cudaMemcpyToSymbol("DEV_MAX_DET_ERROR", &max_det_error, sizeof(double));
00051           cudaMemcpyToSymbol("DEV_REUNIT_ALLOW_SVD", &allow_svd, sizeof(bool));
00052           cudaMemcpyToSymbol("DEV_REUNIT_SVD_ONLY", &svd_only, sizeof(bool));
00053           cudaMemcpyToSymbol("DEV_REUNIT_SVD_REL_ERROR", &svd_rel_error, sizeof(double));
00054           cudaMemcpyToSymbol("DEV_REUNIT_SVD_ABS_ERROR", &svd_abs_error, sizeof(double));
00055         
00056           HOST_HISQ_UNITARIZE_EPS = unitarize_eps;
00057           HOST_HISQ_FORCE_FILTER = hisq_force_filter;
00058           HOST_MAX_DET_ERROR = max_det_error;     
00059           HOST_REUNIT_ALLOW_SVD = allow_svd;
00060           HOST_REUNIT_SVD_ONLY = svd_only;
00061           HOST_REUNIT_SVD_REL_ERROR = svd_rel_error;
00062           HOST_REUNIT_SVD_ABS_ERROR = svd_abs_error;
00063 
00064           not_set = false;
00065         }
00066         checkCudaError();
00067         return;
00068       }
00069 
00070 
00071     template<class Real>
00072      class DerivativeCoefficients{
00073        private:
00074          Real b[6]; 
00075          __device__ __host__       
00076          Real computeC00(const Real &, const Real &, const Real &);
00077          __device__ __host__
00078          Real computeC01(const Real &, const Real &, const Real &);
00079          __device__ __host__
00080          Real computeC02(const Real &, const Real &, const Real &);
00081          __device__ __host__
00082          Real computeC11(const Real &, const Real &, const Real &);
00083          __device__ __host__
00084          Real computeC12(const Real &, const Real &, const Real &);
00085          __device__ __host__
00086          Real computeC22(const Real &, const Real &, const Real &);
00087 
00088        public:
00089          __device__ __host__ void set(const Real & u, const Real & v, const Real & w);
00090          __device__ __host__
00091          Real getB00() const { return b[0]; }
00092          __device__ __host__
00093          Real getB01() const { return b[1]; }
00094          __device__ __host__
00095          Real getB02() const { return b[2]; }
00096          __device__ __host__
00097          Real getB11() const { return b[3]; }
00098          __device__ __host__
00099          Real getB12() const { return b[4]; }
00100          __device__ __host__
00101          Real getB22() const { return b[5]; }
00102      };
00103 
00104     template<class Real>
00105       __device__ __host__
00106       Real DerivativeCoefficients<Real>::computeC00(const Real & u, const Real & v, const Real & w){
00107         Real result =   -pow(w,3)*pow(u,6)
00108           + 3*v*pow(w,3)*pow(u,4)
00109           + 3*pow(v,4)*w*pow(u,4)
00110           -   pow(v,6)*pow(u,3)
00111           - 4*pow(w,4)*pow(u,3)
00112           - 12*pow(v,3)*pow(w,2)*pow(u,3)
00113           + 16*pow(v,2)*pow(w,3)*pow(u,2)
00114           + 3*pow(v,5)*w*pow(u,2)
00115           - 8*v*pow(w,4)*u
00116           - 3*pow(v,4)*pow(w,2)*u
00117           + pow(w,5)
00118           + pow(v,3)*pow(w,3);
00119 
00120         return result;
00121       }
00122 
00123     template<class Real>
00124       __device__ __host__
00125       Real DerivativeCoefficients<Real>::computeC01(const Real & u, const Real & v, const Real & w){
00126         Real result =  - pow(w,2)*pow(u,7)
00127           - pow(v,2)*w*pow(u,6)
00128           + pow(v,4)*pow(u,5)   // This was corrected!
00129           + 6*v*pow(w,2)*pow(u,5)
00130           - 5*pow(w,3)*pow(u,4)    // This was corrected!
00131           - pow(v,3)*w*pow(u,4)
00132           - 2*pow(v,5)*pow(u,3)
00133           - 6*pow(v,2)*pow(w,2)*pow(u,3)
00134           + 10*v*pow(w,3)*pow(u,2)
00135           + 6*pow(v,4)*w*pow(u,2)
00136           - 3*pow(w,4)*u
00137           - 6*pow(v,3)*pow(w,2)*u
00138           + 2*pow(v,2)*pow(w,3);
00139         return result;
00140       }
00141 
00142     template<class Real>
00143       __device__ __host__
00144       Real DerivativeCoefficients<Real>::computeC02(const Real & u, const Real & v, const Real & w){
00145         Real result =   pow(w,2)*pow(u,5)
00146           + pow(v,2)*w*pow(u,4)
00147           - pow(v,4)*pow(u,3)
00148           - 4*v*pow(w,2)*pow(u,3)
00149           + 4*pow(w,3)*pow(u,2)
00150           + 3*pow(v,3)*w*pow(u,2)
00151           - 3*pow(v,2)*pow(w,2)*u
00152           + v*pow(w,3);
00153         return result;
00154       }
00155 
00156     template<class Real>
00157       __device__ __host__
00158       Real DerivativeCoefficients<Real>::computeC11(const Real & u, const Real & v, const Real & w){
00159         Real result = - w*pow(u,8)
00160           - pow(v,2)*pow(u,7)
00161           + 7*v*w*pow(u,6)
00162           + 4*pow(v,3)*pow(u,5)
00163           - 5*pow(w,2)*pow(u,5)
00164           - 16*pow(v,2)*w*pow(u,4)
00165           - 4*pow(v,4)*pow(u,3)
00166           + 16*v*pow(w,2)*pow(u,3)
00167           - 3*pow(w,3)*pow(u,2)
00168           + 12*pow(v,3)*w*pow(u,2)
00169           - 12*pow(v,2)*pow(w,2)*u
00170           + 3*v*pow(w,3);
00171         return result;
00172       }
00173 
00174     template<class Real>
00175       __device__ __host__
00176       Real DerivativeCoefficients<Real>::computeC12(const Real & u, const Real & v, const Real & w){
00177         Real result =  w*pow(u,6)
00178           + pow(v,2)*pow(u,5) // Fixed this!
00179           - 5*v*w*pow(u,4)  // Fixed this!
00180           - 2*pow(v,3)*pow(u,3)
00181           + 4*pow(w,2)*pow(u,3)
00182           + 6*pow(v,2)*w*pow(u,2)
00183           - 6*v*pow(w,2)*u
00184           + pow(w,3);
00185         return result;
00186       }
00187 
00188     template<class Real>
00189       __device__ __host__
00190       Real DerivativeCoefficients<Real>::computeC22(const Real & u, const Real & v, const Real & w){
00191         Real result = - w*pow(u,4)
00192           - pow(v,2)*pow(u,3)
00193           + 3*v*w*pow(u,2)
00194           - 3*pow(w,2)*u;
00195         return result;
00196       }
00197 
00198     template <class Real>
00199       __device__ __host__
00200       void  DerivativeCoefficients<Real>::set(const Real & u, const Real & v, const Real & w){
00201         const Real & denominator = 2.0*pow(w*(u*v-w),3); 
00202         b[0] = computeC00(u,v,w)/denominator;
00203         b[1] = computeC01(u,v,w)/denominator;
00204         b[2] = computeC02(u,v,w)/denominator;
00205         b[3] = computeC11(u,v,w)/denominator;
00206         b[4] = computeC12(u,v,w)/denominator;
00207         b[5] = computeC22(u,v,w)/denominator;
00208         return;
00209       }
00210 
00211 
00212     template<class Cmplx>
00213       __device__ __host__
00214       void accumBothDerivatives(Matrix<Cmplx,3>* result, const Matrix<Cmplx,3> & left, const Matrix<Cmplx,3> & right, const Matrix<Cmplx,3> & outer_prod)
00215       {
00216         const typename RealTypeId<Cmplx>::Type temp = 2.0*getTrace(left*outer_prod).x;
00217         for(int k=0; k<3; ++k){
00218           for(int l=0; l<3; ++l){
00219             // Need to write it this way to get it to work 
00220             // on the CPU. Not sure why.
00221             result->operator()(k,l).x += temp*right(k,l).x;
00222             result->operator()(k,l).y += temp*right(k,l).y;
00223           }
00224         }
00225         return;
00226       }
00227 
00228 
00229     template<class Cmplx>
00230       __device__ __host__
00231       void accumDerivatives(Matrix<Cmplx,3>* result, const Matrix<Cmplx,3> & left, const Matrix<Cmplx,3> & right, const Matrix<Cmplx,3> & outer_prod)
00232       {
00233         Cmplx temp = getTrace(left*outer_prod);
00234         for(int k=0; k<3; ++k){
00235           for(int l=0; l<3; ++l){
00236             result->operator()(k,l) = temp*right(k,l);
00237           }
00238         }
00239         return;
00240       }
00241 
00242 
00243     template<class T>
00244       __device__ __host__
00245     T getAbsMin(const T* const array, int size){
00246       T min = fabs(array[0]);
00247       for(int i=1; i<size; ++i){
00248         T abs_val = fabs(array[i]);
00249         if((abs_val) < min){ min = abs_val; }   
00250       }
00251       return min;
00252     }
00253 
00254 
00255           template<class Real>
00256     __device__ __host__
00257     inline bool checkAbsoluteError(Real a, Real b, Real epsilon)
00258           {
00259                         if( fabs(a-b) <  epsilon) return true;
00260                         return false;
00261                 }
00262 
00263 
00264     template<class Real>
00265     __device__ __host__ 
00266     inline bool checkRelativeError(Real a, Real b, Real epsilon)
00267     {
00268       if( fabs((a-b)/b)  < epsilon ) return true;
00269                         return false;
00270     }
00271     
00272 
00273 
00274 
00275     // Compute the reciprocal square root of the matrix q
00276     // Also modify q if the eigenvalues are dangerously small.
00277     template<class Cmplx> 
00278       __device__  __host__ 
00279       void reciprocalRoot(Matrix<Cmplx,3>* res, DerivativeCoefficients<typename RealTypeId<Cmplx>::Type>* deriv_coeffs, 
00280                                                                 typename RealTypeId<Cmplx>::Type f[3], Matrix<Cmplx,3> & q, int *unitarization_failed){
00281 
00282         Matrix<Cmplx,3> qsq, tempq;
00283 
00284         typename RealTypeId<Cmplx>::Type c[3];
00285         typename RealTypeId<Cmplx>::Type g[3];
00286 
00287 #ifdef __CUDA_ARCH__
00288 #define REUNIT_SVD_ONLY DEV_REUNIT_SVD_ONLY
00289 #else
00290 #define REUNIT_SVD_ONLY HOST_REUNIT_SVD_ONLY
00291 #endif
00292         if(!REUNIT_SVD_ONLY){
00293                 qsq = q*q;
00294                 tempq = qsq*q;
00295 
00296                 c[0] = getTrace(q).x;
00297                 c[1] = getTrace(qsq).x/2.0;
00298                 c[2] = getTrace(tempq).x/3.0;
00299 
00300                 g[0] = g[1] = g[2] = c[0]/3.;
00301                 typename RealTypeId<Cmplx>::Type r,s,theta;
00302                 s = c[1]/3. - c[0]*c[0]/18;
00303                 r = c[2]/2. - (c[0]/3.)*(c[1] - c[0]*c[0]/9.);
00304 
00305 #ifdef __CUDA_ARCH__
00306 #define HISQ_UNITARIZE_EPS DEV_HISQ_UNITARIZE_EPS
00307 #else
00308 #define HISQ_UNITARIZE_EPS HOST_HISQ_UNITARIZE_EPS
00309 #endif
00310 
00311                 typename RealTypeId<Cmplx>::Type cosTheta = r/sqrt(s*s*s);
00312                 if(fabs(s) < HISQ_UNITARIZE_EPS){
00313                         cosTheta = 1.;
00314                         s = 0.0; 
00315                 }
00316                 if(fabs(cosTheta)>1.0){ r>0 ? theta=0.0 : theta=HISQ_UNITARIZE_PI/3.0; }
00317                 else{ theta = acos(cosTheta)/3.0; }
00318 
00319                 s = 2.0*sqrt(s);
00320                 for(int i=0; i<3; ++i){
00321                         g[i] += s*cos(theta + (i-1)*HISQ_UNITARIZE_PI23);
00322                 }
00323 
00324         } // !REUNIT_SVD_ONLY?
00325 
00326         //
00327         // Compare the product of the eigenvalues computed thus far to the 
00328         // absolute value of the determinant. 
00329         // If the determinant is very small or the relative error is greater than some predefined value 
00330         // then recompute the eigenvalues using a singular-value decomposition.
00331         // Note that this particular calculation contains multiple branches, 
00332         // so it doesn't appear to be particularly well-suited to the GPU 
00333         // programming model. However, the analytic calculation of the 
00334         // unitarization is extremely fast, and if the SVD routine is not called 
00335         // too often, we expect pretty good performance.
00336         //
00337 
00338 #ifdef __CUDA_ARCH__
00339 #define REUNIT_ALLOW_SVD DEV_REUNIT_ALLOW_SVD
00340 #define REUNIT_SVD_REL_ERROR DEV_REUNIT_SVD_REL_ERROR
00341 #define REUNIT_SVD_ABS_ERROR DEV_REUNIT_SVD_ABS_ERROR
00342 #else // cpu
00343 #define REUNIT_ALLOW_SVD HOST_REUNIT_ALLOW_SVD
00344 #define REUNIT_SVD_REL_ERROR HOST_REUNIT_SVD_REL_ERROR
00345 #define REUNIT_SVD_ABS_ERROR HOST_REUNIT_SVD_ABS_ERROR
00346 #endif
00347 
00348         if(REUNIT_ALLOW_SVD){
00349                 bool perform_svd = true;
00350                 if(!REUNIT_SVD_ONLY){
00351                   const typename RealTypeId<Cmplx>::Type det = getDeterminant(q).x;
00352                   if( fabs(det) >= REUNIT_SVD_ABS_ERROR){  
00353                     if( checkRelativeError(g[0]*g[1]*g[2],det,REUNIT_SVD_REL_ERROR) ) perform_svd = false;
00354                   }
00355                 }       
00356 
00357                 if(perform_svd){        
00358                         printf("Performing SVD in force\n");
00359                         Matrix<Cmplx,3> tmp2;
00360                         // compute the eigenvalues using the singular value decomposition
00361                         computeSVD<Cmplx>(q,tempq,tmp2,g);
00362                         // The array g contains the eigenvalues of the matrix q
00363                         // The determinant is the product of the eigenvalues, and I can use this
00364                         // to check the SVD
00365                         const typename RealTypeId<Cmplx>::Type determinant = getDeterminant(q).x;
00366                         const typename RealTypeId<Cmplx>::Type gprod = g[0]*g[1]*g[2];
00367                         // Check the svd result for errors
00368 #ifdef __CUDA_ARCH__
00369 #define MAX_DET_ERROR DEV_MAX_DET_ERROR
00370 #else
00371 #define MAX_DET_ERROR HOST_MAX_DET_ERROR
00372 #endif
00373                         if(fabs(gprod - determinant) > MAX_DET_ERROR){
00374 #if  (!defined(__CUDA_ARCH__) || (__COMPUTE_CAPABILITY__ >= 200))
00375                                 printf("Warning: Error in determinant computed by SVD : %g > %g\n", fabs(gprod-determinant), MAX_DET_ERROR);
00376                                 printLink(q);
00377 #endif
00378 
00379 #ifdef __CUDA_ARCH__
00380                                 atomicAdd(unitarization_failed,1);
00381 #else
00382                                 (*unitarization_failed)++;
00383 #endif
00384                         } 
00385                 } // perform_svd?
00386 
00387         } // REUNIT_ALLOW_SVD?
00388 
00389 #ifdef __CUDA_ARCH__
00390 #define HISQ_FORCE_FILTER DEV_HISQ_FORCE_FILTER
00391 #else
00392 #define HISQ_FORCE_FILTER HOST_HISQ_FORCE_FILTER
00393 #endif  
00394         typename RealTypeId<Cmplx>::Type delta = getAbsMin(g,3);
00395         if(delta < HISQ_FORCE_FILTER){
00396                 for(int i=0; i<3; ++i){ 
00397                         g[i]     += HISQ_FORCE_FILTER; 
00398                         q(i,i).x += HISQ_FORCE_FILTER;
00399                 }
00400                 qsq = q*q; // recalculate Q^2
00401         }
00402 
00403 
00404         // At this point we have finished with the c's 
00405         // use these to store sqrt(g)
00406         for(int i=0; i<3; ++i) c[i] = sqrt(g[i]);
00407 
00408         // done with the g's, use these to store u, v, w
00409         g[0] = c[0]+c[1]+c[2];
00410         g[1] = c[0]*c[1] + c[0]*c[2] + c[1]*c[2];
00411         g[2] = c[0]*c[1]*c[2];
00412         
00413         // set the derivative coefficients!
00414         deriv_coeffs->set(g[0], g[1], g[2]);
00415 
00416         const typename RealTypeId<Cmplx>::Type & denominator  = g[2]*(g[0]*g[1]-g[2]); 
00417         c[0] = (g[0]*g[1]*g[1] - g[2]*(g[0]*g[0]+g[1]))/denominator;
00418         c[1] = (-g[0]*g[0]*g[0] - g[2] + 2.*g[0]*g[1])/denominator;
00419         c[2] =  g[0]/denominator;
00420 
00421         tempq = c[1]*q + c[2]*qsq;
00422         // Add a real scalar
00423         tempq(0,0).x += c[0];
00424         tempq(1,1).x += c[0];
00425         tempq(2,2).x += c[0];
00426 
00427         f[0] = c[0];
00428         f[1] = c[1];
00429         f[2] = c[2];
00430 
00431         *res = tempq;
00432         return;
00433       }
00434 
00435 
00436 
00437       // "v" denotes a "fattened" link variable
00438       template<class Cmplx>
00439       __device__ __host__
00440         void getUnitarizeForceSite(const Matrix<Cmplx,3> & v, const Matrix<Cmplx,3> & outer_prod, Matrix<Cmplx,3>* result, int *unitarization_failed)
00441         {
00442           typename RealTypeId<Cmplx>::Type f[3]; 
00443           typename RealTypeId<Cmplx>::Type b[6];
00444 
00445           Matrix<Cmplx,3> v_dagger = conj(v);  // okay!
00446           Matrix<Cmplx,3> q   = v_dagger*v;    // okay!
00447 
00448           Matrix<Cmplx,3> rsqrt_q;
00449 
00450           DerivativeCoefficients<typename RealTypeId<Cmplx>::Type> deriv_coeffs;
00451 
00452           reciprocalRoot<Cmplx>(&rsqrt_q, &deriv_coeffs, f, q, unitarization_failed);
00453 
00454           // Pure hack here
00455           b[0] = deriv_coeffs.getB00();
00456           b[1] = deriv_coeffs.getB01();
00457           b[2] = deriv_coeffs.getB02();
00458           b[3] = deriv_coeffs.getB11();
00459           b[4] = deriv_coeffs.getB12();
00460           b[5] = deriv_coeffs.getB22();
00461 
00462 
00463           Matrix<Cmplx,3> & local_result = *result;
00464 
00465           local_result = rsqrt_q*outer_prod;
00466 
00467           // We are now finished with rsqrt_q
00468           Matrix<Cmplx,3> qv_dagger  = q*v_dagger;
00469           Matrix<Cmplx,3> vv_dagger  = v*v_dagger; 
00470           Matrix<Cmplx,3> vqv_dagger = v*qv_dagger;
00471           Matrix<Cmplx,3> temp = f[1]*vv_dagger + f[2]*vqv_dagger;
00472 
00473 
00474           temp = f[1]*v_dagger + f[2]*qv_dagger;
00475           Matrix<Cmplx,3> conj_outer_prod = conj(outer_prod);
00476 
00477 
00478           temp = f[1]*v + f[2]*v*q;
00479           local_result = local_result + outer_prod*temp*v_dagger + f[2]*q*outer_prod*vv_dagger;
00480 
00481           local_result = local_result + v_dagger*conj_outer_prod*conj(temp) + f[2]*qv_dagger*conj_outer_prod*v_dagger;
00482 
00483 
00484           // now done with vv_dagger, I think
00485           Matrix<Cmplx,3> qsqv_dagger = q*qv_dagger;
00486           Matrix<Cmplx,3> pv_dagger   = b[0]*v_dagger + b[1]*qv_dagger + b[2]*qsqv_dagger;
00487           accumBothDerivatives(&local_result, v, pv_dagger, outer_prod);
00488 
00489           Matrix<Cmplx,3> rv_dagger = b[1]*v_dagger + b[3]*qv_dagger + b[4]*qsqv_dagger;
00490           Matrix<Cmplx,3> vq = v*q;
00491           accumBothDerivatives(&local_result, vq, rv_dagger, outer_prod);
00492 
00493           Matrix<Cmplx,3> sv_dagger = b[2]*v_dagger + b[4]*qv_dagger + b[5]*qsqv_dagger;
00494           Matrix<Cmplx,3> vqsq = vq*q;
00495           accumBothDerivatives(&local_result, vqsq, sv_dagger, outer_prod);
00496           return;
00497         } // get unit force term
00498 
00499 
00500 
00501         template<class Cmplx>
00502           __global__ void getUnitarizeForceField(Cmplx* link_even, Cmplx* link_odd,
00503                                                  Cmplx* old_force_even, Cmplx* old_force_odd,
00504                                                                                                                                                                                                  Cmplx* force_even, Cmplx* force_odd,
00505                                                                                                                                                                                                  int* unitarization_failed)
00506           {
00507             int mem_idx = blockIdx.x*blockDim.x + threadIdx.x;
00508         
00509             Cmplx* force;
00510             Cmplx* link;
00511             Cmplx* old_force;
00512 
00513             force = force_even;
00514             link = link_even;
00515             old_force = old_force_even;
00516             if(mem_idx >= Vh){
00517               mem_idx = mem_idx - Vh;
00518               force = force_odd;
00519               link = link_odd;
00520               old_force = old_force_odd;
00521             }
00522 
00523 
00524             // This part of the calculation is always done in double precision
00525             Matrix<double2,3> v, result, oprod;
00526            
00527             for(int dir=0; dir<4; ++dir){
00528               loadLinkVariableFromArray(old_force, dir, mem_idx, Vh, &oprod);
00529               loadLinkVariableFromArray(link, dir, mem_idx, Vh, &v);
00530 
00531               getUnitarizeForceSite<double2>(v, oprod, &result, unitarization_failed); 
00532 
00533               writeLinkVariableToArray(result, dir, mem_idx, Vh, force); 
00534             }
00535             return;
00536           } // getUnitarizeForceField
00537 
00538 
00539         void unitarizeForceCPU(const QudaGaugeParam& param, cpuGaugeField& cpuOldForce, cpuGaugeField& cpuGauge, cpuGaugeField* cpuNewForce)
00540         {
00541 
00542                 int num_failures = 0;   
00543                 Matrix<double2,3> old_force, new_force, v;
00544                 for(int i=0; i<cpuGauge.Volume(); ++i){
00545                         for(int dir=0; dir<4; ++dir){
00546                                 if(param.cpu_prec == QUDA_SINGLE_PRECISION){
00547                                         copyArrayToLink(&old_force, ((float*)(cpuOldForce.Gauge_p()) + (i*4 + dir)*18)); 
00548                                         copyArrayToLink(&v, ((float*)(cpuGauge.Gauge_p()) + (i*4 + dir)*18)); 
00549                                         getUnitarizeForceSite<double2>(v, old_force, &new_force, &num_failures);
00550                                         copyLinkToArray(((float*)(cpuNewForce->Gauge_p()) + (i*4 + dir)*18), new_force); 
00551                                 }else if(param.cpu_prec == QUDA_DOUBLE_PRECISION){
00552                                         copyArrayToLink(&old_force, ((double*)(cpuOldForce.Gauge_p()) + (i*4 + dir)*18)); 
00553                                         copyArrayToLink(&v, ((double*)(cpuGauge.Gauge_p()) + (i*4 + dir)*18)); 
00554                                         getUnitarizeForceSite<double2>(v, old_force, &new_force, &num_failures);
00555                                         copyLinkToArray(((double*)(cpuNewForce->Gauge_p()) + (i*4 + dir)*18), new_force); 
00556                                 } // precision?
00557                         } // dir
00558                 } // i
00559                 return;
00560         } // unitarize_force_cpu
00561 
00562 
00563         void unitarizeForceCuda(const QudaGaugeParam& param, cudaGaugeField& cudaOldForce, cudaGaugeField& cudaGauge,  cudaGaugeField* cudaNewForce, int* unitarization_failed)
00564         {
00565 
00566           dim3 gridDim(cudaGauge.Volume()/BLOCK_DIM,1,1);
00567           dim3 blockDim(BLOCK_DIM,1,1);
00568 
00569           if(param.cuda_prec == QUDA_SINGLE_PRECISION){
00570                   getUnitarizeForceField<<<gridDim,blockDim>>>((float2*)cudaGauge.Even_p(), (float2*)cudaGauge.Odd_p(),
00571                                   (float2*)cudaOldForce.Even_p(), (float2*)cudaOldForce.Odd_p(),
00572                                   (float2*)cudaNewForce->Even_p(), (float2*)cudaNewForce->Odd_p(),
00573                                   unitarization_failed);
00574           }else if(param.cuda_prec == QUDA_DOUBLE_PRECISION){
00575                   getUnitarizeForceField<<<gridDim,blockDim>>>((double2*)cudaGauge.Even_p(), (double2*)cudaGauge.Odd_p(),
00576                                   (double2*)cudaOldForce.Even_p(), (double2*)cudaOldForce.Odd_p(),
00577                                   (double2*)cudaNewForce->Even_p(), (double2*)cudaNewForce->Odd_p(),
00578                                   unitarization_failed);
00579           } // precision?
00580           return;
00581         } // unitarize_force_cuda
00582 
00583 
00584   } // namespace fermion_force
00585 } // namespace hisq
00586 
00587