v0.9.0/doc/pgauge__heatbath_8cu_source.html

 #include <quda_internal.h>
 #include <quda_matrix.h>
 #include <tune_quda.h>
 #include <gauge_field.h>
 #include <gauge_field_order.h>
 #include <launch_kernel.cuh>
 #include <comm_quda.h>
 #include <pgauge_monte.h>
 #include <gauge_tools.h>
 #include <random_quda.h>
 #include <index_helper.cuh>
 #include <atomic.cuh>
 #include <cub/cub.cuh>


 #ifndef PI
 #define PI    3.1415926535897932384626433832795    // pi
 #endif
 #ifndef PII
 #define PII   6.2831853071795864769252867665590    // 2 * pi
 #endif

 namespace quda {

 #ifdef GPU_GAUGE_ALG


   template<int NCOLORS>
   __host__ __device__ static inline int2 IndexBlock(int block){
     int2 id;
     int i1;
     int found = 0;
     int del_i = 0;
     int index = -1;
     while ( del_i < (NCOLORS - 1) && found == 0 ) {
       del_i++;
       for ( i1 = 0; i1 < (NCOLORS - del_i); i1++ ) {
         index++;
         if ( index == block ) {
           found = 1;
           break;
         }
       }
     }
     id.y = i1 + del_i;
     id.x = i1;
     return id;
   }
   template<int NCOLORS>
   __host__ __device__ static inline void   IndexBlock(int block, int &p, int &q){
     if ( NCOLORS == 3 ) {
       if ( block == 0 ) { p = 0; q = 1; }
       else if ( block == 1 ) { p = 1; q = 2; }
       else{ p = 0; q = 2; }
     }
     else if ( NCOLORS > 3 ) {
       int i1;
       int found = 0;
       int del_i = 0;
       int index = -1;
       while ( del_i < (NCOLORS - 1) && found == 0 ) {
         del_i++;
         for ( i1 = 0; i1 < (NCOLORS - del_i); i1++ ) {
           index++;
           if ( index == block ) {
             found = 1;
             break;
           }
         }
       }
       q = i1 + del_i;
       p = i1;
     }
   }

   template <class T>
   __device__ static inline Matrix<T,2> generate_su2_matrix_milc(T al, cuRNGState& localState){
     T xr1, xr2, xr3, xr4, d, r;
     int k;
     xr1 = Random<T>(localState);
     xr1 = (log((xr1 + 1.e-10)));
     xr2 = Random<T>(localState);
     xr2 = (log((xr2 + 1.e-10)));
     xr3 = Random<T>(localState);
     xr4 = Random<T>(localState);
     xr3 = cos(PII * xr3);
     d = -(xr2  + xr1 * xr3 * xr3 ) / al;
     //now  beat each  site into submission
     int nacd = 0;
     if ((1.00 - 0.5 * d) > xr4 * xr4 ) nacd = 1;
     if ( nacd == 0 && al > 2.0 ) { //k-p algorithm
       for ( k = 0; k < 20; k++ ) {
         //get four random numbers (add a small increment to prevent taking log(0.)
         xr1 = Random<T>(localState);
         xr1 = (log((xr1 + 1.e-10)));
         xr2 = Random<T>(localState);
         xr2 = (log((xr2 + 1.e-10)));
         xr3 = Random<T>(localState);
         xr4 = Random<T>(localState);
         xr3 = cos(PII * xr3);
         d = -(xr2 + xr1 * xr3 * xr3) / al;
         if ((1.00 - 0.5 * d) > xr4 * xr4 ) break;
       }
     } //endif nacd
     Matrix<T,2> a;
     if ( nacd == 0 && al <= 2.0 ) { //creutz algorithm
       xr3 = exp(-2.0 * al);
       xr4 = 1.0 - xr3;
       for ( k = 0; k < 20; k++ ) {
         //get two random numbers
         xr1 = Random<T>(localState);
         xr2 = Random<T>(localState);
         r = xr3 + xr4 * xr1;
         a(0,0) = 1.00 + log(r) / al;
         if ((1.0 - a(0,0) * a(0,0)) > xr2 * xr2 ) break;
       }
       d = 1.0 - a(0,0);
     } //endif nacd
       //generate the four su(2) elements
       //find a0  = 1 - d
     a(0,0) = 1.0 - d;
     //compute r
     xr3 = 1.0 - a(0,0) * a(0,0);
     xr3 = abs(xr3);
     r = sqrt(xr3);
     //compute a3
     a(1,1) = (2.0 * Random<T>(localState) - 1.0) * r;
     //compute a1 and a2
     xr1 = xr3 - a(1,1) * a(1,1);
     xr1 = abs(xr1);
     xr1 = sqrt(xr1);
     //xr2 is a random number between 0 and 2*pi
     xr2 = PII * Random<T>(localState);
     a(0,1) = xr1 * cos(xr2);
     a(1,0) = xr1 * sin(xr2);
     return a;
   }


   template < class T>
   __host__ __device__ static inline Matrix<T,2> get_block_su2( Matrix<complex<T>,3> tmp1, int block ){
     Matrix<T,2> r;
     switch ( block ) {
     case 0:
       r(0,0) = tmp1(0,0).x + tmp1(1,1).x;
       r(0,1) = tmp1(0,1).y + tmp1(1,0).y;
       r(1,0) = tmp1(0,1).x - tmp1(1,0).x;
       r(1,1) = tmp1(0,0).y - tmp1(1,1).y;
       break;
     case 1:
       r(0,0) = tmp1(1,1).x + tmp1(2,2).x;
       r(0,1) = tmp1(1,2).y + tmp1(2,1).y;
       r(1,0) = tmp1(1,2).x - tmp1(2,1).x;
       r(1,1) = tmp1(1,1).y - tmp1(2,2).y;
       break;
     case 2:
       r(0,0) = tmp1(0,0).x + tmp1(2,2).x;
       r(0,1) = tmp1(0,2).y + tmp1(2,0).y;
       r(1,0) = tmp1(0,2).x - tmp1(2,0).x;
       r(1,1) = tmp1(0,0).y - tmp1(2,2).y;
       break;
     }
     return r;
   }

   template <class T, int NCOLORS>
   __host__ __device__ static inline Matrix<T,2> get_block_su2( Matrix<complex<T>,NCOLORS> tmp1, int2 id ){
     Matrix<T,2> r;
     r(0,0) = tmp1(id.x,id.x).x + tmp1(id.y,id.y).x;
     r(0,1) = tmp1(id.x,id.y).y + tmp1(id.y,id.x).y;
     r(1,0) = tmp1(id.x,id.y).x - tmp1(id.y,id.x).x;
     r(1,1) = tmp1(id.x,id.x).y - tmp1(id.y,id.y).y;
     return r;
   }

   template <class T, int NCOLORS>
   __host__ __device__ static inline Matrix<complex<T>,NCOLORS> block_su2_to_sun( Matrix<T,2> rr, int2 id ){
     Matrix<complex<T>,NCOLORS> tmp1;
     setIdentity(&tmp1);
     tmp1(id.x,id.x) = complex<T>( rr(0,0), rr(1,1) );
     tmp1(id.x,id.y) = complex<T>( rr(1,0), rr(0,1) );
     tmp1(id.y,id.x) = complex<T>(-rr(1,0), rr(0,1) );
     tmp1(id.y,id.y) = complex<T>( rr(0,0),-rr(1,1) );
     return tmp1;
   }
   template <class T, int NCOLORS>
   __host__ __device__ static inline void mul_block_sun( Matrix<T,2> u, Matrix<complex<T>,NCOLORS> &link, int2 id ){
     for ( int j = 0; j < NCOLORS; j++ ) {
       complex<T> tmp = complex<T>( u(0,0), u(1,1) ) * link(id.x, j) + complex<T>( u(1,0), u(0,1) ) * link(id.y, j);
       link(id.y, j) = complex<T>(-u(1,0), u(0,1) ) * link(id.x, j) + complex<T>( u(0,0),-u(1,1) ) * link(id.y, j);
       link(id.x, j) = tmp;
     }
   }

   template <class Cmplx>
   __host__ __device__ static inline void block_su2_to_su3( Matrix<Cmplx,3> &U, Cmplx a00, Cmplx a01, Cmplx a10, Cmplx a11, int block ){
     Cmplx tmp;
     switch ( block ) {
     case 0:
       tmp = a00 * U(0,0) + a01 * U(1,0);
       U(1,0) = a10 * U(0,0) + a11 * U(1,0);
       U(0,0) = tmp;
       tmp = a00 * U(0,1) + a01 * U(1,1);
       U(1,1) = a10 * U(0,1) + a11 * U(1,1);
       U(0,1) = tmp;
       tmp = a00 * U(0,2) + a01 * U(1,2);
       U(1,2) = a10 * U(0,2) + a11 * U(1,2);
       U(0,2) = tmp;
       break;
     case 1:
       tmp = a00 * U(1,0) + a01 * U(2,0);
       U(2,0) = a10 * U(1,0) + a11 * U(2,0);
       U(1,0) = tmp;
       tmp = a00 * U(1,1) + a01 * U(2,1);
       U(2,1) = a10 * U(1,1) + a11 * U(2,1);
       U(1,1) = tmp;
       tmp = a00 * U(1,2) + a01 * U(2,2);
       U(2,2) = a10 * U(1,2) + a11 * U(2,2);
       U(1,2) = tmp;
       break;
     case 2:
       tmp = a00 * U(0,0) + a01 * U(2,0);
       U(2,0) = a10 * U(0,0) + a11 * U(2,0);
       U(0,0) = tmp;
       tmp = a00 * U(0,1) + a01 * U(2,1);
       U(2,1) = a10 * U(0,1) + a11 * U(2,1);
       U(0,1) = tmp;
       tmp = a00 * U(0,2) + a01 * U(2,2);
       U(2,2) = a10 * U(0,2) + a11 * U(2,2);
       U(0,2) = tmp;
       break;
     }
   }


 // v * u^dagger
   template <class Float>
   __host__ __device__ static inline Matrix<Float,2> mulsu2UVDagger(Matrix<Float,2> v, Matrix<Float,2> u){
     Matrix<Float,2> b;
     b(0,0) = v(0,0) * u(0,0) + v(0,1) * u(0,1) + v(1,0) * u(1,0) + v(1,1) * u(1,1);
     b(0,1) = v(0,1) * u(0,0) - v(0,0) * u(0,1) + v(1,0) * u(1,1) - v(1,1) * u(1,0);
     b(1,0) = v(1,0) * u(0,0) - v(0,0) * u(1,0) + v(1,1) * u(0,1) - v(0,1) * u(1,1);
     b(1,1) = v(1,1) * u(0,0) - v(0,0) * u(1,1) + v(0,1) * u(1,0) - v(1,0) * u(0,1);
     return b;
   }

   template <class Float, int NCOLORS>
   __device__ inline void heatBathSUN( Matrix<complex<Float>,NCOLORS>& U, Matrix<complex<Float>,NCOLORS> F,
                                       cuRNGState& localState, Float BetaOverNc ){

     if ( NCOLORS == 3 ) {
       /*
          for( int block = 0; block < NCOLORS; block++ ) {
          Matrix<complex<T>,3> tmp1 = U * F;
          Matrix<T,2> r = get_block_su2<T>(tmp1, block);
          T k = sqrt(r(0,0)*r(0,0)+r(0,1)*r(0,1)+r(1,0)*r(1,0)+r(1,1)*r(1,1));
          T ap = BetaOverNc * k;
          k = (T)1.0 / k;
          r *= k;
          //Matrix<T,2> a = generate_su2_matrix<T4, T>(ap, localState);
          Matrix<T,2> a = generate_su2_matrix_milc<T>(ap, localState);
          r = mulsu2UVDagger_4<T>( a, r);
          block_su2_to_su3<T>( U, complex( r(0,0), r(1,1) ), complex( r(1,0), r(0,1) ), complex(-r(1,0), r(0,1) ), complex( r(0,0),-r(1,1) ), block );
          //FLOP_min = (198 + 4 + 15 + 28 + 28 + 84) * 3 = 1071
          }*/

       for ( int block = 0; block < NCOLORS; block++ ) {
         int p,q;
         IndexBlock<NCOLORS>(block, p, q);
         complex<Float> a0((Float)0.0, (Float)0.0);
         complex<Float> a1 = a0;
         complex<Float> a2 = a0;
         complex<Float> a3 = a0;

         for ( int j = 0; j < NCOLORS; j++ ) {
           a0 += U(p,j) * F(j,p);
           a1 += U(p,j) * F(j,q);
           a2 += U(q,j) * F(j,p);
           a3 += U(q,j) * F(j,q);
         }
         Matrix<Float,2> r;
         r(0,0) = a0.x + a3.x;
         r(0,1) = a1.y + a2.y;
         r(1,0) = a1.x - a2.x;
         r(1,1) = a0.y - a3.y;
         Float k = sqrt(r(0,0) * r(0,0) + r(0,1) * r(0,1) + r(1,0) * r(1,0) + r(1,1) * r(1,1));;
         Float ap = BetaOverNc * k;
         k = 1.0 / k;
         r *= k;
         Matrix<Float,2> a = generate_su2_matrix_milc<Float>(ap, localState);
         r = mulsu2UVDagger<Float>( a, r);
         a0 = complex<Float>( r(0,0), r(1,1) );
         a1 = complex<Float>( r(1,0), r(0,1) );
         a2 = complex<Float>(-r(1,0), r(0,1) );
         a3 = complex<Float>( r(0,0),-r(1,1) );
         complex<Float> tmp0;

         for ( int j = 0; j < NCOLORS; j++ ) {
           tmp0 = a0 * U(p,j) + a1 * U(q,j);
           U(q,j) = a2 * U(p,j) + a3 * U(q,j);
           U(p,j) = tmp0;
         }
         //FLOP_min = (NCOLORS * 64 + 19 + 28 + 28) * 3 = NCOLORS * 192 + 225
       }
     }
     else if ( NCOLORS > 3 ) {
       //TESTED IN SU(4) SP THIS IS WORST
       Matrix<complex<Float>,NCOLORS> M = U * F;
       for ( int block = 0; block < NCOLORS * ( NCOLORS - 1) / 2; block++ ) {
         int2 id = IndexBlock<NCOLORS>( block );
         Matrix<Float,2> r = get_block_su2<Float>(M, id);
         Float k = sqrt(r(0,0) * r(0,0) + r(0,1) * r(0,1) + r(1,0) * r(1,0) + r(1,1) * r(1,1));
         Float ap = BetaOverNc * k;
         k = 1.0 / k;
         r *= k;
         Matrix<Float,2> a = generate_su2_matrix_milc<Float>(ap, localState);
         Matrix<Float,2> rr = mulsu2UVDagger<Float>( a, r);
         mul_block_sun<Float, NCOLORS>( rr, U, id);
         mul_block_sun<Float, NCOLORS>( rr, M, id);
       }
       /* / TESTED IN SU(4) SP THIS IS FASTER
          for ( int block = 0; block < NCOLORS * ( NCOLORS - 1) / 2; block++ ) {
          int2 id = IndexBlock<NCOLORS>( block );
          complex a0 = complex::zero();
          complex a1 = complex::zero();
          complex a2 = complex::zero();
          complex a3 = complex::zero();

          for ( int j = 0; j < NCOLORS; j++ ) {
           a0 += U(id.x, j) * F.e[j][id.x];
           a1 += U(id.x, j) * F.e[j][id.y];
           a2 += U(id.y, j) * F.e[j][id.x];
           a3 += U(id.y, j) * F.e[j][id.y];
          }
          Matrix<T,2> r;
          r(0,0) = a0.x + a3.x;
          r(0,1) = a1.y + a2.y;
          r(1,0) = a1.x - a2.x;
          r(1,1) = a0.y - a3.y;
          T k = sqrt(r(0,0) * r(0,0) + r(0,1) * r(0,1) + r(1,0) * r(1,0) + r(1,1) * r(1,1));
          T ap = BetaOverNc * k;
          k = (T)1.0 / k;
          r *= k;
          //Matrix<T,2> a = generate_su2_matrix<T4, T>(ap, localState);
          Matrix<T,2> a = generate_su2_matrix_milc<T>(ap, localState);
          r = mulsu2UVDagger<T>( a, r);
          mul_block_sun<T>( r, U, id); */
          /*
            a0 = complex( r(0,0), r(1,1) );
            a1 = complex( r(1,0), r(0,1) );
            a2 = complex(-r(1,0), r(0,1) );
            a3 = complex( r(0,0),-r(1,1) );
            complex tmp0;

            for ( int j = 0; j < NCOLORS; j++ ) {
            tmp0 = a0 * U(id.x, j) + a1 * U(id.y, j);
            U(id.y, j) = a2 * U(id.x, j) + a3 * U(id.y, j);
            U(id.x, j) = tmp0;
            } */
       // }

     }
   }


   template <class Float, int NCOLORS>
   __device__ inline void overrelaxationSUN( Matrix<complex<Float>,NCOLORS>& U, Matrix<complex<Float>,NCOLORS> F ){

     if ( NCOLORS == 3 ) {
       /*
          for( int block = 0; block < 3; block++ ) {
          Matrix<complex<T>,3> tmp1 = U * F;
          Matrix<T,2> r = get_block_su2<T>(tmp1, block);
          //normalize and conjugate
          Float norm = 1.0 / sqrt(r(0,0)*r(0,0)+r(0,1)*r(0,1)+r(1,0)*r(1,0)+r(1,1)*r(1,1));;
          r(0,0) *= norm;
          r(0,1) *= -norm;
          r(1,0) *= -norm;
          r(1,1) *= -norm;
          complex a00 = complex( r(0,0), r(1,1) );
          complex a01 = complex( r(1,0), r(0,1) );
          complex a10 = complex(-r(1,0), r(0,1) );
          complex a11 = complex( r(0,0),-r(1,1) );
          block_su2_to_su3<T>( U, a00, a01, a10, a11, block );
          block_su2_to_su3<T>( U, a00, a01, a10, a11, block );

          //FLOP = (198 + 17 + 84 * 2) * 3 = 1149
          }*/
       //This version does not need to multiply all matrix at each block: tmp1 = U * F;

       for ( int block = 0; block < 3; block++ ) {
         int p,q;
         IndexBlock<NCOLORS>(block, p, q);
         complex<Float> a0((Float)0., (Float)0.);
         complex<Float> a1 = a0;
         complex<Float> a2 = a0;
         complex<Float> a3 = a0;

         for ( int j = 0; j < NCOLORS; j++ ) {
           a0 += U(p,j) * F(j,p);
           a1 += U(p,j) * F(j,q);
           a2 += U(q,j) * F(j,p);
           a3 += U(q,j) * F(j,q);
         }
         Matrix<Float,2> r;
         r(0,0) = a0.x + a3.x;
         r(0,1) = a1.y + a2.y;
         r(1,0) = a1.x - a2.x;
         r(1,1) = a0.y - a3.y;
         //normalize and conjugate
         //r = r.conj_normalize();
         Float norm = 1.0 / sqrt(r(0,0) * r(0,0) + r(0,1) * r(0,1) + r(1,0) * r(1,0) + r(1,1) * r(1,1));;
         r(0,0) *= norm;
         r(0,1) *= -norm;
         r(1,0) *= -norm;
         r(1,1) *= -norm;


         a0 = complex<Float>( r(0,0), r(1,1) );
         a1 = complex<Float>( r(1,0), r(0,1) );
         a2 = complex<Float>(-r(1,0), r(0,1) );
         a3 = complex<Float>( r(0,0),-r(1,1) );
         complex<Float> tmp0, tmp1;

         for ( int j = 0; j < NCOLORS; j++ ) {
           tmp0 = a0 * U(p,j) + a1 * U(q,j);
           tmp1 = a2 * U(p,j) + a3 * U(q,j);
           U(p,j) = a0 * tmp0 + a1 * tmp1;
           U(q,j) = a2 * tmp0 + a3 * tmp1;
         }
         //FLOP = (NCOLORS * 88 + 17) * 3
       }
     }
     else if ( NCOLORS > 3 ) {
       Matrix<complex<Float>,NCOLORS> M = U * F;
       for ( int block = 0; block < NCOLORS * ( NCOLORS - 1) / 2; block++ ) {
         int2 id = IndexBlock<NCOLORS>( block );
         Matrix<Float,2> r = get_block_su2<Float, NCOLORS>(M, id);
         //normalize and conjugate
         Float norm = 1.0 / sqrt(r(0,0) * r(0,0) + r(0,1) * r(0,1) + r(1,0) * r(1,0) + r(1,1) * r(1,1));;
         r(0,0) *= norm;
         r(0,1) *= -norm;
         r(1,0) *= -norm;
         r(1,1) *= -norm;
         mul_block_sun<Float, NCOLORS>( r, U, id);
         mul_block_sun<Float, NCOLORS>( r, U, id);
         mul_block_sun<Float, NCOLORS>( r, M, id);
         mul_block_sun<Float, NCOLORS>( r, M, id);
       }
       /*  //TESTED IN SU(4) SP THIS IS WORST
           for( int block = 0; block < NCOLORS * ( NCOLORS - 1) / 2; block++ ) {
          int2 id = IndexBlock<NCOLORS>( block );
           complex a0 = complex::zero();
           complex a1 = complex::zero();
           complex a2 = complex::zero();
           complex a3 = complex::zero();

           for(int j = 0; j < NCOLORS; j++){
          a0 += U(id.x, j) * F.e[j][id.x];
          a1 += U(id.x, j) * F.e[j][id.y];
          a2 += U(id.y, j) * F.e[j][id.x];
          a3 += U(id.y, j) * F.e[j][id.y];
           }
           Matrix<T,2> r;
           r(0,0) = a0.x + a3.x;
           r(0,1) = a1.y + a2.y;
           r(1,0) = a1.x - a2.x;
           r(1,1) = a0.y - a3.y;
           //normalize and conjugate
           Float norm = 1.0 / sqrt(r(0,0)*r(0,0)+r(0,1)*r(0,1)+r(1,0)*r(1,0)+r(1,1)*r(1,1));;
           r(0,0) *= norm;
           r(0,1) *= -norm;
           r(1,0) *= -norm;
           r(1,1) *= -norm;
           //mul_block_sun<T>( r, U, id);
           //mul_block_sun<T>( r, U, id);
           a0 = complex( r(0,0), r(1,1) );
           a1 = complex( r(1,0), r(0,1) );
           a2 = complex(-r(1,0), r(0,1) );
           a3 = complex( r(0,0),-r(1,1) );
           complex tmp0, tmp1;

           for(int j = 0; j < NCOLORS; j++){
           tmp0 = a0 * U(id.x, j) + a1 * U(id.y, j);
           tmp1 = a2 * U(id.x, j) + a3 * U(id.y, j);
           U(id.x, j) = a0 * tmp0 + a1 * tmp1;
           U(id.y, j) = a2 * tmp0 + a3 * tmp1;
           }
           }
        */
     }
   }


   template <typename Gauge, typename Float, int NCOLORS>
   struct MonteArg {
     int threads;       // number of active threads required
     int X[4];       // grid dimensions
 #ifdef MULTI_GPU
     int border[4];
 #endif
     Gauge dataOr;
     cudaGaugeField &data;
     Float BetaOverNc;
     RNG rngstate;
     MonteArg(const Gauge &dataOr, cudaGaugeField & data, Float Beta, RNG &rngstate)
       : dataOr(dataOr), data(data), rngstate(rngstate) {
       BetaOverNc = Beta / (Float)NCOLORS;
 #ifdef MULTI_GPU
       for ( int dir = 0; dir < 4; ++dir ) {
         border[dir] = data.R()[dir];
         X[dir] = data.X()[dir] - border[dir] * 2;
       }
 #else
       for ( int dir = 0; dir < 4; ++dir ) X[dir] = data.X()[dir];
 #endif
       threads = X[0] * X[1] * X[2] * X[3] >> 1;
     }
   };


   template<typename Float, typename Gauge, int NCOLORS, bool HeatbathOrRelax>
   __global__ void compute_heatBath(MonteArg<Gauge, Float, NCOLORS> arg, int mu, int parity){
     int idx = threadIdx.x + blockIdx.x * blockDim.x;
     if ( idx >= arg.threads ) return;
     int id = idx;
     int X[4];
     #pragma unroll
     for ( int dr = 0; dr < 4; ++dr ) X[dr] = arg.X[dr];

     int x[4];
     getCoords(x, idx, X, parity);
 #ifdef MULTI_GPU
     #pragma unroll
     for ( int dr = 0; dr < 4; ++dr ) {
       x[dr] += arg.border[dr];
       X[dr] += 2 * arg.border[dr];
     }
     idx = linkIndex(x,X);
 #endif

     Matrix<complex<Float>,NCOLORS> staple;
     setZero(&staple);

     Matrix<complex<Float>,NCOLORS> U;
     for ( int nu = 0; nu < 4; nu++ ) if ( mu != nu ) {
         int dx[4] = { 0, 0, 0, 0 };
         Matrix<complex<Float>,NCOLORS> link;
         arg.dataOr.load((Float*)(link.data), idx, nu, parity);
         dx[nu]++;
         arg.dataOr.load((Float*)(U.data), linkIndexShift(x,dx,X), mu, 1 - parity);
         link *= U;
         dx[nu]--;
         dx[mu]++;
         arg.dataOr.load((Float*)(U.data), linkIndexShift(x,dx,X), nu, 1 - parity);
         link *= conj(U);
         staple += link;
         dx[mu]--;
         dx[nu]--;
         arg.dataOr.load((Float*)(link.data), linkIndexShift(x,dx,X), nu, 1 - parity);
         arg.dataOr.load((Float*)(U.data), linkIndexShift(x,dx,X), mu, 1 - parity);
         link = conj(link) * U;
         dx[mu]++;
         arg.dataOr.load((Float*)(U.data), linkIndexShift(x,dx,X), nu, parity);
         link *= U;
         staple += link;
       }
     arg.dataOr.load((Float*)(U.data), idx, mu, parity);
     if ( HeatbathOrRelax ) {
       cuRNGState localState = arg.rngstate.State()[ id ];
       heatBathSUN<Float, NCOLORS>( U, conj(staple), localState, arg.BetaOverNc );
       arg.rngstate.State()[ id ] = localState;
     }
     else{
       overrelaxationSUN<Float, NCOLORS>( U, conj(staple) );
     }
     arg.dataOr.save((Float*)(U.data), idx, mu, parity);
   }


   template<typename Float, typename Gauge, int NCOLORS, int NElems, bool HeatbathOrRelax>
   class GaugeHB : Tunable {
     MonteArg<Gauge, Float, NCOLORS> arg;
     int mu;
     int parity;
     mutable char aux_string[128];       // used as a label in the autotuner
     private:
     unsigned int sharedBytesPerThread() const {
       return 0;
     }
     unsigned int sharedBytesPerBlock(const TuneParam &param) const {
       return 0;
     }
     //bool tuneSharedBytes() const { return false;  } // Don't tune shared memory
     bool tuneGridDim() const {
       return false;
     }                                        // Don't tune the grid dimensions.
     unsigned int minThreads() const {
       return arg.threads;
     }

     public:
     GaugeHB(MonteArg<Gauge, Float, NCOLORS> &arg)
       : arg(arg), mu(0), parity(0) {
     }
     ~GaugeHB () {
     }
     void SetParam(int _mu, int _parity){
       mu = _mu;
       parity = _parity;
     }
     void apply(const cudaStream_t &stream){
       TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
       compute_heatBath<Float, Gauge, NCOLORS, HeatbathOrRelax ><< < tp.grid,tp.block, tp.shared_bytes, stream >> > (arg, mu, parity);
     }

     TuneKey tuneKey() const {
       std::stringstream vol;
       vol << arg.X[0] << "x";
       vol << arg.X[1] << "x";
       vol << arg.X[2] << "x";
       vol << arg.X[3];
       sprintf(aux_string,"threads=%d,prec=%lu",arg.threads, sizeof(Float));
       return TuneKey(vol.str().c_str(), typeid(*this).name(), aux_string);
     }

     void preTune() {
       arg.data.backup();
       if(HeatbathOrRelax) arg.rngstate.backup();
     }
     void postTune() {
       arg.data.restore();
       if(HeatbathOrRelax) arg.rngstate.restore();
     }
     long long flops() const {

       //NEED TO CHECK THIS!!!!!!
       if ( NCOLORS == 3 ) {
         long long flop = 2268LL;
         if ( HeatbathOrRelax ) {
           flop += 801LL;
         }
         else{
           flop += 843LL;
         }
         flop *= arg.threads;
         return flop;
       }
       else{
         long long flop = NCOLORS * NCOLORS * NCOLORS * 84LL;
         if ( HeatbathOrRelax ) {
           flop += NCOLORS * NCOLORS * NCOLORS + (NCOLORS * ( NCOLORS - 1) / 2) * (46LL + 48LL + 56LL * NCOLORS);
         }
         else{
           flop += NCOLORS * NCOLORS * NCOLORS + (NCOLORS * ( NCOLORS - 1) / 2) * (17LL + 112LL * NCOLORS);
         }
         flop *= arg.threads;
         return flop;
       }
     }
     long long bytes() const {
       //NEED TO CHECK THIS!!!!!!
       if ( NCOLORS == 3 ) {
         long long byte = 20LL * NElems * sizeof(Float);
         if ( HeatbathOrRelax ) byte += 2LL * sizeof(cuRNGState);
         byte *= arg.threads;
         return byte;
       }
       else{
         long long byte = 20LL * NCOLORS * NCOLORS * 2 * sizeof(Float);
         if ( HeatbathOrRelax ) byte += 2LL * sizeof(cuRNGState);
         byte *= arg.threads;
         return byte;
       }
     }
   };


   template<typename Float, int NElems, int NCOLORS, typename Gauge>
   void Monte( Gauge dataOr,  cudaGaugeField& data, RNG &rngstate, Float Beta, int nhb, int nover) {

     TimeProfile profileHBOVR("HeatBath_OR_Relax", false);
     MonteArg<Gauge, Float, NCOLORS> montearg(dataOr, data, Beta, rngstate);
     if ( getVerbosity() >= QUDA_SUMMARIZE ) profileHBOVR.TPSTART(QUDA_PROFILE_COMPUTE);
     GaugeHB<Float, Gauge, NCOLORS, NElems, true> hb(montearg);
     for ( int step = 0; step < nhb; ++step ) {
       for ( int parity = 0; parity < 2; ++parity ) {
         for ( int mu = 0; mu < 4; ++mu ) {
           hb.SetParam(mu, parity);
           hb.apply(0);
         #ifdef MULTI_GPU
           PGaugeExchange( data, mu, parity);
         #endif
         }
       }
     }
     if ( getVerbosity() >= QUDA_SUMMARIZE ) {
       qudaDeviceSynchronize();
       profileHBOVR.TPSTOP(QUDA_PROFILE_COMPUTE);
       double secs = profileHBOVR.Last(QUDA_PROFILE_COMPUTE);
       double gflops = (hb.flops() * 8 * nhb * 1e-9) / (secs);
       double gbytes = hb.bytes() * 8 * nhb / (secs * 1e9);
     #ifdef MULTI_GPU
       printfQuda("HB: Time = %6.6f s, Gflop/s = %6.1f, GB/s = %6.1f\n", secs, gflops * comm_size(), gbytes * comm_size());
     #else
       printfQuda("HB: Time = %6.6f s, Gflop/s = %6.1f, GB/s = %6.1f\n", secs, gflops, gbytes);
     #endif
     }

     if ( getVerbosity() >= QUDA_SUMMARIZE ) profileHBOVR.TPSTART(QUDA_PROFILE_COMPUTE);
     GaugeHB<Float, Gauge, NCOLORS, NElems, false> relax(montearg);
     for ( int step = 0; step < nover; ++step ) {
       for ( int parity = 0; parity < 2; ++parity ) {
         for ( int mu = 0; mu < 4; ++mu ) {
           relax.SetParam(mu, parity);
           relax.apply(0);
         #ifdef MULTI_GPU
           PGaugeExchange( data, mu, parity);
         #endif
         }
       }
     }
     if ( getVerbosity() >= QUDA_SUMMARIZE ) {
       qudaDeviceSynchronize();
       profileHBOVR.TPSTOP(QUDA_PROFILE_COMPUTE);
       double secs = profileHBOVR.Last(QUDA_PROFILE_COMPUTE);
       double gflops = (relax.flops() * 8 * nover * 1e-9) / (secs);
       double gbytes = relax.bytes() * 8 * nover / (secs * 1e9);
     #ifdef MULTI_GPU
       printfQuda("OVR: Time = %6.6f s, Gflop/s = %6.1f, GB/s = %6.1f\n", secs, gflops * comm_size(), gbytes * comm_size());
     #else
       printfQuda("OVR: Time = %6.6f s, Gflop/s = %6.1f, GB/s = %6.1f\n", secs, gflops, gbytes);
     #endif
     }
   }


   template<typename Float>
   void Monte( cudaGaugeField& data, RNG &rngstate, Float Beta, int nhb, int nover) {

     if ( data.isNative() ) {
       if ( data.Reconstruct() == QUDA_RECONSTRUCT_NO ) {
   typedef typename gauge_mapper<Float,QUDA_RECONSTRUCT_NO>::type Gauge;
         Monte<Float, 18, 3>(Gauge(data), data, rngstate, Beta, nhb, nover);
       } else if ( data.Reconstruct() == QUDA_RECONSTRUCT_12 ) {
   typedef typename gauge_mapper<Float,QUDA_RECONSTRUCT_12>::type Gauge;
         Monte<Float, 12, 3>(Gauge(data), data, rngstate, Beta, nhb, nover);
       } else if ( data.Reconstruct() == QUDA_RECONSTRUCT_8 ) {
   typedef typename gauge_mapper<Float,QUDA_RECONSTRUCT_8>::type Gauge;
         Monte<Float, 8, 3>(Gauge(data), data, rngstate, Beta, nhb, nover);
       } else {
         errorQuda("Reconstruction type %d of gauge field not supported", data.Reconstruct());
       }
     } else {
       errorQuda("Invalid Gauge Order\n");
     }
   }
 #endif // GPU_GAUGE_ALG

   void Monte( cudaGaugeField& data, RNG &rngstate, double Beta, int nhb, int nover) {
 #ifdef GPU_GAUGE_ALG
     if ( data.Precision() == QUDA_SINGLE_PRECISION ) {
       Monte<float> (data, rngstate, (float)Beta, nhb, nover);
     } else if ( data.Precision() == QUDA_DOUBLE_PRECISION ) {
       Monte<double>(data, rngstate, Beta, nhb, nover);
     } else {
       errorQuda("Precision %d not supported", data.Precision());
     }
 #else
     errorQuda("Pure gauge code has not been built");
 #endif // GPU_GAUGE_ALG
   }


 }
QUDA_RECONSTRUCT_NO
Definition: enum_quda.h:66

blockDim
dim3 dim3 blockDim
Definition: CMakeCUDACompilerId.cpp1.ii:2471

mu
double mu
Definition: test_util.cpp:1643

quda::cuRNGState
struct curandStateMRG32k3a cuRNGState
Definition: random_quda.h:17

quda::setZero
__device__ __host__ void setZero(Matrix< T, N > *m)
Definition: quda_matrix.h:592

quda::linkIndexShift
static __device__ __host__ int linkIndexShift(const I x[], const J dx[], const K X[4])
Definition: index_helper.cuh:13

quda::linkIndex
static __device__ __host__ int linkIndex(const int x[], const I X[4])
Definition: index_helper.cuh:46

fused_exterior_ndeg_tm_dslash_cuda_gen.a2
string a2
Definition: fused_exterior_ndeg_tm_dslash_cuda_gen.py:910

quda::norm
__host__ __device__ ValueType norm(const complex< ValueType > &z)
Returns the magnitude of z squared.
Definition: complex_quda.h:896

Matrix::data
T data[N][N]
Definition: hisq_force_reference2.cpp:134

getVerbosity
QudaVerbosity getVerbosity()
Definition: util_quda.cpp:20

PII
#define PII
Definition: pgauge_heatbath.cu:21

gauge_tools.h

pgauge_monte.h

quda::exp
__host__ __device__ ValueType exp(ValueType x)
Definition: complex_quda.h:85

errorQuda
#define errorQuda(...)
Definition: util_quda.h:90

quda::IndexBlock
static __host__ __device__ void IndexBlock(int block, int &p, int &q)
Definition: gauge_fix_ovr_hit_devf.cuh:36

quda::sqrt
__host__ __device__ ValueType sqrt(ValueType x)
Definition: complex_quda.h:105

QUDA_SUMMARIZE
Definition: enum_quda.h:236

quda::stream
cudaStream_t * stream
Definition: cuda_color_spinor_field.cu:898

tmp
cudaColorSpinorField * tmp
Definition: covdev_test.cpp:44

quda
Definition: blas_cublas.h:6

quda::PGaugeExchange
void PGaugeExchange(cudaGaugeField &data, const int dir, const int parity)
Perform heatbath and overrelaxation. Performs nhb heatbath steps followed by nover overrelaxation ste...
Definition: pgauge_exchange.cu:345

index
char * index(const char *, int)

param
QudaGaugeParam param
Definition: pack_test.cpp:17

comm_quda.h

b
#define b
Definition: dw_dslash4_core.h:83

x
p x
Definition: CMakeCUDACompilerId.cpp1.ii:3011

comm_size
int comm_size(void)
Definition: comm_mpi.cpp:126

launch_kernel.cuh

QUDA_RECONSTRUCT_12
Definition: enum_quda.h:67

quda::sin
__host__ __device__ ValueType sin(ValueType x)
Definition: complex_quda.h:40

deg_tm_dslash_cuda_gen.id
def id
projector matrices ######################################################################## ...
Definition: deg_tm_dslash_cuda_gen.py:31

quda::RNG
Class declaration to initialize and hold CURAND RNG states.
Definition: random_quda.h:23

p
static __inline__ size_t p
Definition: CMakeCUDACompilerId.cpp1.ii:2995

quda::QUDA_PROFILE_COMPUTE
Definition: quda_internal.h:172

quda::tuneLaunch
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
Definition: tune.cpp:603

deg_tm_dslash_cuda_gen.block
def block(code)
Definition: deg_tm_dslash_cuda_gen.py:101

quda::cudaGaugeField
Definition: gauge_field.h:298

gauge_field_order.h
Main header file for host and device accessors to GaugeFields.

tmp1
#define tmp1
Definition: tmc_core.h:15

quda::qudaDeviceSynchronize
cudaError_t qudaDeviceSynchronize()
Wrapper around cudaDeviceSynchronize or cuDeviceSynchronize.
Definition: quda_cuda_api.cpp:277

tune_quda.h

quda::setIdentity
__device__ __host__ void setIdentity(Matrix< T, N > *m)
Definition: quda_matrix.h:543

X
int X
Definition: asym_wilson_clover_dslash_dagger_fermi_core.h:394

QUDA_RECONSTRUCT_8
Definition: enum_quda.h:68

quda::log
__host__ __device__ ValueType log(ValueType x)
Definition: complex_quda.h:90

Matrix
Definition: hisq_force_reference2.cpp:131

quda_matrix.h

QUDA_DOUBLE_PRECISION
Definition: enum_quda.h:61

index_helper.cuh

atomic.cuh

idx
int idx
Definition: staggered_fused_exterior_dslash_core.h:355

QUDA_SINGLE_PRECISION
Definition: enum_quda.h:60

sprintf
int sprintf(char *, const char *,...) __attribute__((__format__(__printf__

fused_exterior_ndeg_tm_dslash_cuda_gen.a1
string a1
Definition: fused_exterior_ndeg_tm_dslash_cuda_gen.py:909

printfQuda
#define printfQuda(...)
Definition: util_quda.h:84

quda::blas::flops
unsigned long long flops
Definition: blas_quda.cu:42

e
return e
Definition: CMakeCUDACompilerId.cpp1.ii:3026

quda::arg
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
Definition: complex_quda.h:880

quda::cos
__host__ __device__ ValueType cos(ValueType x)
Definition: complex_quda.h:35

quda::abs
__host__ __device__ ValueType abs(ValueType x)
Definition: complex_quda.h:110

random_quda.h

float
float
Definition: CMakeCUDACompilerId.cpp1.ii:12791

quda::conj
__host__ __device__ ValueType conj(ValueType x)
Definition: complex_quda.h:115

quda::Monte
void Monte(cudaGaugeField &data, RNG &rngstate, double Beta, int nhb, int nover)
Perform heatbath and overrelaxation. Performs nhb heatbath steps followed by nover overrelaxation ste...
Definition: pgauge_heatbath.cu:857

getTuning
QudaTune getTuning()
Query whether autotuning is enabled or not. Default is enabled but can be overridden by setting QUDA_...
Definition: util_quda.cpp:51

d
static __inline__ size_t size_t d
Definition: CMakeCUDACompilerId.cpp1.ii:3019

quda::LatticeField::Precision
QudaPrecision Precision() const
Definition: lattice_field.h:462

parity
QudaParity parity
Definition: covdev_test.cpp:53

a
#define a
Definition: dw_dslash4_core.h:82

tmp0
#define tmp0
Definition: tmc_core.h:14

gauge_field.h

quda::blas::bytes
unsigned long long bytes
Definition: blas_quda.cu:43

y
int y
Definition: CMakeCUDACompilerId.cpp1.ii:2637

quda_internal.h

quda::getCoords
static __device__ __host__ void getCoords(int x[], int cb_index, const I X[], int parity)
Definition: index_helper.cuh:129