quda-ref/v1.0.0/pgauge__init_8cu_source.html

 #include <quda_internal.h>
 #include <quda_matrix.h>
 #include <tune_quda.h>
 #include <gauge_field.h>
 #include <gauge_field_order.h>
 #include <launch_kernel.cuh>
 #include <comm_quda.h>
 #include <unitarization_links.h>
 #include <pgauge_monte.h>
 #include <random_quda.h>
 #include <cub_helper.cuh>
 #include <index_helper.cuh>


 #ifndef PI
 #define PI    3.1415926535897932384626433832795    // pi
 #endif
 #ifndef PII
 #define PII   6.2831853071795864769252867665590    // 2 * pi
 #endif

 namespace quda {

 #ifdef GPU_GAUGE_ALG

   template <typename Gauge>
   struct InitGaugeColdArg {
     int threads; // number of active threads required
     int X[4]; // grid dimensions
     Gauge dataOr;
     InitGaugeColdArg(const Gauge &dataOr, const cudaGaugeField &data)
       : dataOr(dataOr) {
       for ( int dir = 0; dir < 4; ++dir ) X[dir] = data.X()[dir];
       threads = X[0] * X[1] * X[2] * X[3];
     }
   };

   template<typename Float, typename Gauge, int NCOLORS>
   __global__ void compute_InitGauge_ColdStart(InitGaugeColdArg<Gauge> arg){
     int idx = threadIdx.x + blockIdx.x * blockDim.x;
     if ( idx >= arg.threads ) return;
     int parity = 0;
     if ( idx >= arg.threads / 2 ) {
       parity = 1;
       idx -= arg.threads / 2;
     }
     Matrix<complex<Float>,NCOLORS> U;
     setIdentity(&U);
     for ( int d = 0; d < 4; d++ ) arg.dataOr(d, idx, parity) = U;
   }


   template<typename Float, typename Gauge, int NCOLORS>
   class InitGaugeCold : Tunable {
     InitGaugeColdArg<Gauge> arg;
     mutable char aux_string[128]; // used as a label in the autotuner
     private:
     unsigned int sharedBytesPerThread() const {
       return 0;
     }
     unsigned int sharedBytesPerBlock(const TuneParam &param) const {
       return 0;
     }
     //bool tuneSharedBytes() const { return false; }  // Don't tune shared memory
     bool tuneGridDim() const {
       return false;
     }                                        // Don't tune the grid dimensions.
     unsigned int minThreads() const {
       return arg.threads;
     }

     public:
     InitGaugeCold(InitGaugeColdArg<Gauge> &arg)
       : arg(arg) {
     }
     ~InitGaugeCold () {
     }

     void apply(const cudaStream_t &stream){
       TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
       compute_InitGauge_ColdStart<Float, Gauge, NCOLORS> <<< tp.grid,tp.block >>> (arg);
       //cudaDeviceSynchronize();
     }

     TuneKey tuneKey() const {
       std::stringstream vol;
       vol << arg.X[0] << "x";
       vol << arg.X[1] << "x";
       vol << arg.X[2] << "x";
       vol << arg.X[3];
       sprintf(aux_string,"threads=%d,prec=%lu", arg.threads, sizeof(Float));
       return TuneKey(vol.str().c_str(), typeid(*this).name(), aux_string);

     }

     long long flops() const {
       return 0;
     }                                  // Only correct if there is no link reconstruction, no cub reduction accounted also
     long long bytes() const {
       return 0;
     }                                  //no accounting the reduction!!!!

   };

   template<typename Float, int NCOLORS, typename Gauge>
   void InitGaugeField( Gauge dataOr,  cudaGaugeField& data) {
     InitGaugeColdArg<Gauge> initarg(dataOr, data);
     InitGaugeCold<Float, Gauge, NCOLORS> init(initarg);
     init.apply(0);
     checkCudaError();
   }


   template<typename Float>
   void InitGaugeField( cudaGaugeField& data) {

     if ( data.isNative() ) {
       if ( data.Reconstruct() == QUDA_RECONSTRUCT_NO ) {
   typedef typename gauge_mapper<Float,QUDA_RECONSTRUCT_NO>::type Gauge;
         InitGaugeField<Float, 3>(Gauge(data), data);
       } else if ( data.Reconstruct() == QUDA_RECONSTRUCT_12 ) {
   typedef typename gauge_mapper<Float,QUDA_RECONSTRUCT_12>::type Gauge;
         InitGaugeField<Float, 3>(Gauge(data), data);
       } else if ( data.Reconstruct() == QUDA_RECONSTRUCT_8 ) {
   typedef typename gauge_mapper<Float,QUDA_RECONSTRUCT_8>::type Gauge;
         InitGaugeField<Float, 3>(Gauge(data), data);
       } else {
         errorQuda("Reconstruction type %d of gauge field not supported", data.Reconstruct());
       }
     } else {
       errorQuda("Invalid Gauge Order\n");
     }

   }

   void InitGaugeField( cudaGaugeField& data) {

     if ( data.Precision() == QUDA_SINGLE_PRECISION ) {
       InitGaugeField<float> (data);
     } else if ( data.Precision() == QUDA_DOUBLE_PRECISION ) {
       InitGaugeField<double>(data);
     } else {
       errorQuda("Precision %d not supported", data.Precision());
     }

   }


   template <typename Gauge>
   struct InitGaugeHotArg {
     int threads; // number of active threads required
     int X[4]; // grid dimensions
     RNG rngstate;
 #ifdef MULTI_GPU
     int border[4];
 #endif
     Gauge dataOr;
     InitGaugeHotArg(const Gauge &dataOr, const cudaGaugeField &data, RNG &rngstate)
       : dataOr(dataOr), rngstate(rngstate) {
 #ifdef MULTI_GPU
       for ( int dir = 0; dir < 4; ++dir ) {
         border[dir] = data.R()[dir];
         X[dir] = data.X()[dir] - border[dir] * 2;
       }
 #else
       for ( int dir = 0; dir < 4; ++dir ) X[dir] = data.X()[dir];
 #endif
       //the optimal number of RNG states in rngstate array must be equal to half the lattice volume
       //this number is the same used in heatbath...
       threads = X[0] * X[1] * X[2] * X[3] >> 1;
     }
   };


   template <typename Float>
   __host__ __device__ static inline void reunit_link( Matrix<complex<Float>,3> &U ){

     complex<Float> t2((Float)0.0, (Float)0.0);
     Float t1 = 0.0;
     //first normalize first row
     //sum of squares of row
 #pragma unroll
     for ( int c = 0; c < 3; c++ ) t1 += norm(U(0,c));
     t1 = (Float)1.0 / sqrt(t1);
     //14
     //used to normalize row
 #pragma unroll
     for ( int c = 0; c < 3; c++ ) U(0,c) *= t1;
     //6
 #pragma unroll
     for ( int c = 0; c < 3; c++ ) t2 += conj(U(0,c)) * U(1,c);
     //24
 #pragma unroll
     for ( int c = 0; c < 3; c++ ) U(1,c) -= t2 * U(0,c);
     //24
     //normalize second row
     //sum of squares of row
     t1 = 0.0;
 #pragma unroll
     for ( int c = 0; c < 3; c++ ) t1 += norm(U(1,c));
     t1 = (Float)1.0 / sqrt(t1);
     //14
     //used to normalize row
 #pragma unroll
     for ( int c = 0; c < 3; c++ ) U(1, c) *= t1;
     //6
     //Reconstruct lat row
     U(2,0) = conj(U(0,1) * U(1,2) - U(0,2) * U(1,1));
     U(2,1) = conj(U(0,2) * U(1,0) - U(0,0) * U(1,2));
     U(2,2) = conj(U(0,0) * U(1,1) - U(0,1) * U(1,0));
     //42
     //T=130
   }


   template <class T>
   __device__ static inline Matrix<T,2> randomSU2(cuRNGState& localState){
     Matrix<T,2> a;
     T aabs, ctheta, stheta, phi;
     a(0,0) = Random<T>(localState, (T)-1.0, (T)1.0);
     aabs = sqrt( 1.0 - a(0,0) * a(0,0));
     ctheta = Random<T>(localState, (T)-1.0, (T)1.0);
     phi = PII * Random<T>(localState);
     stheta = ( curand(&localState) & 1 ? 1 : -1 ) * sqrt( (T)1.0 - ctheta * ctheta );
     a(0,1) = aabs * stheta * cos( phi );
     a(1,0) = aabs * stheta * sin( phi );
     a(1,1) = aabs * ctheta;
     return a;
   }


   template <class T, int NCOLORS>
   __host__ __device__ static inline void mul_block_sun( Matrix<T,2> u, Matrix<complex<T>,NCOLORS> &link, int2 id ){
     for ( int j = 0; j < NCOLORS; j++ ) {
       complex<T> tmp = complex<T>( u(0,0), u(1,1) ) * link(id.x, j) + complex<T>( u(1,0), u(0,1) ) * link(id.y, j);
       link(id.y, j) = complex<T>(-u(1,0), u(0,1) ) * link(id.x, j) + complex<T>( u(0,0),-u(1,1) ) * link(id.y, j);
       link(id.x, j) = tmp;
     }
   }


   template<int NCOLORS>
   __host__ __device__ static inline int2 IndexBlock(int block){
     int2 id;
     int i1;
     int found = 0;
     int del_i = 0;
     int index = -1;
     while ( del_i < (NCOLORS - 1) && found == 0 ) {
       del_i++;
       for ( i1 = 0; i1 < (NCOLORS - del_i); i1++ ) {
         index++;
         if ( index == block ) {
           found = 1;
           break;
         }
       }
     }
     id.y = i1 + del_i;
     id.x = i1;
     return id;
   }

   template <class Float, int NCOLORS>
   __device__ inline Matrix<complex<Float>,NCOLORS> randomize( cuRNGState& localState ){
     Matrix<complex<Float>,NCOLORS> U;

     for ( int i = 0; i < NCOLORS; i++ )
       for ( int j = 0; j < NCOLORS; j++ )
         U(i,j) = complex<Float>( (Float)(Random<Float>(localState) - 0.5), (Float)(Random<Float>(localState) - 0.5) );
     reunit_link<Float>(U);
     return U;

     /*setIdentity(&U);
        for( int block = 0; block < NCOLORS * ( NCOLORS - 1) / 2; block++ ) {
        Matrix<Float,2> rr = randomSU2<Float>(localState);
        int2 id = IndexBlock<NCOLORS>( block );
        mul_block_sun<Float, NCOLORS>(rr, U, id);
        //U = block_su2_to_su3<Float>( U, a00, a01, a10, a11, block );
        }
        return U;*/
   }

   template<typename Float, typename Gauge, int NCOLORS>
   __global__ void compute_InitGauge_HotStart(InitGaugeHotArg<Gauge> arg){
     int idx = threadIdx.x + blockIdx.x * blockDim.x;
     if ( idx >= arg.threads ) return;
   #ifdef MULTI_GPU
     int X[4], x[4];
     for ( int dr = 0; dr < 4; ++dr ) X[dr] = arg.X[dr];
     for ( int dr = 0; dr < 4; ++dr ) X[dr] += 2 * arg.border[dr];
     int id = idx;
     cuRNGState localState = arg.rngstate.State()[ id ];
   #else
     cuRNGState localState = arg.rngstate.State()[ idx ];
   #endif
     for ( int parity = 0; parity < 2; parity++ ) {
     #ifdef MULTI_GPU
       getCoords(x, id, arg.X, parity);
       for ( int dr = 0; dr < 4; ++dr ) x[dr] += arg.border[dr];
       idx = linkIndex(x,X);
     #endif
       for ( int d = 0; d < 4; d++ ) {
         Matrix<complex<Float>,NCOLORS> U;
         U = randomize<Float, NCOLORS>(localState);
         arg.dataOr(d, idx, parity) = U;
       }
     }
   #ifdef MULTI_GPU
     arg.rngstate.State()[ id ] = localState;
   #else
     arg.rngstate.State()[ idx ] = localState;
   #endif
   }


   template<typename Float, typename Gauge, int NCOLORS>
   class InitGaugeHot : Tunable {
     InitGaugeHotArg<Gauge> arg;
     mutable char aux_string[128]; // used as a label in the autotuner
     private:
     unsigned int sharedBytesPerThread() const {
       return 0;
     }
     unsigned int sharedBytesPerBlock(const TuneParam &param) const {
       return 0;
     }
     bool tuneSharedBytes() const {
       return false;
     }                                            // Don't tune shared memory
     bool tuneGridDim() const {
       return false;
     }                                        // Don't tune the grid dimensions.
     unsigned int minThreads() const {
       return arg.threads;
     }

     public:
     InitGaugeHot(InitGaugeHotArg<Gauge> &arg)
       : arg(arg) {
     }
     ~InitGaugeHot () {
     }

     void apply(const cudaStream_t &stream){
       TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
       compute_InitGauge_HotStart<Float, Gauge, NCOLORS> <<< tp.grid,tp.block >>> (arg);
       //cudaDeviceSynchronize();
     }

     TuneKey tuneKey() const {
       std::stringstream vol;
       vol << arg.X[0] << "x";
       vol << arg.X[1] << "x";
       vol << arg.X[2] << "x";
       vol << arg.X[3];
       sprintf(aux_string,"threads=%d,prec=%lud", arg.threads, sizeof(Float));
       return TuneKey(vol.str().c_str(), typeid(*this).name(), aux_string);

     }

     void preTune(){ arg.rngstate.backup(); }
     void postTune(){ arg.rngstate.restore(); }
     long long flops() const {
       return 0;
     }                                  // Only correct if there is no link reconstruction, no cub reduction accounted also
     long long bytes() const {
       return 0;
     }                                  //no accounting the reduction!!!!

   };


   template<typename Float, int NCOLORS, typename Gauge>
   void InitGaugeField( Gauge dataOr,  cudaGaugeField& data, RNG &rngstate) {
     InitGaugeHotArg<Gauge> initarg(dataOr, data, rngstate);
     InitGaugeHot<Float, Gauge, NCOLORS> init(initarg);
     init.apply(0);
     checkCudaError();
     qudaDeviceSynchronize();

     data.exchangeExtendedGhost(data.R(),false);
   }

   template<typename Float>
   void InitGaugeField( cudaGaugeField& data, RNG &rngstate) {

     if ( data.isNative() ) {
       if ( data.Reconstruct() == QUDA_RECONSTRUCT_NO ) {
   typedef typename gauge_mapper<Float,QUDA_RECONSTRUCT_NO>::type Gauge;
         InitGaugeField<Float, 3>(Gauge(data), data, rngstate);
       } else if ( data.Reconstruct() == QUDA_RECONSTRUCT_12 ) {
   typedef typename gauge_mapper<Float,QUDA_RECONSTRUCT_12>::type Gauge;
         InitGaugeField<Float, 3>(Gauge(data), data, rngstate);
       } else if ( data.Reconstruct() == QUDA_RECONSTRUCT_8 ) {
   typedef typename gauge_mapper<Float,QUDA_RECONSTRUCT_8>::type Gauge;
         InitGaugeField<Float, 3>(Gauge(data), data, rngstate);
       } else {
         errorQuda("Reconstruction type %d of gauge field not supported", data.Reconstruct());
       }
     } else {
       errorQuda("Invalid Gauge Order\n");
     }
   }
 #endif // GPU_GAUGE_ALG

   void InitGaugeField( cudaGaugeField& data, RNG &rngstate) {
 #ifdef GPU_GAUGE_ALG
     if ( data.Precision() == QUDA_SINGLE_PRECISION ) {
       InitGaugeField<float> (data, rngstate);
     } else if ( data.Precision() == QUDA_DOUBLE_PRECISION ) {
       InitGaugeField<double>(data, rngstate);
     } else {
       errorQuda("Precision %d not supported", data.Precision());
     }
 #else
     errorQuda("Pure gauge code has not been built");
 #endif
   }
 }
QUDA_RECONSTRUCT_NO
Definition: enum_quda.h:67

quda::TuneParam
Definition: tune_quda.h:17

quda::cuRNGState
struct curandStateMRG32k3a cuRNGState
Definition: random_quda.h:17

quda::linkIndex
static __device__ __host__ int linkIndex(const int x[], const I X[4])
Definition: index_helper.cuh:46

quda::norm
__host__ __device__ ValueType norm(const complex< ValueType > &z)
Returns the magnitude of z squared.
Definition: complex_quda.h:1092

getVerbosity
QudaVerbosity getVerbosity()
Definition: util_quda.cpp:21

pgauge_monte.h

errorQuda
#define errorQuda(...)
Definition: util_quda.h:121

quda::IndexBlock
static __host__ __device__ void IndexBlock(int block, int &p, int &q)
Definition: gauge_fix_ovr_hit_devf.cuh:36

quda::sqrt
__host__ __device__ ValueType sqrt(ValueType x)
Definition: complex_quda.h:120

cub_helper.cuh

quda::stream
cudaStream_t * stream
Definition: cuda_color_spinor_field.cpp:897

tmp
cudaColorSpinorField * tmp
Definition: covdev_test.cpp:44

quda::TuneParam::grid
dim3 grid
Definition: tune_quda.h:21

quda
Definition: blas_cublas.h:5

param
QudaGaugeParam param
Definition: pack_test.cpp:17

comm_quda.h

quda::LatticeField::R
const int * R() const
Definition: lattice_field.h:536

quda::Tunable
Definition: tune_quda.h:59

qudaDeviceSynchronize
#define qudaDeviceSynchronize()
Definition: quda_cuda_api.h:145

launch_kernel.cuh

QUDA_RECONSTRUCT_12
Definition: enum_quda.h:68

PII
#define PII
Definition: pgauge_init.cu:19

quda::sin
__host__ __device__ ValueType sin(ValueType x)
Definition: complex_quda.h:51

quda::cudaGaugeField::exchangeExtendedGhost
void exchangeExtendedGhost(const int *R, bool no_comms_fill=false)
This does routine will populate the border / halo region of a gauge field that has been created using...
Definition: cuda_gauge_field.cpp:510

quda::TuneParam::block
dim3 block
Definition: tune_quda.h:20

quda::RNG
Class declaration to initialize and hold CURAND RNG states.
Definition: random_quda.h:23

quda::tuneLaunch
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
Definition: tune.cpp:643

quda::cudaGaugeField
Definition: gauge_field.h:404

quda::InitGaugeField
void InitGaugeField(cudaGaugeField &data)
Perform a cold start to the gauge field, identity SU(3) matrix, also fills the ghost links in multi-G...

gauge_field_order.h
Main header file for host and device accessors to GaugeFields.

X
int X[4]
Definition: covdev_test.cpp:70

quda::cublas::init
void init()
Create the CUBLAS context.
Definition: blas_cublas.cu:31

tune_quda.h

quda::setIdentity
__device__ __host__ void setIdentity(Matrix< T, N > *m)
Definition: quda_matrix.h:653

QUDA_RECONSTRUCT_8
Definition: enum_quda.h:69

index
static int index(int ndim, const int *dims, const int *x)
Definition: comm_common.cpp:32

quda_matrix.h

QUDA_DOUBLE_PRECISION
Definition: enum_quda.h:62

unitarization_links.h

index_helper.cuh

QUDA_SINGLE_PRECISION
Definition: enum_quda.h:61

quda::gauge_mapper
Definition: gauge_field_order.h:3012

quda::blas::flops
unsigned long long flops
Definition: blas_quda.cu:22

quda::arg
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
Definition: complex_quda.h:1076

quda::cos
__host__ __device__ ValueType cos(ValueType x)
Definition: complex_quda.h:46

quda::GaugeField::Reconstruct
QudaReconstructType Reconstruct() const
Definition: gauge_field.h:250

checkCudaError
#define checkCudaError()
Definition: util_quda.h:161

random_quda.h

quda::conj
__host__ __device__ ValueType conj(ValueType x)
Definition: complex_quda.h:130

getTuning
QudaTune getTuning()
Query whether autotuning is enabled or not. Default is enabled but can be overridden by setting QUDA_...
Definition: util_quda.cpp:52

quda::LatticeField::Precision
QudaPrecision Precision() const
Definition: lattice_field.h:546

quda::GaugeField::isNative
bool isNative() const
Definition: gauge_field.cpp:167

parity
QudaParity parity
Definition: covdev_test.cpp:54

gauge_field.h

quda::Matrix
Definition: quda_matrix.h:64

quda::TuneKey
Definition: tune_key.h:8

quda::blas::bytes
unsigned long long bytes
Definition: blas_quda.cu:23

quda::getCoords
__host__ __device__ int getCoords(int coord[], const Arg &arg, int &idx, int parity, int &dim)
Compute the space-time coordinates we are at.
Definition: dslash_helper.cuh:88

quda::LatticeField::X
const int * X() const
Definition: lattice_field.h:499

quda_internal.h