v0.9.0/doc/pgauge__init_8cu_source.html

 #include <quda_internal.h>
 #include <quda_matrix.h>
 #include <tune_quda.h>
 #include <gauge_field.h>
 #include <gauge_field_order.h>
 #include <launch_kernel.cuh>
 #include <comm_quda.h>
 #include <unitarization_links.h>
 #include <pgauge_monte.h>
 #include <random_quda.h>
 #include <cub/cub.cuh>
 #include <index_helper.cuh>


 #ifndef PI
 #define PI    3.1415926535897932384626433832795    // pi
 #endif
 #ifndef PII
 #define PII   6.2831853071795864769252867665590    // 2 * pi
 #endif

 namespace quda {

 #ifdef GPU_GAUGE_ALG

   template <typename Gauge>
   struct InitGaugeColdArg {
     int threads; // number of active threads required
     int X[4]; // grid dimensions
     Gauge dataOr;
     InitGaugeColdArg(const Gauge &dataOr, const cudaGaugeField &data)
       : dataOr(dataOr) {
       for ( int dir = 0; dir < 4; ++dir ) X[dir] = data.X()[dir];
       threads = X[0] * X[1] * X[2] * X[3];
     }
   };


   template<typename Float, typename Gauge, int NCOLORS>
   __global__ void compute_InitGauge_ColdStart(InitGaugeColdArg<Gauge> arg){
     int idx = threadIdx.x + blockIdx.x * blockDim.x;
     if ( idx >= arg.threads ) return;
     int parity = 0;
     if ( idx >= arg.threads / 2 ) {
       parity = 1;
       idx -= arg.threads / 2;
     }
     Matrix<complex<Float>,NCOLORS> U;
     setIdentity(&U);
     for ( int d = 0; d < 4; d++ )
       arg.dataOr.save((Float*)(U.data),idx, d, parity);
   }


   template<typename Float, typename Gauge, int NCOLORS>
   class InitGaugeCold : Tunable {
     InitGaugeColdArg<Gauge> arg;
     mutable char aux_string[128]; // used as a label in the autotuner
     private:
     unsigned int sharedBytesPerThread() const {
       return 0;
     }
     unsigned int sharedBytesPerBlock(const TuneParam &param) const {
       return 0;
     }
     //bool tuneSharedBytes() const { return false; }  // Don't tune shared memory
     bool tuneGridDim() const {
       return false;
     }                                        // Don't tune the grid dimensions.
     unsigned int minThreads() const {
       return arg.threads;
     }

     public:
     InitGaugeCold(InitGaugeColdArg<Gauge> &arg)
       : arg(arg) {
     }
     ~InitGaugeCold () {
     }

     void apply(const cudaStream_t &stream){
       TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
       compute_InitGauge_ColdStart<Float, Gauge, NCOLORS><< < tp.grid,tp.block >> > (arg);
       //cudaDeviceSynchronize();
     }

     TuneKey tuneKey() const {
       std::stringstream vol;
       vol << arg.X[0] << "x";
       vol << arg.X[1] << "x";
       vol << arg.X[2] << "x";
       vol << arg.X[3];
       sprintf(aux_string,"threads=%d,prec=%lu", arg.threads, sizeof(Float));
       return TuneKey(vol.str().c_str(), typeid(*this).name(), aux_string);

     }

     long long flops() const {
       return 0;
     }                                  // Only correct if there is no link reconstruction, no cub reduction accounted also
     long long bytes() const {
       return 0;
     }                                  //no accounting the reduction!!!!

   };

   template<typename Float, int NCOLORS, typename Gauge>
   void InitGaugeField( Gauge dataOr,  cudaGaugeField& data) {
     InitGaugeColdArg<Gauge> initarg(dataOr, data);
     InitGaugeCold<Float, Gauge, NCOLORS> init(initarg);
     init.apply(0);
     checkCudaError();
   }


   template<typename Float>
   void InitGaugeField( cudaGaugeField& data) {

     if ( data.isNative() ) {
       if ( data.Reconstruct() == QUDA_RECONSTRUCT_NO ) {
   typedef typename gauge_mapper<Float,QUDA_RECONSTRUCT_NO>::type Gauge;
         InitGaugeField<Float, 3>(Gauge(data), data);
       } else if ( data.Reconstruct() == QUDA_RECONSTRUCT_12 ) {
   typedef typename gauge_mapper<Float,QUDA_RECONSTRUCT_12>::type Gauge;
         InitGaugeField<Float, 3>(Gauge(data), data);
       } else if ( data.Reconstruct() == QUDA_RECONSTRUCT_8 ) {
   typedef typename gauge_mapper<Float,QUDA_RECONSTRUCT_8>::type Gauge;
         InitGaugeField<Float, 3>(Gauge(data), data);
       } else {
         errorQuda("Reconstruction type %d of gauge field not supported", data.Reconstruct());
       }
     } else {
       errorQuda("Invalid Gauge Order\n");
     }

   }

   void InitGaugeField( cudaGaugeField& data) {

     if ( data.Precision() == QUDA_SINGLE_PRECISION ) {
       InitGaugeField<float> (data);
     } else if ( data.Precision() == QUDA_DOUBLE_PRECISION ) {
       InitGaugeField<double>(data);
     } else {
       errorQuda("Precision %d not supported", data.Precision());
     }

   }


   template <typename Gauge>
   struct InitGaugeHotArg {
     int threads; // number of active threads required
     int X[4]; // grid dimensions
     RNG rngstate;
 #ifdef MULTI_GPU
     int border[4];
 #endif
     Gauge dataOr;
     InitGaugeHotArg(const Gauge &dataOr, const cudaGaugeField &data, RNG &rngstate)
       : dataOr(dataOr), rngstate(rngstate) {
 #ifdef MULTI_GPU
       for ( int dir = 0; dir < 4; ++dir ) {
         border[dir] = data.R()[dir];
         X[dir] = data.X()[dir] - border[dir] * 2;
       }
 #else
       for ( int dir = 0; dir < 4; ++dir ) X[dir] = data.X()[dir];
 #endif
       //the optimal number of RNG states in rngstate array must be equal to half the lattice volume
       //this number is the same used in heatbath...
       threads = X[0] * X[1] * X[2] * X[3] >> 1;
     }
   };


   template <typename Float>
   __host__ __device__ static inline void reunit_link( Matrix<complex<Float>,3> &U ){

     complex<Float> t2((Float)0.0, (Float)0.0);
     Float t1 = 0.0;
     //first normalize first row
     //sum of squares of row
 #pragma unroll
     for ( int c = 0; c < 3; c++ ) t1 += norm(U(0,c));
     t1 = (Float)1.0 / sqrt(t1);
     //14
     //used to normalize row
 #pragma unroll
     for ( int c = 0; c < 3; c++ ) U(0,c) *= t1;
     //6
 #pragma unroll
     for ( int c = 0; c < 3; c++ ) t2 += conj(U(0,c)) * U(1,c);
     //24
 #pragma unroll
     for ( int c = 0; c < 3; c++ ) U(1,c) -= t2 * U(0,c);
     //24
     //normalize second row
     //sum of squares of row
     t1 = 0.0;
 #pragma unroll
     for ( int c = 0; c < 3; c++ ) t1 += norm(U(1,c));
     t1 = (Float)1.0 / sqrt(t1);
     //14
     //used to normalize row
 #pragma unroll
     for ( int c = 0; c < 3; c++ ) U(1, c) *= t1;
     //6
     //Reconstruct lat row
     U(2,0) = conj(U(0,1) * U(1,2) - U(0,2) * U(1,1));
     U(2,1) = conj(U(0,2) * U(1,0) - U(0,0) * U(1,2));
     U(2,2) = conj(U(0,0) * U(1,1) - U(0,1) * U(1,0));
     //42
     //T=130
   }


   template <class T>
   __device__ static inline Matrix<T,2> randomSU2(cuRNGState& localState){
     Matrix<T,2> a;
     T aabs, ctheta, stheta, phi;
     a(0,0) = Random<T>(localState, (T)-1.0, (T)1.0);
     aabs = sqrt( 1.0 - a(0,0) * a(0,0));
     ctheta = Random<T>(localState, (T)-1.0, (T)1.0);
     phi = PII * Random<T>(localState);
     stheta = ( curand(&localState) & 1 ? 1 : -1 ) * sqrt( (T)1.0 - ctheta * ctheta );
     a(0,1) = aabs * stheta * cos( phi );
     a(1,0) = aabs * stheta * sin( phi );
     a(1,1) = aabs * ctheta;
     return a;
   }


   template <class T, int NCOLORS>
   __host__ __device__ static inline void mul_block_sun( Matrix<T,2> u, Matrix<complex<T>,NCOLORS> &link, int2 id ){
     for ( int j = 0; j < NCOLORS; j++ ) {
       complex<T> tmp = complex<T>( u(0,0), u(1,1) ) * link(id.x, j) + complex<T>( u(1,0), u(0,1) ) * link(id.y, j);
       link(id.y, j) = complex<T>(-u(1,0), u(0,1) ) * link(id.x, j) + complex<T>( u(0,0),-u(1,1) ) * link(id.y, j);
       link(id.x, j) = tmp;
     }
   }


   template<int NCOLORS>
   __host__ __device__ static inline int2 IndexBlock(int block){
     int2 id;
     int i1;
     int found = 0;
     int del_i = 0;
     int index = -1;
     while ( del_i < (NCOLORS - 1) && found == 0 ) {
       del_i++;
       for ( i1 = 0; i1 < (NCOLORS - del_i); i1++ ) {
         index++;
         if ( index == block ) {
           found = 1;
           break;
         }
       }
     }
     id.y = i1 + del_i;
     id.x = i1;
     return id;
   }

   template <class Float, int NCOLORS>
   __device__ inline Matrix<complex<Float>,NCOLORS> randomize( cuRNGState& localState ){
     Matrix<complex<Float>,NCOLORS> U;

     for ( int i = 0; i < NCOLORS; i++ )
       for ( int j = 0; j < NCOLORS; j++ )
         U(i,j) = complex<Float>( (Float)(Random<Float>(localState) - 0.5), (Float)(Random<Float>(localState) - 0.5) );
     reunit_link<Float>(U);
     return U;

     /*setIdentity(&U);
        for( int block = 0; block < NCOLORS * ( NCOLORS - 1) / 2; block++ ) {
        Matrix<Float,2> rr = randomSU2<Float>(localState);
        int2 id = IndexBlock<NCOLORS>( block );
        mul_block_sun<Float, NCOLORS>(rr, U, id);
        //U = block_su2_to_su3<Float>( U, a00, a01, a10, a11, block );
        }
        return U;*/
   }

   template<typename Float, typename Gauge, int NCOLORS>
   __global__ void compute_InitGauge_HotStart(InitGaugeHotArg<Gauge> arg){
     int idx = threadIdx.x + blockIdx.x * blockDim.x;
     if ( idx >= arg.threads ) return;
   #ifdef MULTI_GPU
     int X[4], x[4];
     for ( int dr = 0; dr < 4; ++dr ) X[dr] = arg.X[dr];
     for ( int dr = 0; dr < 4; ++dr ) X[dr] += 2 * arg.border[dr];
     int id = idx;
     cuRNGState localState = arg.rngstate.State()[ id ];
   #else
     cuRNGState localState = arg.rngstate.State()[ idx ];
   #endif
     for ( int parity = 0; parity < 2; parity++ ) {
     #ifdef MULTI_GPU
       getCoords(x, id, arg.X, parity);
       for ( int dr = 0; dr < 4; ++dr ) x[dr] += arg.border[dr];
       idx = linkIndex(x,X);
     #endif
       for ( int d = 0; d < 4; d++ ) {
         Matrix<complex<Float>,NCOLORS> U;
         U = randomize<Float, NCOLORS>(localState);
         arg.dataOr.save((Float*)(U.data),idx, d, parity);
       }
     }
   #ifdef MULTI_GPU
     arg.rngstate.State()[ id ] = localState;
   #else
     arg.rngstate.State()[ idx ] = localState;
   #endif
   }


   template<typename Float, typename Gauge, int NCOLORS>
   class InitGaugeHot : Tunable {
     InitGaugeHotArg<Gauge> arg;
     mutable char aux_string[128]; // used as a label in the autotuner
     private:
     unsigned int sharedBytesPerThread() const {
       return 0;
     }
     unsigned int sharedBytesPerBlock(const TuneParam &param) const {
       return 0;
     }
     bool tuneSharedBytes() const {
       return false;
     }                                            // Don't tune shared memory
     bool tuneGridDim() const {
       return false;
     }                                        // Don't tune the grid dimensions.
     unsigned int minThreads() const {
       return arg.threads;
     }

     public:
     InitGaugeHot(InitGaugeHotArg<Gauge> &arg)
       : arg(arg) {
     }
     ~InitGaugeHot () {
     }

     void apply(const cudaStream_t &stream){
       TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
       compute_InitGauge_HotStart<Float, Gauge, NCOLORS><< < tp.grid,tp.block >> > (arg);
       //cudaDeviceSynchronize();
     }

     TuneKey tuneKey() const {
       std::stringstream vol;
       vol << arg.X[0] << "x";
       vol << arg.X[1] << "x";
       vol << arg.X[2] << "x";
       vol << arg.X[3];
       sprintf(aux_string,"threads=%d,prec=%lud", arg.threads, sizeof(Float));
       return TuneKey(vol.str().c_str(), typeid(*this).name(), aux_string);

     }

     void preTune(){ arg.rngstate.backup(); }
     void postTune(){ arg.rngstate.restore(); }
     long long flops() const {
       return 0;
     }                                  // Only correct if there is no link reconstruction, no cub reduction accounted also
     long long bytes() const {
       return 0;
     }                                  //no accounting the reduction!!!!

   };


   template<typename Float, int NCOLORS, typename Gauge>
   void InitGaugeField( Gauge dataOr,  cudaGaugeField& data, RNG &rngstate) {
     InitGaugeHotArg<Gauge> initarg(dataOr, data, rngstate);
     InitGaugeHot<Float, Gauge, NCOLORS> init(initarg);
     init.apply(0);
     checkCudaError();
     qudaDeviceSynchronize();

     data.exchangeExtendedGhost(data.R(),false);
   }


   template<typename Float>
   void InitGaugeField( cudaGaugeField& data, RNG &rngstate) {

     if ( data.isNative() ) {
       if ( data.Reconstruct() == QUDA_RECONSTRUCT_NO ) {
   typedef typename gauge_mapper<Float,QUDA_RECONSTRUCT_NO>::type Gauge;
         InitGaugeField<Float, 3>(Gauge(data), data, rngstate);
       } else if ( data.Reconstruct() == QUDA_RECONSTRUCT_12 ) {
   typedef typename gauge_mapper<Float,QUDA_RECONSTRUCT_12>::type Gauge;
         InitGaugeField<Float, 3>(Gauge(data), data, rngstate);
       } else if ( data.Reconstruct() == QUDA_RECONSTRUCT_8 ) {
   typedef typename gauge_mapper<Float,QUDA_RECONSTRUCT_8>::type Gauge;
         InitGaugeField<Float, 3>(Gauge(data), data, rngstate);
       } else {
         errorQuda("Reconstruction type %d of gauge field not supported", data.Reconstruct());
       }
     } else {
       errorQuda("Invalid Gauge Order\n");
     }
   }
 #endif // GPU_GAUGE_ALG

   void InitGaugeField( cudaGaugeField& data, RNG &rngstate) {
 #ifdef GPU_GAUGE_ALG
     if ( data.Precision() == QUDA_SINGLE_PRECISION ) {
       InitGaugeField<float> (data, rngstate);
     } else if ( data.Precision() == QUDA_DOUBLE_PRECISION ) {
       InitGaugeField<double>(data, rngstate);
     } else {
       errorQuda("Precision %d not supported", data.Precision());
     }
 #else
     errorQuda("Pure gauge code has not been built");
 #endif
   }
 }
QUDA_RECONSTRUCT_NO
Definition: enum_quda.h:66

blockDim
dim3 dim3 blockDim
Definition: CMakeCUDACompilerId.cpp1.ii:2471

quda::cuRNGState
struct curandStateMRG32k3a cuRNGState
Definition: random_quda.h:17

quda::linkIndex
static __device__ __host__ int linkIndex(const int x[], const I X[4])
Definition: index_helper.cuh:46

quda::norm
__host__ __device__ ValueType norm(const complex< ValueType > &z)
Returns the magnitude of z squared.
Definition: complex_quda.h:896

Matrix::data
T data[N][N]
Definition: hisq_force_reference2.cpp:134

getVerbosity
QudaVerbosity getVerbosity()
Definition: util_quda.cpp:20

pgauge_monte.h

errorQuda
#define errorQuda(...)
Definition: util_quda.h:90

quda::blas::init
void init()
Definition: blas_quda.cu:64

quda::IndexBlock
static __host__ __device__ void IndexBlock(int block, int &p, int &q)
Definition: gauge_fix_ovr_hit_devf.cuh:36

quda::sqrt
__host__ __device__ ValueType sqrt(ValueType x)
Definition: complex_quda.h:105

quda::stream
cudaStream_t * stream
Definition: cuda_color_spinor_field.cu:898

tmp
cudaColorSpinorField * tmp
Definition: covdev_test.cpp:44

quda
Definition: blas_cublas.h:6

index
char * index(const char *, int)

param
QudaGaugeParam param
Definition: pack_test.cpp:17

comm_quda.h

x
p x
Definition: CMakeCUDACompilerId.cpp1.ii:3011

launch_kernel.cuh

QUDA_RECONSTRUCT_12
Definition: enum_quda.h:67

PII
#define PII
Definition: pgauge_init.cu:19

quda::sin
__host__ __device__ ValueType sin(ValueType x)
Definition: complex_quda.h:40

deg_tm_dslash_cuda_gen.id
def id
projector matrices ######################################################################## ...
Definition: deg_tm_dslash_cuda_gen.py:31

for
for(int s=0;s< param.dc.Ls;s++)
Definition: dw_dslash5inv_core.h:181

quda::RNG
Class declaration to initialize and hold CURAND RNG states.
Definition: random_quda.h:23

quda::tuneLaunch
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
Definition: tune.cpp:603

fused_exterior_ndeg_tm_dslash_cuda_gen.i
int i
start here
Definition: fused_exterior_ndeg_tm_dslash_cuda_gen.py:816

deg_tm_dslash_cuda_gen.block
def block(code)
Definition: deg_tm_dslash_cuda_gen.py:101

quda::cudaGaugeField
Definition: gauge_field.h:298

quda::InitGaugeField
void InitGaugeField(cudaGaugeField &data)
Perform a cold start to the gauge field, identity SU(3) matrix, also fills the ghost links in multi-G...

gauge_field_order.h
Main header file for host and device accessors to GaugeFields.

quda::qudaDeviceSynchronize
cudaError_t qudaDeviceSynchronize()
Wrapper around cudaDeviceSynchronize or cuDeviceSynchronize.
Definition: quda_cuda_api.cpp:277

tune_quda.h

quda::setIdentity
__device__ __host__ void setIdentity(Matrix< T, N > *m)
Definition: quda_matrix.h:543

X
int X
Definition: asym_wilson_clover_dslash_dagger_fermi_core.h:394

QUDA_RECONSTRUCT_8
Definition: enum_quda.h:68

Matrix
Definition: hisq_force_reference2.cpp:131

quda_matrix.h

QUDA_DOUBLE_PRECISION
Definition: enum_quda.h:61

unitarization_links.h

index_helper.cuh

idx
int idx
Definition: staggered_fused_exterior_dslash_core.h:355

QUDA_SINGLE_PRECISION
Definition: enum_quda.h:60

sprintf
int sprintf(char *, const char *,...) __attribute__((__format__(__printf__

quda::blas::flops
unsigned long long flops
Definition: blas_quda.cu:42

quda::arg
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
Definition: complex_quda.h:880

c
const void * c
Definition: CMakeCUDACompilerId.cpp1.ii:2234

quda::cos
__host__ __device__ ValueType cos(ValueType x)
Definition: complex_quda.h:35

checkCudaError
#define checkCudaError()
Definition: util_quda.h:129

random_quda.h

quda::conj
__host__ __device__ ValueType conj(ValueType x)
Definition: complex_quda.h:115

getTuning
QudaTune getTuning()
Query whether autotuning is enabled or not. Default is enabled but can be overridden by setting QUDA_...
Definition: util_quda.cpp:51

d
static __inline__ size_t size_t d
Definition: CMakeCUDACompilerId.cpp1.ii:3019

quda::LatticeField::Precision
QudaPrecision Precision() const
Definition: lattice_field.h:462

parity
QudaParity parity
Definition: covdev_test.cpp:53

a
#define a
Definition: dw_dslash4_core.h:82

gauge_field.h

quda::blas::bytes
unsigned long long bytes
Definition: blas_quda.cu:43

y
int y
Definition: CMakeCUDACompilerId.cpp1.ii:2637

quda_internal.h

quda::getCoords
static __device__ __host__ void getCoords(int x[], int cb_index, const I X[], int parity)
Definition: index_helper.cuh:129