QUDA  0.9.0
pgauge_init.cu
Go to the documentation of this file.
1 #include <quda_internal.h>
2 #include <quda_matrix.h>
3 #include <tune_quda.h>
4 #include <gauge_field.h>
5 #include <gauge_field_order.h>
6 #include <launch_kernel.cuh>
7 #include <comm_quda.h>
8 #include <unitarization_links.h>
9 #include <pgauge_monte.h>
10 #include <random_quda.h>
11 #include <cub/cub.cuh>
12 #include <index_helper.cuh>
13 
14 
15 #ifndef PI
16 #define PI 3.1415926535897932384626433832795 // pi
17 #endif
18 #ifndef PII
19 #define PII 6.2831853071795864769252867665590 // 2 * pi
20 #endif
21 
22 namespace quda {
23 
24 #ifdef GPU_GAUGE_ALG
25 
26  template <typename Gauge>
27  struct InitGaugeColdArg {
28  int threads; // number of active threads required
29  int X[4]; // grid dimensions
30  Gauge dataOr;
31  InitGaugeColdArg(const Gauge &dataOr, const cudaGaugeField &data)
32  : dataOr(dataOr) {
33  for ( int dir = 0; dir < 4; ++dir ) X[dir] = data.X()[dir];
34  threads = X[0] * X[1] * X[2] * X[3];
35  }
36  };
37 
38 
39 
40 
41  template<typename Float, typename Gauge, int NCOLORS>
42  __global__ void compute_InitGauge_ColdStart(InitGaugeColdArg<Gauge> arg){
43  int idx = threadIdx.x + blockIdx.x * blockDim.x;
44  if ( idx >= arg.threads ) return;
45  int parity = 0;
46  if ( idx >= arg.threads / 2 ) {
47  parity = 1;
48  idx -= arg.threads / 2;
49  }
50  Matrix<complex<Float>,NCOLORS> U;
51  setIdentity(&U);
52  for ( int d = 0; d < 4; d++ )
53  arg.dataOr.save((Float*)(U.data),idx, d, parity);
54  }
55 
56 
57  template<typename Float, typename Gauge, int NCOLORS>
58  class InitGaugeCold : Tunable {
59  InitGaugeColdArg<Gauge> arg;
60  mutable char aux_string[128]; // used as a label in the autotuner
61  private:
62  unsigned int sharedBytesPerThread() const {
63  return 0;
64  }
65  unsigned int sharedBytesPerBlock(const TuneParam &param) const {
66  return 0;
67  }
68  //bool tuneSharedBytes() const { return false; } // Don't tune shared memory
69  bool tuneGridDim() const {
70  return false;
71  } // Don't tune the grid dimensions.
72  unsigned int minThreads() const {
73  return arg.threads;
74  }
75 
76  public:
77  InitGaugeCold(InitGaugeColdArg<Gauge> &arg)
78  : arg(arg) {
79  }
80  ~InitGaugeCold () {
81  }
82 
83  void apply(const cudaStream_t &stream){
84  TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
85  compute_InitGauge_ColdStart<Float, Gauge, NCOLORS><< < tp.grid,tp.block >> > (arg);
86  //cudaDeviceSynchronize();
87  }
88 
89  TuneKey tuneKey() const {
90  std::stringstream vol;
91  vol << arg.X[0] << "x";
92  vol << arg.X[1] << "x";
93  vol << arg.X[2] << "x";
94  vol << arg.X[3];
95  sprintf(aux_string,"threads=%d,prec=%lu", arg.threads, sizeof(Float));
96  return TuneKey(vol.str().c_str(), typeid(*this).name(), aux_string);
97 
98  }
99 
100  long long flops() const {
101  return 0;
102  } // Only correct if there is no link reconstruction, no cub reduction accounted also
103  long long bytes() const {
104  return 0;
105  } //no accounting the reduction!!!!
106 
107  };
108 
109  template<typename Float, int NCOLORS, typename Gauge>
110  void InitGaugeField( Gauge dataOr, cudaGaugeField& data) {
111  InitGaugeColdArg<Gauge> initarg(dataOr, data);
112  InitGaugeCold<Float, Gauge, NCOLORS> init(initarg);
113  init.apply(0);
114  checkCudaError();
115  }
116 
117 
118 
119  template<typename Float>
120  void InitGaugeField( cudaGaugeField& data) {
121 
122  if ( data.isNative() ) {
123  if ( data.Reconstruct() == QUDA_RECONSTRUCT_NO ) {
124  typedef typename gauge_mapper<Float,QUDA_RECONSTRUCT_NO>::type Gauge;
125  InitGaugeField<Float, 3>(Gauge(data), data);
126  } else if ( data.Reconstruct() == QUDA_RECONSTRUCT_12 ) {
127  typedef typename gauge_mapper<Float,QUDA_RECONSTRUCT_12>::type Gauge;
128  InitGaugeField<Float, 3>(Gauge(data), data);
129  } else if ( data.Reconstruct() == QUDA_RECONSTRUCT_8 ) {
130  typedef typename gauge_mapper<Float,QUDA_RECONSTRUCT_8>::type Gauge;
131  InitGaugeField<Float, 3>(Gauge(data), data);
132  } else {
133  errorQuda("Reconstruction type %d of gauge field not supported", data.Reconstruct());
134  }
135  } else {
136  errorQuda("Invalid Gauge Order\n");
137  }
138 
139  }
140 
145  void InitGaugeField( cudaGaugeField& data) {
146 
147  if ( data.Precision() == QUDA_SINGLE_PRECISION ) {
148  InitGaugeField<float> (data);
149  } else if ( data.Precision() == QUDA_DOUBLE_PRECISION ) {
150  InitGaugeField<double>(data);
151  } else {
152  errorQuda("Precision %d not supported", data.Precision());
153  }
154 
155  }
156 
157 
158 
159 
160 
161 
162 
163 
164  template <typename Gauge>
165  struct InitGaugeHotArg {
166  int threads; // number of active threads required
167  int X[4]; // grid dimensions
168  RNG rngstate;
169 #ifdef MULTI_GPU
170  int border[4];
171 #endif
172  Gauge dataOr;
173  InitGaugeHotArg(const Gauge &dataOr, const cudaGaugeField &data, RNG &rngstate)
174  : dataOr(dataOr), rngstate(rngstate) {
175 #ifdef MULTI_GPU
176  for ( int dir = 0; dir < 4; ++dir ) {
177  border[dir] = data.R()[dir];
178  X[dir] = data.X()[dir] - border[dir] * 2;
179  }
180 #else
181  for ( int dir = 0; dir < 4; ++dir ) X[dir] = data.X()[dir];
182 #endif
183  //the optimal number of RNG states in rngstate array must be equal to half the lattice volume
184  //this number is the same used in heatbath...
185  threads = X[0] * X[1] * X[2] * X[3] >> 1;
186  }
187  };
188 
189 
190  template <typename Float>
191  __host__ __device__ static inline void reunit_link( Matrix<complex<Float>,3> &U ){
192 
193  complex<Float> t2((Float)0.0, (Float)0.0);
194  Float t1 = 0.0;
195  //first normalize first row
196  //sum of squares of row
197 #pragma unroll
198  for ( int c = 0; c < 3; c++ ) t1 += norm(U(0,c));
199  t1 = (Float)1.0 / sqrt(t1);
200  //14
201  //used to normalize row
202 #pragma unroll
203  for ( int c = 0; c < 3; c++ ) U(0,c) *= t1;
204  //6
205 #pragma unroll
206  for ( int c = 0; c < 3; c++ ) t2 += conj(U(0,c)) * U(1,c);
207  //24
208 #pragma unroll
209  for ( int c = 0; c < 3; c++ ) U(1,c) -= t2 * U(0,c);
210  //24
211  //normalize second row
212  //sum of squares of row
213  t1 = 0.0;
214 #pragma unroll
215  for ( int c = 0; c < 3; c++ ) t1 += norm(U(1,c));
216  t1 = (Float)1.0 / sqrt(t1);
217  //14
218  //used to normalize row
219 #pragma unroll
220  for ( int c = 0; c < 3; c++ ) U(1, c) *= t1;
221  //6
222  //Reconstruct lat row
223  U(2,0) = conj(U(0,1) * U(1,2) - U(0,2) * U(1,1));
224  U(2,1) = conj(U(0,2) * U(1,0) - U(0,0) * U(1,2));
225  U(2,2) = conj(U(0,0) * U(1,1) - U(0,1) * U(1,0));
226  //42
227  //T=130
228  }
229 
230 
231 
232 
233 
234 
240  template <class T>
241  __device__ static inline Matrix<T,2> randomSU2(cuRNGState& localState){
242  Matrix<T,2> a;
243  T aabs, ctheta, stheta, phi;
244  a(0,0) = Random<T>(localState, (T)-1.0, (T)1.0);
245  aabs = sqrt( 1.0 - a(0,0) * a(0,0));
246  ctheta = Random<T>(localState, (T)-1.0, (T)1.0);
247  phi = PII * Random<T>(localState);
248  stheta = ( curand(&localState) & 1 ? 1 : -1 ) * sqrt( (T)1.0 - ctheta * ctheta );
249  a(0,1) = aabs * stheta * cos( phi );
250  a(1,0) = aabs * stheta * sin( phi );
251  a(1,1) = aabs * ctheta;
252  return a;
253  }
254 
255 
262  template <class T, int NCOLORS>
263  __host__ __device__ static inline void mul_block_sun( Matrix<T,2> u, Matrix<complex<T>,NCOLORS> &link, int2 id ){
264  for ( int j = 0; j < NCOLORS; j++ ) {
265  complex<T> tmp = complex<T>( u(0,0), u(1,1) ) * link(id.x, j) + complex<T>( u(1,0), u(0,1) ) * link(id.y, j);
266  link(id.y, j) = complex<T>(-u(1,0), u(0,1) ) * link(id.x, j) + complex<T>( u(0,0),-u(1,1) ) * link(id.y, j);
267  link(id.x, j) = tmp;
268  }
269  }
270 
271 
277  template<int NCOLORS>
278  __host__ __device__ static inline int2 IndexBlock(int block){
279  int2 id;
280  int i1;
281  int found = 0;
282  int del_i = 0;
283  int index = -1;
284  while ( del_i < (NCOLORS - 1) && found == 0 ) {
285  del_i++;
286  for ( i1 = 0; i1 < (NCOLORS - del_i); i1++ ) {
287  index++;
288  if ( index == block ) {
289  found = 1;
290  break;
291  }
292  }
293  }
294  id.y = i1 + del_i;
295  id.x = i1;
296  return id;
297  }
298 
304  template <class Float, int NCOLORS>
305  __device__ inline Matrix<complex<Float>,NCOLORS> randomize( cuRNGState& localState ){
306  Matrix<complex<Float>,NCOLORS> U;
307 
308  for ( int i = 0; i < NCOLORS; i++ )
309  for ( int j = 0; j < NCOLORS; j++ )
310  U(i,j) = complex<Float>( (Float)(Random<Float>(localState) - 0.5), (Float)(Random<Float>(localState) - 0.5) );
311  reunit_link<Float>(U);
312  return U;
313 
314  /*setIdentity(&U);
315  for( int block = 0; block < NCOLORS * ( NCOLORS - 1) / 2; block++ ) {
316  Matrix<Float,2> rr = randomSU2<Float>(localState);
317  int2 id = IndexBlock<NCOLORS>( block );
318  mul_block_sun<Float, NCOLORS>(rr, U, id);
319  //U = block_su2_to_su3<Float>( U, a00, a01, a10, a11, block );
320  }
321  return U;*/
322  }
323 
324  template<typename Float, typename Gauge, int NCOLORS>
325  __global__ void compute_InitGauge_HotStart(InitGaugeHotArg<Gauge> arg){
326  int idx = threadIdx.x + blockIdx.x * blockDim.x;
327  if ( idx >= arg.threads ) return;
328  #ifdef MULTI_GPU
329  int X[4], x[4];
330  for ( int dr = 0; dr < 4; ++dr ) X[dr] = arg.X[dr];
331  for ( int dr = 0; dr < 4; ++dr ) X[dr] += 2 * arg.border[dr];
332  int id = idx;
333  cuRNGState localState = arg.rngstate.State()[ id ];
334  #else
335  cuRNGState localState = arg.rngstate.State()[ idx ];
336  #endif
337  for ( int parity = 0; parity < 2; parity++ ) {
338  #ifdef MULTI_GPU
339  getCoords(x, id, arg.X, parity);
340  for ( int dr = 0; dr < 4; ++dr ) x[dr] += arg.border[dr];
341  idx = linkIndex(x,X);
342  #endif
343  for ( int d = 0; d < 4; d++ ) {
344  Matrix<complex<Float>,NCOLORS> U;
345  U = randomize<Float, NCOLORS>(localState);
346  arg.dataOr.save((Float*)(U.data),idx, d, parity);
347  }
348  }
349  #ifdef MULTI_GPU
350  arg.rngstate.State()[ id ] = localState;
351  #else
352  arg.rngstate.State()[ idx ] = localState;
353  #endif
354  }
355 
356 
357 
358 
359  template<typename Float, typename Gauge, int NCOLORS>
360  class InitGaugeHot : Tunable {
361  InitGaugeHotArg<Gauge> arg;
362  mutable char aux_string[128]; // used as a label in the autotuner
363  private:
364  unsigned int sharedBytesPerThread() const {
365  return 0;
366  }
367  unsigned int sharedBytesPerBlock(const TuneParam &param) const {
368  return 0;
369  }
370  bool tuneSharedBytes() const {
371  return false;
372  } // Don't tune shared memory
373  bool tuneGridDim() const {
374  return false;
375  } // Don't tune the grid dimensions.
376  unsigned int minThreads() const {
377  return arg.threads;
378  }
379 
380  public:
381  InitGaugeHot(InitGaugeHotArg<Gauge> &arg)
382  : arg(arg) {
383  }
384  ~InitGaugeHot () {
385  }
386 
387  void apply(const cudaStream_t &stream){
388  TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
389  compute_InitGauge_HotStart<Float, Gauge, NCOLORS><< < tp.grid,tp.block >> > (arg);
390  //cudaDeviceSynchronize();
391  }
392 
393  TuneKey tuneKey() const {
394  std::stringstream vol;
395  vol << arg.X[0] << "x";
396  vol << arg.X[1] << "x";
397  vol << arg.X[2] << "x";
398  vol << arg.X[3];
399  sprintf(aux_string,"threads=%d,prec=%lud", arg.threads, sizeof(Float));
400  return TuneKey(vol.str().c_str(), typeid(*this).name(), aux_string);
401 
402  }
403 
404  void preTune(){ arg.rngstate.backup(); }
405  void postTune(){ arg.rngstate.restore(); }
406  long long flops() const {
407  return 0;
408  } // Only correct if there is no link reconstruction, no cub reduction accounted also
409  long long bytes() const {
410  return 0;
411  } //no accounting the reduction!!!!
412 
413  };
414 
415 
416 
417 
418 
419  template<typename Float, int NCOLORS, typename Gauge>
420  void InitGaugeField( Gauge dataOr, cudaGaugeField& data, RNG &rngstate) {
421  InitGaugeHotArg<Gauge> initarg(dataOr, data, rngstate);
422  InitGaugeHot<Float, Gauge, NCOLORS> init(initarg);
423  init.apply(0);
424  checkCudaError();
426 
427  data.exchangeExtendedGhost(data.R(),false);
428  }
429 
430 
431 
432  template<typename Float>
433  void InitGaugeField( cudaGaugeField& data, RNG &rngstate) {
434 
435  if ( data.isNative() ) {
436  if ( data.Reconstruct() == QUDA_RECONSTRUCT_NO ) {
437  typedef typename gauge_mapper<Float,QUDA_RECONSTRUCT_NO>::type Gauge;
438  InitGaugeField<Float, 3>(Gauge(data), data, rngstate);
439  } else if ( data.Reconstruct() == QUDA_RECONSTRUCT_12 ) {
440  typedef typename gauge_mapper<Float,QUDA_RECONSTRUCT_12>::type Gauge;
441  InitGaugeField<Float, 3>(Gauge(data), data, rngstate);
442  } else if ( data.Reconstruct() == QUDA_RECONSTRUCT_8 ) {
443  typedef typename gauge_mapper<Float,QUDA_RECONSTRUCT_8>::type Gauge;
444  InitGaugeField<Float, 3>(Gauge(data), data, rngstate);
445  } else {
446  errorQuda("Reconstruction type %d of gauge field not supported", data.Reconstruct());
447  }
448  } else {
449  errorQuda("Invalid Gauge Order\n");
450  }
451  }
452 #endif // GPU_GAUGE_ALG
453 
459  void InitGaugeField( cudaGaugeField& data, RNG &rngstate) {
460 #ifdef GPU_GAUGE_ALG
461  if ( data.Precision() == QUDA_SINGLE_PRECISION ) {
462  InitGaugeField<float> (data, rngstate);
463  } else if ( data.Precision() == QUDA_DOUBLE_PRECISION ) {
464  InitGaugeField<double>(data, rngstate);
465  } else {
466  errorQuda("Precision %d not supported", data.Precision());
467  }
468 #else
469  errorQuda("Pure gauge code has not been built");
470 #endif
471  }
472 }
dim3 dim3 blockDim
struct curandStateMRG32k3a cuRNGState
Definition: random_quda.h:17
static __device__ __host__ int linkIndex(const int x[], const I X[4])
__host__ __device__ ValueType norm(const complex< ValueType > &z)
Returns the magnitude of z squared.
Definition: complex_quda.h:896
QudaVerbosity getVerbosity()
Definition: util_quda.cpp:20
#define errorQuda(...)
Definition: util_quda.h:90
void init()
Definition: blas_quda.cu:64
static __host__ __device__ void IndexBlock(int block, int &p, int &q)
__host__ __device__ ValueType sqrt(ValueType x)
Definition: complex_quda.h:105
cudaStream_t * stream
cudaColorSpinorField * tmp
Definition: covdev_test.cpp:44
char * index(const char *, int)
QudaGaugeParam param
Definition: pack_test.cpp:17
#define PII
Definition: pgauge_init.cu:19
__host__ __device__ ValueType sin(ValueType x)
Definition: complex_quda.h:40
def id
projector matrices ######################################################################## ...
for(int s=0;s< param.dc.Ls;s++)
Class declaration to initialize and hold CURAND RNG states.
Definition: random_quda.h:23
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
Definition: tune.cpp:603
void InitGaugeField(cudaGaugeField &data)
Perform a cold start to the gauge field, identity SU(3) matrix, also fills the ghost links in multi-G...
Main header file for host and device accessors to GaugeFields.
cudaError_t qudaDeviceSynchronize()
Wrapper around cudaDeviceSynchronize or cuDeviceSynchronize.
__device__ __host__ void setIdentity(Matrix< T, N > *m)
Definition: quda_matrix.h:543
int sprintf(char *, const char *,...) __attribute__((__format__(__printf__
unsigned long long flops
Definition: blas_quda.cu:42
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
Definition: complex_quda.h:880
const void * c
__host__ __device__ ValueType cos(ValueType x)
Definition: complex_quda.h:35
#define checkCudaError()
Definition: util_quda.h:129
__host__ __device__ ValueType conj(ValueType x)
Definition: complex_quda.h:115
QudaTune getTuning()
Query whether autotuning is enabled or not. Default is enabled but can be overridden by setting QUDA_...
Definition: util_quda.cpp:51
static __inline__ size_t size_t d
QudaPrecision Precision() const
QudaParity parity
Definition: covdev_test.cpp:53
#define a
unsigned long long bytes
Definition: blas_quda.cu:43
static __device__ __host__ void getCoords(int x[], int cb_index, const I X[], int parity)