QUDA  1.0.0
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros
pgauge_init.cu
Go to the documentation of this file.
1 #include <quda_internal.h>
2 #include <quda_matrix.h>
3 #include <tune_quda.h>
4 #include <gauge_field.h>
5 #include <gauge_field_order.h>
6 #include <launch_kernel.cuh>
7 #include <comm_quda.h>
8 #include <unitarization_links.h>
9 #include <pgauge_monte.h>
10 #include <random_quda.h>
11 #include <cub_helper.cuh>
12 #include <index_helper.cuh>
13 
14 
15 #ifndef PI
16 #define PI 3.1415926535897932384626433832795 // pi
17 #endif
18 #ifndef PII
19 #define PII 6.2831853071795864769252867665590 // 2 * pi
20 #endif
21 
22 namespace quda {
23 
24 #ifdef GPU_GAUGE_ALG
25 
26  template <typename Gauge>
27  struct InitGaugeColdArg {
28  int threads; // number of active threads required
29  int X[4]; // grid dimensions
30  Gauge dataOr;
31  InitGaugeColdArg(const Gauge &dataOr, const cudaGaugeField &data)
32  : dataOr(dataOr) {
33  for ( int dir = 0; dir < 4; ++dir ) X[dir] = data.X()[dir];
34  threads = X[0] * X[1] * X[2] * X[3];
35  }
36  };
37 
38  template<typename Float, typename Gauge, int NCOLORS>
39  __global__ void compute_InitGauge_ColdStart(InitGaugeColdArg<Gauge> arg){
40  int idx = threadIdx.x + blockIdx.x * blockDim.x;
41  if ( idx >= arg.threads ) return;
42  int parity = 0;
43  if ( idx >= arg.threads / 2 ) {
44  parity = 1;
45  idx -= arg.threads / 2;
46  }
47  Matrix<complex<Float>,NCOLORS> U;
48  setIdentity(&U);
49  for ( int d = 0; d < 4; d++ ) arg.dataOr(d, idx, parity) = U;
50  }
51 
52 
53  template<typename Float, typename Gauge, int NCOLORS>
54  class InitGaugeCold : Tunable {
55  InitGaugeColdArg<Gauge> arg;
56  mutable char aux_string[128]; // used as a label in the autotuner
57  private:
58  unsigned int sharedBytesPerThread() const {
59  return 0;
60  }
61  unsigned int sharedBytesPerBlock(const TuneParam &param) const {
62  return 0;
63  }
64  //bool tuneSharedBytes() const { return false; } // Don't tune shared memory
65  bool tuneGridDim() const {
66  return false;
67  } // Don't tune the grid dimensions.
68  unsigned int minThreads() const {
69  return arg.threads;
70  }
71 
72  public:
73  InitGaugeCold(InitGaugeColdArg<Gauge> &arg)
74  : arg(arg) {
75  }
76  ~InitGaugeCold () {
77  }
78 
79  void apply(const cudaStream_t &stream){
80  TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
81  compute_InitGauge_ColdStart<Float, Gauge, NCOLORS> <<< tp.grid,tp.block >>> (arg);
82  //cudaDeviceSynchronize();
83  }
84 
85  TuneKey tuneKey() const {
86  std::stringstream vol;
87  vol << arg.X[0] << "x";
88  vol << arg.X[1] << "x";
89  vol << arg.X[2] << "x";
90  vol << arg.X[3];
91  sprintf(aux_string,"threads=%d,prec=%lu", arg.threads, sizeof(Float));
92  return TuneKey(vol.str().c_str(), typeid(*this).name(), aux_string);
93 
94  }
95 
96  long long flops() const {
97  return 0;
98  } // Only correct if there is no link reconstruction, no cub reduction accounted also
99  long long bytes() const {
100  return 0;
101  } //no accounting the reduction!!!!
102 
103  };
104 
105  template<typename Float, int NCOLORS, typename Gauge>
106  void InitGaugeField( Gauge dataOr, cudaGaugeField& data) {
107  InitGaugeColdArg<Gauge> initarg(dataOr, data);
108  InitGaugeCold<Float, Gauge, NCOLORS> init(initarg);
109  init.apply(0);
110  checkCudaError();
111  }
112 
113 
114 
115  template<typename Float>
116  void InitGaugeField( cudaGaugeField& data) {
117 
118  if ( data.isNative() ) {
119  if ( data.Reconstruct() == QUDA_RECONSTRUCT_NO ) {
120  typedef typename gauge_mapper<Float,QUDA_RECONSTRUCT_NO>::type Gauge;
121  InitGaugeField<Float, 3>(Gauge(data), data);
122  } else if ( data.Reconstruct() == QUDA_RECONSTRUCT_12 ) {
123  typedef typename gauge_mapper<Float,QUDA_RECONSTRUCT_12>::type Gauge;
124  InitGaugeField<Float, 3>(Gauge(data), data);
125  } else if ( data.Reconstruct() == QUDA_RECONSTRUCT_8 ) {
126  typedef typename gauge_mapper<Float,QUDA_RECONSTRUCT_8>::type Gauge;
127  InitGaugeField<Float, 3>(Gauge(data), data);
128  } else {
129  errorQuda("Reconstruction type %d of gauge field not supported", data.Reconstruct());
130  }
131  } else {
132  errorQuda("Invalid Gauge Order\n");
133  }
134 
135  }
136 
141  void InitGaugeField( cudaGaugeField& data) {
142 
143  if ( data.Precision() == QUDA_SINGLE_PRECISION ) {
144  InitGaugeField<float> (data);
145  } else if ( data.Precision() == QUDA_DOUBLE_PRECISION ) {
146  InitGaugeField<double>(data);
147  } else {
148  errorQuda("Precision %d not supported", data.Precision());
149  }
150 
151  }
152 
153 
154 
155 
156 
157 
158 
159 
160  template <typename Gauge>
161  struct InitGaugeHotArg {
162  int threads; // number of active threads required
163  int X[4]; // grid dimensions
164  RNG rngstate;
165 #ifdef MULTI_GPU
166  int border[4];
167 #endif
168  Gauge dataOr;
169  InitGaugeHotArg(const Gauge &dataOr, const cudaGaugeField &data, RNG &rngstate)
170  : dataOr(dataOr), rngstate(rngstate) {
171 #ifdef MULTI_GPU
172  for ( int dir = 0; dir < 4; ++dir ) {
173  border[dir] = data.R()[dir];
174  X[dir] = data.X()[dir] - border[dir] * 2;
175  }
176 #else
177  for ( int dir = 0; dir < 4; ++dir ) X[dir] = data.X()[dir];
178 #endif
179  //the optimal number of RNG states in rngstate array must be equal to half the lattice volume
180  //this number is the same used in heatbath...
181  threads = X[0] * X[1] * X[2] * X[3] >> 1;
182  }
183  };
184 
185 
186  template <typename Float>
187  __host__ __device__ static inline void reunit_link( Matrix<complex<Float>,3> &U ){
188 
189  complex<Float> t2((Float)0.0, (Float)0.0);
190  Float t1 = 0.0;
191  //first normalize first row
192  //sum of squares of row
193 #pragma unroll
194  for ( int c = 0; c < 3; c++ ) t1 += norm(U(0,c));
195  t1 = (Float)1.0 / sqrt(t1);
196  //14
197  //used to normalize row
198 #pragma unroll
199  for ( int c = 0; c < 3; c++ ) U(0,c) *= t1;
200  //6
201 #pragma unroll
202  for ( int c = 0; c < 3; c++ ) t2 += conj(U(0,c)) * U(1,c);
203  //24
204 #pragma unroll
205  for ( int c = 0; c < 3; c++ ) U(1,c) -= t2 * U(0,c);
206  //24
207  //normalize second row
208  //sum of squares of row
209  t1 = 0.0;
210 #pragma unroll
211  for ( int c = 0; c < 3; c++ ) t1 += norm(U(1,c));
212  t1 = (Float)1.0 / sqrt(t1);
213  //14
214  //used to normalize row
215 #pragma unroll
216  for ( int c = 0; c < 3; c++ ) U(1, c) *= t1;
217  //6
218  //Reconstruct lat row
219  U(2,0) = conj(U(0,1) * U(1,2) - U(0,2) * U(1,1));
220  U(2,1) = conj(U(0,2) * U(1,0) - U(0,0) * U(1,2));
221  U(2,2) = conj(U(0,0) * U(1,1) - U(0,1) * U(1,0));
222  //42
223  //T=130
224  }
225 
226 
227 
228 
229 
230 
236  template <class T>
237  __device__ static inline Matrix<T,2> randomSU2(cuRNGState& localState){
238  Matrix<T,2> a;
239  T aabs, ctheta, stheta, phi;
240  a(0,0) = Random<T>(localState, (T)-1.0, (T)1.0);
241  aabs = sqrt( 1.0 - a(0,0) * a(0,0));
242  ctheta = Random<T>(localState, (T)-1.0, (T)1.0);
243  phi = PII * Random<T>(localState);
244  stheta = ( curand(&localState) & 1 ? 1 : -1 ) * sqrt( (T)1.0 - ctheta * ctheta );
245  a(0,1) = aabs * stheta * cos( phi );
246  a(1,0) = aabs * stheta * sin( phi );
247  a(1,1) = aabs * ctheta;
248  return a;
249  }
250 
251 
258  template <class T, int NCOLORS>
259  __host__ __device__ static inline void mul_block_sun( Matrix<T,2> u, Matrix<complex<T>,NCOLORS> &link, int2 id ){
260  for ( int j = 0; j < NCOLORS; j++ ) {
261  complex<T> tmp = complex<T>( u(0,0), u(1,1) ) * link(id.x, j) + complex<T>( u(1,0), u(0,1) ) * link(id.y, j);
262  link(id.y, j) = complex<T>(-u(1,0), u(0,1) ) * link(id.x, j) + complex<T>( u(0,0),-u(1,1) ) * link(id.y, j);
263  link(id.x, j) = tmp;
264  }
265  }
266 
267 
273  template<int NCOLORS>
274  __host__ __device__ static inline int2 IndexBlock(int block){
275  int2 id;
276  int i1;
277  int found = 0;
278  int del_i = 0;
279  int index = -1;
280  while ( del_i < (NCOLORS - 1) && found == 0 ) {
281  del_i++;
282  for ( i1 = 0; i1 < (NCOLORS - del_i); i1++ ) {
283  index++;
284  if ( index == block ) {
285  found = 1;
286  break;
287  }
288  }
289  }
290  id.y = i1 + del_i;
291  id.x = i1;
292  return id;
293  }
294 
300  template <class Float, int NCOLORS>
301  __device__ inline Matrix<complex<Float>,NCOLORS> randomize( cuRNGState& localState ){
302  Matrix<complex<Float>,NCOLORS> U;
303 
304  for ( int i = 0; i < NCOLORS; i++ )
305  for ( int j = 0; j < NCOLORS; j++ )
306  U(i,j) = complex<Float>( (Float)(Random<Float>(localState) - 0.5), (Float)(Random<Float>(localState) - 0.5) );
307  reunit_link<Float>(U);
308  return U;
309 
310  /*setIdentity(&U);
311  for( int block = 0; block < NCOLORS * ( NCOLORS - 1) / 2; block++ ) {
312  Matrix<Float,2> rr = randomSU2<Float>(localState);
313  int2 id = IndexBlock<NCOLORS>( block );
314  mul_block_sun<Float, NCOLORS>(rr, U, id);
315  //U = block_su2_to_su3<Float>( U, a00, a01, a10, a11, block );
316  }
317  return U;*/
318  }
319 
320  template<typename Float, typename Gauge, int NCOLORS>
321  __global__ void compute_InitGauge_HotStart(InitGaugeHotArg<Gauge> arg){
322  int idx = threadIdx.x + blockIdx.x * blockDim.x;
323  if ( idx >= arg.threads ) return;
324  #ifdef MULTI_GPU
325  int X[4], x[4];
326  for ( int dr = 0; dr < 4; ++dr ) X[dr] = arg.X[dr];
327  for ( int dr = 0; dr < 4; ++dr ) X[dr] += 2 * arg.border[dr];
328  int id = idx;
329  cuRNGState localState = arg.rngstate.State()[ id ];
330  #else
331  cuRNGState localState = arg.rngstate.State()[ idx ];
332  #endif
333  for ( int parity = 0; parity < 2; parity++ ) {
334  #ifdef MULTI_GPU
335  getCoords(x, id, arg.X, parity);
336  for ( int dr = 0; dr < 4; ++dr ) x[dr] += arg.border[dr];
337  idx = linkIndex(x,X);
338  #endif
339  for ( int d = 0; d < 4; d++ ) {
340  Matrix<complex<Float>,NCOLORS> U;
341  U = randomize<Float, NCOLORS>(localState);
342  arg.dataOr(d, idx, parity) = U;
343  }
344  }
345  #ifdef MULTI_GPU
346  arg.rngstate.State()[ id ] = localState;
347  #else
348  arg.rngstate.State()[ idx ] = localState;
349  #endif
350  }
351 
352 
353 
354 
355  template<typename Float, typename Gauge, int NCOLORS>
356  class InitGaugeHot : Tunable {
357  InitGaugeHotArg<Gauge> arg;
358  mutable char aux_string[128]; // used as a label in the autotuner
359  private:
360  unsigned int sharedBytesPerThread() const {
361  return 0;
362  }
363  unsigned int sharedBytesPerBlock(const TuneParam &param) const {
364  return 0;
365  }
366  bool tuneSharedBytes() const {
367  return false;
368  } // Don't tune shared memory
369  bool tuneGridDim() const {
370  return false;
371  } // Don't tune the grid dimensions.
372  unsigned int minThreads() const {
373  return arg.threads;
374  }
375 
376  public:
377  InitGaugeHot(InitGaugeHotArg<Gauge> &arg)
378  : arg(arg) {
379  }
380  ~InitGaugeHot () {
381  }
382 
383  void apply(const cudaStream_t &stream){
384  TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
385  compute_InitGauge_HotStart<Float, Gauge, NCOLORS> <<< tp.grid,tp.block >>> (arg);
386  //cudaDeviceSynchronize();
387  }
388 
389  TuneKey tuneKey() const {
390  std::stringstream vol;
391  vol << arg.X[0] << "x";
392  vol << arg.X[1] << "x";
393  vol << arg.X[2] << "x";
394  vol << arg.X[3];
395  sprintf(aux_string,"threads=%d,prec=%lud", arg.threads, sizeof(Float));
396  return TuneKey(vol.str().c_str(), typeid(*this).name(), aux_string);
397 
398  }
399 
400  void preTune(){ arg.rngstate.backup(); }
401  void postTune(){ arg.rngstate.restore(); }
402  long long flops() const {
403  return 0;
404  } // Only correct if there is no link reconstruction, no cub reduction accounted also
405  long long bytes() const {
406  return 0;
407  } //no accounting the reduction!!!!
408 
409  };
410 
411 
412  template<typename Float, int NCOLORS, typename Gauge>
413  void InitGaugeField( Gauge dataOr, cudaGaugeField& data, RNG &rngstate) {
414  InitGaugeHotArg<Gauge> initarg(dataOr, data, rngstate);
415  InitGaugeHot<Float, Gauge, NCOLORS> init(initarg);
416  init.apply(0);
417  checkCudaError();
419 
420  data.exchangeExtendedGhost(data.R(),false);
421  }
422 
423  template<typename Float>
424  void InitGaugeField( cudaGaugeField& data, RNG &rngstate) {
425 
426  if ( data.isNative() ) {
427  if ( data.Reconstruct() == QUDA_RECONSTRUCT_NO ) {
428  typedef typename gauge_mapper<Float,QUDA_RECONSTRUCT_NO>::type Gauge;
429  InitGaugeField<Float, 3>(Gauge(data), data, rngstate);
430  } else if ( data.Reconstruct() == QUDA_RECONSTRUCT_12 ) {
431  typedef typename gauge_mapper<Float,QUDA_RECONSTRUCT_12>::type Gauge;
432  InitGaugeField<Float, 3>(Gauge(data), data, rngstate);
433  } else if ( data.Reconstruct() == QUDA_RECONSTRUCT_8 ) {
434  typedef typename gauge_mapper<Float,QUDA_RECONSTRUCT_8>::type Gauge;
435  InitGaugeField<Float, 3>(Gauge(data), data, rngstate);
436  } else {
437  errorQuda("Reconstruction type %d of gauge field not supported", data.Reconstruct());
438  }
439  } else {
440  errorQuda("Invalid Gauge Order\n");
441  }
442  }
443 #endif // GPU_GAUGE_ALG
444 
450  void InitGaugeField( cudaGaugeField& data, RNG &rngstate) {
451 #ifdef GPU_GAUGE_ALG
452  if ( data.Precision() == QUDA_SINGLE_PRECISION ) {
453  InitGaugeField<float> (data, rngstate);
454  } else if ( data.Precision() == QUDA_DOUBLE_PRECISION ) {
455  InitGaugeField<double>(data, rngstate);
456  } else {
457  errorQuda("Precision %d not supported", data.Precision());
458  }
459 #else
460  errorQuda("Pure gauge code has not been built");
461 #endif
462  }
463 }
struct curandStateMRG32k3a cuRNGState
Definition: random_quda.h:17
static __device__ __host__ int linkIndex(const int x[], const I X[4])
__host__ __device__ ValueType norm(const complex< ValueType > &z)
Returns the magnitude of z squared.
QudaVerbosity getVerbosity()
Definition: util_quda.cpp:21
#define errorQuda(...)
Definition: util_quda.h:121
static __host__ __device__ void IndexBlock(int block, int &p, int &q)
__host__ __device__ ValueType sqrt(ValueType x)
Definition: complex_quda.h:120
cudaStream_t * stream
cudaColorSpinorField * tmp
Definition: covdev_test.cpp:44
QudaGaugeParam param
Definition: pack_test.cpp:17
const int * R() const
#define qudaDeviceSynchronize()
#define PII
Definition: pgauge_init.cu:19
__host__ __device__ ValueType sin(ValueType x)
Definition: complex_quda.h:51
void exchangeExtendedGhost(const int *R, bool no_comms_fill=false)
This does routine will populate the border / halo region of a gauge field that has been created using...
Class declaration to initialize and hold CURAND RNG states.
Definition: random_quda.h:23
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
Definition: tune.cpp:643
void InitGaugeField(cudaGaugeField &data)
Perform a cold start to the gauge field, identity SU(3) matrix, also fills the ghost links in multi-G...
Main header file for host and device accessors to GaugeFields.
int X[4]
Definition: covdev_test.cpp:70
void init()
Create the CUBLAS context.
Definition: blas_cublas.cu:31
__device__ __host__ void setIdentity(Matrix< T, N > *m)
Definition: quda_matrix.h:653
static int index(int ndim, const int *dims, const int *x)
Definition: comm_common.cpp:32
unsigned long long flops
Definition: blas_quda.cu:22
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
__host__ __device__ ValueType cos(ValueType x)
Definition: complex_quda.h:46
QudaReconstructType Reconstruct() const
Definition: gauge_field.h:250
#define checkCudaError()
Definition: util_quda.h:161
__host__ __device__ ValueType conj(ValueType x)
Definition: complex_quda.h:130
QudaTune getTuning()
Query whether autotuning is enabled or not. Default is enabled but can be overridden by setting QUDA_...
Definition: util_quda.cpp:52
QudaPrecision Precision() const
bool isNative() const
QudaParity parity
Definition: covdev_test.cpp:54
unsigned long long bytes
Definition: blas_quda.cu:23
__host__ __device__ int getCoords(int coord[], const Arg &arg, int &idx, int parity, int &dim)
Compute the space-time coordinates we are at.
const int * X() const