QUDA  v1.1.0
A library for QCD on GPUs
pgauge_init.cu
Go to the documentation of this file.
1 #include <quda_internal.h>
2 #include <quda_matrix.h>
3 #include <tune_quda.h>
4 #include <gauge_field.h>
5 #include <gauge_field_order.h>
6 #include <launch_kernel.cuh>
7 #include <comm_quda.h>
8 #include <unitarization_links.h>
9 #include <pgauge_monte.h>
10 #include <random_quda.h>
11 #include <index_helper.cuh>
12 #include <instantiate.h>
13 
14 #ifndef PI
15 #define PI 3.1415926535897932384626433832795 // pi
16 #endif
17 #ifndef PII
18 #define PII 6.2831853071795864769252867665590 // 2 * pi
19 #endif
20 
21 namespace quda {
22 
23  template <typename Float, int nColor_, QudaReconstructType recon_>
24  struct InitGaugeColdArg {
25  static constexpr int nColor = nColor_;
26  static constexpr QudaReconstructType recon = recon_;
27  using real = typename mapper<Float>::type;
28  using Gauge = typename gauge_mapper<real, recon>::type;
29  int threads; // number of active threads required
30  int X[4]; // grid dimensions
31  Gauge dataOr;
32  InitGaugeColdArg(const GaugeField &data)
33  : dataOr(data) {
34  for ( int dir = 0; dir < 4; ++dir ) X[dir] = data.X()[dir];
35  threads = X[0] * X[1] * X[2] * X[3];
36  }
37  };
38 
39  template <typename Arg> __global__ void compute_InitGauge_ColdStart(Arg arg)
40  {
41  int idx = threadIdx.x + blockIdx.x * blockDim.x;
42  if ( idx >= arg.threads ) return;
43  int parity = 0;
44  if ( idx >= arg.threads / 2 ) {
45  parity = 1;
46  idx -= arg.threads / 2;
47  }
48  Matrix<complex<typename Arg::real>, Arg::nColor> U;
49  setIdentity(&U);
50  for ( int d = 0; d < 4; d++ ) arg.dataOr(d, idx, parity) = U;
51  }
52 
53  template <typename Float, int nColor, QudaReconstructType recon>
54  class InitGaugeCold : Tunable {
55  InitGaugeColdArg<Float, nColor, recon> arg;
56  const GaugeField &meta;
57  unsigned int sharedBytesPerThread() const { return 0; }
58  unsigned int sharedBytesPerBlock(const TuneParam &param) const { return 0; }
59  bool tuneGridDim() const { return false;}
60  unsigned int minThreads() const { return arg.threads; }
61 
62  public:
63  InitGaugeCold(GaugeField &data) :
64  arg(data),
65  meta(data)
66  {
67  apply(0);
68  }
69 
70  void apply(const qudaStream_t &stream)
71  {
72  TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
73  qudaLaunchKernel(compute_InitGauge_ColdStart<decltype(arg)>, tp, stream, arg);
74  }
75 
76  TuneKey tuneKey() const { return TuneKey(meta.VolString(), typeid(*this).name(), meta.AuxString()); }
77  long long flops() const { return 0; }
78  long long bytes() const { return meta.Bytes(); }
79  };
80 
81  template <typename Float, int nColor_, QudaReconstructType recon_>
82  struct InitGaugeHotArg {
83  static constexpr int nColor = nColor_;
84  static constexpr QudaReconstructType recon = recon_;
85  using real = typename mapper<Float>::type;
86  using Gauge = typename gauge_mapper<real, recon>::type;
87  int threads; // number of active threads required
88  int X[4]; // grid dimensions
89  RNG rngstate;
90  int border[4];
91  Gauge dataOr;
92  InitGaugeHotArg(const GaugeField &data, RNG &rngstate) :
93  dataOr(data),
94  rngstate(rngstate)
95  {
96  for ( int dir = 0; dir < 4; ++dir ) {
97  border[dir] = data.R()[dir];
98  X[dir] = data.X()[dir] - border[dir] * 2;
99  }
100 
101  //the optimal number of RNG states in rngstate array must be equal to half the lattice volume
102  //this number is the same used in heatbath...
103  threads = X[0] * X[1] * X[2] * X[3] >> 1;
104  }
105  };
106 
107  template <typename Float>
108  __host__ __device__ static inline void reunit_link( Matrix<complex<Float>,3> &U )
109  {
110  complex<Float> t2((Float)0.0, (Float)0.0);
111  Float t1 = 0.0;
112  //first normalize first row
113  //sum of squares of row
114 #pragma unroll
115  for ( int c = 0; c < 3; c++ ) t1 += norm(U(0,c));
116  t1 = (Float)1.0 / sqrt(t1);
117  //14
118  //used to normalize row
119 #pragma unroll
120  for ( int c = 0; c < 3; c++ ) U(0,c) *= t1;
121  //6
122 #pragma unroll
123  for ( int c = 0; c < 3; c++ ) t2 += conj(U(0,c)) * U(1,c);
124  //24
125 #pragma unroll
126  for ( int c = 0; c < 3; c++ ) U(1,c) -= t2 * U(0,c);
127  //24
128  //normalize second row
129  //sum of squares of row
130  t1 = 0.0;
131 #pragma unroll
132  for ( int c = 0; c < 3; c++ ) t1 += norm(U(1,c));
133  t1 = (Float)1.0 / sqrt(t1);
134  //14
135  //used to normalize row
136 #pragma unroll
137  for ( int c = 0; c < 3; c++ ) U(1, c) *= t1;
138  //6
139  //Reconstruct lat row
140  U(2,0) = conj(U(0,1) * U(1,2) - U(0,2) * U(1,1));
141  U(2,1) = conj(U(0,2) * U(1,0) - U(0,0) * U(1,2));
142  U(2,2) = conj(U(0,0) * U(1,1) - U(0,1) * U(1,0));
143  //42
144  //T=130
145  }
146 
147  /**
148  @brief Generate the four random real elements of the SU(2) matrix
149  @param localstate CURAND rng state
150  @return four real numbers of the SU(2) matrix
151  */
152  template <class T>
153  __device__ static inline Matrix<T,2> randomSU2(cuRNGState& localState){
154  Matrix<T,2> a;
155  T aabs, ctheta, stheta, phi;
156  a(0,0) = Random<T>(localState, (T)-1.0, (T)1.0);
157  aabs = sqrt( 1.0 - a(0,0) * a(0,0));
158  ctheta = Random<T>(localState, (T)-1.0, (T)1.0);
159  phi = PII * Random<T>(localState);
160  stheta = ( curand(&localState) & 1 ? 1 : -1 ) * sqrt( (T)1.0 - ctheta * ctheta );
161  a(0,1) = aabs * stheta * cos( phi );
162  a(1,0) = aabs * stheta * sin( phi );
163  a(1,1) = aabs * ctheta;
164  return a;
165  }
166 
167  /**
168  @brief Update the SU(Nc) link with the new SU(2) matrix, link <- u * link
169  @param u SU(2) matrix represented by four real numbers
170  @param link SU(Nc) matrix
171  @param id indices
172  */
173  template <class T, int NCOLORS>
174  __host__ __device__ static inline void mul_block_sun( Matrix<T,2> u, Matrix<complex<T>,NCOLORS> &link, int2 id ){
175  for ( int j = 0; j < NCOLORS; j++ ) {
176  complex<T> tmp = complex<T>( u(0,0), u(1,1) ) * link(id.x, j) + complex<T>( u(1,0), u(0,1) ) * link(id.y, j);
177  link(id.y, j) = complex<T>(-u(1,0), u(0,1) ) * link(id.x, j) + complex<T>( u(0,0),-u(1,1) ) * link(id.y, j);
178  link(id.x, j) = tmp;
179  }
180  }
181 
182  /**
183  @brief Calculate the SU(2) index block in the SU(Nc) matrix
184  @param block number to calculate the index's, the total number of blocks is NCOLORS * ( NCOLORS - 1) / 2.
185  @return Returns two index's in int2 type, accessed by .x and .y.
186  */
187  template<int NCOLORS>
188  __host__ __device__ static inline int2 IndexBlock(int block){
189  int2 id;
190  int i1;
191  int found = 0;
192  int del_i = 0;
193  int index = -1;
194  while ( del_i < (NCOLORS - 1) && found == 0 ) {
195  del_i++;
196  for ( i1 = 0; i1 < (NCOLORS - del_i); i1++ ) {
197  index++;
198  if ( index == block ) {
199  found = 1;
200  break;
201  }
202  }
203  }
204  id.y = i1 + del_i;
205  id.x = i1;
206  return id;
207  }
208 
209  /**
210  @brief Generate a SU(Nc) random matrix
211  @param localstate CURAND rng state
212  @return SU(Nc) matrix
213  */
214  template <class Float, int NCOLORS>
215  __device__ inline Matrix<complex<Float>,NCOLORS> randomize( cuRNGState& localState )
216  {
217  Matrix<complex<Float>,NCOLORS> U;
218 
219  for ( int i = 0; i < NCOLORS; i++ )
220  for ( int j = 0; j < NCOLORS; j++ )
221  U(i,j) = complex<Float>( (Float)(Random<Float>(localState) - 0.5), (Float)(Random<Float>(localState) - 0.5) );
222  reunit_link<Float>(U);
223  return U;
224 
225  /*setIdentity(&U);
226  for( int block = 0; block < NCOLORS * ( NCOLORS - 1) / 2; block++ ) {
227  Matrix<Float,2> rr = randomSU2<Float>(localState);
228  int2 id = IndexBlock<NCOLORS>( block );
229  mul_block_sun<Float, NCOLORS>(rr, U, id);
230  //U = block_su2_to_su3<Float>( U, a00, a01, a10, a11, block );
231  }
232  return U;*/
233  }
234 
235  template<typename Arg> __global__ void compute_InitGauge_HotStart(Arg arg)
236  {
237  int idx = threadIdx.x + blockIdx.x * blockDim.x;
238  if ( idx >= arg.threads ) return;
239  int X[4], x[4];
240  for ( int dr = 0; dr < 4; ++dr ) X[dr] = arg.X[dr];
241  for ( int dr = 0; dr < 4; ++dr ) X[dr] += 2 * arg.border[dr];
242  int id = idx;
243  cuRNGState localState = arg.rngstate.State()[ id ];
244  for (int parity = 0; parity < 2; parity++) {
245  getCoords(x, id, arg.X, parity);
246  for (int dr = 0; dr < 4; dr++) x[dr] += arg.border[dr];
247  idx = linkIndex(x,X);
248  for (int d = 0; d < 4; d++) {
249  Matrix<complex<typename Arg::real>, Arg::nColor> U;
250  U = randomize<typename Arg::real, Arg::nColor>(localState);
251  arg.dataOr(d, idx, parity) = U;
252  }
253  }
254  arg.rngstate.State()[ id ] = localState;
255  }
256 
257  template<typename Float, int nColors, QudaReconstructType recon>
258  class InitGaugeHot : Tunable {
259  InitGaugeHotArg<Float, nColors, recon> arg;
260  const GaugeField &meta;
261  unsigned int sharedBytesPerThread() const { return 0; }
262  unsigned int sharedBytesPerBlock(const TuneParam &param) const { return 0; }
263  bool tuneSharedBytes() const { return false; }
264  bool tuneGridDim() const { return false; }
265  unsigned int minThreads() const { return arg.threads; }
266 
267  public:
268  InitGaugeHot(GaugeField &data, RNG &rngstate) :
269  arg(data, rngstate),
270  meta(data)
271  {
272  apply(0);
273  qudaDeviceSynchronize();
274  data.exchangeExtendedGhost(data.R(),false);
275  }
276 
277  void apply(const qudaStream_t &stream){
278  TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
279  qudaLaunchKernel(compute_InitGauge_HotStart<decltype(arg)>, tp, stream, arg);
280  }
281 
282  TuneKey tuneKey() const { return TuneKey(meta.VolString(), typeid(*this).name(), meta.AuxString()); }
283 
284  void preTune(){ arg.rngstate.backup(); }
285  void postTune(){ arg.rngstate.restore(); }
286  long long flops() const { return 0; }
287  long long bytes() const { return meta.Bytes(); }
288  };
289 
290  /**
291  * @brief Perform a cold start to the gauge field, identity SU(3) matrix, also fills the ghost links in multi-GPU case (no need to exchange data)
292  *
293  * @param[in,out] data Gauge field
294  */
295  void InitGaugeField(GaugeField& data) {
296 #ifdef GPU_GAUGE_ALG
297  instantiate<InitGaugeCold>(data);
298 #else
299  errorQuda("Pure gauge code has not been built");
300 #endif
301  }
302 
303  /** @brief Perform a hot start to the gauge field, random SU(3) matrix, followed by reunitarization, also exchange borders links in multi-GPU case.
304  *
305  * @param[in,out] data Gauge field
306  * @param[in,out] rngstate state of the CURAND random number generator
307  */
308  void InitGaugeField(GaugeField& data, RNG &rngstate) {
309 #ifdef GPU_GAUGE_ALG
310  instantiate<InitGaugeHot>(data, rngstate);
311 #else
312  errorQuda("Pure gauge code has not been built");
313 #endif
314  }
315 
316 }