1 #include <quda_internal.h>
2 #include <quda_matrix.h>
4 #include <gauge_field.h>
5 #include <gauge_field_order.h>
6 #include <launch_kernel.cuh>
8 #include <unitarization_links.h>
9 #include <pgauge_monte.h>
10 #include <random_quda.h>
11 #include <index_helper.cuh>
12 #include <instantiate.h>
15 #define PI 3.1415926535897932384626433832795 // pi
18 #define PII 6.2831853071795864769252867665590 // 2 * pi
23 template <typename Float, int nColor_, QudaReconstructType recon_>
24 struct InitGaugeColdArg {
25 static constexpr int nColor = nColor_;
26 static constexpr QudaReconstructType recon = recon_;
27 using real = typename mapper<Float>::type;
28 using Gauge = typename gauge_mapper<real, recon>::type;
29 int threads; // number of active threads required
30 int X[4]; // grid dimensions
32 InitGaugeColdArg(const GaugeField &data)
34 for ( int dir = 0; dir < 4; ++dir ) X[dir] = data.X()[dir];
35 threads = X[0] * X[1] * X[2] * X[3];
39 template <typename Arg> __global__ void compute_InitGauge_ColdStart(Arg arg)
41 int idx = threadIdx.x + blockIdx.x * blockDim.x;
42 if ( idx >= arg.threads ) return;
44 if ( idx >= arg.threads / 2 ) {
46 idx -= arg.threads / 2;
48 Matrix<complex<typename Arg::real>, Arg::nColor> U;
50 for ( int d = 0; d < 4; d++ ) arg.dataOr(d, idx, parity) = U;
53 template <typename Float, int nColor, QudaReconstructType recon>
54 class InitGaugeCold : Tunable {
55 InitGaugeColdArg<Float, nColor, recon> arg;
56 const GaugeField &meta;
57 unsigned int sharedBytesPerThread() const { return 0; }
58 unsigned int sharedBytesPerBlock(const TuneParam ¶m) const { return 0; }
59 bool tuneGridDim() const { return false;}
60 unsigned int minThreads() const { return arg.threads; }
63 InitGaugeCold(GaugeField &data) :
70 void apply(const qudaStream_t &stream)
72 TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
73 qudaLaunchKernel(compute_InitGauge_ColdStart<decltype(arg)>, tp, stream, arg);
76 TuneKey tuneKey() const { return TuneKey(meta.VolString(), typeid(*this).name(), meta.AuxString()); }
77 long long flops() const { return 0; }
78 long long bytes() const { return meta.Bytes(); }
81 template <typename Float, int nColor_, QudaReconstructType recon_>
82 struct InitGaugeHotArg {
83 static constexpr int nColor = nColor_;
84 static constexpr QudaReconstructType recon = recon_;
85 using real = typename mapper<Float>::type;
86 using Gauge = typename gauge_mapper<real, recon>::type;
87 int threads; // number of active threads required
88 int X[4]; // grid dimensions
92 InitGaugeHotArg(const GaugeField &data, RNG &rngstate) :
96 for ( int dir = 0; dir < 4; ++dir ) {
97 border[dir] = data.R()[dir];
98 X[dir] = data.X()[dir] - border[dir] * 2;
101 //the optimal number of RNG states in rngstate array must be equal to half the lattice volume
102 //this number is the same used in heatbath...
103 threads = X[0] * X[1] * X[2] * X[3] >> 1;
107 template <typename Float>
108 __host__ __device__ static inline void reunit_link( Matrix<complex<Float>,3> &U )
110 complex<Float> t2((Float)0.0, (Float)0.0);
112 //first normalize first row
113 //sum of squares of row
115 for ( int c = 0; c < 3; c++ ) t1 += norm(U(0,c));
116 t1 = (Float)1.0 / sqrt(t1);
118 //used to normalize row
120 for ( int c = 0; c < 3; c++ ) U(0,c) *= t1;
123 for ( int c = 0; c < 3; c++ ) t2 += conj(U(0,c)) * U(1,c);
126 for ( int c = 0; c < 3; c++ ) U(1,c) -= t2 * U(0,c);
128 //normalize second row
129 //sum of squares of row
132 for ( int c = 0; c < 3; c++ ) t1 += norm(U(1,c));
133 t1 = (Float)1.0 / sqrt(t1);
135 //used to normalize row
137 for ( int c = 0; c < 3; c++ ) U(1, c) *= t1;
139 //Reconstruct lat row
140 U(2,0) = conj(U(0,1) * U(1,2) - U(0,2) * U(1,1));
141 U(2,1) = conj(U(0,2) * U(1,0) - U(0,0) * U(1,2));
142 U(2,2) = conj(U(0,0) * U(1,1) - U(0,1) * U(1,0));
148 @brief Generate the four random real elements of the SU(2) matrix
149 @param localstate CURAND rng state
150 @return four real numbers of the SU(2) matrix
153 __device__ static inline Matrix<T,2> randomSU2(cuRNGState& localState){
155 T aabs, ctheta, stheta, phi;
156 a(0,0) = Random<T>(localState, (T)-1.0, (T)1.0);
157 aabs = sqrt( 1.0 - a(0,0) * a(0,0));
158 ctheta = Random<T>(localState, (T)-1.0, (T)1.0);
159 phi = PII * Random<T>(localState);
160 stheta = ( curand(&localState) & 1 ? 1 : -1 ) * sqrt( (T)1.0 - ctheta * ctheta );
161 a(0,1) = aabs * stheta * cos( phi );
162 a(1,0) = aabs * stheta * sin( phi );
163 a(1,1) = aabs * ctheta;
168 @brief Update the SU(Nc) link with the new SU(2) matrix, link <- u * link
169 @param u SU(2) matrix represented by four real numbers
170 @param link SU(Nc) matrix
173 template <class T, int NCOLORS>
174 __host__ __device__ static inline void mul_block_sun( Matrix<T,2> u, Matrix<complex<T>,NCOLORS> &link, int2 id ){
175 for ( int j = 0; j < NCOLORS; j++ ) {
176 complex<T> tmp = complex<T>( u(0,0), u(1,1) ) * link(id.x, j) + complex<T>( u(1,0), u(0,1) ) * link(id.y, j);
177 link(id.y, j) = complex<T>(-u(1,0), u(0,1) ) * link(id.x, j) + complex<T>( u(0,0),-u(1,1) ) * link(id.y, j);
183 @brief Calculate the SU(2) index block in the SU(Nc) matrix
184 @param block number to calculate the index's, the total number of blocks is NCOLORS * ( NCOLORS - 1) / 2.
185 @return Returns two index's in int2 type, accessed by .x and .y.
187 template<int NCOLORS>
188 __host__ __device__ static inline int2 IndexBlock(int block){
194 while ( del_i < (NCOLORS - 1) && found == 0 ) {
196 for ( i1 = 0; i1 < (NCOLORS - del_i); i1++ ) {
198 if ( index == block ) {
210 @brief Generate a SU(Nc) random matrix
211 @param localstate CURAND rng state
212 @return SU(Nc) matrix
214 template <class Float, int NCOLORS>
215 __device__ inline Matrix<complex<Float>,NCOLORS> randomize( cuRNGState& localState )
217 Matrix<complex<Float>,NCOLORS> U;
219 for ( int i = 0; i < NCOLORS; i++ )
220 for ( int j = 0; j < NCOLORS; j++ )
221 U(i,j) = complex<Float>( (Float)(Random<Float>(localState) - 0.5), (Float)(Random<Float>(localState) - 0.5) );
222 reunit_link<Float>(U);
226 for( int block = 0; block < NCOLORS * ( NCOLORS - 1) / 2; block++ ) {
227 Matrix<Float,2> rr = randomSU2<Float>(localState);
228 int2 id = IndexBlock<NCOLORS>( block );
229 mul_block_sun<Float, NCOLORS>(rr, U, id);
230 //U = block_su2_to_su3<Float>( U, a00, a01, a10, a11, block );
235 template<typename Arg> __global__ void compute_InitGauge_HotStart(Arg arg)
237 int idx = threadIdx.x + blockIdx.x * blockDim.x;
238 if ( idx >= arg.threads ) return;
240 for ( int dr = 0; dr < 4; ++dr ) X[dr] = arg.X[dr];
241 for ( int dr = 0; dr < 4; ++dr ) X[dr] += 2 * arg.border[dr];
243 cuRNGState localState = arg.rngstate.State()[ id ];
244 for (int parity = 0; parity < 2; parity++) {
245 getCoords(x, id, arg.X, parity);
246 for (int dr = 0; dr < 4; dr++) x[dr] += arg.border[dr];
247 idx = linkIndex(x,X);
248 for (int d = 0; d < 4; d++) {
249 Matrix<complex<typename Arg::real>, Arg::nColor> U;
250 U = randomize<typename Arg::real, Arg::nColor>(localState);
251 arg.dataOr(d, idx, parity) = U;
254 arg.rngstate.State()[ id ] = localState;
257 template<typename Float, int nColors, QudaReconstructType recon>
258 class InitGaugeHot : Tunable {
259 InitGaugeHotArg<Float, nColors, recon> arg;
260 const GaugeField &meta;
261 unsigned int sharedBytesPerThread() const { return 0; }
262 unsigned int sharedBytesPerBlock(const TuneParam ¶m) const { return 0; }
263 bool tuneSharedBytes() const { return false; }
264 bool tuneGridDim() const { return false; }
265 unsigned int minThreads() const { return arg.threads; }
268 InitGaugeHot(GaugeField &data, RNG &rngstate) :
273 qudaDeviceSynchronize();
274 data.exchangeExtendedGhost(data.R(),false);
277 void apply(const qudaStream_t &stream){
278 TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
279 qudaLaunchKernel(compute_InitGauge_HotStart<decltype(arg)>, tp, stream, arg);
282 TuneKey tuneKey() const { return TuneKey(meta.VolString(), typeid(*this).name(), meta.AuxString()); }
284 void preTune(){ arg.rngstate.backup(); }
285 void postTune(){ arg.rngstate.restore(); }
286 long long flops() const { return 0; }
287 long long bytes() const { return meta.Bytes(); }
291 * @brief Perform a cold start to the gauge field, identity SU(3) matrix, also fills the ghost links in multi-GPU case (no need to exchange data)
293 * @param[in,out] data Gauge field
295 void InitGaugeField(GaugeField& data) {
297 instantiate<InitGaugeCold>(data);
299 errorQuda("Pure gauge code has not been built");
303 /** @brief Perform a hot start to the gauge field, random SU(3) matrix, followed by reunitarization, also exchange borders links in multi-GPU case.
305 * @param[in,out] data Gauge field
306 * @param[in,out] rngstate state of the CURAND random number generator
308 void InitGaugeField(GaugeField& data, RNG &rngstate) {
310 instantiate<InitGaugeHot>(data, rngstate);
312 errorQuda("Pure gauge code has not been built");