16 #define PI 3.1415926535897932384626433832795 // pi 19 #define PII 6.2831853071795864769252867665590 // 2 * pi 26 template <
typename Gauge>
27 struct InitGaugeColdArg {
33 for (
int dir = 0; dir < 4; ++dir ) X[dir] = data.
X()[dir];
34 threads = X[0] * X[1] * X[2] * X[3];
38 template<
typename Float,
typename Gauge,
int NCOLORS>
39 __global__
void compute_InitGauge_ColdStart(InitGaugeColdArg<Gauge>
arg){
40 int idx = threadIdx.x + blockIdx.x * blockDim.x;
41 if ( idx >= arg.threads )
return;
43 if ( idx >= arg.threads / 2 ) {
45 idx -= arg.threads / 2;
49 for (
int d = 0; d < 4; d++ ) arg.dataOr(d, idx, parity) = U;
53 template<
typename Float,
typename Gauge,
int NCOLORS>
55 InitGaugeColdArg<Gauge>
arg;
56 mutable char aux_string[128];
58 unsigned int sharedBytesPerThread()
const {
65 bool tuneGridDim()
const {
68 unsigned int minThreads()
const {
73 InitGaugeCold(InitGaugeColdArg<Gauge> &arg)
79 void apply(
const cudaStream_t &
stream){
81 compute_InitGauge_ColdStart<Float, Gauge, NCOLORS> <<< tp.
grid,tp.
block >>> (
arg);
86 std::stringstream vol;
87 vol << arg.X[0] <<
"x";
88 vol << arg.X[1] <<
"x";
89 vol << arg.X[2] <<
"x";
91 sprintf(aux_string,
"threads=%d,prec=%lu", arg.threads,
sizeof(Float));
92 return TuneKey(vol.str().c_str(),
typeid(*this).name(), aux_string);
96 long long flops()
const {
99 long long bytes()
const {
105 template<
typename Float,
int NCOLORS,
typename Gauge>
107 InitGaugeColdArg<Gauge> initarg(dataOr, data);
108 InitGaugeCold<Float, Gauge, NCOLORS>
init(initarg);
115 template<
typename Float>
121 InitGaugeField<Float, 3>(Gauge(data), data);
124 InitGaugeField<Float, 3>(Gauge(data), data);
127 InitGaugeField<Float, 3>(Gauge(data), data);
144 InitGaugeField<float> (data);
146 InitGaugeField<double>(data);
160 template <
typename Gauge>
161 struct InitGaugeHotArg {
170 : dataOr(dataOr), rngstate(rngstate) {
172 for (
int dir = 0; dir < 4; ++dir ) {
173 border[dir] = data.
R()[dir];
174 X[dir] = data.
X()[dir] - border[dir] * 2;
177 for (
int dir = 0; dir < 4; ++dir ) X[dir] = data.
X()[dir];
181 threads = X[0] * X[1] * X[2] * X[3] >> 1;
186 template <
typename Float>
187 __host__ __device__
static inline void reunit_link(
Matrix<complex<Float>,3> &U ){
189 complex<Float> t2((Float)0.0, (Float)0.0);
194 for (
int c = 0; c < 3; c++ ) t1 +=
norm(U(0,c));
195 t1 = (Float)1.0 /
sqrt(t1);
199 for (
int c = 0; c < 3; c++ ) U(0,c) *= t1;
202 for (
int c = 0; c < 3; c++ ) t2 +=
conj(U(0,c)) * U(1,c);
205 for (
int c = 0; c < 3; c++ ) U(1,c) -= t2 * U(0,c);
211 for (
int c = 0; c < 3; c++ ) t1 +=
norm(U(1,c));
212 t1 = (Float)1.0 /
sqrt(t1);
216 for (
int c = 0; c < 3; c++ ) U(1, c) *= t1;
219 U(2,0) =
conj(U(0,1) * U(1,2) - U(0,2) * U(1,1));
220 U(2,1) =
conj(U(0,2) * U(1,0) - U(0,0) * U(1,2));
221 U(2,2) =
conj(U(0,0) * U(1,1) - U(0,1) * U(1,0));
239 T aabs, ctheta, stheta, phi;
240 a(0,0) = Random<T>(localState, (T)-1.0, (T)1.0);
241 aabs =
sqrt( 1.0 - a(0,0) * a(0,0));
242 ctheta = Random<T>(localState, (T)-1.0, (T)1.0);
243 phi =
PII * Random<T>(localState);
244 stheta = ( curand(&localState) & 1 ? 1 : -1 ) *
sqrt( (T)1.0 - ctheta * ctheta );
245 a(0,1) = aabs * stheta *
cos( phi );
246 a(1,0) = aabs * stheta *
sin( phi );
247 a(1,1) = aabs * ctheta;
258 template <
class T,
int NCOLORS>
259 __host__ __device__
static inline void mul_block_sun(
Matrix<T,2> u,
Matrix<complex<T>,NCOLORS> &link, int2
id ){
260 for (
int j = 0; j < NCOLORS; j++ ) {
261 complex<T>
tmp = complex<T>( u(0,0), u(1,1) ) * link(
id.x, j) + complex<T>( u(1,0), u(0,1) ) * link(
id.y, j);
262 link(
id.y, j) = complex<T>(-u(1,0), u(0,1) ) * link(
id.x, j) + complex<T>( u(0,0),-u(1,1) ) * link(
id.y, j);
273 template<
int NCOLORS>
274 __host__ __device__
static inline int2
IndexBlock(
int block){
280 while ( del_i < (NCOLORS - 1) && found == 0 ) {
282 for ( i1 = 0; i1 < (NCOLORS - del_i); i1++ ) {
284 if ( index == block ) {
300 template <
class Float,
int NCOLORS>
302 Matrix<complex<Float>,NCOLORS> U;
304 for (
int i = 0; i < NCOLORS; i++ )
305 for (
int j = 0; j < NCOLORS; j++ )
306 U(i,j) = complex<Float>( (Float)(Random<Float>(localState) - 0.5), (Float)(Random<Float>(localState) - 0.5) );
307 reunit_link<Float>(U);
320 template<
typename Float,
typename Gauge,
int NCOLORS>
321 __global__
void compute_InitGauge_HotStart(InitGaugeHotArg<Gauge> arg){
322 int idx = threadIdx.x + blockIdx.x * blockDim.x;
323 if ( idx >= arg.threads )
return;
326 for (
int dr = 0; dr < 4; ++dr ) X[dr] = arg.X[dr];
327 for (
int dr = 0; dr < 4; ++dr ) X[dr] += 2 * arg.border[dr];
329 cuRNGState localState = arg.rngstate.State()[ id ];
331 cuRNGState localState = arg.rngstate.State()[ idx ];
336 for (
int dr = 0; dr < 4; ++dr ) x[dr] += arg.border[dr];
339 for (
int d = 0; d < 4; d++ ) {
340 Matrix<complex<Float>,NCOLORS> U;
341 U = randomize<Float, NCOLORS>(localState);
342 arg.dataOr(d, idx,
parity) = U;
346 arg.rngstate.State()[ id ] = localState;
348 arg.rngstate.State()[ idx ] = localState;
355 template<
typename Float,
typename Gauge,
int NCOLORS>
357 InitGaugeHotArg<Gauge>
arg;
358 mutable char aux_string[128];
360 unsigned int sharedBytesPerThread()
const {
366 bool tuneSharedBytes()
const {
369 bool tuneGridDim()
const {
372 unsigned int minThreads()
const {
377 InitGaugeHot(InitGaugeHotArg<Gauge> &arg)
383 void apply(
const cudaStream_t &
stream){
385 compute_InitGauge_HotStart<Float, Gauge, NCOLORS> <<< tp.
grid,tp.
block >>> (
arg);
390 std::stringstream vol;
391 vol << arg.X[0] <<
"x";
392 vol << arg.X[1] <<
"x";
393 vol << arg.X[2] <<
"x";
395 sprintf(aux_string,
"threads=%d,prec=%lud", arg.threads,
sizeof(Float));
396 return TuneKey(vol.str().c_str(),
typeid(*this).name(), aux_string);
400 void preTune(){ arg.rngstate.backup(); }
401 void postTune(){ arg.rngstate.restore(); }
402 long long flops()
const {
405 long long bytes()
const {
412 template<
typename Float,
int NCOLORS,
typename Gauge>
414 InitGaugeHotArg<Gauge> initarg(dataOr, data, rngstate);
415 InitGaugeHot<Float, Gauge, NCOLORS>
init(initarg);
423 template<
typename Float>
429 InitGaugeField<Float, 3>(Gauge(data), data, rngstate);
432 InitGaugeField<Float, 3>(Gauge(data), data, rngstate);
435 InitGaugeField<Float, 3>(Gauge(data), data, rngstate);
443 #endif // GPU_GAUGE_ALG 453 InitGaugeField<float> (data, rngstate);
455 InitGaugeField<double>(data, rngstate);
460 errorQuda(
"Pure gauge code has not been built");
struct curandStateMRG32k3a cuRNGState
static __device__ __host__ int linkIndex(const int x[], const I X[4])
__host__ __device__ ValueType norm(const complex< ValueType > &z)
Returns the magnitude of z squared.
QudaVerbosity getVerbosity()
static __host__ __device__ void IndexBlock(int block, int &p, int &q)
__host__ __device__ ValueType sqrt(ValueType x)
cudaColorSpinorField * tmp
#define qudaDeviceSynchronize()
__host__ __device__ ValueType sin(ValueType x)
void exchangeExtendedGhost(const int *R, bool no_comms_fill=false)
This does routine will populate the border / halo region of a gauge field that has been created using...
Class declaration to initialize and hold CURAND RNG states.
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
void InitGaugeField(cudaGaugeField &data)
Perform a cold start to the gauge field, identity SU(3) matrix, also fills the ghost links in multi-G...
Main header file for host and device accessors to GaugeFields.
void init()
Create the CUBLAS context.
__device__ __host__ void setIdentity(Matrix< T, N > *m)
static int index(int ndim, const int *dims, const int *x)
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
__host__ __device__ ValueType cos(ValueType x)
QudaReconstructType Reconstruct() const
__host__ __device__ ValueType conj(ValueType x)
QudaTune getTuning()
Query whether autotuning is enabled or not. Default is enabled but can be overridden by setting QUDA_...
QudaPrecision Precision() const
__host__ __device__ int getCoords(int coord[], const Arg &arg, int &idx, int parity, int &dim)
Compute the space-time coordinates we are at.