18 #define PI 3.1415926535897932384626433832795 // pi 21 #define PII 6.2831853071795864769252867665590 // 2 * pi 35 __host__ __device__
static inline int2
IndexBlock(
int block){
41 while ( del_i < (NCOLORS - 1) && found == 0 ) {
43 for ( i1 = 0; i1 < (NCOLORS - del_i); i1++ ) {
45 if ( index == block ) {
62 __host__ __device__
static inline void IndexBlock(
int block,
int &p,
int &q){
64 if ( block == 0 ) { p = 0; q = 1; }
65 else if ( block == 1 ) { p = 1; q = 2; }
68 else if ( NCOLORS > 3 ) {
73 while ( del_i < (NCOLORS - 1) && found == 0 ) {
75 for ( i1 = 0; i1 < (NCOLORS - del_i); i1++ ) {
77 if ( index == block ) {
96 T xr1, xr2, xr3, xr4, d, r;
98 xr1 = Random<T>(localState);
99 xr1 = (
log((xr1 + 1.e-10)));
100 xr2 = Random<T>(localState);
101 xr2 = (
log((xr2 + 1.e-10)));
102 xr3 = Random<T>(localState);
103 xr4 = Random<T>(localState);
105 d = -(xr2 + xr1 * xr3 * xr3 ) / al;
108 if ((1.00 - 0.5 * d) > xr4 * xr4 ) nacd = 1;
109 if ( nacd == 0 && al > 2.0 ) {
110 for ( k = 0; k < 20; k++ ) {
112 xr1 = Random<T>(localState);
113 xr1 = (
log((xr1 + 1.e-10)));
114 xr2 = Random<T>(localState);
115 xr2 = (
log((xr2 + 1.e-10)));
116 xr3 = Random<T>(localState);
117 xr4 = Random<T>(localState);
119 d = -(xr2 + xr1 * xr3 * xr3) / al;
120 if ((1.00 - 0.5 * d) > xr4 * xr4 )
break;
124 if ( nacd == 0 && al <= 2.0 ) {
125 xr3 =
exp(-2.0 * al);
127 for ( k = 0; k < 20; k++ ) {
129 xr1 = Random<T>(localState);
130 xr2 = Random<T>(localState);
132 a(0,0) = 1.00 +
log(r) / al;
133 if ((1.0 - a(0,0) * a(0,0)) > xr2 * xr2 )
break;
141 xr3 = 1.0 - a(0,0) * a(0,0);
145 a(1,1) = (2.0 * Random<T>(localState) - 1.0) * r;
147 xr1 = xr3 - a(1,1) * a(1,1);
151 xr2 =
PII * Random<T>(localState);
152 a(0,1) = xr1 *
cos(xr2);
153 a(1,0) = xr1 *
sin(xr2);
196 template <
class T,
int NCOLORS>
197 __host__ __device__
static inline Matrix<T,2> get_block_su2(
Matrix<complex<T>,NCOLORS>
tmp1, int2
id ){
199 r(0,0) =
tmp1(
id.x,
id.x).
x +
tmp1(
id.y,
id.y).
x;
200 r(0,1) =
tmp1(
id.x,
id.y).y +
tmp1(
id.y,
id.x).y;
201 r(1,0) =
tmp1(
id.x,
id.y).
x -
tmp1(
id.y,
id.x).
x;
202 r(1,1) =
tmp1(
id.x,
id.x).y -
tmp1(
id.y,
id.y).y;
212 template <
class T,
int NCOLORS>
214 Matrix<complex<T>,NCOLORS>
tmp1;
216 tmp1(
id.x,
id.x) = complex<T>( rr(0,0), rr(1,1) );
217 tmp1(
id.x,
id.y) = complex<T>( rr(1,0), rr(0,1) );
218 tmp1(
id.y,
id.x) = complex<T>(-rr(1,0), rr(0,1) );
219 tmp1(
id.y,
id.y) = complex<T>( rr(0,0),-rr(1,1) );
228 template <
class T,
int NCOLORS>
229 __host__ __device__
static inline void mul_block_sun(
Matrix<T,2> u,
Matrix<complex<T>,NCOLORS> &link, int2
id ){
230 for (
int j = 0; j < NCOLORS; j++ ) {
231 complex<T>
tmp = complex<T>( u(0,0), u(1,1) ) * link(
id.x, j) + complex<T>( u(1,0), u(0,1) ) * link(
id.y, j);
232 link(
id.y, j) = complex<T>(-u(1,0), u(0,1) ) * link(
id.x, j) + complex<T>( u(0,0),-u(1,1) ) * link(
id.y, j);
246 template <
class Cmplx>
247 __host__ __device__
static inline void block_su2_to_su3(
Matrix<Cmplx,3> &U, Cmplx a00, Cmplx a01, Cmplx a10, Cmplx a11,
int block ){
251 tmp = a00 * U(0,0) + a01 * U(1,0);
252 U(1,0) = a10 * U(0,0) + a11 * U(1,0);
254 tmp = a00 * U(0,1) + a01 * U(1,1);
255 U(1,1) = a10 * U(0,1) + a11 * U(1,1);
257 tmp = a00 * U(0,2) + a01 * U(1,2);
258 U(1,2) = a10 * U(0,2) + a11 * U(1,2);
262 tmp = a00 * U(1,0) + a01 * U(2,0);
263 U(2,0) = a10 * U(1,0) + a11 * U(2,0);
265 tmp = a00 * U(1,1) + a01 * U(2,1);
266 U(2,1) = a10 * U(1,1) + a11 * U(2,1);
268 tmp = a00 * U(1,2) + a01 * U(2,2);
269 U(2,2) = a10 * U(1,2) + a11 * U(2,2);
273 tmp = a00 * U(0,0) + a01 * U(2,0);
274 U(2,0) = a10 * U(0,0) + a11 * U(2,0);
276 tmp = a00 * U(0,1) + a01 * U(2,1);
277 U(2,1) = a10 * U(0,1) + a11 * U(2,1);
279 tmp = a00 * U(0,2) + a01 * U(2,2);
280 U(2,2) = a10 * U(0,2) + a11 * U(2,2);
289 template <
class Float>
292 b(0,0) = v(0,0) * u(0,0) + v(0,1) * u(0,1) + v(1,0) * u(1,0) + v(1,1) * u(1,1);
293 b(0,1) = v(0,1) * u(0,0) - v(0,0) * u(0,1) + v(1,0) * u(1,1) - v(1,1) * u(1,0);
294 b(1,0) = v(1,0) * u(0,0) - v(0,0) * u(1,0) + v(1,1) * u(0,1) - v(0,1) * u(1,1);
295 b(1,1) = v(1,1) * u(0,0) - v(0,0) * u(1,1) + v(0,1) * u(1,0) - v(1,0) * u(0,1);
305 template <
class Float,
int NCOLORS>
306 __device__
inline void heatBathSUN(
Matrix<complex<Float>,NCOLORS>& U,
Matrix<complex<Float>,NCOLORS> F,
309 if ( NCOLORS == 3 ) {
328 for (
int block = 0; block < NCOLORS; block++ ) {
330 IndexBlock<NCOLORS>(block, p, q);
331 complex<Float> a0((Float)0.0, (Float)0.0);
332 complex<Float> a1 = a0;
333 complex<Float> a2 = a0;
334 complex<Float> a3 = a0;
336 for (
int j = 0; j < NCOLORS; j++ ) {
337 a0 += U(p,j) * F(j,p);
338 a1 += U(p,j) * F(j,q);
339 a2 += U(q,j) * F(j,p);
340 a3 += U(q,j) * F(j,q);
343 r(0,0) = a0.x + a3.x;
344 r(0,1) = a1.y + a2.y;
345 r(1,0) = a1.x - a2.x;
346 r(1,1) = a0.y - a3.y;
347 Float k =
sqrt(r(0,0) * r(0,0) + r(0,1) * r(0,1) + r(1,0) * r(1,0) + r(1,1) * r(1,1));;
348 Float ap = BetaOverNc * k;
352 r = mulsu2UVDagger<Float>( a, r);
354 a0 = complex<Float>( r(0,0), r(1,1) );
355 a1 = complex<Float>( r(1,0), r(0,1) );
356 a2 = complex<Float>(-r(1,0), r(0,1) );
357 a3 = complex<Float>( r(0,0),-r(1,1) );
360 for (
int j = 0; j < NCOLORS; j++ ) {
361 tmp0 = a0 * U(p,j) + a1 * U(q,j);
362 U(q,j) = a2 * U(p,j) + a3 * U(q,j);
369 else if ( NCOLORS > 3 ) {
373 for (
int block = 0; block < NCOLORS * ( NCOLORS - 1) / 2; block++ ) {
374 int2
id = IndexBlock<NCOLORS>( block );
376 Float k =
sqrt(r(0,0) * r(0,0) + r(0,1) * r(0,1) + r(1,0) * r(1,0) + r(1,1) * r(1,1));
377 Float ap = BetaOverNc * k;
383 mul_block_sun<Float, NCOLORS>( rr, U, id);
384 mul_block_sun<Float, NCOLORS>( rr, M, id);
438 template <
class Float,
int NCOLORS>
439 __device__
inline void overrelaxationSUN(
Matrix<complex<Float>,NCOLORS>& U,
Matrix<complex<Float>,NCOLORS> F ){
441 if ( NCOLORS == 3 ) {
467 for (
int block = 0; block < 3; block++ ) {
469 IndexBlock<NCOLORS>(block, p, q);
470 complex<Float> a0((Float)0., (Float)0.);
471 complex<Float> a1 = a0;
472 complex<Float> a2 = a0;
473 complex<Float> a3 = a0;
475 for (
int j = 0; j < NCOLORS; j++ ) {
476 a0 += U(p,j) * F(j,p);
477 a1 += U(p,j) * F(j,q);
478 a2 += U(q,j) * F(j,p);
479 a3 += U(q,j) * F(j,q);
482 r(0,0) = a0.x + a3.x;
483 r(0,1) = a1.y + a2.y;
484 r(1,0) = a1.x - a2.x;
485 r(1,1) = a0.y - a3.y;
488 Float
norm = 1.0 /
sqrt(r(0,0) * r(0,0) + r(0,1) * r(0,1) + r(1,0) * r(1,0) + r(1,1) * r(1,1));;
496 a0 = complex<Float>( r(0,0), r(1,1) );
497 a1 = complex<Float>( r(1,0), r(0,1) );
498 a2 = complex<Float>(-r(1,0), r(0,1) );
499 a3 = complex<Float>( r(0,0),-r(1,1) );
500 complex<Float> tmp0,
tmp1;
502 for (
int j = 0; j < NCOLORS; j++ ) {
503 tmp0 = a0 * U(p,j) + a1 * U(q,j);
504 tmp1 = a2 * U(p,j) + a3 * U(q,j);
505 U(p,j) = a0 * tmp0 + a1 *
tmp1;
506 U(q,j) = a2 * tmp0 + a3 *
tmp1;
512 else if ( NCOLORS > 3 ) {
515 for (
int block = 0; block < NCOLORS * ( NCOLORS - 1) / 2; block++ ) {
516 int2
id = IndexBlock<NCOLORS>( block );
519 Float
norm = 1.0 /
sqrt(r(0,0) * r(0,0) + r(0,1) * r(0,1) + r(1,0) * r(1,0) + r(1,1) * r(1,1));;
524 mul_block_sun<Float, NCOLORS>( r, U, id);
525 mul_block_sun<Float, NCOLORS>( r, U, id);
526 mul_block_sun<Float, NCOLORS>( r, M, id);
527 mul_block_sun<Float, NCOLORS>( r, M, id);
576 template <
typename Gauge,
typename Float,
int NCOLORS>
588 : dataOr(dataOr), data(data), rngstate(rngstate) {
589 BetaOverNc = Beta / (Float)NCOLORS;
591 for (
int dir = 0; dir < 4; ++dir ) {
592 border[dir] = data.
R()[dir];
593 X[dir] = data.
X()[dir] - border[dir] * 2;
596 for (
int dir = 0; dir < 4; ++dir ) X[dir] = data.
X()[dir];
598 threads = X[0] * X[1] * X[2] * X[3] >> 1;
603 template<
typename Float,
typename Gauge,
int NCOLORS,
bool HeatbathOrRelax>
604 __global__
void compute_heatBath(MonteArg<Gauge, Float, NCOLORS>
arg,
int mu,
int parity){
605 int idx = threadIdx.x + blockIdx.x * blockDim.x;
606 if ( idx >= arg.threads )
return;
610 for (
int dr = 0; dr < 4; ++dr ) X[dr] = arg.X[dr];
616 for (
int dr = 0; dr < 4; ++dr ) {
617 x[dr] += arg.border[dr];
618 X[dr] += 2 * arg.border[dr];
627 for (
int nu = 0; nu < 4; nu++ )
if ( mu != nu ) {
628 int dx[4] = { 0, 0, 0, 0 };
642 link =
conj(link) * U;
648 U = arg.dataOr(mu, idx, parity);
649 if ( HeatbathOrRelax ) {
650 cuRNGState localState = arg.rngstate.State()[ id ];
651 heatBathSUN<Float, NCOLORS>( U,
conj(staple), localState, arg.BetaOverNc );
652 arg.rngstate.State()[ id ] = localState;
655 overrelaxationSUN<Float, NCOLORS>( U,
conj(staple) );
657 arg.dataOr(mu, idx, parity) = U;
661 template<
typename Float,
typename Gauge,
int NCOLORS,
int NElems,
bool HeatbathOrRelax>
663 MonteArg<Gauge, Float, NCOLORS>
arg;
666 mutable char aux_string[128];
668 unsigned int sharedBytesPerThread()
const {
675 bool tuneGridDim()
const {
678 unsigned int minThreads()
const {
683 GaugeHB(MonteArg<Gauge, Float, NCOLORS> &arg)
688 void SetParam(
int _mu,
int _parity){
692 void apply(
const cudaStream_t &
stream){
698 std::stringstream vol;
699 vol << arg.X[0] <<
"x";
700 vol << arg.X[1] <<
"x";
701 vol << arg.X[2] <<
"x";
703 sprintf(aux_string,
"threads=%d,prec=%lu",arg.threads,
sizeof(Float));
704 return TuneKey(vol.str().c_str(),
typeid(*this).name(), aux_string);
709 if(HeatbathOrRelax) arg.rngstate.backup();
713 if(HeatbathOrRelax) arg.rngstate.restore();
715 long long flops()
const {
718 if ( NCOLORS == 3 ) {
719 long long flop = 2268LL;
720 if ( HeatbathOrRelax ) {
730 long long flop = NCOLORS * NCOLORS * NCOLORS * 84LL;
731 if ( HeatbathOrRelax ) {
732 flop += NCOLORS * NCOLORS * NCOLORS + (NCOLORS * ( NCOLORS - 1) / 2) * (46LL + 48LL + 56LL * NCOLORS);
735 flop += NCOLORS * NCOLORS * NCOLORS + (NCOLORS * ( NCOLORS - 1) / 2) * (17LL + 112LL * NCOLORS);
741 long long bytes()
const {
743 if ( NCOLORS == 3 ) {
744 long long byte = 20LL * NElems *
sizeof(Float);
745 if ( HeatbathOrRelax ) byte += 2LL *
sizeof(
cuRNGState);
750 long long byte = 20LL * NCOLORS * NCOLORS * 2 *
sizeof(Float);
751 if ( HeatbathOrRelax ) byte += 2LL *
sizeof(
cuRNGState);
766 template<
typename Float,
int NElems,
int NCOLORS,
typename Gauge>
769 TimeProfile profileHBOVR(
"HeatBath_OR_Relax",
false);
770 MonteArg<Gauge, Float, NCOLORS> montearg(dataOr, data, Beta, rngstate);
772 GaugeHB<Float, Gauge, NCOLORS, NElems, true> hb(montearg);
773 for (
int step = 0; step < nhb; ++step ) {
774 for (
int parity = 0; parity < 2; ++
parity ) {
775 for (
int mu = 0; mu < 4; ++
mu ) {
776 hb.SetParam(mu, parity);
788 double gflops = (hb.flops() * 8 * nhb * 1e-9) / (secs);
789 double gbytes = hb.bytes() * 8 * nhb / (secs * 1e9);
793 printfQuda(
"HB: Time = %6.6f s, Gflop/s = %6.1f, GB/s = %6.1f\n", secs, gflops, gbytes);
798 GaugeHB<Float, Gauge, NCOLORS, NElems, false> relax(montearg);
799 for (
int step = 0; step < nover; ++step ) {
800 for (
int parity = 0; parity < 2; ++
parity ) {
801 for (
int mu = 0; mu < 4; ++
mu ) {
802 relax.SetParam(mu, parity);
814 double gflops = (relax.flops() * 8 * nover * 1e-9) / (secs);
815 double gbytes = relax.bytes() * 8 * nover / (secs * 1e9);
819 printfQuda(
"OVR: Time = %6.6f s, Gflop/s = %6.1f, GB/s = %6.1f\n", secs, gflops, gbytes);
826 template<
typename Float>
832 Monte<Float, 18, 3>(Gauge(data), data, rngstate, Beta, nhb, nover);
835 Monte<Float, 12, 3>(Gauge(data), data, rngstate, Beta, nhb, nover);
838 Monte<Float, 8, 3>(Gauge(data), data, rngstate, Beta, nhb, nover);
846 #endif // GPU_GAUGE_ALG 859 Monte<float> (data, rngstate, (float)Beta, nhb, nover);
861 Monte<double>(data, rngstate, Beta, nhb, nover);
866 errorQuda(
"Pure gauge code has not been built");
867 #endif // GPU_GAUGE_ALG
cudaColorSpinorField * tmp1
struct curandStateMRG32k3a cuRNGState
__device__ __host__ void setZero(Matrix< T, N > *m)
static __device__ __host__ int linkIndexShift(const I x[], const J dx[], const K X[4])
static __device__ __host__ int linkIndex(const int x[], const I X[4])
__host__ __device__ ValueType norm(const complex< ValueType > &z)
Returns the magnitude of z squared.
QudaVerbosity getVerbosity()
__host__ __device__ ValueType exp(ValueType x)
static __host__ __device__ void IndexBlock(int block, int &p, int &q)
__host__ __device__ ValueType sqrt(ValueType x)
cudaColorSpinorField * tmp
void PGaugeExchange(cudaGaugeField &data, const int dir, const int parity)
Perform heatbath and overrelaxation. Performs nhb heatbath steps followed by nover overrelaxation ste...
double Last(QudaProfileType idx)
#define qudaDeviceSynchronize()
__host__ __device__ ValueType sin(ValueType x)
Class declaration to initialize and hold CURAND RNG states.
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
Main header file for host and device accessors to GaugeFields.
__device__ __host__ void setIdentity(Matrix< T, N > *m)
__host__ __device__ ValueType log(ValueType x)
static int index(int ndim, const int *dims, const int *x)
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
__host__ __device__ ValueType cos(ValueType x)
QudaReconstructType Reconstruct() const
__host__ __device__ ValueType abs(ValueType x)
__host__ __device__ ValueType conj(ValueType x)
void Monte(cudaGaugeField &data, RNG &rngstate, double Beta, int nhb, int nover)
Perform heatbath and overrelaxation. Performs nhb heatbath steps followed by nover overrelaxation ste...
QudaTune getTuning()
Query whether autotuning is enabled or not. Default is enabled but can be overridden by setting QUDA_...
QudaPrecision Precision() const
__host__ __device__ int getCoords(int coord[], const Arg &arg, int &idx, int parity, int &dim)
Compute the space-time coordinates we are at.