13 #include <cub/cub.cuh> 18 #define PI 3.1415926535897932384626433832795 // pi 21 #define PII 6.2831853071795864769252867665590 // 2 * pi 41 while ( del_i < (NCOLORS - 1) && found == 0 ) {
43 for ( i1 = 0; i1 < (NCOLORS - del_i); i1++ ) {
62 __host__ __device__
static inline void IndexBlock(
int block,
int &
p,
int &q){
64 if (
block == 0 ) {
p = 0; q = 1; }
65 else if (
block == 1 ) {
p = 1; q = 2; }
68 else if ( NCOLORS > 3 ) {
73 while ( del_i < (NCOLORS - 1) && found == 0 ) {
75 for ( i1 = 0; i1 < (NCOLORS - del_i); i1++ ) {
96 T xr1, xr2, xr3, xr4,
d, r;
98 xr1 = Random<T>(localState);
99 xr1 = (
log((xr1 + 1.
e-10)));
100 xr2 = Random<T>(localState);
101 xr2 = (
log((xr2 + 1.
e-10)));
102 xr3 = Random<T>(localState);
103 xr4 = Random<T>(localState);
105 d = -(xr2 + xr1 * xr3 * xr3 ) / al;
108 if ((1.00 - 0.5 *
d) > xr4 * xr4 ) nacd = 1;
109 if ( nacd == 0 && al > 2.0 ) {
110 for ( k = 0; k < 20; k++ ) {
112 xr1 = Random<T>(localState);
113 xr1 = (
log((xr1 + 1.
e-10)));
114 xr2 = Random<T>(localState);
115 xr2 = (
log((xr2 + 1.
e-10)));
116 xr3 = Random<T>(localState);
117 xr4 = Random<T>(localState);
119 d = -(xr2 + xr1 * xr3 * xr3) / al;
120 if ((1.00 - 0.5 *
d) > xr4 * xr4 )
break;
124 if ( nacd == 0 && al <= 2.0 ) {
125 xr3 =
exp(-2.0 * al);
127 for ( k = 0; k < 20; k++ ) {
129 xr1 = Random<T>(localState);
130 xr2 = Random<T>(localState);
132 a(0,0) = 1.00 +
log(r) / al;
133 if ((1.0 -
a(0,0) *
a(0,0)) > xr2 * xr2 )
break;
141 xr3 = 1.0 -
a(0,0) *
a(0,0);
145 a(1,1) = (2.0 * Random<T>(localState) - 1.0) * r;
147 xr1 = xr3 -
a(1,1) *
a(1,1);
151 xr2 =
PII * Random<T>(localState);
152 a(0,1) = xr1 *
cos(xr2);
153 a(1,0) = xr1 *
sin(xr2);
196 template <
class T,
int NCOLORS>
197 __host__ __device__
static inline Matrix<T,2> get_block_su2(
Matrix<complex<T>,NCOLORS>
tmp1, int2
id ){
212 template <
class T,
int NCOLORS>
216 tmp1(
id.
x,
id.
x) = complex<T>( rr(0,0), rr(1,1) );
217 tmp1(
id.
x,
id.
y) = complex<T>( rr(1,0), rr(0,1) );
218 tmp1(
id.
y,
id.
x) = complex<T>(-rr(1,0), rr(0,1) );
219 tmp1(
id.
y,
id.
y) = complex<T>( rr(0,0),-rr(1,1) );
228 template <
class T,
int NCOLORS>
229 __host__ __device__
static inline void mul_block_sun(
Matrix<T,2> u,
Matrix<complex<T>,NCOLORS> &link, int2
id ){
230 for (
int j = 0; j < NCOLORS; j++ ) {
231 complex<T>
tmp = complex<T>( u(0,0), u(1,1) ) * link(
id.
x, j) + complex<T>( u(1,0), u(0,1) ) * link(
id.
y, j);
232 link(
id.
y, j) = complex<T>(-u(1,0), u(0,1) ) * link(
id.
x, j) + complex<T>( u(0,0),-u(1,1) ) * link(
id.
y, j);
246 template <
class Cmplx>
247 __host__ __device__
static inline void block_su2_to_su3(
Matrix<Cmplx,3> &U, Cmplx a00, Cmplx a01, Cmplx a10, Cmplx a11,
int block ){
251 tmp = a00 * U(0,0) + a01 * U(1,0);
252 U(1,0) = a10 * U(0,0) + a11 * U(1,0);
254 tmp = a00 * U(0,1) + a01 * U(1,1);
255 U(1,1) = a10 * U(0,1) + a11 * U(1,1);
257 tmp = a00 * U(0,2) + a01 * U(1,2);
258 U(1,2) = a10 * U(0,2) + a11 * U(1,2);
262 tmp = a00 * U(1,0) + a01 * U(2,0);
263 U(2,0) = a10 * U(1,0) + a11 * U(2,0);
265 tmp = a00 * U(1,1) + a01 * U(2,1);
266 U(2,1) = a10 * U(1,1) + a11 * U(2,1);
268 tmp = a00 * U(1,2) + a01 * U(2,2);
269 U(2,2) = a10 * U(1,2) + a11 * U(2,2);
273 tmp = a00 * U(0,0) + a01 * U(2,0);
274 U(2,0) = a10 * U(0,0) + a11 * U(2,0);
276 tmp = a00 * U(0,1) + a01 * U(2,1);
277 U(2,1) = a10 * U(0,1) + a11 * U(2,1);
279 tmp = a00 * U(0,2) + a01 * U(2,2);
280 U(2,2) = a10 * U(0,2) + a11 * U(2,2);
289 template <
class Float>
292 b(0,0) = v(0,0) * u(0,0) + v(0,1) * u(0,1) + v(1,0) * u(1,0) + v(1,1) * u(1,1);
293 b(0,1) = v(0,1) * u(0,0) - v(0,0) * u(0,1) + v(1,0) * u(1,1) - v(1,1) * u(1,0);
294 b(1,0) = v(1,0) * u(0,0) - v(0,0) * u(1,0) + v(1,1) * u(0,1) - v(0,1) * u(1,1);
295 b(1,1) = v(1,1) * u(0,0) - v(0,0) * u(1,1) + v(0,1) * u(1,0) - v(1,0) * u(0,1);
305 template <
class Float,
int NCOLORS>
306 __device__
inline void heatBathSUN(
Matrix<complex<Float>,NCOLORS>& U,
Matrix<complex<Float>,NCOLORS> F,
309 if ( NCOLORS == 3 ) {
330 IndexBlock<NCOLORS>(
block,
p, q);
331 complex<Float> a0((Float)0.0, (Float)0.0);
332 complex<Float>
a1 = a0;
333 complex<Float>
a2 = a0;
334 complex<Float> a3 = a0;
336 for (
int j = 0; j < NCOLORS; j++ ) {
337 a0 += U(
p,j) * F(j,
p);
338 a1 += U(
p,j) * F(j,q);
339 a2 += U(q,j) * F(j,
p);
340 a3 += U(q,j) * F(j,q);
343 r(0,0) = a0.x + a3.x;
344 r(0,1) =
a1.y +
a2.y;
345 r(1,0) =
a1.x -
a2.x;
346 r(1,1) = a0.y - a3.y;
347 Float k =
sqrt(r(0,0) * r(0,0) + r(0,1) * r(0,1) + r(1,0) * r(1,0) + r(1,1) * r(1,1));;
348 Float ap = BetaOverNc * k;
352 r = mulsu2UVDagger<Float>(
a, r);
354 a0 = complex<Float>( r(0,0), r(1,1) );
355 a1 = complex<Float>( r(1,0), r(0,1) );
356 a2 = complex<Float>(-r(1,0), r(0,1) );
357 a3 = complex<Float>( r(0,0),-r(1,1) );
360 for (
int j = 0; j < NCOLORS; j++ ) {
361 tmp0 = a0 * U(
p,j) +
a1 * U(q,j);
362 U(q,j) =
a2 * U(
p,j) + a3 * U(q,j);
369 else if ( NCOLORS > 3 ) {
374 int2
id = IndexBlock<NCOLORS>(
block );
376 Float k =
sqrt(r(0,0) * r(0,0) + r(0,1) * r(0,1) + r(1,0) * r(1,0) + r(1,1) * r(1,1));
377 Float ap = BetaOverNc * k;
383 mul_block_sun<Float, NCOLORS>( rr, U,
id);
384 mul_block_sun<Float, NCOLORS>( rr, M,
id);
438 template <
class Float,
int NCOLORS>
439 __device__
inline void overrelaxationSUN(
Matrix<complex<Float>,NCOLORS>& U,
Matrix<complex<Float>,NCOLORS> F ){
441 if ( NCOLORS == 3 ) {
469 IndexBlock<NCOLORS>(
block,
p, q);
470 complex<Float> a0((Float)0., (Float)0.);
471 complex<Float>
a1 = a0;
472 complex<Float>
a2 = a0;
473 complex<Float> a3 = a0;
475 for (
int j = 0; j < NCOLORS; j++ ) {
476 a0 += U(
p,j) * F(j,
p);
477 a1 += U(
p,j) * F(j,q);
478 a2 += U(q,j) * F(j,
p);
479 a3 += U(q,j) * F(j,q);
482 r(0,0) = a0.x + a3.x;
483 r(0,1) =
a1.y +
a2.y;
484 r(1,0) =
a1.x -
a2.x;
485 r(1,1) = a0.y - a3.y;
488 Float
norm = 1.0 /
sqrt(r(0,0) * r(0,0) + r(0,1) * r(0,1) + r(1,0) * r(1,0) + r(1,1) * r(1,1));;
496 a0 = complex<Float>( r(0,0), r(1,1) );
497 a1 = complex<Float>( r(1,0), r(0,1) );
498 a2 = complex<Float>(-r(1,0), r(0,1) );
499 a3 = complex<Float>( r(0,0),-r(1,1) );
502 for (
int j = 0; j < NCOLORS; j++ ) {
503 tmp0 = a0 * U(
p,j) +
a1 * U(q,j);
504 tmp1 =
a2 * U(
p,j) + a3 * U(q,j);
512 else if ( NCOLORS > 3 ) {
516 int2
id = IndexBlock<NCOLORS>(
block );
519 Float
norm = 1.0 /
sqrt(r(0,0) * r(0,0) + r(0,1) * r(0,1) + r(1,0) * r(1,0) + r(1,1) * r(1,1));;
524 mul_block_sun<Float, NCOLORS>( r, U,
id);
525 mul_block_sun<Float, NCOLORS>( r, U,
id);
526 mul_block_sun<Float, NCOLORS>( r, M,
id);
527 mul_block_sun<Float, NCOLORS>( r, M,
id);
576 template <
typename Gauge,
typename Float,
int NCOLORS>
584 cudaGaugeField &data;
587 MonteArg(
const Gauge &dataOr, cudaGaugeField & data, Float Beta, RNG &rngstate)
588 : dataOr(dataOr), data(data), rngstate(rngstate) {
589 BetaOverNc = Beta / (Float)NCOLORS;
591 for (
int dir = 0; dir < 4; ++dir ) {
592 border[dir] = data.R()[dir];
593 X[dir] = data.X()[dir] - border[dir] * 2;
596 for (
int dir = 0; dir < 4; ++dir )
X[dir] = data.X()[dir];
598 threads =
X[0] *
X[1] *
X[2] *
X[3] >> 1;
603 template<
typename Float,
typename Gauge,
int NCOLORS,
bool HeatbathOrRelax>
604 __global__
void compute_heatBath(MonteArg<Gauge, Float, NCOLORS>
arg,
int mu,
int parity){
606 if (
idx >=
arg.threads )
return;
610 for (
int dr = 0; dr < 4; ++dr )
X[dr] =
arg.X[dr];
616 for (
int dr = 0; dr < 4; ++dr ) {
617 x[dr] +=
arg.border[dr];
618 X[dr] += 2 *
arg.border[dr];
627 for (
int nu = 0; nu < 4; nu++ )
if (
mu != nu ) {
628 int dx[4] = { 0, 0, 0, 0 };
643 link =
conj(link) * U;
650 if ( HeatbathOrRelax ) {
652 heatBathSUN<Float, NCOLORS>( U,
conj(staple), localState,
arg.BetaOverNc );
653 arg.rngstate.State()[
id ] = localState;
656 overrelaxationSUN<Float, NCOLORS>( U,
conj(staple) );
662 template<
typename Float,
typename Gauge,
int NCOLORS,
int NElems,
bool HeatbathOrRelax>
663 class GaugeHB : Tunable {
664 MonteArg<Gauge, Float, NCOLORS>
arg;
667 mutable char aux_string[128];
669 unsigned int sharedBytesPerThread()
const {
672 unsigned int sharedBytesPerBlock(
const TuneParam &
param)
const {
676 bool tuneGridDim()
const {
679 unsigned int minThreads()
const {
684 GaugeHB(MonteArg<Gauge, Float, NCOLORS> &
arg)
689 void SetParam(
int _mu,
int _parity){
693 void apply(
const cudaStream_t &
stream){
695 compute_heatBath<Float, Gauge, NCOLORS, HeatbathOrRelax ><< < tp.grid,tp.block, tp.shared_bytes,
stream >> > (
arg,
mu,
parity);
698 TuneKey tuneKey()
const {
699 std::stringstream vol;
700 vol <<
arg.X[0] <<
"x";
701 vol <<
arg.X[1] <<
"x";
702 vol <<
arg.X[2] <<
"x";
704 sprintf(aux_string,
"threads=%d,prec=%lu",
arg.threads,
sizeof(Float));
705 return TuneKey(vol.str().c_str(),
typeid(*this).name(), aux_string);
710 if(HeatbathOrRelax)
arg.rngstate.backup();
714 if(HeatbathOrRelax)
arg.rngstate.restore();
716 long long flops()
const {
719 if ( NCOLORS == 3 ) {
720 long long flop = 2268LL;
721 if ( HeatbathOrRelax ) {
731 long long flop = NCOLORS * NCOLORS * NCOLORS * 84LL;
732 if ( HeatbathOrRelax ) {
733 flop += NCOLORS * NCOLORS * NCOLORS + (NCOLORS * ( NCOLORS - 1) / 2) * (46LL + 48LL + 56LL * NCOLORS);
736 flop += NCOLORS * NCOLORS * NCOLORS + (NCOLORS * ( NCOLORS - 1) / 2) * (17LL + 112LL * NCOLORS);
742 long long bytes()
const {
744 if ( NCOLORS == 3 ) {
745 long long byte = 20LL * NElems *
sizeof(Float);
746 if ( HeatbathOrRelax ) byte += 2LL *
sizeof(
cuRNGState);
751 long long byte = 20LL * NCOLORS * NCOLORS * 2 *
sizeof(Float);
752 if ( HeatbathOrRelax ) byte += 2LL *
sizeof(
cuRNGState);
767 template<
typename Float,
int NElems,
int NCOLORS,
typename Gauge>
768 void Monte( Gauge dataOr, cudaGaugeField& data, RNG &rngstate, Float Beta,
int nhb,
int nover) {
770 TimeProfile profileHBOVR(
"HeatBath_OR_Relax",
false);
771 MonteArg<Gauge, Float, NCOLORS> montearg(dataOr, data, Beta, rngstate);
773 GaugeHB<Float, Gauge, NCOLORS, NElems, true> hb(montearg);
774 for (
int step = 0; step < nhb; ++step ) {
776 for (
int mu = 0;
mu < 4; ++
mu ) {
789 double gflops = (hb.flops() * 8 * nhb * 1
e-9) / (secs);
790 double gbytes = hb.bytes() * 8 * nhb / (secs * 1e9);
794 printfQuda(
"HB: Time = %6.6f s, Gflop/s = %6.1f, GB/s = %6.1f\n", secs, gflops, gbytes);
799 GaugeHB<Float, Gauge, NCOLORS, NElems, false> relax(montearg);
800 for (
int step = 0; step < nover; ++step ) {
802 for (
int mu = 0;
mu < 4; ++
mu ) {
815 double gflops = (relax.flops() * 8 * nover * 1
e-9) / (secs);
816 double gbytes = relax.bytes() * 8 * nover / (secs * 1e9);
820 printfQuda(
"OVR: Time = %6.6f s, Gflop/s = %6.1f, GB/s = %6.1f\n", secs, gflops, gbytes);
827 template<
typename Float>
828 void Monte( cudaGaugeField& data, RNG &rngstate, Float Beta,
int nhb,
int nover) {
830 if ( data.isNative() ) {
832 typedef typename gauge_mapper<Float,QUDA_RECONSTRUCT_NO>::type Gauge;
833 Monte<Float, 18, 3>(Gauge(data), data, rngstate, Beta, nhb, nover);
835 typedef typename gauge_mapper<Float,QUDA_RECONSTRUCT_12>::type Gauge;
836 Monte<Float, 12, 3>(Gauge(data), data, rngstate, Beta, nhb, nover);
838 typedef typename gauge_mapper<Float,QUDA_RECONSTRUCT_8>::type Gauge;
839 Monte<Float, 8, 3>(Gauge(data), data, rngstate, Beta, nhb, nover);
841 errorQuda(
"Reconstruction type %d of gauge field not supported", data.Reconstruct());
847 #endif // GPU_GAUGE_ALG 860 Monte<float> (data, rngstate, (
float)Beta, nhb, nover);
862 Monte<double>(data, rngstate, Beta, nhb, nover);
867 errorQuda(
"Pure gauge code has not been built");
868 #endif // GPU_GAUGE_ALG
struct curandStateMRG32k3a cuRNGState
__device__ __host__ void setZero(Matrix< T, N > *m)
static __device__ __host__ int linkIndexShift(const I x[], const J dx[], const K X[4])
static __device__ __host__ int linkIndex(const int x[], const I X[4])
__host__ __device__ ValueType norm(const complex< ValueType > &z)
Returns the magnitude of z squared.
QudaVerbosity getVerbosity()
__host__ __device__ ValueType exp(ValueType x)
static __host__ __device__ void IndexBlock(int block, int &p, int &q)
__host__ __device__ ValueType sqrt(ValueType x)
cudaColorSpinorField * tmp
void PGaugeExchange(cudaGaugeField &data, const int dir, const int parity)
Perform heatbath and overrelaxation. Performs nhb heatbath steps followed by nover overrelaxation ste...
char * index(const char *, int)
__host__ __device__ ValueType sin(ValueType x)
def id
projector matrices ######################################################################## ...
Class declaration to initialize and hold CURAND RNG states.
static __inline__ size_t p
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
Main header file for host and device accessors to GaugeFields.
cudaError_t qudaDeviceSynchronize()
Wrapper around cudaDeviceSynchronize or cuDeviceSynchronize.
__device__ __host__ void setIdentity(Matrix< T, N > *m)
__host__ __device__ ValueType log(ValueType x)
int sprintf(char *, const char *,...) __attribute__((__format__(__printf__
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
__host__ __device__ ValueType cos(ValueType x)
__host__ __device__ ValueType abs(ValueType x)
__host__ __device__ ValueType conj(ValueType x)
void Monte(cudaGaugeField &data, RNG &rngstate, double Beta, int nhb, int nover)
Perform heatbath and overrelaxation. Performs nhb heatbath steps followed by nover overrelaxation ste...
QudaTune getTuning()
Query whether autotuning is enabled or not. Default is enabled but can be overridden by setting QUDA_...
static __inline__ size_t size_t d
QudaPrecision Precision() const
static __device__ __host__ void getCoords(int x[], int cb_index, const I X[], int parity)