11 #include <cub/cub.cuh> 16 #define PI 3.1415926535897932384626433832795 // pi 19 #define PII 6.2831853071795864769252867665590 // 2 * pi 26 template <
typename Gauge>
27 struct InitGaugeColdArg {
31 InitGaugeColdArg(
const Gauge &dataOr,
const cudaGaugeField &data)
33 for (
int dir = 0; dir < 4; ++dir )
X[dir] = data.X()[dir];
34 threads =
X[0] *
X[1] *
X[2] *
X[3];
41 template<
typename Float,
typename Gauge,
int NCOLORS>
42 __global__
void compute_InitGauge_ColdStart(InitGaugeColdArg<Gauge>
arg){
44 if (
idx >=
arg.threads )
return;
46 if (
idx >=
arg.threads / 2 ) {
52 for (
int d = 0;
d < 4;
d++ )
57 template<
typename Float,
typename Gauge,
int NCOLORS>
58 class InitGaugeCold : Tunable {
59 InitGaugeColdArg<Gauge>
arg;
60 mutable char aux_string[128];
62 unsigned int sharedBytesPerThread()
const {
65 unsigned int sharedBytesPerBlock(
const TuneParam &
param)
const {
69 bool tuneGridDim()
const {
72 unsigned int minThreads()
const {
77 InitGaugeCold(InitGaugeColdArg<Gauge> &
arg)
83 void apply(
const cudaStream_t &
stream){
85 compute_InitGauge_ColdStart<Float, Gauge, NCOLORS><< < tp.grid,tp.block >> > (
arg);
89 TuneKey tuneKey()
const {
90 std::stringstream vol;
91 vol <<
arg.X[0] <<
"x";
92 vol <<
arg.X[1] <<
"x";
93 vol <<
arg.X[2] <<
"x";
95 sprintf(aux_string,
"threads=%d,prec=%lu",
arg.threads,
sizeof(Float));
96 return TuneKey(vol.str().c_str(),
typeid(*this).name(), aux_string);
100 long long flops()
const {
103 long long bytes()
const {
109 template<
typename Float,
int NCOLORS,
typename Gauge>
111 InitGaugeColdArg<Gauge> initarg(dataOr, data);
112 InitGaugeCold<Float, Gauge, NCOLORS>
init(initarg);
119 template<
typename Float>
122 if ( data.isNative() ) {
124 typedef typename gauge_mapper<Float,QUDA_RECONSTRUCT_NO>::type Gauge;
125 InitGaugeField<Float, 3>(Gauge(data), data);
127 typedef typename gauge_mapper<Float,QUDA_RECONSTRUCT_12>::type Gauge;
128 InitGaugeField<Float, 3>(Gauge(data), data);
130 typedef typename gauge_mapper<Float,QUDA_RECONSTRUCT_8>::type Gauge;
131 InitGaugeField<Float, 3>(Gauge(data), data);
133 errorQuda(
"Reconstruction type %d of gauge field not supported", data.Reconstruct());
148 InitGaugeField<float> (data);
150 InitGaugeField<double>(data);
152 errorQuda(
"Precision %d not supported", data.Precision());
164 template <
typename Gauge>
165 struct InitGaugeHotArg {
173 InitGaugeHotArg(
const Gauge &dataOr,
const cudaGaugeField &data, RNG &rngstate)
174 : dataOr(dataOr), rngstate(rngstate) {
176 for (
int dir = 0; dir < 4; ++dir ) {
177 border[dir] = data.R()[dir];
178 X[dir] = data.X()[dir] - border[dir] * 2;
181 for (
int dir = 0; dir < 4; ++dir )
X[dir] = data.X()[dir];
185 threads =
X[0] *
X[1] *
X[2] *
X[3] >> 1;
190 template <
typename Float>
191 __host__ __device__
static inline void reunit_link(
Matrix<complex<Float>,3> &U ){
193 complex<Float> t2((Float)0.0, (Float)0.0);
198 for (
int c = 0;
c < 3;
c++ ) t1 +=
norm(U(0,
c));
199 t1 = (Float)1.0 /
sqrt(t1);
203 for (
int c = 0;
c < 3;
c++ ) U(0,
c) *= t1;
206 for (
int c = 0;
c < 3;
c++ ) t2 +=
conj(U(0,
c)) * U(1,
c);
209 for (
int c = 0;
c < 3;
c++ ) U(1,
c) -= t2 * U(0,
c);
215 for (
int c = 0;
c < 3;
c++ ) t1 +=
norm(U(1,
c));
216 t1 = (Float)1.0 /
sqrt(t1);
220 for (
int c = 0;
c < 3;
c++ ) U(1,
c) *= t1;
223 U(2,0) =
conj(U(0,1) * U(1,2) - U(0,2) * U(1,1));
224 U(2,1) =
conj(U(0,2) * U(1,0) - U(0,0) * U(1,2));
225 U(2,2) =
conj(U(0,0) * U(1,1) - U(0,1) * U(1,0));
243 T aabs, ctheta, stheta, phi;
244 a(0,0) = Random<T>(localState, (T)-1.0, (T)1.0);
245 aabs =
sqrt( 1.0 -
a(0,0) *
a(0,0));
246 ctheta = Random<T>(localState, (T)-1.0, (T)1.0);
247 phi =
PII * Random<T>(localState);
248 stheta = ( curand(&localState) & 1 ? 1 : -1 ) *
sqrt( (T)1.0 - ctheta * ctheta );
249 a(0,1) = aabs * stheta *
cos( phi );
250 a(1,0) = aabs * stheta *
sin( phi );
251 a(1,1) = aabs * ctheta;
262 template <
class T,
int NCOLORS>
263 __host__ __device__
static inline void mul_block_sun(
Matrix<T,2> u,
Matrix<complex<T>,NCOLORS> &link, int2
id ){
264 for (
int j = 0; j < NCOLORS; j++ ) {
265 complex<T>
tmp = complex<T>( u(0,0), u(1,1) ) * link(
id.
x, j) + complex<T>( u(1,0), u(0,1) ) * link(
id.
y, j);
266 link(
id.
y, j) = complex<T>(-u(1,0), u(0,1) ) * link(
id.
x, j) + complex<T>( u(0,0),-u(1,1) ) * link(
id.
y, j);
277 template<
int NCOLORS>
284 while ( del_i < (NCOLORS - 1) && found == 0 ) {
286 for ( i1 = 0; i1 < (NCOLORS - del_i); i1++ ) {
304 template <
class Float,
int NCOLORS>
308 for (
int i = 0;
i < NCOLORS;
i++ )
309 for (
int j = 0; j < NCOLORS; j++ )
310 U(
i,j) = complex<Float>( (Float)(Random<Float>(localState) - 0.5), (Float)(Random<Float>(localState) - 0.5) );
311 reunit_link<Float>(U);
324 template<
typename Float,
typename Gauge,
int NCOLORS>
325 __global__
void compute_InitGauge_HotStart(InitGaugeHotArg<Gauge>
arg){
327 if (
idx >=
arg.threads )
return;
330 for (
int dr = 0; dr < 4; ++dr )
X[dr] =
arg.X[dr];
331 for (
int dr = 0; dr < 4; ++dr )
X[dr] += 2 *
arg.border[dr];
340 for (
int dr = 0; dr < 4; ++dr )
x[dr] +=
arg.border[dr];
343 for (
int d = 0;
d < 4;
d++ ) {
345 U = randomize<Float, NCOLORS>(localState);
350 arg.rngstate.State()[
id ] = localState;
352 arg.rngstate.State()[
idx ] = localState;
359 template<
typename Float,
typename Gauge,
int NCOLORS>
360 class InitGaugeHot : Tunable {
361 InitGaugeHotArg<Gauge>
arg;
362 mutable char aux_string[128];
364 unsigned int sharedBytesPerThread()
const {
367 unsigned int sharedBytesPerBlock(
const TuneParam &
param)
const {
370 bool tuneSharedBytes()
const {
373 bool tuneGridDim()
const {
376 unsigned int minThreads()
const {
381 InitGaugeHot(InitGaugeHotArg<Gauge> &
arg)
387 void apply(
const cudaStream_t &
stream){
389 compute_InitGauge_HotStart<Float, Gauge, NCOLORS><< < tp.grid,tp.block >> > (
arg);
393 TuneKey tuneKey()
const {
394 std::stringstream vol;
395 vol <<
arg.X[0] <<
"x";
396 vol <<
arg.X[1] <<
"x";
397 vol <<
arg.X[2] <<
"x";
399 sprintf(aux_string,
"threads=%d,prec=%lud",
arg.threads,
sizeof(Float));
400 return TuneKey(vol.str().c_str(),
typeid(*this).name(), aux_string);
404 void preTune(){
arg.rngstate.backup(); }
405 void postTune(){
arg.rngstate.restore(); }
406 long long flops()
const {
409 long long bytes()
const {
419 template<
typename Float,
int NCOLORS,
typename Gauge>
420 void InitGaugeField( Gauge dataOr, cudaGaugeField& data, RNG &rngstate) {
421 InitGaugeHotArg<Gauge> initarg(dataOr, data, rngstate);
422 InitGaugeHot<Float, Gauge, NCOLORS>
init(initarg);
427 data.exchangeExtendedGhost(data.R(),
false);
432 template<
typename Float>
435 if ( data.isNative() ) {
437 typedef typename gauge_mapper<Float,QUDA_RECONSTRUCT_NO>::type Gauge;
438 InitGaugeField<Float, 3>(Gauge(data), data, rngstate);
440 typedef typename gauge_mapper<Float,QUDA_RECONSTRUCT_12>::type Gauge;
441 InitGaugeField<Float, 3>(Gauge(data), data, rngstate);
443 typedef typename gauge_mapper<Float,QUDA_RECONSTRUCT_8>::type Gauge;
444 InitGaugeField<Float, 3>(Gauge(data), data, rngstate);
446 errorQuda(
"Reconstruction type %d of gauge field not supported", data.Reconstruct());
452 #endif // GPU_GAUGE_ALG 462 InitGaugeField<float> (data, rngstate);
464 InitGaugeField<double>(data, rngstate);
469 errorQuda(
"Pure gauge code has not been built");
struct curandStateMRG32k3a cuRNGState
static __device__ __host__ int linkIndex(const int x[], const I X[4])
__host__ __device__ ValueType norm(const complex< ValueType > &z)
Returns the magnitude of z squared.
QudaVerbosity getVerbosity()
static __host__ __device__ void IndexBlock(int block, int &p, int &q)
__host__ __device__ ValueType sqrt(ValueType x)
cudaColorSpinorField * tmp
char * index(const char *, int)
__host__ __device__ ValueType sin(ValueType x)
def id
projector matrices ######################################################################## ...
for(int s=0;s< param.dc.Ls;s++)
Class declaration to initialize and hold CURAND RNG states.
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
void InitGaugeField(cudaGaugeField &data)
Perform a cold start to the gauge field, identity SU(3) matrix, also fills the ghost links in multi-G...
Main header file for host and device accessors to GaugeFields.
cudaError_t qudaDeviceSynchronize()
Wrapper around cudaDeviceSynchronize or cuDeviceSynchronize.
__device__ __host__ void setIdentity(Matrix< T, N > *m)
int sprintf(char *, const char *,...) __attribute__((__format__(__printf__
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
__host__ __device__ ValueType cos(ValueType x)
__host__ __device__ ValueType conj(ValueType x)
QudaTune getTuning()
Query whether autotuning is enabled or not. Default is enabled but can be overridden by setting QUDA_...
static __inline__ size_t size_t d
QudaPrecision Precision() const
static __device__ __host__ void getCoords(int x[], int cb_index, const I X[], int parity)