13 #ifdef GPU_GAUGE_TOOLS 15 template <
typename Gauge>
16 struct GaugePlaqArg :
public ReduceArg<double2> {
23 GaugePlaqArg(
const Gauge &dataOr,
const GaugeField &data)
27 for (
int dir=0; dir<4; ++dir){
28 border[dir] = data.
R()[dir];
29 E[dir] = data.
X()[dir];
30 X[dir] = data.
X()[dir] - border[dir]*2;
33 threads =
X[0]*
X[1]*
X[2]*
X[3]/2;
37 template<
int blockSize,
typename Float,
typename Gauge>
38 __global__
void computePlaq(GaugePlaqArg<Gauge>
arg){
43 double2 plaq = make_double2(0.0,0.0);
48 for (
int dr=0; dr<4; ++dr)
x[dr] +=
arg.border[dr];
50 int dx[4] = {0, 0, 0, 0};
51 for (
int mu = 0;
mu < 3;
mu++) {
52 for (
int nu = (
mu+1); nu < 3; nu++) {
80 reduce2d<blockSize,2>(
arg, plaq);
83 template<
typename Float,
typename Gauge>
85 GaugePlaqArg<Gauge>
arg;
93 :
arg(
arg), location(location) {}
98 arg.result_h[0] = make_double2(0.,0.);
109 std::stringstream vol,
aux;
110 vol <<
arg.X[0] <<
"x" <<
arg.X[1] <<
"x" <<
arg.X[2] <<
"x" <<
arg.X[3];
111 aux <<
"threads=" <<
arg.threads <<
",prec=" <<
sizeof(Float);
112 return TuneKey(vol.str().c_str(),
typeid(*this).name(), aux.str().c_str());
115 long long flops()
const {
return 6ll*2*
arg.threads*(3*198+3); }
116 long long bytes()
const {
return 6ll*4*2*
arg.threads*
arg.dataOr.Bytes(); }
119 template<
typename Float,
typename Gauge>
121 GaugePlaqArg<Gauge>
arg(dataOr, data);
122 GaugePlaq<Float,Gauge> gaugePlaq(
arg, location);
128 plq.x =
arg.result_h[0].x;
129 plq.y =
arg.result_h[0].y;
132 template<
typename Float>
140 #ifdef GPU_GAUGE_TOOLS 143 double3 plaq = make_double3(0.5*(plq.x + plq.y), plq.x, plq.y);
146 double3 plaq = make_double3(0., 0., 0.);
double3 plaquette(const GaugeField &U, QudaFieldLocation location)
static __device__ __host__ int linkIndexShift(const I x[], const J dx[], const K X[4])
#define LAUNCH_KERNEL_LOCAL_PARITY(kernel, tp, stream, arg,...)
QudaVerbosity getVerbosity()
void comm_allreduce_array(double *data, size_t size)
virtual TuneKey tuneKey() const =0
virtual long long bytes() const
#define INSTANTIATE_PRECISION(func, lat,...)
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
Main header file for host and device accessors to GaugeFields.
cudaError_t qudaDeviceSynchronize()
Wrapper around cudaDeviceSynchronize or cuDeviceSynchronize.
__device__ __host__ T getTrace(const Matrix< T, 3 > &a)
enum QudaFieldLocation_s QudaFieldLocation
#define INSTANTIATE_RECONSTRUCT(func, g,...)
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
virtual unsigned int minThreads() const
__host__ __device__ ValueType conj(ValueType x)
QudaTune getTuning()
Query whether autotuning is enabled or not. Default is enabled but can be overridden by setting QUDA_...
virtual long long flops() const =0
virtual void apply(const cudaStream_t &stream)=0
static __device__ __host__ void getCoords(int x[], int cb_index, const I X[], int parity)