7 #include <launch_kernel.cuh>
11 #ifdef GPU_GAUGE_TOOLS
14 template <
typename Gauge>
25 GaugePlaqArg(
const Gauge &dataOr,
const GaugeField &data)
26 : dataOr(dataOr), plaq_h(static_cast<double*>(
pinned_malloc(sizeof(double)))) {
28 for(
int dir=0; dir<4; ++dir){
32 for(
int dir=0; dir<4; ++dir)
X[dir] = data.X()[dir] - border[dir]*2;
34 for(
int dir=0; dir<4; ++dir)
X[dir] = data.X()[dir];
42 cudaHostGetDevicePointer(&plaq, plaq_h, 0);
47 static __inline__ __device__
double atomicAdd(
double *addr,
double val)
49 double old=*addr, assumed;
53 old = __longlong_as_double( atomicCAS((
unsigned long long int*)addr,
54 __double_as_longlong(assumed),
55 __double_as_longlong(val+assumed)));
56 }
while( __double_as_longlong(assumed)!=__double_as_longlong(old) );
61 __device__ __host__
inline int linkIndex3(
int x[],
int dx[],
const int X[4]) {
63 for (
int i=0; i<4; i++) y[i] = (x[i] + dx[i] + X[i]) % X[i];
64 int idx = (((y[3]*X[2] + y[2])*X[1] + y[1])*X[0] + y[0]) >> 1;
69 __device__ __host__
inline void getCoords3(
int x[4],
int cb_index,
const int X[4],
int parity)
71 x[3] = cb_index/(X[2]*X[1]*X[0]/2);
72 x[2] = (cb_index/(X[1]*X[0]/2)) % X[2];
73 x[1] = (cb_index/(X[0]/2)) % X[1];
74 x[0] = 2*(cb_index%(X[0]/2)) + ((x[3]+x[2]+x[1]+
parity)&1);
79 template<
int blockSize,
typename Float,
typename Gauge>
80 __global__
void computePlaq(GaugePlaqArg<Gauge>
arg){
81 int idx = threadIdx.x + blockIdx.x*blockDim.x;
82 if(idx >= arg.threads)
return;
83 typedef typename ComplexTypeId<Float>::Type Cmplx;
85 if(idx >= arg.threads/2) {
91 for(
int dr=0; dr<4; ++dr) X[dr] = arg.X[dr];
94 getCoords3(x, idx, X, parity);
96 for(
int dr=0; dr<4; ++dr) {
97 x[dr] += arg.border[dr];
98 X[dr] += 2*arg.border[dr];
103 int dx[4] = {0, 0, 0, 0};
104 for (
int mu = 0;
mu < 3;
mu++) {
105 for (
int nu = (
mu+1); nu < 4; nu++) {
108 arg.dataOr.load((
Float*)(U1.data),linkIndex3(x,dx,X),
mu, parity);
110 arg.dataOr.load((
Float*)(U2.data),linkIndex3(x,dx,X), nu, 1-parity);
113 arg.dataOr.load((
Float*)(U3.data),linkIndex3(x,dx,X),
mu, 1-parity);
115 arg.dataOr.load((
Float*)(U4.data),linkIndex3(x,dx,X), nu, parity);
118 tmpM = tmpM *
conj(U3);
119 tmpM = tmpM *
conj(U4);
125 typedef cub::BlockReduce<double, blockSize> BlockReduce;
126 __shared__
typename BlockReduce::TempStorage temp_storage;
127 double aggregate = BlockReduce(temp_storage).Sum(plaq);
129 if (threadIdx.x == 0) atomicAdd((
double *) arg.plaq, aggregate);
132 template<
typename Float,
typename Gauge>
133 class GaugePlaq : Tunable {
134 GaugePlaqArg<Gauge>
arg;
138 unsigned int sharedBytesPerThread()
const {
return sizeof(
Float); }
139 unsigned int sharedBytesPerBlock(
const TuneParam &
param)
const {
return 0; }
141 bool tuneSharedBytes()
const {
return false; }
142 bool tuneGridDim()
const {
return false; }
143 unsigned int minThreads()
const {
return arg.threads; }
147 : arg(arg), location(location) {}
150 void apply(
const cudaStream_t &
stream){
154 LAUNCH_KERNEL(computePlaq, tp, stream, arg,
Float, Gauge);
157 cudaDeviceSynchronize();
162 ((
double *) arg.plaq_h)[0] /= 18.*(arg.threads*nNodes);
164 ((
double *) arg.plaq_h)[0] /= 18.*arg.threads;
172 TuneKey tuneKey()
const {
173 std::stringstream vol, aux;
174 vol << arg.X[0] <<
"x";
175 vol << arg.X[1] <<
"x";
176 vol << arg.X[2] <<
"x";
178 aux <<
"threads=" << arg.threads <<
",prec=" <<
sizeof(
Float);
179 return TuneKey(vol.str().c_str(),
typeid(*this).name(), aux.str().c_str());
184 std::stringstream ps;
185 ps <<
"block=(" << param.block.x <<
"," << param.block.y <<
"," << param.block.z <<
")";
186 ps <<
"shared=" << param.shared_bytes;
192 long long flops()
const {
return (1)*6*arg.threads; }
193 long long bytes()
const {
return (1)*6*arg.threads*
sizeof(
Float); }
197 template<
typename Float,
typename Gauge>
199 GaugePlaqArg<Gauge>
arg(dataOr, data);
200 GaugePlaq<Float,Gauge> gaugePlaq(arg, location);
202 cudaDeviceSynchronize();
203 plq = ((
double *) arg.plaq_h)[0];
206 template<
typename Float>
216 plaquette(FloatNOrder<Float, 18, 2, 18>(data), data, location, res);
218 plaquette(FloatNOrder<Float, 18, 2, 12>(data), data, location, res);
220 plaquette(FloatNOrder<Float, 18, 2, 8>(data), data, location, res);
222 errorQuda(
"Reconstruction type %d of gauge field not supported", data.Reconstruct());
226 plaquette(FloatNOrder<Float, 18, 4, 18>(data), data, location, res);
228 plaquette(FloatNOrder<Float, 18, 4, 12>(data), data, location, res);
230 plaquette(FloatNOrder<Float, 18, 4, 8>(data), data, location, res);
232 errorQuda(
"Reconstruction type %d of gauge field not supported", data.Reconstruct());
244 #ifdef GPU_GAUGE_TOOLS
246 errorQuda(
"Half precision not supported\n");
249 return plaquette<float> (data,
location);
251 return plaquette<double>(data,
location);
#define pinned_malloc(size)
QudaVerbosity getVerbosity()
__global__ void const RealA *const const RealA *const const RealA *const const RealB *const const RealB *const int int mu
__global__ void const FloatN FloatM FloatM Float Float int threads
double plaquette(const GaugeField &data, QudaFieldLocation location)
QudaPrecision Precision() const
const QudaFieldLocation location
FloatingPoint< float > Float
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
__device__ __host__ T getTrace(const Matrix< T, 3 > &a)
enum QudaFieldLocation_s QudaFieldLocation
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
void comm_allreduce(double *data)
__host__ __device__ ValueType conj(ValueType x)