QUDA  0.9.0
gauge_plaq.cu
Go to the documentation of this file.
1 #include <quda_internal.h>
2 #include <quda_matrix.h>
3 #include <tune_quda.h>
4 #include <gauge_field.h>
5 #include <gauge_field_order.h>
6 #include <launch_kernel.cuh>
7 #include <atomic.cuh>
8 #include <cub_helper.cuh>
9 #include <index_helper.cuh>
10 
11 namespace quda {
12 
13 #ifdef GPU_GAUGE_TOOLS
14 
15  template <typename Gauge>
16  struct GaugePlaqArg : public ReduceArg<double2> {
17  int threads; // number of active threads required
18  int E[4]; // extended grid dimensions
19  int X[4]; // true grid dimensions
20  int border[4];
21  Gauge dataOr;
22 
23  GaugePlaqArg(const Gauge &dataOr, const GaugeField &data)
24  : ReduceArg<double2>(), dataOr(dataOr)
25  {
26  int R = 0;
27  for (int dir=0; dir<4; ++dir){
28  border[dir] = data.R()[dir];
29  E[dir] = data.X()[dir];
30  X[dir] = data.X()[dir] - border[dir]*2;
31  R += border[dir];
32  }
33  threads = X[0]*X[1]*X[2]*X[3]/2;
34  }
35  };
36 
37  template<int blockSize, typename Float, typename Gauge>
38  __global__ void computePlaq(GaugePlaqArg<Gauge> arg){
39  typedef Matrix<complex<Float>,3> Link;
40  int idx = threadIdx.x + blockIdx.x*blockDim.x;
41  int parity = threadIdx.y;
42 
43  double2 plaq = make_double2(0.0,0.0);
44 
45  if(idx < arg.threads) {
46  int x[4];
47  getCoords(x, idx, arg.X, parity);
48  for (int dr=0; dr<4; ++dr) x[dr] += arg.border[dr]; // extended grid coordinates
49 
50  int dx[4] = {0, 0, 0, 0};
51  for (int mu = 0; mu < 3; mu++) {
52  for (int nu = (mu+1); nu < 3; nu++) {
53 
54  Link U1 = arg.dataOr(mu, linkIndexShift(x,dx,arg.E), parity);
55  dx[mu]++;
56  Link U2 = arg.dataOr(nu, linkIndexShift(x,dx,arg.E), 1-parity);
57  dx[mu]--;
58  dx[nu]++;
59  Link U3 = arg.dataOr(mu, linkIndexShift(x,dx,arg.E), 1-parity);
60  dx[nu]--;
61  Link U4 = arg.dataOr(nu, linkIndexShift(x,dx,arg.E), parity);
62 
63  plaq.x += getTrace( U1 * U2 * conj(U3) * conj(U4) ).x;
64  }
65 
66  Link U1 = arg.dataOr(mu, linkIndexShift(x,dx,arg.E), parity);
67  dx[mu]++;
68  Link U2 = arg.dataOr(3, linkIndexShift(x,dx,arg.E), 1-parity);
69  dx[mu]--;
70  dx[3]++;
71  Link U3 = arg.dataOr(mu,linkIndexShift(x,dx,arg.E), 1-parity);
72  dx[3]--;
73  Link U4 = arg.dataOr(3, linkIndexShift(x,dx,arg.E), parity);
74 
75  plaq.y += getTrace( U1 * U2 * conj(U3) * conj(U4) ).x;
76  }
77  }
78 
79  // perform final inter-block reduction and write out result
80  reduce2d<blockSize,2>(arg, plaq);
81  }
82 
83  template<typename Float, typename Gauge>
84  class GaugePlaq : TunableLocalParity {
85  GaugePlaqArg<Gauge> arg;
86  const QudaFieldLocation location;
87 
88  private:
89  unsigned int minThreads() const { return arg.threads; }
90 
91  public:
92  GaugePlaq(GaugePlaqArg<Gauge> &arg, QudaFieldLocation location)
93  : arg(arg), location(location) {}
94  ~GaugePlaq () { }
95 
96  void apply(const cudaStream_t &stream){
97  if(location == QUDA_CUDA_FIELD_LOCATION){
98  arg.result_h[0] = make_double2(0.,0.);
99  TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
100 
101  LAUNCH_KERNEL_LOCAL_PARITY(computePlaq, tp, stream, arg, Float, Gauge);
103  } else {
104  errorQuda("CPU not supported yet\n");
105  }
106  }
107 
108  TuneKey tuneKey() const {
109  std::stringstream vol, aux;
110  vol << arg.X[0] << "x" << arg.X[1] << "x" << arg.X[2] << "x" << arg.X[3];
111  aux << "threads=" << arg.threads << ",prec=" << sizeof(Float);
112  return TuneKey(vol.str().c_str(), typeid(*this).name(), aux.str().c_str());
113  }
114 
115  long long flops() const { return 6ll*2*arg.threads*(3*198+3); }
116  long long bytes() const { return 6ll*4*2*arg.threads*arg.dataOr.Bytes(); }
117  };
118 
119  template<typename Float, typename Gauge>
120  void plaquette(const Gauge dataOr, const GaugeField& data, double2 &plq, QudaFieldLocation location) {
121  GaugePlaqArg<Gauge> arg(dataOr, data);
122  GaugePlaq<Float,Gauge> gaugePlaq(arg, location);
123  gaugePlaq.apply(0);
124 
125  comm_allreduce_array((double*) arg.result_h, 2);
126  arg.result_h[0].x /= 9.*(2*arg.threads*comm_size());
127  arg.result_h[0].y /= 9.*(2*arg.threads*comm_size());
128  plq.x = arg.result_h[0].x;
129  plq.y = arg.result_h[0].y;
130  }
131 
132  template<typename Float>
133  void plaquette(const GaugeField& data, double2 &plq, QudaFieldLocation location) {
134  INSTANTIATE_RECONSTRUCT(plaquette<Float>, data, plq, location);
135  }
136 #endif
137 
138  double3 plaquette(const GaugeField& data, QudaFieldLocation location) {
139 
140 #ifdef GPU_GAUGE_TOOLS
141  double2 plq;
142  INSTANTIATE_PRECISION(plaquette, data, plq, location);
143  double3 plaq = make_double3(0.5*(plq.x + plq.y), plq.x, plq.y);
144 #else
145  errorQuda("Gauge tools are not build");
146  double3 plaq = make_double3(0., 0., 0.);
147 #endif
148  return plaq;
149  }
150 
151 } // namespace quda
dim3 dim3 blockDim
double3 plaquette(const GaugeField &U, QudaFieldLocation location)
Definition: gauge_plaq.cu:138
double mu
Definition: test_util.cpp:1643
cudaStream_t stream
static __device__ __host__ int linkIndexShift(const I x[], const J dx[], const K X[4])
#define LAUNCH_KERNEL_LOCAL_PARITY(kernel, tp, stream, arg,...)
QudaVerbosity getVerbosity()
Definition: util_quda.cpp:20
#define errorQuda(...)
Definition: util_quda.h:90
void comm_allreduce_array(double *data, size_t size)
Definition: comm_mpi.cpp:296
static int R[4]
int E[4]
Definition: test_util.cpp:36
virtual TuneKey tuneKey() const =0
const int * R() const
int comm_size(void)
Definition: comm_mpi.cpp:126
virtual long long bytes() const
Definition: tune_quda.h:64
#define INSTANTIATE_PRECISION(func, lat,...)
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
Definition: tune.cpp:603
Main header file for host and device accessors to GaugeFields.
cudaError_t qudaDeviceSynchronize()
Wrapper around cudaDeviceSynchronize or cuDeviceSynchronize.
__device__ __host__ T getTrace(const Matrix< T, 3 > &a)
Definition: quda_matrix.h:305
enum QudaFieldLocation_s QudaFieldLocation
#define INSTANTIATE_RECONSTRUCT(func, g,...)
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
Definition: complex_quda.h:880
virtual unsigned int minThreads() const
Definition: tune_quda.h:73
__host__ __device__ ValueType conj(ValueType x)
Definition: complex_quda.h:115
QudaTune getTuning()
Query whether autotuning is enabled or not. Default is enabled but can be overridden by setting QUDA_...
Definition: util_quda.cpp:51
QudaParity parity
Definition: covdev_test.cpp:53
char aux[TuneKey::aux_n]
Definition: tune_quda.h:189
virtual long long flops() const =0
virtual void apply(const cudaStream_t &stream)=0
const int * X() const
static __device__ __host__ void getCoords(int x[], int cb_index, const I X[], int parity)