QUDA  v0.7.0
A library for QCD on GPUs
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
gauge_plaq.cu
Go to the documentation of this file.
1 #include <quda_internal.h>
2 #include <quda_matrix.h>
3 #include <tune_quda.h>
4 #include <gauge_field.h>
5 #include <gauge_field_order.h>
6 #include <cub/cub.cuh>
7 #include <launch_kernel.cuh>
8 
9 namespace quda {
10 
11 #ifdef GPU_GAUGE_TOOLS
12 
13 // template <typename Float, typename Gauge>
14  template <typename Gauge>
15  struct GaugePlaqArg {
16  int threads; // number of active threads required
17  int X[4]; // grid dimensions
18 #ifdef MULTI_GPU
19  int border[4];
20 #endif
21  Gauge dataOr;
22  double *plaq;
23  double *plaq_h;
24 
25  GaugePlaqArg(const Gauge &dataOr, const GaugeField &data)
26  : dataOr(dataOr), plaq_h(static_cast<double*>(pinned_malloc(sizeof(double)))) {
27 #ifdef MULTI_GPU
28  for(int dir=0; dir<4; ++dir){
29  border[dir] = 2;
30  }
31 
32  for(int dir=0; dir<4; ++dir) X[dir] = data.X()[dir] - border[dir]*2;
33 #else
34  for(int dir=0; dir<4; ++dir) X[dir] = data.X()[dir];
35 #endif
36  threads = X[0]*X[1]*X[2]*X[3];
37 /* if ((cudaMallocHost(&plaq_h, sizeof(double))) == cudaErrorMemoryAllocation)
38  errorQuda ("Error allocating memory for plaquette.\n");
39  if ((cudaMalloc(&plaq, sizeof(double))) == cudaErrorMemoryAllocation)
40  errorQuda ("Error allocating memory for plaquette.\n");
41 */
42  cudaHostGetDevicePointer(&plaq, plaq_h, 0);
43 
44  }
45  };
46 
47  static __inline__ __device__ double atomicAdd(double *addr, double val)
48  {
49  double old=*addr, assumed;
50 
51  do {
52  assumed = old;
53  old = __longlong_as_double( atomicCAS((unsigned long long int*)addr,
54  __double_as_longlong(assumed),
55  __double_as_longlong(val+assumed)));
56  } while( __double_as_longlong(assumed)!=__double_as_longlong(old) );
57 
58  return old;
59  }
60 
61  __device__ __host__ inline int linkIndex3(int x[], int dx[], const int X[4]) {
62  int y[4];
63  for (int i=0; i<4; i++) y[i] = (x[i] + dx[i] + X[i]) % X[i];
64  int idx = (((y[3]*X[2] + y[2])*X[1] + y[1])*X[0] + y[0]) >> 1;
65  return idx;
66  }
67 
68 
69  __device__ __host__ inline void getCoords3(int x[4], int cb_index, const int X[4], int parity)
70  {
71  x[3] = cb_index/(X[2]*X[1]*X[0]/2);
72  x[2] = (cb_index/(X[1]*X[0]/2)) % X[2];
73  x[1] = (cb_index/(X[0]/2)) % X[1];
74  x[0] = 2*(cb_index%(X[0]/2)) + ((x[3]+x[2]+x[1]+parity)&1);
75 
76  return;
77  }
78 
79  template<int blockSize, typename Float, typename Gauge>
80  __global__ void computePlaq(GaugePlaqArg<Gauge> arg){
81  int idx = threadIdx.x + blockIdx.x*blockDim.x;
82  if(idx >= arg.threads) return;
83  typedef typename ComplexTypeId<Float>::Type Cmplx;
84  int parity = 0;
85  if(idx >= arg.threads/2) {
86  parity = 1;
87  idx -= arg.threads/2;
88  }
89 
90  int X[4];
91  for(int dr=0; dr<4; ++dr) X[dr] = arg.X[dr];
92 
93  int x[4];
94  getCoords3(x, idx, X, parity);
95 #ifdef MULTI_GPU
96  for(int dr=0; dr<4; ++dr) {
97  x[dr] += arg.border[dr];
98  X[dr] += 2*arg.border[dr];
99  }
100 #endif
101  double plaq = 0.;
102 
103  int dx[4] = {0, 0, 0, 0};
104  for (int mu = 0; mu < 3; mu++) {
105  for (int nu = (mu+1); nu < 4; nu++) {
106  Matrix<Cmplx,3> U1, U2, U3, U4, tmpM;
107 
108  arg.dataOr.load((Float*)(U1.data),linkIndex3(x,dx,X), mu, parity);
109  dx[mu]++;
110  arg.dataOr.load((Float*)(U2.data),linkIndex3(x,dx,X), nu, 1-parity);
111  dx[mu]--;
112  dx[nu]++;
113  arg.dataOr.load((Float*)(U3.data),linkIndex3(x,dx,X), mu, 1-parity);
114  dx[nu]--;
115  arg.dataOr.load((Float*)(U4.data),linkIndex3(x,dx,X), nu, parity);
116 
117  tmpM = U1 * U2;
118  tmpM = tmpM * conj(U3);
119  tmpM = tmpM * conj(U4);
120 
121  plaq += getTrace(tmpM).x;
122  }
123  }
124 
125  typedef cub::BlockReduce<double, blockSize> BlockReduce;
126  __shared__ typename BlockReduce::TempStorage temp_storage;
127  double aggregate = BlockReduce(temp_storage).Sum(plaq);
128 
129  if (threadIdx.x == 0) atomicAdd((double *) arg.plaq, aggregate);
130  }
131 
132  template<typename Float, typename Gauge>
133  class GaugePlaq : Tunable {
134  GaugePlaqArg<Gauge> arg;
136 
137  private:
138  unsigned int sharedBytesPerThread() const { return sizeof(Float); }
139  unsigned int sharedBytesPerBlock(const TuneParam &param) const { return 0; }
140 
141  bool tuneSharedBytes() const { return false; } // Don't tune shared memory
142  bool tuneGridDim() const { return false; } // Don't tune the grid dimensions.
143  unsigned int minThreads() const { return arg.threads; }
144 
145  public:
146  GaugePlaq(GaugePlaqArg<Gauge> &arg, QudaFieldLocation location)
147  : arg(arg), location(location) {}
148  ~GaugePlaq () { host_free(arg.plaq_h); }
149 
150  void apply(const cudaStream_t &stream){
152  TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
153 
154  LAUNCH_KERNEL(computePlaq, tp, stream, arg, Float, Gauge);
155 
156 // cudaMemcpy(arg.plaq_h, arg.plaq, sizeof(double), cudaMemcpyDeviceToHost);
157  cudaDeviceSynchronize();
158 
159  #ifdef MULTI_GPU
160  comm_allreduce((double*) arg.plaq_h);
161  const int nNodes = comm_dim(0)*comm_dim(1)*comm_dim(2)*comm_dim(3);
162  ((double *) arg.plaq_h)[0] /= 18.*(arg.threads*nNodes);
163  #else
164  ((double *) arg.plaq_h)[0] /= 18.*arg.threads;
165  #endif
166  } else {
167  errorQuda("CPU not supported yet\n");
168  //computePlaqCPU(arg);
169  }
170  }
171 
172  TuneKey tuneKey() const {
173  std::stringstream vol, aux;
174  vol << arg.X[0] << "x";
175  vol << arg.X[1] << "x";
176  vol << arg.X[2] << "x";
177  vol << arg.X[3];
178  aux << "threads=" << arg.threads << ",prec=" << sizeof(Float);
179  return TuneKey(vol.str().c_str(), typeid(*this).name(), aux.str().c_str());
180  }
181 
182 
183  std::string paramString(const TuneParam &param) const {
184  std::stringstream ps;
185  ps << "block=(" << param.block.x << "," << param.block.y << "," << param.block.z << ")";
186  ps << "shared=" << param.shared_bytes;
187  return ps.str();
188  }
189 
190  void preTune(){}
191  void postTune(){}
192  long long flops() const { return (1)*6*arg.threads; }
193  long long bytes() const { return (1)*6*arg.threads*sizeof(Float); } // Only correct if there is no link reconstruction
194 
195  };
196 
197  template<typename Float, typename Gauge>
198  void plaquette(const Gauge dataOr, const GaugeField& data, QudaFieldLocation location, Float &plq) {
199  GaugePlaqArg<Gauge> arg(dataOr, data);
200  GaugePlaq<Float,Gauge> gaugePlaq(arg, location);
201  gaugePlaq.apply(0);
202  cudaDeviceSynchronize();
203  plq = ((double *) arg.plaq_h)[0];
204  }
205 
206  template<typename Float>
207  Float plaquette(const GaugeField& data, QudaFieldLocation location) {
208 
209  // Switching to FloatNOrder for the gauge field in order to support RECONSTRUCT_12
210  // Need to fix this!!
211 
212  Float res;
213 
214  if(data.Order() == QUDA_FLOAT2_GAUGE_ORDER) {
215  if(data.Reconstruct() == QUDA_RECONSTRUCT_NO) {
216  plaquette(FloatNOrder<Float, 18, 2, 18>(data), data, location, res);
217  } else if(data.Reconstruct() == QUDA_RECONSTRUCT_12){
218  plaquette(FloatNOrder<Float, 18, 2, 12>(data), data, location, res);
219  } else if(data.Reconstruct() == QUDA_RECONSTRUCT_8){
220  plaquette(FloatNOrder<Float, 18, 2, 8>(data), data, location, res);
221  } else {
222  errorQuda("Reconstruction type %d of gauge field not supported", data.Reconstruct());
223  }
224  } else if(data.Order() == QUDA_FLOAT4_GAUGE_ORDER) {
225  if(data.Reconstruct() == QUDA_RECONSTRUCT_NO) {
226  plaquette(FloatNOrder<Float, 18, 4, 18>(data), data, location, res);
227  } else if(data.Reconstruct() == QUDA_RECONSTRUCT_12){
228  plaquette(FloatNOrder<Float, 18, 4, 12>(data), data, location, res);
229  } else if(data.Reconstruct() == QUDA_RECONSTRUCT_8){
230  plaquette(FloatNOrder<Float, 18, 4, 8>(data), data, location, res);
231  } else {
232  errorQuda("Reconstruction type %d of gauge field not supported", data.Reconstruct());
233  }
234  } else {
235  errorQuda("Invalid Gauge Order\n");
236  }
237 
238  return res;
239  }
240 #endif
241 
242  double plaquette(const GaugeField& data, QudaFieldLocation location) {
243 
244 #ifdef GPU_GAUGE_TOOLS
245  if(data.Precision() == QUDA_HALF_PRECISION) {
246  errorQuda("Half precision not supported\n");
247  }
248  if (data.Precision() == QUDA_SINGLE_PRECISION) {
249  return plaquette<float> (data, location);
250  } else if(data.Precision() == QUDA_DOUBLE_PRECISION) {
251  return plaquette<double>(data, location);
252  } else {
253  errorQuda("Precision %d not supported", data.Precision());
254  }
255 #else
256  errorQuda("Gauge tools are not build");
257 #endif
258 
259  }
260 }
#define pinned_malloc(size)
Definition: malloc_quda.h:26
int y[4]
QudaVerbosity getVerbosity()
Definition: util_quda.cpp:20
#define errorQuda(...)
Definition: util_quda.h:73
#define host_free(ptr)
Definition: malloc_quda.h:29
int comm_dim(int dim)
__global__ void const RealA *const const RealA *const const RealA *const const RealB *const const RealB *const int int mu
cudaStream_t * stream
__global__ void const FloatN FloatM FloatM Float Float int threads
Definition: llfat_core.h:1099
::std::string string
Definition: gtest.h:1979
double plaquette(const GaugeField &data, QudaFieldLocation location)
Definition: gauge_plaq.cu:242
QudaGaugeParam param
Definition: pack_test.cpp:17
QudaPrecision Precision() const
const QudaFieldLocation location
Definition: pack_test.cpp:46
FloatingPoint< float > Float
Definition: gtest.h:7350
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
Definition: tune.cpp:271
int x[4]
int dx[4]
__device__ __host__ T getTrace(const Matrix< T, 3 > &a)
Definition: quda_matrix.h:378
enum QudaFieldLocation_s QudaFieldLocation
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
Definition: complex_quda.h:843
void comm_allreduce(double *data)
Definition: comm_mpi.cpp:201
__host__ __device__ ValueType conj(ValueType x)
Definition: complex_quda.h:115
QudaTune getTuning()
Definition: util_quda.cpp:32
const QudaParity parity
Definition: dslash_test.cpp:29