QUDA  0.9.0
gauge_ape.cu
Go to the documentation of this file.
1 #include <quda_internal.h>
2 #include <quda_matrix.h>
3 #include <su3_project.cuh>
4 #include <tune_quda.h>
5 #include <gauge_field.h>
6 #include <gauge_field_order.h>
7 #include <index_helper.cuh>
8 
9 #define DOUBLE_TOL 1e-15
10 #define SINGLE_TOL 2e-6
11 
12 namespace quda {
13 
14 #ifdef GPU_GAUGE_TOOLS
15 
16  template <typename Float, typename GaugeOr, typename GaugeDs>
17  struct GaugeAPEArg {
18  int threads; // number of active threads required
19  int X[4]; // grid dimensions
20  int border[4];
21  GaugeOr origin;
22  const Float alpha;
23  const Float tolerance;
24 
25  GaugeDs dest;
26 
27  GaugeAPEArg(GaugeOr &origin, GaugeDs &dest, const GaugeField &data, const Float alpha, const Float tolerance)
28  : threads(1), origin(origin), dest(dest), alpha(alpha), tolerance(tolerance) {
29  for ( int dir = 0; dir < 4; ++dir ) {
30  border[dir] = data.R()[dir];
31  X[dir] = data.X()[dir] - border[dir] * 2;
32  threads *= X[dir];
33  }
34  threads /= 2;
35  }
36  };
37 
38 
39  template <typename Float, typename GaugeOr, typename GaugeDs, typename Float2>
40  __host__ __device__ void computeStaple(GaugeAPEArg<Float,GaugeOr,GaugeDs>& arg, int idx, int parity, int dir, Matrix<Float2,3> &staple) {
41 
42  typedef Matrix<complex<Float>,3> Link;
43  // compute spacetime dimensions and parity
44 
45  int X[4];
46  for(int dr=0; dr<4; ++dr) X[dr] = arg.X[dr];
47 
48  int x[4];
49  getCoords(x, idx, X, parity);
50  for(int dr=0; dr<4; ++dr) {
51  x[dr] += arg.border[dr];
52  X[dr] += 2*arg.border[dr];
53  }
54 
55  setZero(&staple);
56 
57  // I believe most users won't want to include time staples in smearing
58  for(int mu=0; mu<3; mu++) {
59 
60  //identify directions orthogonal to the link.
61  if(mu != dir) {
62 
63  int nu = dir;
64  {
65  int dx[4] = {0, 0, 0, 0};
66  Link U1, U2, U3, tmpS;
67 
68  //Get link U_{\mu}(x)
69  U1 = arg.origin(mu, linkIndexShift(x,dx,X), parity);
70 
71  dx[mu]++;
72  //Get link U_{\nu}(x+\mu)
73  U2 = arg.origin(nu, linkIndexShift(x,dx,X), 1-parity);
74 
75  dx[mu]--;
76  dx[nu]++;
77  //Get link U_{\mu}(x+\nu)
78  U3 = arg.origin(mu, linkIndexShift(x,dx,X), 1-parity);
79 
80  // staple += U_{\mu}(x) * U_{\nu}(x+\mu) * U^\dag_{\mu}(x+\nu)
81  staple = staple + U1 * U2 * conj(U3);
82 
83  dx[mu]--;
84  dx[nu]--;
85  //Get link U_{\mu}(x-\mu)
86  U1 = arg.origin(mu, linkIndexShift(x,dx,X), 1-parity);
87  //Get link U_{\nu}(x-\mu)
88  U2 = arg.origin(nu, linkIndexShift(x,dx,X), 1-parity);
89 
90  dx[nu]++;
91  //Get link U_{\mu}(x-\mu+\nu)
92  U3 = arg.origin(mu, linkIndexShift(x,dx,X), parity);
93 
94  // staple += U^\dag_{\mu}(x-\mu) * U_{\nu}(x-\mu) * U_{\mu}(x-\mu+\nu)
95  staple = staple + conj(U1) * U2 * U3;
96  }
97  }
98  }
99  }
100 
101  template<typename Float, typename GaugeOr, typename GaugeDs>
102  __global__ void computeAPEStep(GaugeAPEArg<Float,GaugeOr,GaugeDs> arg){
103 
104  int idx = threadIdx.x + blockIdx.x*blockDim.x;
105  int parity = threadIdx.y + blockIdx.y*blockDim.y;
106  int dir = threadIdx.z + blockIdx.z*blockDim.z;
107  if (idx >= arg.threads) return;
108  if (dir >= 3) return;
109  typedef complex<Float> Complex;
110  typedef Matrix<complex<Float>,3> Link;
111 
112  int X[4];
113  for(int dr=0; dr<4; ++dr) X[dr] = arg.X[dr];
114 
115  int x[4];
116  getCoords(x, idx, X, parity);
117  for(int dr=0; dr<4; ++dr) {
118  x[dr] += arg.border[dr];
119  X[dr] += 2*arg.border[dr];
120  }
121 
122  int dx[4] = {0, 0, 0, 0};
123  //Only spatial dimensions are smeared
124  {
125  Link U, S, TestU, I;
126 
127  computeStaple<Float,GaugeOr,GaugeDs,Complex>(arg,idx,parity,dir,S);
128 
129  // Get link U
130  U = arg.origin(dir, linkIndexShift(x,dx,X), parity);
131 
132  S = S * (arg.alpha/((Float) (2.*(3. - 1.))));
133  setIdentity(&I);
134 
135  TestU = I*(1.-arg.alpha) + S*conj(U);
136  polarSu3<Float>(TestU, arg.tolerance);
137  U = TestU*U;
138 
139  arg.dest(dir, linkIndexShift(x,dx,X), parity) = U;
140  }
141  }
142 
143  template<typename Float, typename GaugeOr, typename GaugeDs>
144  class GaugeAPE : TunableVectorYZ {
145  GaugeAPEArg<Float,GaugeOr,GaugeDs> arg;
146  const GaugeField &meta;
147 
148  private:
149  bool tuneGridDim() const { return false; } // Don't tune the grid dimensions.
150  unsigned int minThreads() const { return arg.threads; }
151 
152  public:
153  // (2,3) --- 2 for parity in the y thread dim, 3 corresponds to mapping direction to the z thread dim
154  GaugeAPE(GaugeAPEArg<Float,GaugeOr, GaugeDs> &arg, const GaugeField &meta)
155  : TunableVectorYZ(2,3), arg(arg), meta(meta) {}
156  virtual ~GaugeAPE () {}
157 
158  void apply(const cudaStream_t &stream){
159  if (meta.Location() == QUDA_CUDA_FIELD_LOCATION) {
160  TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
161  computeAPEStep<<<tp.grid,tp.block,tp.shared_bytes>>>(arg);
162  } else {
163  errorQuda("CPU not supported yet\n");
164  //computeAPEStepCPU(arg);
165  }
166  }
167 
168  TuneKey tuneKey() const {
169  std::stringstream aux;
170  aux << "threads=" << arg.threads << ",prec=" << sizeof(Float);
171  return TuneKey(meta.VolString(), typeid(*this).name(), aux.str().c_str());
172  }
173 
174  long long flops() const { return 3*(2+2*4)*198ll*arg.threads; } // just counts matrix multiplication
175  long long bytes() const { return 3*((1+2*6)*arg.origin.Bytes()+arg.dest.Bytes())*arg.threads; }
176  }; // GaugeAPE
177 
178  template<typename Float,typename GaugeOr, typename GaugeDs>
179  void APEStep(GaugeOr origin, GaugeDs dest, const GaugeField& dataOr, Float alpha) {
180  GaugeAPEArg<Float,GaugeOr,GaugeDs> arg(origin, dest, dataOr, alpha, dataOr.Precision() == QUDA_DOUBLE_PRECISION ? DOUBLE_TOL : SINGLE_TOL);
181  GaugeAPE<Float,GaugeOr,GaugeDs> gaugeAPE(arg,dataOr);
182  gaugeAPE.apply(0);
184  }
185 
186  template<typename Float>
187  void APEStep(GaugeField &dataDs, const GaugeField& dataOr, Float alpha) {
188 
189  if(dataDs.Reconstruct() == QUDA_RECONSTRUCT_NO) {
190  typedef typename gauge_mapper<Float,QUDA_RECONSTRUCT_NO>::type GDs;
191 
192  if(dataOr.Reconstruct() == QUDA_RECONSTRUCT_NO) {
193  typedef typename gauge_mapper<Float,QUDA_RECONSTRUCT_NO>::type GOr;
194  APEStep(GOr(dataOr), GDs(dataDs), dataOr, alpha);
195  }else if(dataOr.Reconstruct() == QUDA_RECONSTRUCT_12){
196  typedef typename gauge_mapper<Float,QUDA_RECONSTRUCT_12>::type GOr;
197  APEStep(GOr(dataOr), GDs(dataDs), dataOr, alpha);
198  }else if(dataOr.Reconstruct() == QUDA_RECONSTRUCT_8){
199  typedef typename gauge_mapper<Float,QUDA_RECONSTRUCT_8>::type GOr;
200  APEStep(GOr(dataOr), GDs(dataDs), dataOr, alpha);
201  }else{
202  errorQuda("Reconstruction type %d of origin gauge field not supported", dataOr.Reconstruct());
203  }
204  } else if(dataDs.Reconstruct() == QUDA_RECONSTRUCT_12){
205  typedef typename gauge_mapper<Float,QUDA_RECONSTRUCT_12>::type GDs;
206  if(dataOr.Reconstruct() == QUDA_RECONSTRUCT_NO){
207  typedef typename gauge_mapper<Float,QUDA_RECONSTRUCT_NO>::type GOr;
208  APEStep(GOr(dataOr), GDs(dataDs), dataOr, alpha);
209  }else if(dataOr.Reconstruct() == QUDA_RECONSTRUCT_12){
210  typedef typename gauge_mapper<Float,QUDA_RECONSTRUCT_12>::type GOr;
211  APEStep(GOr(dataOr), GDs(dataDs), dataOr, alpha);
212  }else if(dataOr.Reconstruct() == QUDA_RECONSTRUCT_8){
213  typedef typename gauge_mapper<Float,QUDA_RECONSTRUCT_8>::type GOr;
214  APEStep(GOr(dataOr), GDs(dataDs), dataOr, alpha);
215  }else{
216  errorQuda("Reconstruction type %d of origin gauge field not supported", dataOr.Reconstruct());
217  }
218  } else if(dataDs.Reconstruct() == QUDA_RECONSTRUCT_8){
219  typedef typename gauge_mapper<Float,QUDA_RECONSTRUCT_8>::type GDs;
220  if(dataOr.Reconstruct() == QUDA_RECONSTRUCT_NO){
221  typedef typename gauge_mapper<Float,QUDA_RECONSTRUCT_NO>::type GOr;
222  APEStep(GOr(dataOr), GDs(dataDs), dataOr, alpha);
223  }else if(dataOr.Reconstruct() == QUDA_RECONSTRUCT_12){
224  typedef typename gauge_mapper<Float,QUDA_RECONSTRUCT_12>::type GOr;
225  APEStep(GOr(dataOr), GDs(dataDs), dataOr, alpha);
226  }else if(dataOr.Reconstruct() == QUDA_RECONSTRUCT_8){
227  typedef typename gauge_mapper<Float,QUDA_RECONSTRUCT_8>::type GOr;
228  APEStep(GOr(dataOr), GDs(dataDs), dataOr, alpha);
229  }else{
230  errorQuda("Reconstruction type %d of origin gauge field not supported", dataOr.Reconstruct());
231  }
232  } else {
233  errorQuda("Reconstruction type %d of destination gauge field not supported", dataDs.Reconstruct());
234  }
235 
236  }
237 
238 #endif
239 
240  void APEStep(GaugeField &dataDs, const GaugeField& dataOr, double alpha) {
241 
242 #ifdef GPU_GAUGE_TOOLS
243 
244  if(dataOr.Precision() != dataDs.Precision()) {
245  errorQuda("Orign and destination fields must have the same precision\n");
246  }
247 
248  if(dataDs.Precision() == QUDA_HALF_PRECISION){
249  errorQuda("Half precision not supported\n");
250  }
251 
252  if (!dataOr.isNative())
253  errorQuda("Order %d with %d reconstruct not supported", dataOr.Order(), dataOr.Reconstruct());
254 
255  if (!dataDs.isNative())
256  errorQuda("Order %d with %d reconstruct not supported", dataDs.Order(), dataDs.Reconstruct());
257 
258  if (dataDs.Precision() == QUDA_SINGLE_PRECISION){
259  APEStep<float>(dataDs, dataOr, (float) alpha);
260  } else if(dataDs.Precision() == QUDA_DOUBLE_PRECISION) {
261  APEStep<double>(dataDs, dataOr, alpha);
262  } else {
263  errorQuda("Precision %d not supported", dataDs.Precision());
264  }
265  return;
266 #else
267  errorQuda("Gauge tools are not build");
268 #endif
269  }
270 
271 }
dim3 dim3 blockDim
double mu
Definition: test_util.cpp:1643
__device__ __host__ void setZero(Matrix< T, N > *m)
Definition: quda_matrix.h:592
static __device__ __host__ int linkIndexShift(const I x[], const J dx[], const K X[4])
QudaVerbosity getVerbosity()
Definition: util_quda.cpp:20
#define errorQuda(...)
Definition: util_quda.h:90
std::complex< double > Complex
Definition: eig_variables.h:13
cudaStream_t * stream
void APEStep(GaugeField &dataDs, const GaugeField &dataOr, double alpha)
Definition: gauge_ape.cu:240
#define DOUBLE_TOL
Definition: gauge_ape.cu:9
#define SINGLE_TOL
Definition: gauge_ape.cu:10
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
Definition: tune.cpp:603
Main header file for host and device accessors to GaugeFields.
cudaError_t qudaDeviceSynchronize()
Wrapper around cudaDeviceSynchronize or cuDeviceSynchronize.
__device__ __host__ void setIdentity(Matrix< T, N > *m)
Definition: quda_matrix.h:543
unsigned long long flops
Definition: blas_quda.cu:42
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
Definition: complex_quda.h:880
QudaReconstructType Reconstruct() const
Definition: gauge_field.h:203
QudaGaugeFieldOrder Order() const
Definition: gauge_field.h:204
__host__ __device__ ValueType conj(ValueType x)
Definition: complex_quda.h:115
QudaTune getTuning()
Query whether autotuning is enabled or not. Default is enabled but can be overridden by setting QUDA_...
Definition: util_quda.cpp:51
QudaPrecision Precision() const
bool isNative() const
QudaParity parity
Definition: covdev_test.cpp:53
unsigned long long bytes
Definition: blas_quda.cu:43
static __device__ __host__ void getCoords(int x[], int cb_index, const I X[], int parity)