QUDA  v0.7.0
A library for QCD on GPUs
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
gauge_ape.cu
Go to the documentation of this file.
1 #include <quda_internal.h>
2 #include <quda_matrix.h>
3 #include <tune_quda.h>
4 #include <gauge_field.h>
5 #include <gauge_field_order.h>
6 
7 #define DOUBLE_TOL 1e-15
8 #define SINGLE_TOL 2e-6
9 
10 namespace quda {
11 
12 #ifdef GPU_GAUGE_TOOLS
13 
14  template <typename Float, typename GaugeOr, typename GaugeDs>
15  struct GaugeAPEArg {
16  int threads; // number of active threads required
17  int X[4]; // grid dimensions
18 #ifdef MULTI_GPU
19  int border[4];
20 #endif
21  GaugeOr origin;
22  const Float alpha;
23  const Float tolerance;
24 
25  GaugeDs dest;
26 
27  GaugeAPEArg(GaugeOr &origin, GaugeDs &dest, const GaugeField &data, const Float alpha, const Float tolerance)
28  : origin(origin), dest(dest), alpha(alpha), tolerance(tolerance) {
29 #ifdef MULTI_GPU
30  for(int dir=0; dir<4; ++dir){
31  border[dir] = 2;
32  }
33  for(int dir=0; dir<4; ++dir) X[dir] = data.X()[dir] - border[dir]*2;
34 #else
35  for(int dir=0; dir<4; ++dir) X[dir] = data.X()[dir];
36 #endif
37  threads = X[0]*X[1]*X[2]*X[3];
38  }
39  };
40 
41 
42  __device__ __host__ inline int linkIndex2(int x[], int dx[], const int X[4]) {
43  int y[4];
44  for (int i=0; i<4; i++) y[i] = (x[i] + dx[i] + X[i]) % X[i];
45  int idx = (((y[3]*X[2] + y[2])*X[1] + y[1])*X[0] + y[0]) >> 1;
46  return idx;
47  }
48 
49 
50  __device__ __host__ inline void getCoords2(int x[4], int cb_index, const int X[4], int parity)
51  {
52  x[3] = cb_index/(X[2]*X[1]*X[0]/2);
53  x[2] = (cb_index/(X[1]*X[0]/2)) % X[2];
54  x[1] = (cb_index/(X[0]/2)) % X[1];
55  x[0] = 2*(cb_index%(X[0]/2)) + ((x[3]+x[2]+x[1]+parity)&1);
56 
57  return;
58  }
59 
60  template <typename Float2, typename Float>
61  __host__ __device__ int checkUnitary(Matrix<Float2,3> in, Matrix<Float2,3> *inv, const Float tol)
62  {
63  computeMatrixInverse(in, inv);
64 
65  for (int i=0;i<3;i++)
66  for (int j=0;j<3;j++)
67  {
68  if (fabs(in(i,j).x - (*inv)(j,i).x) > tol)
69  return 1;
70  if (fabs(in(i,j).y + (*inv)(j,i).y) > tol)
71  return 1;
72  }
73  return 0;
74  }
75 
76  template <typename Float2>
77  __host__ __device__ int checkUnitaryPrint(Matrix<Float2,3> in, Matrix<Float2,3> *inv)
78  {
79  computeMatrixInverse(in, inv);
80  for (int i=0;i<3;i++)
81  for (int j=0;j<3;j++)
82  {
83  printf("TESTR: %+.3le %+.3le %+.3le\n", in(i,j).x, (*inv)(j,i).x, fabs(in(i,j).x - (*inv)(j,i).x));
84  printf("TESTI: %+.3le %+.3le %+.3le\n", in(i,j).y, (*inv)(j,i).y, fabs(in(i,j).y + (*inv)(j,i).y));
85  cudaDeviceSynchronize();
86  if (fabs(in(i,j).x - (*inv)(j,i).x) > 1e-14)
87  return 1;
88  if (fabs(in(i,j).y + (*inv)(j,i).y) > 1e-14)
89  return 1;
90  }
91  return 0;
92  }
93 
94  template <typename Float2,typename Float>
95  __host__ __device__ void polarSu3(Matrix<Float2,3> *in, Float tol)
96  {
97  typedef typename ComplexTypeId<Float>::Type Cmplx;
98  Matrix<Cmplx,3> inv, out;
99 
100  out = *in;
101  computeMatrixInverse(out, &inv);
102 
103  do
104  {
105  out = out + conj(inv);
106  out = out*0.5;
107  } while(checkUnitary(out, &inv, tol));
108 /*
109  printf("Convergence after %d iterations\n", N);
110  cudaDeviceSynchronize();
111  printf("%+.3lf %+.3lfi %+.3lf %+.3lfi %+.3lf %+.3lfi\n", out(0,0).x, out(0,0).y, out(0,1).x, out(0,1).y, out(0,2).x, out(0,2).y);
112  printf("%+.3lf %+.3lfi %+.3lf %+.3lfi %+.3lf %+.3lfi\n", out(1,0).x, out(1,0).y, out(1,1).x, out(1,1).y, out(1,2).x, out(1,2).y);
113  printf("%+.3lf %+.3lfi %+.3lf %+.3lfi %+.3lf %+.3lfi\n", out(2,0).x, out(2,0).y, out(2,1).x, out(2,1).y, out(2,2).x, out(2,2).y);
114  printf("\n\n");
115  printf("%+.3lf %+.3lfi %+.3lf %+.3lfi %+.3lf %+.3lfi\n", inv(0,0).x, inv(0,0).y, inv(0,1).x, inv(0,1).y, inv(0,2).x, inv(0,2).y);
116  printf("%+.3lf %+.3lfi %+.3lf %+.3lfi %+.3lf %+.3lfi\n", inv(1,0).x, inv(1,0).y, inv(1,1).x, inv(1,1).y, inv(1,2).x, inv(1,2).y);
117  printf("%+.3lf %+.3lfi %+.3lf %+.3lfi %+.3lf %+.3lfi\n", inv(2,0).x, inv(2,0).y, inv(2,1).x, inv(2,1).y, inv(2,2).x, inv(2,2).y);
118  printf("\n\n\n\n");
119  cudaDeviceSynchronize();
120 */
121  Cmplx det = getDeterminant(out);
122  double mod = det.x*det.x + det.y*det.y;
123  mod = pow(mod, (1./6.));
124  double angle = atan2(det.y, det.x);
125  angle /= -3.;
126 
127  Cmplx cTemp;
128 
129  cTemp.x = cos(angle)/mod;
130  cTemp.y = sin(angle)/mod;
131 
132 // out = out*cTemp;
133  *in = out*cTemp;
134 /* if (checkUnitary(out, &inv))
135  {
136  cTemp = getDeterminant(out);
137  printf ("DetX: %+.3lf %+.3lfi, %.3lf %.3lf\nDetN: %+.3lf %+.3lfi", det.x, det.y, mod, angle, cTemp.x, cTemp.y);
138  cudaDeviceSynchronize();
139  checkUnitaryPrint(out, &inv);
140  setIdentity(in);
141  *in = *in * 0.5;
142  }
143  else
144  {
145  cTemp = getDeterminant(out);
146 // printf("Det: %+.3lf %+.3lf\n", cTemp.x, cTemp.y);
147  cudaDeviceSynchronize();
148 
149  if (fabs(cTemp.x - 1.0) > 1e-8)
150  setIdentity(in);
151  else if (fabs(cTemp.y) > 1e-8)
152  {
153  setIdentity(in);
154  printf("DadadaUnitary failed\n");
155  *in = *in * 0.1;
156  }
157  else
158  *in = out;
159  }*/
160  }
161 
162 
163  template <typename Float, typename GaugeOr, typename GaugeDs, typename Float2>
164  __host__ __device__ void computeStaple(GaugeAPEArg<Float,GaugeOr,GaugeDs>& arg, int idx, int parity, int dir, Matrix<Float2,3> &staple) {
165 
166  typedef typename ComplexTypeId<Float>::Type Cmplx;
167  // compute spacetime dimensions and parity
168 
169  int X[4];
170  for(int dr=0; dr<4; ++dr) X[dr] = arg.X[dr];
171 
172  int x[4];
173  getCoords2(x, idx, X, parity);
174 #ifdef MULTI_GPU
175  for(int dr=0; dr<4; ++dr) {
176  x[dr] += arg.border[dr];
177  X[dr] += 2*arg.border[dr];
178  }
179 #endif
180 
181  setZero(&staple);
182 
183  for (int mu=0; mu<4; mu++) {
184  if (mu == dir) {
185  continue;
186  }
187 
188  int nu = dir;
189 
190  {
191  int dx[4] = {0, 0, 0, 0};
192  Matrix<Cmplx,3> U1;
193  arg.origin.load((Float*)(U1.data),linkIndex2(x,dx,X), mu, parity);
194 
195  Matrix<Cmplx,3> U2;
196  dx[mu]++;
197  arg.origin.load((Float*)(U2.data),linkIndex2(x,dx,X), nu, 1-parity);
198 
199  Matrix<Cmplx,3> U3;
200  dx[mu]--;
201  dx[nu]++;
202  arg.origin.load((Float*)(U3.data),linkIndex2(x,dx,X), mu, 1-parity);
203 
204  Matrix<Cmplx,3> tmpS;
205 
206  tmpS = U1 * U2;
207  tmpS = tmpS * conj(U3);
208 
209  staple = staple + tmpS;
210 
211  dx[mu]--;
212  dx[nu]--;
213  arg.origin.load((Float*)(U1.data),linkIndex2(x,dx,X), mu, 1-parity);
214  arg.origin.load((Float*)(U2.data),linkIndex2(x,dx,X), nu, 1-parity);
215 
216  dx[nu]++;
217  arg.origin.load((Float*)(U3.data),linkIndex2(x,dx,X), mu, parity);
218 
219  tmpS = conj(U1);
220  tmpS = tmpS * U2;
221  tmpS = tmpS * U3;
222 
223  staple = staple + tmpS;
224  }
225  }
226  }
227 
228  template<typename Float, typename GaugeOr, typename GaugeDs>
229  __global__ void computeAPEStep(GaugeAPEArg<Float,GaugeOr,GaugeDs> arg){
230  int idx = threadIdx.x + blockIdx.x*blockDim.x;
231  if(idx >= arg.threads) return;
232  typedef typename ComplexTypeId<Float>::Type Cmplx;
233 
234  int parity = 0;
235  if(idx >= arg.threads/2) {
236  parity = 1;
237  idx -= arg.threads/2;
238  }
239 
240  int X[4];
241  for(int dr=0; dr<4; ++dr) X[dr] = arg.X[dr];
242 
243  int x[4];
244  getCoords2(x, idx, X, parity);
245 #ifdef MULTI_GPU
246  for(int dr=0; dr<4; ++dr) {
247  x[dr] += arg.border[dr];
248  X[dr] += 2*arg.border[dr];
249  }
250 #endif
251 
252  int dx[4] = {0, 0, 0, 0};
253  for (int dir=0; dir < 3; dir++) { //Only spatial dimensions are smeared
254  Matrix<Cmplx,3> U, S;
255 
256  computeStaple<Float,GaugeOr,GaugeDs,Cmplx>(arg,idx,parity,dir,S);
257 
258  arg.origin.load((Float*)(U.data),linkIndex2(x,dx,X), dir, parity);
259 
260  U = U * (1. - arg.alpha);
261  S = S * (arg.alpha/6.);
262 
263  U = U + S;
264 
265  polarSu3<Cmplx,Float>(&U, arg.tolerance);
266  arg.dest.save((Float*)(U.data),linkIndex2(x,dx,X), dir, parity);
267  }
268  }
269 
270  template<typename Float, typename GaugeOr, typename GaugeDs>
271  class GaugeAPE : Tunable {
272  GaugeAPEArg<Float,GaugeOr,GaugeDs> arg;
274 
275  private:
276  unsigned int sharedBytesPerThread() const { return 0; }
277  unsigned int sharedBytesPerBlock(const TuneParam &param) const { return 0; }
278 
279  bool tuneSharedBytes() const { return false; } // Don't tune shared memory
280  bool tuneGridDim() const { return false; } // Don't tune the grid dimensions.
281  unsigned int minThreads() const { return arg.threads; }
282 
283  public:
284  GaugeAPE(GaugeAPEArg<Float,GaugeOr, GaugeDs> &arg, QudaFieldLocation location)
285  : arg(arg), location(location) {}
286  virtual ~GaugeAPE () {}
287 
288  void apply(const cudaStream_t &stream){
289  if(location == QUDA_CUDA_FIELD_LOCATION){
290 #if (__COMPUTE_CAPABILITY__ >= 200)
291  TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
292  computeAPEStep<<<tp.grid,tp.block,tp.shared_bytes>>>(arg);
293 #else
294  errorQuda("GaugeAPE not supported on pre-Fermi architecture");
295 #endif
296  }else{
297  errorQuda("CPU not supported yet\n");
298  //computeAPEStepCPU(arg);
299  }
300  }
301 
302  TuneKey tuneKey() const {
303  std::stringstream vol, aux;
304  vol << arg.X[0] << "x";
305  vol << arg.X[1] << "x";
306  vol << arg.X[2] << "x";
307  vol << arg.X[3];
308  aux << "threads=" << arg.threads << ",prec=" << sizeof(Float);
309  return TuneKey(vol.str().c_str(), typeid(*this).name(), aux.str().c_str());
310  }
311 
312 
313  std::string paramString(const TuneParam &param) const {
314  std::stringstream ps;
315  ps << "block=(" << param.block.x << "," << param.block.y << "," << param.block.z << ")";
316  ps << "shared=" << param.shared_bytes;
317  return ps.str();
318  }
319 
320  void preTune(){}
321  void postTune(){}
322  long long flops() const { return (1)*6*arg.threads; }
323  long long bytes() const { return (1)*6*arg.threads*sizeof(Float); } // Only correct if there is no link reconstruction
324 
325  }; // GaugeAPE
326 
327  template<typename Float,typename GaugeOr, typename GaugeDs>
328  void APEStep(GaugeOr origin, GaugeDs dest, const GaugeField& dataOr, Float alpha, QudaFieldLocation location) {
329  if (dataOr.Precision() == QUDA_DOUBLE_PRECISION) {
330  GaugeAPEArg<Float,GaugeOr,GaugeDs> arg(origin, dest, dataOr, alpha, DOUBLE_TOL);
331  GaugeAPE<Float,GaugeOr,GaugeDs> gaugeAPE(arg, location);
332  gaugeAPE.apply(0);
333  } else {
334  GaugeAPEArg<Float,GaugeOr,GaugeDs> arg(origin, dest, dataOr, alpha, SINGLE_TOL);
335  GaugeAPE<Float,GaugeOr,GaugeDs> gaugeAPE(arg, location);
336  gaugeAPE.apply(0);
337  }
338  cudaDeviceSynchronize();
339  }
340 
341  template<typename Float>
342  void APEStep(GaugeField &dataDs, const GaugeField& dataOr, Float alpha, QudaFieldLocation location) {
343 
344  // Switching to FloatNOrder for the gauge field in order to support RECONSTRUCT_12
345  // Need to fix this!!
346 
347  if(dataDs.Order() == QUDA_FLOAT2_GAUGE_ORDER) {
348  if(dataOr.Order() == QUDA_FLOAT2_GAUGE_ORDER) {
349  if(dataDs.Reconstruct() == QUDA_RECONSTRUCT_NO) {
350  if(dataOr.Reconstruct() == QUDA_RECONSTRUCT_NO) {
351  APEStep(FloatNOrder<Float, 18, 2, 18>(dataOr), FloatNOrder<Float, 18, 2, 18>(dataDs), dataOr, alpha, location);
352  }else if(dataOr.Reconstruct() == QUDA_RECONSTRUCT_12){
353  APEStep(FloatNOrder<Float, 18, 2, 12>(dataOr), FloatNOrder<Float, 18, 2, 18>(dataDs), dataOr, alpha, location);
354  }else if(dataOr.Reconstruct() == QUDA_RECONSTRUCT_8){
355  APEStep(FloatNOrder<Float, 18, 2, 8>(dataOr), FloatNOrder<Float, 18, 2, 18>(dataDs), dataOr, alpha, location);
356  }else{
357  errorQuda("Reconstruction type %d of origin gauge field not supported", dataOr.Reconstruct());
358  }
359  } else if(dataDs.Reconstruct() == QUDA_RECONSTRUCT_12){
360  if(dataOr.Reconstruct() == QUDA_RECONSTRUCT_NO){
361  APEStep(FloatNOrder<Float, 18, 2, 18>(dataOr), FloatNOrder<Float, 18, 2, 12>(dataDs), dataOr, alpha, location);
362  }else if(dataOr.Reconstruct() == QUDA_RECONSTRUCT_12){
363  APEStep(FloatNOrder<Float, 18, 2, 12>(dataOr), FloatNOrder<Float, 18, 2, 12>(dataDs), dataOr, alpha, location);
364  }else if(dataOr.Reconstruct() == QUDA_RECONSTRUCT_8){
365  APEStep(FloatNOrder<Float, 18, 2, 8>(dataOr), FloatNOrder<Float, 18, 2, 12>(dataDs), dataOr, alpha, location);
366  }else{
367  errorQuda("Reconstruction type %d of origin gauge field not supported", dataOr.Reconstruct());
368  }
369  } else if(dataDs.Reconstruct() == QUDA_RECONSTRUCT_8){
370  if(dataOr.Reconstruct() == QUDA_RECONSTRUCT_NO){
371  APEStep(FloatNOrder<Float, 18, 2, 18>(dataOr), FloatNOrder<Float, 18, 2, 8>(dataDs), dataOr, alpha, location);
372  }else if(dataOr.Reconstruct() == QUDA_RECONSTRUCT_12){
373  APEStep(FloatNOrder<Float, 18, 2, 12>(dataOr), FloatNOrder<Float, 18, 2, 8>(dataDs), dataOr, alpha, location);
374  }else if(dataOr.Reconstruct() == QUDA_RECONSTRUCT_8){
375  APEStep(FloatNOrder<Float, 18, 2, 8>(dataOr), FloatNOrder<Float, 18, 2, 8>(dataDs), dataOr, alpha, location);
376  }else{
377  errorQuda("Reconstruction type %d of origin gauge field not supported", dataOr.Reconstruct());
378  }
379  } else {
380  errorQuda("Reconstruction type %d of destination gauge field not supported", dataDs.Reconstruct());
381  }
382  } else if(dataOr.Order() == QUDA_FLOAT4_GAUGE_ORDER) {
383  if(dataDs.Reconstruct() == QUDA_RECONSTRUCT_NO) {
384  if(dataOr.Reconstruct() == QUDA_RECONSTRUCT_NO) {
385  APEStep(FloatNOrder<Float, 18, 4, 18>(dataOr), FloatNOrder<Float, 18, 2, 18>(dataDs), dataOr, alpha, location);
386  }else if(dataOr.Reconstruct() == QUDA_RECONSTRUCT_12){
387  APEStep(FloatNOrder<Float, 18, 4, 12>(dataOr), FloatNOrder<Float, 18, 2, 18>(dataDs), dataOr, alpha, location);
388  }else if(dataOr.Reconstruct() == QUDA_RECONSTRUCT_8){
389  APEStep(FloatNOrder<Float, 18, 4, 8>(dataOr), FloatNOrder<Float, 18, 2, 18>(dataDs), dataOr, alpha, location);
390  }else{
391  errorQuda("Reconstruction type %d of origin gauge field not supported", dataOr.Reconstruct());
392  }
393  } else if(dataDs.Reconstruct() == QUDA_RECONSTRUCT_12){
394  if(dataOr.Reconstruct() == QUDA_RECONSTRUCT_NO){
395  APEStep(FloatNOrder<Float, 18, 4, 18>(dataOr), FloatNOrder<Float, 18, 2, 12>(dataDs), dataOr, alpha, location);
396  }else if(dataOr.Reconstruct() == QUDA_RECONSTRUCT_12){
397  APEStep(FloatNOrder<Float, 18, 4, 12>(dataOr), FloatNOrder<Float, 18, 2, 12>(dataDs), dataOr, alpha, location);
398  }else if(dataOr.Reconstruct() == QUDA_RECONSTRUCT_8){
399  APEStep(FloatNOrder<Float, 18, 4, 8>(dataOr), FloatNOrder<Float, 18, 2, 12>(dataDs), dataOr, alpha, location);
400  }else{
401  errorQuda("Reconstruction type %d of origin gauge field not supported", dataOr.Reconstruct());
402  }
403  } else if(dataDs.Reconstruct() == QUDA_RECONSTRUCT_8){
404  if(dataOr.Reconstruct() == QUDA_RECONSTRUCT_NO){
405  APEStep(FloatNOrder<Float, 18, 4, 18>(dataOr), FloatNOrder<Float, 18, 2, 8>(dataDs), dataOr, alpha, location);
406  }else if(dataOr.Reconstruct() == QUDA_RECONSTRUCT_12){
407  APEStep(FloatNOrder<Float, 18, 4, 12>(dataOr), FloatNOrder<Float, 18, 2, 8>(dataDs), dataOr, alpha, location);
408  }else if(dataOr.Reconstruct() == QUDA_RECONSTRUCT_8){
409  APEStep(FloatNOrder<Float, 18, 4, 8>(dataOr), FloatNOrder<Float, 18, 2, 8>(dataDs), dataOr, alpha, location);
410  }else{
411  errorQuda("Reconstruction type %d of origin gauge field not supported", dataOr.Reconstruct());
412  }
413  } else {
414  errorQuda("Reconstruction type %d of destination gauge field not supported", dataDs.Reconstruct());
415  }
416  } else {
417  errorQuda("Invalid Gauge Order origin field\n");
418  }
419  } else if(dataDs.Order() == QUDA_FLOAT4_GAUGE_ORDER) {
420  if(dataOr.Order() == QUDA_FLOAT2_GAUGE_ORDER) {
421  if(dataDs.Reconstruct() == QUDA_RECONSTRUCT_NO) {
422  if(dataOr.Reconstruct() == QUDA_RECONSTRUCT_NO) {
423  APEStep(FloatNOrder<Float, 18, 2, 18>(dataOr), FloatNOrder<Float, 18, 4, 18>(dataDs), dataOr, alpha, location);
424  }else if(dataOr.Reconstruct() == QUDA_RECONSTRUCT_12){
425  APEStep(FloatNOrder<Float, 18, 2, 12>(dataOr), FloatNOrder<Float, 18, 4, 18>(dataDs), dataOr, alpha, location);
426  }else if(dataOr.Reconstruct() == QUDA_RECONSTRUCT_8){
427  APEStep(FloatNOrder<Float, 18, 2, 8>(dataOr), FloatNOrder<Float, 18, 4, 18>(dataDs), dataOr, alpha, location);
428  }else{
429  errorQuda("Reconstruction type %d of origin gauge field not supported", dataOr.Reconstruct());
430  }
431  } else if(dataDs.Reconstruct() == QUDA_RECONSTRUCT_12){
432  if(dataOr.Reconstruct() == QUDA_RECONSTRUCT_NO){
433  APEStep(FloatNOrder<Float, 18, 2, 18>(dataOr), FloatNOrder<Float, 18, 4, 12>(dataDs), dataOr, alpha, location);
434  }else if(dataOr.Reconstruct() == QUDA_RECONSTRUCT_12){
435  APEStep(FloatNOrder<Float, 18, 2, 12>(dataOr), FloatNOrder<Float, 18, 4, 12>(dataDs), dataOr, alpha, location);
436  }else if(dataOr.Reconstruct() == QUDA_RECONSTRUCT_8){
437  APEStep(FloatNOrder<Float, 18, 2, 8>(dataOr), FloatNOrder<Float, 18, 4, 12>(dataDs), dataOr, alpha, location);
438  }else{
439  errorQuda("Reconstruction type %d of origin gauge field not supported", dataOr.Reconstruct());
440  }
441  } else if(dataDs.Reconstruct() == QUDA_RECONSTRUCT_8){
442  if(dataOr.Reconstruct() == QUDA_RECONSTRUCT_NO){
443  APEStep(FloatNOrder<Float, 18, 2, 18>(dataOr), FloatNOrder<Float, 18, 4, 8>(dataDs), dataOr, alpha, location);
444  }else if(dataOr.Reconstruct() == QUDA_RECONSTRUCT_12){
445  APEStep(FloatNOrder<Float, 18, 2, 12>(dataOr), FloatNOrder<Float, 18, 4, 8>(dataDs), dataOr, alpha, location);
446  }else if(dataOr.Reconstruct() == QUDA_RECONSTRUCT_8){
447  APEStep(FloatNOrder<Float, 18, 2, 8>(dataOr), FloatNOrder<Float, 18, 4, 8>(dataDs), dataOr, alpha, location);
448  }else{
449  errorQuda("Reconstruction type %d of origin gauge field not supported", dataOr.Reconstruct());
450  }
451  } else {
452  errorQuda("Reconstruction type %d of destination gauge field not supported", dataDs.Reconstruct());
453  }
454  } else if(dataOr.Order() == QUDA_FLOAT4_GAUGE_ORDER) {
455  if(dataDs.Reconstruct() == QUDA_RECONSTRUCT_NO) {
456  if(dataOr.Reconstruct() == QUDA_RECONSTRUCT_NO) {
457  APEStep(FloatNOrder<Float, 18, 4, 18>(dataOr), FloatNOrder<Float, 18, 4, 18>(dataDs), dataOr, alpha, location);
458  }else if(dataOr.Reconstruct() == QUDA_RECONSTRUCT_12){
459  APEStep(FloatNOrder<Float, 18, 4, 12>(dataOr), FloatNOrder<Float, 18, 4, 18>(dataDs), dataOr, alpha, location);
460  }else if(dataOr.Reconstruct() == QUDA_RECONSTRUCT_8){
461  APEStep(FloatNOrder<Float, 18, 4, 8>(dataOr), FloatNOrder<Float, 18, 4, 18>(dataDs), dataOr, alpha, location);
462  }else{
463  errorQuda("Reconstruction type %d of origin gauge field not supported", dataOr.Reconstruct());
464  }
465  } else if(dataDs.Reconstruct() == QUDA_RECONSTRUCT_12){
466  if(dataOr.Reconstruct() == QUDA_RECONSTRUCT_NO){
467  APEStep(FloatNOrder<Float, 18, 4, 18>(dataOr), FloatNOrder<Float, 18, 4, 12>(dataDs), dataOr, alpha, location);
468  }else if(dataOr.Reconstruct() == QUDA_RECONSTRUCT_12){
469  APEStep(FloatNOrder<Float, 18, 4, 12>(dataOr), FloatNOrder<Float, 18, 4, 12>(dataDs), dataOr, alpha, location);
470  }else if(dataOr.Reconstruct() == QUDA_RECONSTRUCT_8){
471  APEStep(FloatNOrder<Float, 18, 4, 8>(dataOr), FloatNOrder<Float, 18, 4, 12>(dataDs), dataOr, alpha, location);
472  }else{
473  errorQuda("Reconstruction type %d of origin gauge field not supported", dataOr.Reconstruct());
474  }
475  } else if(dataDs.Reconstruct() == QUDA_RECONSTRUCT_8){
476  if(dataOr.Reconstruct() == QUDA_RECONSTRUCT_NO){
477  APEStep(FloatNOrder<Float, 18, 4, 18>(dataOr), FloatNOrder<Float, 18, 4, 8>(dataDs), dataOr, alpha, location);
478  }else if(dataOr.Reconstruct() == QUDA_RECONSTRUCT_12){
479  APEStep(FloatNOrder<Float, 18, 4, 12>(dataOr), FloatNOrder<Float, 18, 4, 8>(dataDs), dataOr, alpha, location);
480  }else if(dataOr.Reconstruct() == QUDA_RECONSTRUCT_8){
481  APEStep(FloatNOrder<Float, 18, 4, 8>(dataOr), FloatNOrder<Float, 18, 4, 8>(dataDs), dataOr, alpha, location);
482  }else{
483  errorQuda("Reconstruction type %d of origin gauge field not supported", dataOr.Reconstruct());
484  }
485  } else {
486  errorQuda("Reconstruction type %d of destination gauge field not supported", dataDs.Reconstruct());
487  }
488  } else {
489  errorQuda("Invalid Gauge Order origin field\n");
490  }
491  } else {
492  errorQuda("Invalid Gauge Order destination field\n");
493  }
494  }
495 #endif
496 
497  void APEStep(GaugeField &dataDs, const GaugeField& dataOr, double alpha, QudaFieldLocation location) {
498 
499 #ifdef GPU_GAUGE_TOOLS
500 
501  if(dataOr.Precision() != dataDs.Precision()) {
502  errorQuda("Oriign and destination fields must have the same precision\n");
503  }
504 
505  if(dataDs.Precision() == QUDA_HALF_PRECISION){
506  errorQuda("Half precision not supported\n");
507  }
508 
509  if (dataDs.Precision() == QUDA_SINGLE_PRECISION){
510  APEStep<float>(dataDs, dataOr, (float) alpha, location);
511  } else if(dataDs.Precision() == QUDA_DOUBLE_PRECISION) {
512  APEStep<double>(dataDs, dataOr, alpha, location);
513  } else {
514  errorQuda("Precision %d not supported", dataDs.Precision());
515  }
516  return;
517 #else
518  errorQuda("Gauge tools are not build");
519 #endif
520  }
521 
522 
523 }
__device__ __host__ void setZero(Matrix< T, N > *m)
Definition: quda_matrix.h:640
int y[4]
QudaVerbosity getVerbosity()
Definition: util_quda.cpp:20
#define errorQuda(...)
Definition: util_quda.h:73
__global__ void const RealA *const const RealA *const const RealA *const const RealB *const const RealB *const int int mu
cudaStream_t * stream
__global__ void const FloatN FloatM FloatM Float Float int threads
Definition: llfat_core.h:1099
::std::string string
Definition: gtest.h:1979
__device__ __host__ T getDeterminant(const Matrix< T, 3 > &a)
Definition: quda_matrix.h:385
QudaGaugeParam param
Definition: pack_test.cpp:17
QudaPrecision Precision() const
const QudaFieldLocation location
Definition: pack_test.cpp:46
#define DOUBLE_TOL
Definition: gauge_ape.cu:7
__host__ __device__ ValueType sin(ValueType x)
Definition: complex_quda.h:40
FloatingPoint< float > Float
Definition: gtest.h:7350
cpuColorSpinorField * in
#define SINGLE_TOL
Definition: gauge_ape.cu:8
__host__ __device__ ValueType atan2(ValueType x, ValueType y)
Definition: complex_quda.h:65
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
Definition: tune.cpp:271
__host__ __device__ ValueType pow(ValueType x, ExponentType e)
Definition: complex_quda.h:100
int x[4]
int dx[4]
void APEStep(GaugeField &dataDs, const GaugeField &dataOr, double alpha, QudaFieldLocation location)
Definition: gauge_ape.cu:497
enum QudaFieldLocation_s QudaFieldLocation
cpuColorSpinorField * out
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
Definition: complex_quda.h:843
__device__ __host__ void computeMatrixInverse(const Matrix< T, 3 > &u, Matrix< T, 3 > *uinv)
Definition: quda_matrix.h:555
__host__ __device__ ValueType cos(ValueType x)
Definition: complex_quda.h:35
__host__ __device__ ValueType conj(ValueType x)
Definition: complex_quda.h:115
QudaTune getTuning()
Definition: util_quda.cpp:32
const QudaParity parity
Definition: dslash_test.cpp:29