QUDA  1.0.0
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros
llfat_quda.cu
Go to the documentation of this file.
1 #include <stdio.h>
2 #include <cuda_runtime.h>
3 #include <cuda.h>
4 
5 #include <quda_internal.h>
6 #include <gauge_field.h>
7 #include <llfat_quda.h>
8 #include <index_helper.cuh>
9 #include <gauge_field_order.h>
10 #include <fast_intdiv.h>
11 #include <tune_quda.h>
12 
13 #define MIN_COEFF 1e-7
14 
15 namespace quda {
16 
17 #ifdef GPU_FATLINK
18 
19  template <typename Float, typename Link, typename Gauge>
20  struct LinkArg {
21  unsigned int threads;
22 
23  int_fastdiv X[4];
24  int_fastdiv E[4];
25  int border[4];
26 
31  int odd_bit;
32 
33  Gauge u;
34  Link link;
35  Float coeff;
36 
37  LinkArg(Link link, Gauge u, Float coeff, const GaugeField &link_meta, const GaugeField &u_meta)
38  : threads(link_meta.VolumeCB()), link(link), u(u), coeff(coeff)
39  {
40  for (int d=0; d<4; d++) {
41  X[d] = link_meta.X()[d];
42  E[d] = u_meta.X()[d];
43  border[d] = (E[d] - X[d]) / 2;
44  }
45  }
46  };
47 
48  template <typename Float, int dir, typename Arg>
49  __device__ void longLinkDir(Arg &arg, int idx, int parity) {
50  int x[4];
51  int dx[4] = {0, 0, 0, 0};
52 
53  int *y = arg.u.coords;
54  getCoords(x, idx, arg.X, parity);
55  for (int d=0; d<4; d++) x[d] += arg.border[d];
56 
57  typedef Matrix<complex<Float>,3> Link;
58 
59  Link a = arg.u(dir, linkIndex(y, x, arg.E), parity);
60 
61  dx[dir]++;
62  Link b = arg.u(dir, linkIndexShift(y, x, dx, arg.E), 1-parity);
63 
64  dx[dir]++;
65  Link c = arg.u(dir, linkIndexShift(y, x, dx, arg.E), parity);
66  dx[dir]-=2;
67 
68  arg.link(dir, idx, parity) = arg.coeff * a * b * c;
69  }
70 
71  template <typename Float, typename Arg>
72  __global__ void computeLongLink(Arg arg) {
73 
74  int idx = blockIdx.x*blockDim.x + threadIdx.x;
75  int parity = blockIdx.y*blockDim.y + threadIdx.y;
76  int dir = blockIdx.z*blockDim.z + threadIdx.z;
77  if (idx >= arg.threads) return;
78  if (dir >= 4) return;
79 
80  switch(dir) {
81  case 0: longLinkDir<Float, 0>(arg, idx, parity); break;
82  case 1: longLinkDir<Float, 1>(arg, idx, parity); break;
83  case 2: longLinkDir<Float, 2>(arg, idx, parity); break;
84  case 3: longLinkDir<Float, 3>(arg, idx, parity); break;
85  }
86  return;
87  }
88 
89  template <typename Float, typename Arg>
90  class LongLink : public TunableVectorYZ {
91  Arg &arg;
92  const GaugeField &meta;
93  unsigned int minThreads() const { return arg.threads; }
94  bool tuneGridDim() const { return false; }
95 
96  public:
97  LongLink(Arg &arg, const GaugeField &meta) : TunableVectorYZ(2,4), arg(arg), meta(meta) {}
98  virtual ~LongLink() {}
99 
100  void apply(const cudaStream_t &stream) {
101  TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
102  computeLongLink<Float><<<tp.grid,tp.block,tp.shared_bytes>>>(arg);
103  }
104 
105  TuneKey tuneKey() const {
106  std::stringstream aux;
107  aux << "threads=" << arg.threads << ",prec=" << sizeof(Float);
108  return TuneKey(meta.VolString(), typeid(*this).name(), aux.str().c_str());
109  }
110 
111  long long flops() const { return 2*4*arg.threads*198; }
112  long long bytes() const { return 2*4*arg.threads*(3*arg.u.Bytes()+arg.link.Bytes()); }
113  };
114 
115  void computeLongLink(GaugeField &lng, const GaugeField &u, double coeff)
116  {
117  if (u.Precision() == QUDA_DOUBLE_PRECISION) {
118  typedef typename gauge_mapper<double,QUDA_RECONSTRUCT_NO>::type L;
119  if (u.Reconstruct() == QUDA_RECONSTRUCT_NO) {
120  typedef LinkArg<double,L,L> Arg;
121  Arg arg(L(lng), L(u), coeff, lng, u);
122  LongLink<double,Arg> longLink(arg,lng);
123  longLink.apply(0);
124  } else if (u.Reconstruct() == QUDA_RECONSTRUCT_12) {
125  if (u.StaggeredPhase() == QUDA_STAGGERED_PHASE_MILC) {
126  typedef typename gauge_mapper<double, QUDA_RECONSTRUCT_12, 18, QUDA_STAGGERED_PHASE_MILC>::type G;
127  typedef LinkArg<double, L, G> Arg;
128  Arg arg(L(lng), G(u), coeff, lng, u);
129  LongLink<double, Arg> longLink(arg, lng);
130  longLink.apply(0);
131  } else {
132  errorQuda("Staggered phase type %d not supported", u.StaggeredPhase());
133  }
134  } else {
135  errorQuda("Reconstruct %d is not supported\n", u.Reconstruct());
136  }
137  } else if (u.Precision() == QUDA_SINGLE_PRECISION) {
138  typedef typename gauge_mapper<float,QUDA_RECONSTRUCT_NO>::type L;
139  if (u.Reconstruct() == QUDA_RECONSTRUCT_NO) {
140  typedef LinkArg<float,L,L> Arg;
141  Arg arg(L(lng), L(u), coeff, lng, u) ;
142  LongLink<float,Arg> longLink(arg,lng);
143  longLink.apply(0);
144  } else if (u.Reconstruct() == QUDA_RECONSTRUCT_12) {
145  if (u.StaggeredPhase() == QUDA_STAGGERED_PHASE_MILC) {
146  typedef typename gauge_mapper<float, QUDA_RECONSTRUCT_12, 18, QUDA_STAGGERED_PHASE_MILC>::type G;
147  typedef LinkArg<float, L, G> Arg;
148  Arg arg(L(lng), G(u), coeff, lng, u);
149  LongLink<float, Arg> longLink(arg, lng);
150  longLink.apply(0);
151  } else {
152  errorQuda("Staggered phase type %d not supported", u.StaggeredPhase());
153  }
154  } else {
155  errorQuda("Reconstruct %d is not supported\n", u.Reconstruct());
156  }
157  } else {
158  errorQuda("Unsupported precision %d\n", u.Precision());
159  }
160  return;
161  }
162 
163  template <typename Float, typename Arg>
164  __global__ void computeOneLink(Arg arg) {
165 
166  int idx = blockIdx.x*blockDim.x + threadIdx.x;
167  int parity = blockIdx.y * blockDim.y + threadIdx.y;
168  int dir = blockIdx.z * blockDim.z + threadIdx.z;
169  if (idx >= arg.threads) return;
170  if (dir >= 4) return;
171 
172  int *x = arg.u.coords;
173  getCoords(x, idx, arg.X, parity);
174  for (int d=0; d<4; d++) x[d] += arg.border[d];
175 
176  typedef Matrix<complex<Float>,3> Link;
177 
178  Link a = arg.u(dir, linkIndex(x,x,arg.E), parity);
179 
180  arg.link(dir, idx, parity) = arg.coeff*a;
181 
182  return;
183  }
184 
185  template <typename Float, typename Arg>
186  class OneLink : public TunableVectorYZ {
187  Arg &arg;
188  const GaugeField &meta;
189  unsigned int minThreads() const { return arg.threads; }
190  bool tuneGridDim() const { return false; }
191 
192  public:
193  OneLink(Arg &arg, const GaugeField &meta) : TunableVectorYZ(2,4), arg(arg), meta(meta) {}
194  virtual ~OneLink() {}
195 
196  void apply(const cudaStream_t &stream) {
197  TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
198  computeOneLink<Float><<<tp.grid,tp.block>>>(arg);
199  }
200 
201  TuneKey tuneKey() const {
202  std::stringstream aux;
203  aux << "threads=" << arg.threads << ",prec=" << sizeof(Float);
204  return TuneKey(meta.VolString(), typeid(*this).name(), aux.str().c_str());
205  }
206 
207  long long flops() const { return 2*4*arg.threads*18; }
208  long long bytes() const { return 2*4*arg.threads*(arg.u.Bytes()+arg.link.Bytes()); }
209  };
210 
211  void computeOneLink(GaugeField &fat, const GaugeField &u, double coeff)
212  {
213  if (u.Precision() == QUDA_DOUBLE_PRECISION) {
214  typedef typename gauge_mapper<double,QUDA_RECONSTRUCT_NO>::type L;
215  if (u.Reconstruct() == QUDA_RECONSTRUCT_NO) {
216  typedef LinkArg<double,L,L> Arg;
217  Arg arg(L(fat), L(u), coeff, fat, u);
218  OneLink<double,Arg> oneLink(arg,fat);
219  oneLink.apply(0);
220  } else if (u.Reconstruct() == QUDA_RECONSTRUCT_12) {
221  typedef typename gauge_mapper<double,QUDA_RECONSTRUCT_12,18,QUDA_STAGGERED_PHASE_MILC>::type G;
222  typedef LinkArg<double,L,G> Arg;
223  Arg arg(L(fat), G(u), coeff, fat, u);
224  OneLink<double,Arg> oneLink(arg,fat);
225  oneLink.apply(0);
226  } else {
227  errorQuda("Reconstruct %d is not supported\n", u.Reconstruct());
228  }
229  } else if (u.Precision() == QUDA_SINGLE_PRECISION) {
230  typedef typename gauge_mapper<float,QUDA_RECONSTRUCT_NO>::type L;
231  if (u.Reconstruct() == QUDA_RECONSTRUCT_NO) {
232  typedef LinkArg<float,L,L> Arg;
233  Arg arg(L(fat), L(u), coeff, fat, u);
234  OneLink<float,Arg> oneLink(arg,fat);
235  oneLink.apply(0);
236  } else if (u.Reconstruct() == QUDA_RECONSTRUCT_12) {
237  typedef typename gauge_mapper<float,QUDA_RECONSTRUCT_12,18,QUDA_STAGGERED_PHASE_MILC>::type G;
238  typedef LinkArg<float,L,G> Arg;
239  Arg arg(L(fat), G(u), coeff, fat, u);
240  OneLink<float,Arg> oneLink(arg,fat);
241  oneLink.apply(0);
242  } else {
243  errorQuda("Reconstruct %d is not supported\n", u.Reconstruct());
244  }
245  } else {
246  errorQuda("Unsupported precision %d\n", u.Precision());
247  }
248  return;
249  }
250 
251  template <typename Float, typename Fat, typename Staple, typename Mulink, typename Gauge>
252  struct StapleArg {
253  unsigned int threads;
254 
255  int_fastdiv X[4];
256  int_fastdiv E[4];
257  int border[4];
258 
259  int_fastdiv inner_X[4];
260  int inner_border[4];
261 
266  int odd_bit;
267 
268  Gauge u;
269  Fat fat;
270  Staple staple;
271  Mulink mulink;
272  Float coeff;
273 
274  int n_mu;
275  int mu_map[4];
276 
277  StapleArg(Fat fat, Staple staple, Mulink mulink, Gauge u, Float coeff,
278  const GaugeField &fat_meta, const GaugeField &u_meta)
279  : threads(1), fat(fat), staple(staple), mulink(mulink), u(u), coeff(coeff),
280  odd_bit( (commDimPartitioned(0)+commDimPartitioned(1) +
282  for (int d=0; d<4; d++) {
283  X[d] = (fat_meta.X()[d] + u_meta.X()[d]) / 2;
284  E[d] = u_meta.X()[d];
285  border[d] = (E[d] - X[d]) / 2;
286  threads *= X[d];
287 
288  inner_X[d] = fat_meta.X()[d];
289  inner_border[d] = (E[d] - inner_X[d]) / 2;
290  }
291  threads /= 2; // account for parity in y dimension
292  }
293  };
294 
295  template<typename Float, int mu, int nu, typename Arg>
296  __device__ inline void computeStaple(Matrix<complex<Float>,3> &staple, Arg &arg, int x[], int parity) {
297  typedef Matrix<complex<Float>,3> Link;
298  int *y = arg.u.coords, *y_mu = arg.mulink.coords, dx[4] = {0, 0, 0, 0};
299 
300  /* Computes the upper staple :
301  * mu (B)
302  * +-------+
303  * nu | |
304  * (A) | |(C)
305  * X X
306  */
307  {
308  /* load matrix A*/
309  Link a = arg.u(nu, linkIndex(y, x, arg.E), parity);
310 
311  /* load matrix B*/
312  dx[nu]++;
313  Link b = arg.mulink(mu, linkIndexShift(y_mu, x, dx, arg.E), 1-parity);
314  dx[nu]--;
315 
316  /* load matrix C*/
317  dx[mu]++;
318  Link c = arg.u(nu, linkIndexShift(y, x, dx, arg.E), 1-parity);
319  dx[mu]--;
320 
321  staple = a * b * conj(c);
322  }
323 
324  /* Computes the lower staple :
325  * X X
326  * nu | |
327  * (A) | | (C)
328  * +-------+
329  * mu (B)
330  */
331  {
332  /* load matrix A*/
333  dx[nu]--;
334  Link a = arg.u(nu, linkIndexShift(y, x, dx, arg.E), 1-parity);
335 
336  /* load matrix B*/
337  Link b = arg.mulink(mu, linkIndexShift(y_mu, x, dx, arg.E), 1-parity);
338 
339  /* load matrix C*/
340  dx[mu]++;
341  Link c = arg.u(nu, linkIndexShift(y, x, dx, arg.E), parity);
342  dx[mu]--;
343  dx[nu]++;
344 
345  staple = staple + conj(a)*b*c;
346  }
347  }
348 
349  template<typename Float, bool save_staple, typename Arg>
350  __global__ void computeStaple(Arg arg, int nu)
351  {
352  int idx = blockIdx.x*blockDim.x + threadIdx.x;
353  int parity = blockIdx.y*blockDim.y + threadIdx.y;
354  if (idx >= arg.threads) return;
355 
356  int mu_idx = blockIdx.z*blockDim.z + threadIdx.z;
357  if (mu_idx >= arg.n_mu) return;
358  int mu;
359  switch(mu_idx) {
360  case 0: mu = arg.mu_map[0]; break;
361  case 1: mu = arg.mu_map[1]; break;
362  case 2: mu = arg.mu_map[2]; break;
363  }
364 
365  int x[4];
366  getCoords(x, idx, arg.X, (parity+arg.odd_bit)%2);
367  for (int d=0; d<4; d++) x[d] += arg.border[d];
368 
369  typedef Matrix<complex<Float>,3> Link;
370  Link staple;
371  switch(mu) {
372  case 0:
373  switch(nu) {
374  case 1: computeStaple<Float,0,1>(staple, arg, x, parity); break;
375  case 2: computeStaple<Float,0,2>(staple, arg, x, parity); break;
376  case 3: computeStaple<Float,0,3>(staple, arg, x, parity); break;
377  } break;
378  case 1:
379  switch(nu) {
380  case 0: computeStaple<Float,1,0>(staple, arg, x, parity); break;
381  case 2: computeStaple<Float,1,2>(staple, arg, x, parity); break;
382  case 3: computeStaple<Float,1,3>(staple, arg, x, parity); break;
383  } break;
384  case 2:
385  switch(nu) {
386  case 0: computeStaple<Float,2,0>(staple, arg, x, parity); break;
387  case 1: computeStaple<Float,2,1>(staple, arg, x, parity); break;
388  case 3: computeStaple<Float,2,3>(staple, arg, x, parity); break;
389  } break;
390  case 3:
391  switch(nu) {
392  case 0: computeStaple<Float,3,0>(staple, arg, x, parity); break;
393  case 1: computeStaple<Float,3,1>(staple, arg, x, parity); break;
394  case 2: computeStaple<Float,3,2>(staple, arg, x, parity); break;
395  } break;
396  }
397 
398  // exclude inner halo
399  if ( !(x[0] < arg.inner_border[0] || x[0] >= arg.inner_X[0] + arg.inner_border[0] ||
400  x[1] < arg.inner_border[1] || x[1] >= arg.inner_X[1] + arg.inner_border[1] ||
401  x[2] < arg.inner_border[2] || x[2] >= arg.inner_X[2] + arg.inner_border[2] ||
402  x[3] < arg.inner_border[3] || x[3] >= arg.inner_X[3] + arg.inner_border[3]) ) {
403  // convert to inner coords
404  int inner_x[] = {x[0]-arg.inner_border[0], x[1]-arg.inner_border[1], x[2]-arg.inner_border[2], x[3]-arg.inner_border[3]};
405  Link fat = arg.fat(mu, linkIndex(inner_x, arg.inner_X), parity);
406  fat += arg.coeff * staple;
407  arg.fat(mu, linkIndex(inner_x, arg.inner_X), parity) = fat;
408  }
409 
410  if (save_staple) arg.staple(mu, linkIndex(x, arg.E), parity) = staple;
411  return;
412  }
413 
414  template <typename Float, typename Arg>
415  class Staple : public TunableVectorYZ {
416  Arg &arg;
417  const GaugeField &meta;
418  unsigned int minThreads() const { return arg.threads; }
419  bool tuneGridDim() const { return false; }
420  int nu;
421  int dir1;
422  int dir2;
423  bool save_staple;
424 
425  public:
426  Staple(Arg &arg, int nu, int dir1, int dir2, bool save_staple, const GaugeField &meta)
427  : TunableVectorYZ(2,(3 - ( (dir1 > -1) ? 1 : 0 ) - ( (dir2 > -1) ? 1 : 0 ))),
428  arg(arg), meta(meta), nu(nu), dir1(dir1), dir2(dir2), save_staple(save_staple)
429  {
430  // compute the map for z thread index to mu index in the kernel
431  // mu != nu 3 -> n_mu = 3
432  // mu != nu != rho 2 -> n_mu = 2
433  // mu != nu != rho != sig 1 -> n_mu = 1
434  arg.n_mu = 3 - ( (dir1 > -1) ? 1 : 0 ) - ( (dir2 > -1) ? 1 : 0 );
435  int j=0;
436  for (int i=0; i<4; i++) {
437  if (i==nu || i==dir1 || i==dir2) continue; // skip these dimensions
438  arg.mu_map[j++] = i;
439  }
440  assert(j == arg.n_mu);
441  }
442  virtual ~Staple() {}
443 
444  void apply(const cudaStream_t &stream) {
445  TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
446  if (save_staple)
447  computeStaple<Float,true><<<tp.grid,tp.block>>>(arg, nu);
448  else
449  computeStaple<Float,false><<<tp.grid,tp.block>>>(arg, nu);
450  }
451 
452  TuneKey tuneKey() const {
453  std::stringstream aux;
454  aux << "threads=" << arg.threads << ",prec=" << sizeof(Float);
455  aux << ",nu=" << nu << ",dir1=" << dir1 << ",dir2=" << dir2 << ",save=" << save_staple;
456  return TuneKey(meta.VolString(), typeid(*this).name(), aux.str().c_str());
457  }
458 
459  void preTune() { arg.fat.save(); arg.staple.save(); }
460  void postTune() { arg.fat.load(); arg.staple.load(); }
461 
462  long long flops() const {
463  return 2*arg.n_mu*arg.threads*( 4*198 + 18 + 36 );
464  }
465  long long bytes() const {
466  return arg.n_mu*2*meta.VolumeCB()*arg.fat.Bytes()*2 // fat load/store is only done on interior
467  + arg.n_mu*2*arg.threads*(4*arg.u.Bytes() + 2*arg.mulink.Bytes() + (save_staple ? arg.staple.Bytes() : 0));
468  }
469  };
470 
471  // Compute the staple field for direction nu,excluding the directions dir1 and dir2.
472  void computeStaple(GaugeField &fat, GaugeField &staple, const GaugeField &mulink, const GaugeField &u,
473  int nu, int dir1, int dir2, double coeff, bool save_staple) {
474 
475  if (u.Precision() == QUDA_DOUBLE_PRECISION) {
476  typedef typename gauge_mapper<double,QUDA_RECONSTRUCT_NO>::type L;
477  if (u.Reconstruct() == QUDA_RECONSTRUCT_NO) {
478  typedef StapleArg<double,L,L,L,L> Arg;
479  Arg arg(L(fat), L(staple), L(mulink), L(u), coeff, fat, u);
480  Staple<double,Arg> stapler(arg, nu, dir1, dir2, save_staple, fat);
481  stapler.apply(0);
482  } else if (u.Reconstruct() == QUDA_RECONSTRUCT_12) {
483  typedef typename gauge_mapper<double,QUDA_RECONSTRUCT_12,18,QUDA_STAGGERED_PHASE_MILC>::type G;
484  if (mulink.Reconstruct() == QUDA_RECONSTRUCT_NO) {
485  typedef StapleArg<double,L,L,L,G> Arg;
486  Arg arg(L(fat), L(staple), L(mulink), G(u), coeff, fat, u);
487  Staple<double,Arg> stapler(arg, nu, dir1, dir2, save_staple, fat);
488  stapler.apply(0);
489  } else if (mulink.Reconstruct() == QUDA_RECONSTRUCT_12) {
490  typedef StapleArg<double,L,L,G,G> Arg;
491  Arg arg(L(fat), L(staple), G(mulink), G(u), coeff, fat, u);
492  Staple<double,Arg> stapler(arg, nu, dir1, dir2, save_staple, fat);
493  stapler.apply(0);
494  } else {
495  errorQuda("Reconstruct %d is not supported\n", u.Reconstruct());
496  }
497  } else {
498  errorQuda("Reconstruct %d is not supported\n", u.Reconstruct());
499  }
500  } else if (u.Precision() == QUDA_SINGLE_PRECISION) {
501  typedef typename gauge_mapper<float,QUDA_RECONSTRUCT_NO>::type L;
502  if (u.Reconstruct() == QUDA_RECONSTRUCT_NO) {
503  typedef StapleArg<float,L,L,L,L> Arg;
504  Arg arg(L(fat), L(staple), L(mulink), L(u), coeff, fat, u);
505  Staple<float,Arg> stapler(arg, nu, dir1, dir2, save_staple, fat);
506  stapler.apply(0);
507  } else if (u.Reconstruct() == QUDA_RECONSTRUCT_12) {
508  typedef typename gauge_mapper<float,QUDA_RECONSTRUCT_12,18,QUDA_STAGGERED_PHASE_MILC>::type G;
509  if (mulink.Reconstruct() == QUDA_RECONSTRUCT_NO) {
510  typedef StapleArg<double,L,L,L,G> Arg;
511  Arg arg(L(fat), L(staple), L(mulink), G(u), coeff, fat, u);
512  Staple<float,Arg> stapler(arg, nu, dir1, dir2, save_staple, fat);
513  stapler.apply(0);
514  } else if (mulink.Reconstruct() == QUDA_RECONSTRUCT_12) {
515  typedef StapleArg<double,L,L,G,G> Arg;
516  Arg arg(L(fat), L(staple), G(mulink), G(u), coeff, fat, u);
517  Staple<float,Arg> stapler(arg, nu, dir1, dir2, save_staple, fat);
518  stapler.apply(0);
519  } else {
520  errorQuda("Reconstruct %d is not supported\n", u.Reconstruct());
521  }
522  } else {
523  errorQuda("Reconstruct %d is not supported\n", u.Reconstruct());
524  }
525  } else {
526  errorQuda("Unsupported precision %d\n", u.Precision());
527  }
528  }
529 
530 #endif //GPU_FATLINK
531 
532  void fatLongKSLink(cudaGaugeField* fat, cudaGaugeField* lng, const cudaGaugeField& u, const double *coeff)
533  {
534 
535 #ifdef GPU_FATLINK
538  gParam.setPrecision(gParam.Precision());
540  cudaGaugeField staple(gParam);
541  cudaGaugeField staple1(gParam);
542 
543  if( ((fat->X()[0] % 2 != 0) || (fat->X()[1] % 2 != 0) || (fat->X()[2] % 2 != 0) || (fat->X()[3] % 2 != 0))
544  && (u.Reconstruct() != QUDA_RECONSTRUCT_NO)){
545  errorQuda("Reconstruct %d and odd dimensionsize is not supported by link fattening code (yet)\n",
546  u.Reconstruct());
547  }
548 
549  computeOneLink(*fat, u, coeff[0]-6.0*coeff[5]);
550 
551  // if this pointer is not NULL, compute the long link
552  if (lng) computeLongLink(*lng, u, coeff[1]);
553 
554  // Check the coefficients. If all of the following are zero, return.
555  if (fabs(coeff[2]) < MIN_COEFF && fabs(coeff[3]) < MIN_COEFF &&
556  fabs(coeff[4]) < MIN_COEFF && fabs(coeff[5]) < MIN_COEFF) return;
557 
558  for (int nu = 0; nu < 4; nu++) {
559  computeStaple(*fat, staple, u, u, nu, -1, -1, coeff[2], 1);
560 
561  if (coeff[5] != 0.0) computeStaple(*fat, staple, staple, u, nu, -1, -1, coeff[5], 0);
562 
563  for (int rho = 0; rho < 4; rho++) {
564  if (rho != nu) {
565 
566  computeStaple(*fat, staple1, staple, u, rho, nu, -1, coeff[3], 1);
567 
568  if (fabs(coeff[4]) > MIN_COEFF) {
569  for (int sig = 0; sig < 4; sig++) {
570  if (sig != nu && sig != rho) {
571  computeStaple(*fat, staple, staple1, u, sig, nu, rho, coeff[4], 0);
572  }
573  } //sig
574  } // MIN_COEFF
575  }
576  } //rho
577  } //nu
578 
580  checkCudaError();
581 #else
582  errorQuda("Fat-link computation not enabled");
583 #endif
584 
585  return;
586  }
587 
588 #undef MIN_COEFF
589 
590 } // namespace quda
void fatLongKSLink(cudaGaugeField *fat, cudaGaugeField *lng, const cudaGaugeField &gauge, const double *coeff)
Compute the fat and long links for an improved staggered (Kogut-Susskind) fermions.
Definition: llfat_quda.cu:532
int commDimPartitioned(int dir)
double mu
Definition: test_util.cpp:1648
static __device__ __host__ int linkIndexShift(const I x[], const J dx[], const K X[4])
static __device__ __host__ int linkIndex(const int x[], const I X[4])
QudaVerbosity getVerbosity()
Definition: util_quda.cpp:21
#define errorQuda(...)
Definition: util_quda.h:121
cudaStream_t * stream
int E[4]
Definition: test_util.cpp:35
#define qudaDeviceSynchronize()
__host__ __device__ void computeStaple(Arg &arg, int idx, int parity, int dir, Link &staple)
Definition: gauge_ape.cuh:36
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
Definition: tune.cpp:643
#define MIN_COEFF
Definition: llfat_quda.cu:13
Main header file for host and device accessors to GaugeFields.
int X[4]
Definition: covdev_test.cpp:70
void setPrecision(QudaPrecision precision, bool force_native=false)
Helper function for setting the precision and corresponding field order for QUDA internal fields...
Definition: gauge_field.h:131
GaugeFieldParam gParam
QudaPrecision Precision() const
Definition: lattice_field.h:58
unsigned long long flops
Definition: blas_quda.cu:22
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
QudaReconstructType reconstruct
Definition: gauge_field.h:16
QudaFieldCreate create
Definition: gauge_field.h:26
QudaReconstructType Reconstruct() const
Definition: gauge_field.h:250
#define checkCudaError()
Definition: util_quda.h:161
__host__ __device__ ValueType conj(ValueType x)
Definition: complex_quda.h:130
QudaTune getTuning()
Query whether autotuning is enabled or not. Default is enabled but can be overridden by setting QUDA_...
Definition: util_quda.cpp:52
QudaParity parity
Definition: covdev_test.cpp:54
unsigned long long bytes
Definition: blas_quda.cu:23
__host__ __device__ int getCoords(int coord[], const Arg &arg, int &idx, int parity, int &dim)
Compute the space-time coordinates we are at.
const int * X() const