QUDA  0.9.0
llfat_quda.cu
Go to the documentation of this file.
1 #include <stdio.h>
2 #include <cuda_runtime.h>
3 #include <cuda.h>
4 
5 #include <quda_internal.h>
6 #include <gauge_field.h>
7 #include <llfat_quda.h>
8 #include <index_helper.cuh>
9 #include <gauge_field_order.h>
10 #include <fast_intdiv.h>
11 
12 #define MIN_COEFF 1e-7
13 
14 namespace quda {
15 
16 #ifdef GPU_FATLINK
17 
18  template <typename Float, typename Link, typename Gauge>
19  struct LinkArg {
20  unsigned int threads;
21 
22  int_fastdiv X[4];
23  int_fastdiv E[4];
24  int border[4];
25 
30  int odd_bit;
31 
32  Gauge u;
33  Link link;
34  Float coeff;
35 
36  LinkArg(Link link, Gauge u, Float coeff, const GaugeField &link_meta, const GaugeField &u_meta)
37  : threads(link_meta.VolumeCB()), link(link), u(u), coeff(coeff)
38  {
39  for (int d=0; d<4; d++) {
40  X[d] = link_meta.X()[d];
41  E[d] = u_meta.X()[d];
42  border[d] = (E[d] - X[d]) / 2;
43  }
44  }
45  };
46 
47  template <typename Float, int dir, typename Arg>
48  __device__ void longLinkDir(Arg &arg, int idx, int parity) {
49  int x[4];
50  int dx[4] = {0, 0, 0, 0};
51 
52  int *y = arg.u.coords;
53  getCoords(x, idx, arg.X, parity);
54  for (int d=0; d<4; d++) x[d] += arg.border[d];
55 
56  typedef Matrix<complex<Float>,3> Link;
57 
58  Link a = arg.u(dir, linkIndex(y, x, arg.E), parity);
59 
60  dx[dir]++;
61  Link b = arg.u(dir, linkIndexShift(y, x, dx, arg.E), 1-parity);
62 
63  dx[dir]++;
64  Link c = arg.u(dir, linkIndexShift(y, x, dx, arg.E), parity);
65  dx[dir]-=2;
66 
67  arg.link(dir, idx, parity) = arg.coeff * a * b * c;
68  }
69 
70  template <typename Float, typename Arg>
71  __global__ void computeLongLink(Arg arg) {
72 
73  int idx = blockIdx.x*blockDim.x + threadIdx.x;
74  int parity = blockIdx.y*blockDim.y + threadIdx.y;
75  int dir = blockIdx.z*blockDim.z + threadIdx.z;
76  if (idx >= arg.threads) return;
77  if (dir >= 4) return;
78 
79  switch(dir) {
80  case 0: longLinkDir<Float, 0>(arg, idx, parity); break;
81  case 1: longLinkDir<Float, 1>(arg, idx, parity); break;
82  case 2: longLinkDir<Float, 2>(arg, idx, parity); break;
83  case 3: longLinkDir<Float, 3>(arg, idx, parity); break;
84  }
85  return;
86  }
87 
88  template <typename Float, typename Arg>
89  class LongLink : public TunableVectorYZ {
90  Arg &arg;
91  const GaugeField &meta;
92  unsigned int minThreads() const { return arg.threads; }
93  bool tuneGridDim() const { return false; }
94 
95  public:
96  LongLink(Arg &arg, const GaugeField &meta) : TunableVectorYZ(2,4), arg(arg), meta(meta) {}
97  virtual ~LongLink() {}
98 
99  void apply(const cudaStream_t &stream) {
100  TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
101  computeLongLink<Float><<<tp.grid,tp.block,tp.shared_bytes>>>(arg);
102  }
103 
104  TuneKey tuneKey() const {
105  std::stringstream aux;
106  aux << "threads=" << arg.threads << ",prec=" << sizeof(Float);
107  return TuneKey(meta.VolString(), typeid(*this).name(), aux.str().c_str());
108  }
109 
110  long long flops() const { return 2*4*arg.threads*198; }
111  long long bytes() const { return 2*4*arg.threads*(3*arg.u.Bytes()+arg.link.Bytes()); }
112  };
113 
114  void computeLongLink(GaugeField &lng, const GaugeField &u, double coeff)
115  {
116  if (u.Precision() == QUDA_DOUBLE_PRECISION) {
117  typedef typename gauge_mapper<double,QUDA_RECONSTRUCT_NO>::type L;
118  if (u.Reconstruct() == QUDA_RECONSTRUCT_NO) {
119  typedef LinkArg<double,L,L> Arg;
120  Arg arg(L(lng), L(u), coeff, lng, u);
121  LongLink<double,Arg> longLink(arg,lng);
122  longLink.apply(0);
123  } else if (u.Reconstruct() == QUDA_RECONSTRUCT_12) {
124  typedef typename gauge_mapper<double,QUDA_RECONSTRUCT_12,18,QUDA_STAGGERED_PHASE_MILC>::type G;
125  typedef LinkArg<double,L,G> Arg;
126  Arg arg(L(lng), G(u), coeff, lng, u);
127  LongLink<double,Arg> longLink(arg,lng);
128  longLink.apply(0);
129  } else {
130  errorQuda("Reconstruct %d is not supported\n", u.Reconstruct());
131  }
132  } else if (u.Precision() == QUDA_SINGLE_PRECISION) {
133  typedef typename gauge_mapper<float,QUDA_RECONSTRUCT_NO>::type L;
134  if (u.Reconstruct() == QUDA_RECONSTRUCT_NO) {
135  typedef LinkArg<float,L,L> Arg;
136  Arg arg(L(lng), L(u), coeff, lng, u) ;
137  LongLink<float,Arg> longLink(arg,lng);
138  longLink.apply(0);
139  } else if (u.Reconstruct() == QUDA_RECONSTRUCT_12) {
140  typedef typename gauge_mapper<float,QUDA_RECONSTRUCT_12,18,QUDA_STAGGERED_PHASE_MILC>::type G;
141  typedef LinkArg<float,L,G> Arg;
142  Arg arg(L(lng), G(u), coeff, lng, u);
143  LongLink<float,Arg> longLink(arg,lng);
144  longLink.apply(0);
145 
146  } else {
147  errorQuda("Reconstruct %d is not supported\n", u.Reconstruct());
148  }
149  } else {
150  errorQuda("Unsupported precision %d\n", u.Precision());
151  }
152  return;
153  }
154 
155  template <typename Float, typename Arg>
156  __global__ void computeOneLink(Arg arg) {
157 
158  int idx = blockIdx.x*blockDim.x + threadIdx.x;
159  int parity = blockIdx.y * blockDim.y + threadIdx.y;
160  int dir = blockIdx.z * blockDim.z + threadIdx.z;
161  if (idx >= arg.threads) return;
162  if (dir >= 4) return;
163 
164  int *x = arg.u.coords;
165  getCoords(x, idx, arg.X, parity);
166  for (int d=0; d<4; d++) x[d] += arg.border[d];
167 
168  typedef Matrix<complex<Float>,3> Link;
169 
170  Link a = arg.u(dir, linkIndex(x,x,arg.E), parity);
171 
172  arg.link(dir, idx, parity) = arg.coeff*a;
173 
174  return;
175  }
176 
177  template <typename Float, typename Arg>
178  class OneLink : public TunableVectorYZ {
179  Arg &arg;
180  const GaugeField &meta;
181  unsigned int minThreads() const { return arg.threads; }
182  bool tuneGridDim() const { return false; }
183 
184  public:
185  OneLink(Arg &arg, const GaugeField &meta) : TunableVectorYZ(2,4), arg(arg), meta(meta) {}
186  virtual ~OneLink() {}
187 
188  void apply(const cudaStream_t &stream) {
189  TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
190  computeOneLink<Float><<<tp.grid,tp.block>>>(arg);
191  }
192 
193  TuneKey tuneKey() const {
194  std::stringstream aux;
195  aux << "threads=" << arg.threads << ",prec=" << sizeof(Float);
196  return TuneKey(meta.VolString(), typeid(*this).name(), aux.str().c_str());
197  }
198 
199  long long flops() const { return 2*4*arg.threads*18; }
200  long long bytes() const { return 2*4*arg.threads*(arg.u.Bytes()+arg.link.Bytes()); }
201  };
202 
203  void computeOneLink(GaugeField &fat, const GaugeField &u, double coeff)
204  {
205  if (u.Precision() == QUDA_DOUBLE_PRECISION) {
206  typedef typename gauge_mapper<double,QUDA_RECONSTRUCT_NO>::type L;
207  if (u.Reconstruct() == QUDA_RECONSTRUCT_NO) {
208  typedef LinkArg<double,L,L> Arg;
209  Arg arg(L(fat), L(u), coeff, fat, u);
210  OneLink<double,Arg> oneLink(arg,fat);
211  oneLink.apply(0);
212  } else if (u.Reconstruct() == QUDA_RECONSTRUCT_12) {
213  typedef typename gauge_mapper<double,QUDA_RECONSTRUCT_12,18,QUDA_STAGGERED_PHASE_MILC>::type G;
214  typedef LinkArg<double,L,G> Arg;
215  Arg arg(L(fat), G(u), coeff, fat, u);
216  OneLink<double,Arg> oneLink(arg,fat);
217  oneLink.apply(0);
218  } else {
219  errorQuda("Reconstruct %d is not supported\n", u.Reconstruct());
220  }
221  } else if (u.Precision() == QUDA_SINGLE_PRECISION) {
222  typedef typename gauge_mapper<float,QUDA_RECONSTRUCT_NO>::type L;
223  if (u.Reconstruct() == QUDA_RECONSTRUCT_NO) {
224  typedef LinkArg<float,L,L> Arg;
225  Arg arg(L(fat), L(u), coeff, fat, u);
226  OneLink<float,Arg> oneLink(arg,fat);
227  oneLink.apply(0);
228  } else if (u.Reconstruct() == QUDA_RECONSTRUCT_12) {
229  typedef typename gauge_mapper<float,QUDA_RECONSTRUCT_12,18,QUDA_STAGGERED_PHASE_MILC>::type G;
230  typedef LinkArg<float,L,G> Arg;
231  Arg arg(L(fat), G(u), coeff, fat, u);
232  OneLink<float,Arg> oneLink(arg,fat);
233  oneLink.apply(0);
234  } else {
235  errorQuda("Reconstruct %d is not supported\n", u.Reconstruct());
236  }
237  } else {
238  errorQuda("Unsupported precision %d\n", u.Precision());
239  }
240  return;
241  }
242 
243  template <typename Float, typename Fat, typename Staple, typename Mulink, typename Gauge>
244  struct StapleArg {
245  unsigned int threads;
246 
247  int_fastdiv X[4];
248  int_fastdiv E[4];
249  int border[4];
250 
251  int_fastdiv inner_X[4];
252  int inner_border[4];
253 
258  int odd_bit;
259 
260  Gauge u;
261  Fat fat;
262  Staple staple;
263  Mulink mulink;
264  Float coeff;
265 
266  int n_mu;
267  int mu_map[4];
268 
269  StapleArg(Fat fat, Staple staple, Mulink mulink, Gauge u, Float coeff,
270  const GaugeField &fat_meta, const GaugeField &u_meta)
271  : threads(1), fat(fat), staple(staple), mulink(mulink), u(u), coeff(coeff),
272  odd_bit( (commDimPartitioned(0)+commDimPartitioned(1) +
274  for (int d=0; d<4; d++) {
275  X[d] = (fat_meta.X()[d] + u_meta.X()[d]) / 2;
276  E[d] = u_meta.X()[d];
277  border[d] = (E[d] - X[d]) / 2;
278  threads *= X[d];
279 
280  inner_X[d] = fat_meta.X()[d];
281  inner_border[d] = (E[d] - inner_X[d]) / 2;
282  }
283  threads /= 2; // account for parity in y dimension
284  }
285  };
286 
287  template<typename Float, int mu, int nu, typename Arg>
288  __device__ inline void computeStaple(Matrix<complex<Float>,3> &staple, Arg &arg, int x[], int parity) {
289  typedef Matrix<complex<Float>,3> Link;
290  int *y = arg.u.coords, *y_mu = arg.mulink.coords, dx[4] = {0, 0, 0, 0};
291 
292  /* Computes the upper staple :
293  * mu (B)
294  * +-------+
295  * nu | |
296  * (A) | |(C)
297  * X X
298  */
299  {
300  /* load matrix A*/
301  Link a = arg.u(nu, linkIndex(y, x, arg.E), parity);
302 
303  /* load matrix B*/
304  dx[nu]++;
305  Link b = arg.mulink(mu, linkIndexShift(y_mu, x, dx, arg.E), 1-parity);
306  dx[nu]--;
307 
308  /* load matrix C*/
309  dx[mu]++;
310  Link c = arg.u(nu, linkIndexShift(y, x, dx, arg.E), 1-parity);
311  dx[mu]--;
312 
313  staple = a * b * conj(c);
314  }
315 
316  /* Computes the lower staple :
317  * X X
318  * nu | |
319  * (A) | | (C)
320  * +-------+
321  * mu (B)
322  */
323  {
324  /* load matrix A*/
325  dx[nu]--;
326  Link a = arg.u(nu, linkIndexShift(y, x, dx, arg.E), 1-parity);
327 
328  /* load matrix B*/
329  Link b = arg.mulink(mu, linkIndexShift(y_mu, x, dx, arg.E), 1-parity);
330 
331  /* load matrix C*/
332  dx[mu]++;
333  Link c = arg.u(nu, linkIndexShift(y, x, dx, arg.E), parity);
334  dx[mu]--;
335  dx[nu]++;
336 
337  staple = staple + conj(a)*b*c;
338  }
339  }
340 
341  template<typename Float, bool save_staple, typename Arg>
342  __global__ void computeStaple(Arg arg, int nu)
343  {
344  int idx = blockIdx.x*blockDim.x + threadIdx.x;
345  int parity = blockIdx.y*blockDim.y + threadIdx.y;
346  if (idx >= arg.threads) return;
347 
348  int mu_idx = blockIdx.z*blockDim.z + threadIdx.z;
349  if (mu_idx >= arg.n_mu) return;
350  int mu;
351  switch(mu_idx) {
352  case 0: mu = arg.mu_map[0]; break;
353  case 1: mu = arg.mu_map[1]; break;
354  case 2: mu = arg.mu_map[2]; break;
355  }
356 
357  int x[4];
358  getCoords(x, idx, arg.X, (parity+arg.odd_bit)%2);
359  for (int d=0; d<4; d++) x[d] += arg.border[d];
360 
361  typedef Matrix<complex<Float>,3> Link;
362  Link staple;
363  switch(mu) {
364  case 0:
365  switch(nu) {
366  case 1: computeStaple<Float,0,1>(staple, arg, x, parity); break;
367  case 2: computeStaple<Float,0,2>(staple, arg, x, parity); break;
368  case 3: computeStaple<Float,0,3>(staple, arg, x, parity); break;
369  } break;
370  case 1:
371  switch(nu) {
372  case 0: computeStaple<Float,1,0>(staple, arg, x, parity); break;
373  case 2: computeStaple<Float,1,2>(staple, arg, x, parity); break;
374  case 3: computeStaple<Float,1,3>(staple, arg, x, parity); break;
375  } break;
376  case 2:
377  switch(nu) {
378  case 0: computeStaple<Float,2,0>(staple, arg, x, parity); break;
379  case 1: computeStaple<Float,2,1>(staple, arg, x, parity); break;
380  case 3: computeStaple<Float,2,3>(staple, arg, x, parity); break;
381  } break;
382  case 3:
383  switch(nu) {
384  case 0: computeStaple<Float,3,0>(staple, arg, x, parity); break;
385  case 1: computeStaple<Float,3,1>(staple, arg, x, parity); break;
386  case 2: computeStaple<Float,3,2>(staple, arg, x, parity); break;
387  } break;
388  }
389 
390  // exclude inner halo
391  if ( !(x[0] < arg.inner_border[0] || x[0] >= arg.inner_X[0] + arg.inner_border[0] ||
392  x[1] < arg.inner_border[1] || x[1] >= arg.inner_X[1] + arg.inner_border[1] ||
393  x[2] < arg.inner_border[2] || x[2] >= arg.inner_X[2] + arg.inner_border[2] ||
394  x[3] < arg.inner_border[3] || x[3] >= arg.inner_X[3] + arg.inner_border[3]) ) {
395  // convert to inner coords
396  int inner_x[] = {x[0]-arg.inner_border[0], x[1]-arg.inner_border[1], x[2]-arg.inner_border[2], x[3]-arg.inner_border[3]};
397  Link fat = arg.fat(mu, linkIndex(inner_x, arg.inner_X), parity);
398  fat += arg.coeff * staple;
399  arg.fat(mu, linkIndex(inner_x, arg.inner_X), parity) = fat;
400  }
401 
402  if (save_staple) arg.staple(mu, linkIndex(x, arg.E), parity) = staple;
403  return;
404  }
405 
406  template <typename Float, typename Arg>
407  class Staple : public TunableVectorYZ {
408  Arg &arg;
409  const GaugeField &meta;
410  unsigned int minThreads() const { return arg.threads; }
411  bool tuneGridDim() const { return false; }
412  int nu;
413  int dir1;
414  int dir2;
415  bool save_staple;
416 
417  public:
418  Staple(Arg &arg, int nu, int dir1, int dir2, bool save_staple, const GaugeField &meta)
419  : TunableVectorYZ(2,(3 - ( (dir1 > -1) ? 1 : 0 ) - ( (dir2 > -1) ? 1 : 0 ))),
420  arg(arg), meta(meta), nu(nu), dir1(dir1), dir2(dir2), save_staple(save_staple)
421  {
422  // compute the map for z thread index to mu index in the kernel
423  // mu != nu 3 -> n_mu = 3
424  // mu != nu != rho 2 -> n_mu = 2
425  // mu != nu != rho != sig 1 -> n_mu = 1
426  arg.n_mu = 3 - ( (dir1 > -1) ? 1 : 0 ) - ( (dir2 > -1) ? 1 : 0 );
427  int j=0;
428  for (int i=0; i<4; i++) {
429  if (i==nu || i==dir1 || i==dir2) continue; // skip these dimensions
430  arg.mu_map[j++] = i;
431  }
432  assert(j == arg.n_mu);
433  }
434  virtual ~Staple() {}
435 
436  void apply(const cudaStream_t &stream) {
437  TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
438  if (save_staple)
439  computeStaple<Float,true><<<tp.grid,tp.block>>>(arg, nu);
440  else
441  computeStaple<Float,false><<<tp.grid,tp.block>>>(arg, nu);
442  }
443 
444  TuneKey tuneKey() const {
445  std::stringstream aux;
446  aux << "threads=" << arg.threads << ",prec=" << sizeof(Float);
447  aux << ",nu=" << nu << ",dir1=" << dir1 << ",dir2=" << dir2 << ",save=" << save_staple;
448  return TuneKey(meta.VolString(), typeid(*this).name(), aux.str().c_str());
449  }
450 
451  void preTune() { arg.fat.save(); arg.staple.save(); }
452  void postTune() { arg.fat.load(); arg.staple.load(); }
453 
454  long long flops() const {
455  return 2*arg.n_mu*arg.threads*( 4*198 + 18 + 36 );
456  }
457  long long bytes() const {
458  return arg.n_mu*2*meta.VolumeCB()*arg.fat.Bytes()*2 // fat load/store is only done on interior
459  + arg.n_mu*2*arg.threads*(4*arg.u.Bytes() + 2*arg.mulink.Bytes() + (save_staple ? arg.staple.Bytes() : 0));
460  }
461  };
462 
463  // Compute the staple field for direction nu,excluding the directions dir1 and dir2.
464  void computeStaple(GaugeField &fat, GaugeField &staple, const GaugeField &mulink, const GaugeField &u,
465  int nu, int dir1, int dir2, double coeff, bool save_staple) {
466 
467  if (u.Precision() == QUDA_DOUBLE_PRECISION) {
468  typedef typename gauge_mapper<double,QUDA_RECONSTRUCT_NO>::type L;
469  if (u.Reconstruct() == QUDA_RECONSTRUCT_NO) {
470  typedef StapleArg<double,L,L,L,L> Arg;
471  Arg arg(L(fat), L(staple), L(mulink), L(u), coeff, fat, u);
472  Staple<double,Arg> stapler(arg, nu, dir1, dir2, save_staple, fat);
473  stapler.apply(0);
474  } else if (u.Reconstruct() == QUDA_RECONSTRUCT_12) {
475  typedef typename gauge_mapper<double,QUDA_RECONSTRUCT_12,18,QUDA_STAGGERED_PHASE_MILC>::type G;
476  if (mulink.Reconstruct() == QUDA_RECONSTRUCT_NO) {
477  typedef StapleArg<double,L,L,L,G> Arg;
478  Arg arg(L(fat), L(staple), L(mulink), G(u), coeff, fat, u);
479  Staple<double,Arg> stapler(arg, nu, dir1, dir2, save_staple, fat);
480  stapler.apply(0);
481  } else if (mulink.Reconstruct() == QUDA_RECONSTRUCT_12) {
482  typedef StapleArg<double,L,L,G,G> Arg;
483  Arg arg(L(fat), L(staple), G(mulink), G(u), coeff, fat, u);
484  Staple<double,Arg> stapler(arg, nu, dir1, dir2, save_staple, fat);
485  stapler.apply(0);
486  } else {
487  errorQuda("Reconstruct %d is not supported\n", u.Reconstruct());
488  }
489  } else {
490  errorQuda("Reconstruct %d is not supported\n", u.Reconstruct());
491  }
492  } else if (u.Precision() == QUDA_SINGLE_PRECISION) {
493  typedef typename gauge_mapper<float,QUDA_RECONSTRUCT_NO>::type L;
494  if (u.Reconstruct() == QUDA_RECONSTRUCT_NO) {
495  typedef StapleArg<float,L,L,L,L> Arg;
496  Arg arg(L(fat), L(staple), L(mulink), L(u), coeff, fat, u);
497  Staple<float,Arg> stapler(arg, nu, dir1, dir2, save_staple, fat);
498  stapler.apply(0);
499  } else if (u.Reconstruct() == QUDA_RECONSTRUCT_12) {
500  typedef typename gauge_mapper<float,QUDA_RECONSTRUCT_12,18,QUDA_STAGGERED_PHASE_MILC>::type G;
501  if (mulink.Reconstruct() == QUDA_RECONSTRUCT_NO) {
502  typedef StapleArg<double,L,L,L,G> Arg;
503  Arg arg(L(fat), L(staple), L(mulink), G(u), coeff, fat, u);
504  Staple<float,Arg> stapler(arg, nu, dir1, dir2, save_staple, fat);
505  stapler.apply(0);
506  } else if (mulink.Reconstruct() == QUDA_RECONSTRUCT_12) {
507  typedef StapleArg<double,L,L,G,G> Arg;
508  Arg arg(L(fat), L(staple), G(mulink), G(u), coeff, fat, u);
509  Staple<float,Arg> stapler(arg, nu, dir1, dir2, save_staple, fat);
510  stapler.apply(0);
511  } else {
512  errorQuda("Reconstruct %d is not supported\n", u.Reconstruct());
513  }
514  } else {
515  errorQuda("Reconstruct %d is not supported\n", u.Reconstruct());
516  }
517  } else {
518  errorQuda("Unsupported precision %d\n", u.Precision());
519  }
520  }
521 
522 #endif //GPU_FATLINK
523 
524  void fatLongKSLink(cudaGaugeField* fat, cudaGaugeField* lng, const cudaGaugeField& u, const double *coeff)
525  {
526 
527 #ifdef GPU_FATLINK
532  cudaGaugeField staple(gParam);
533  cudaGaugeField staple1(gParam);
534 
535  if( ((fat->X()[0] % 2 != 0) || (fat->X()[1] % 2 != 0) || (fat->X()[2] % 2 != 0) || (fat->X()[3] % 2 != 0))
536  && (u.Reconstruct() != QUDA_RECONSTRUCT_NO)){
537  errorQuda("Reconstruct %d and odd dimensionsize is not supported by link fattening code (yet)\n",
538  u.Reconstruct());
539  }
540 
541  computeOneLink(*fat, u, coeff[0]-6.0*coeff[5]);
542 
543  // if this pointer is not NULL, compute the long link
544  if (lng) computeLongLink(*lng, u, coeff[1]);
545 
546  // Check the coefficients. If all of the following are zero, return.
547  if (fabs(coeff[2]) < MIN_COEFF && fabs(coeff[3]) < MIN_COEFF &&
548  fabs(coeff[4]) < MIN_COEFF && fabs(coeff[5]) < MIN_COEFF) return;
549 
550  for (int nu = 0; nu < 4; nu++) {
551  computeStaple(*fat, staple, u, u, nu, -1, -1, coeff[2], 1);
552 
553  if (coeff[5] != 0.0) computeStaple(*fat, staple, staple, u, nu, -1, -1, coeff[5], 0);
554 
555  for (int rho = 0; rho < 4; rho++) {
556  if (rho != nu) {
557 
558  computeStaple(*fat, staple1, staple, u, rho, nu, -1, coeff[3], 1);
559 
560  if (fabs(coeff[4]) > MIN_COEFF) {
561  for (int sig = 0; sig < 4; sig++) {
562  if (sig != nu && sig != rho) {
563  computeStaple(*fat, staple, staple1, u, sig, nu, rho, coeff[4], 0);
564  }
565  } //sig
566  } // MIN_COEFF
567  }
568  } //rho
569  } //nu
570 
572  checkCudaError();
573 #else
574  errorQuda("Fat-link computation not enabled");
575 #endif
576 
577  return;
578  }
579 
580 #undef MIN_COEFF
581 
582 } // namespace quda
void fatLongKSLink(cudaGaugeField *fat, cudaGaugeField *lng, const cudaGaugeField &gauge, const double *coeff)
Compute the fat and long links for an improved staggered (Kogut-Susskind) fermions.
Definition: llfat_quda.cu:524
dim3 dim3 blockDim
int commDimPartitioned(int dir)
double mu
Definition: test_util.cpp:1643
static __device__ __host__ int linkIndexShift(const I x[], const J dx[], const K X[4])
static __device__ __host__ int linkIndex(const int x[], const I X[4])
QudaVerbosity getVerbosity()
Definition: util_quda.cpp:20
#define errorQuda(...)
Definition: util_quda.h:90
void setPrecision(QudaPrecision precision)
Helper function for setting the precision and corresponding field order for QUDA internal fields...
Definition: gauge_field.h:113
cudaStream_t * stream
QudaPrecision precision
Definition: lattice_field.h:54
int E[4]
Definition: test_util.cpp:36
#define b
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
Definition: tune.cpp:603
#define MIN_COEFF
Definition: llfat_quda.cu:12
Main header file for host and device accessors to GaugeFields.
cudaError_t qudaDeviceSynchronize()
Wrapper around cudaDeviceSynchronize or cuDeviceSynchronize.
GaugeFieldParam gParam
double fabs(double)
unsigned long long flops
Definition: blas_quda.cu:42
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
Definition: complex_quda.h:880
QudaReconstructType reconstruct
Definition: gauge_field.h:14
QudaFieldCreate create
Definition: gauge_field.h:25
const void * c
QudaReconstructType Reconstruct() const
Definition: gauge_field.h:203
#define checkCudaError()
Definition: util_quda.h:129
__host__ __device__ ValueType conj(ValueType x)
Definition: complex_quda.h:115
QudaTune getTuning()
Query whether autotuning is enabled or not. Default is enabled but can be overridden by setting QUDA_...
Definition: util_quda.cpp:51
static __inline__ size_t size_t d
QudaParity parity
Definition: covdev_test.cpp:53
#define a
unsigned long long bytes
Definition: blas_quda.cu:43
const int * X() const
static __device__ __host__ void getCoords(int x[], int cb_index, const I X[], int parity)