QUDA  1.0.0
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros
dslash_domain_wall_m5.cuh
Go to the documentation of this file.
1 #pragma once
2 
4 #include <math_helper.cuh>
5 
6 namespace quda
7 {
8 
12  template <typename real> struct coeff_5 {
13  complex<real> a[QUDA_MAX_DWF_LS]; // xpay coefficients
14  complex<real> b[QUDA_MAX_DWF_LS];
15  complex<real> c[QUDA_MAX_DWF_LS];
16  };
17 
18  constexpr int size = 4096;
19  static __constant__ char mobius_d[size]; // buffer used for Mobius coefficients for GPU kernel
20 
21  // helper trait for determining if we are using variable coefficients
22  template <Dslash5Type type> struct is_variable {
23  static constexpr bool value = false;
24  };
25  template <> struct is_variable<DSLASH5_MOBIUS_PRE> {
26  static constexpr bool value = true;
27  };
28  template <> struct is_variable<DSLASH5_MOBIUS> {
29  static constexpr bool value = true;
30  };
31  template <> struct is_variable<M5_INV_ZMOBIUS> {
32  static constexpr bool value = true;
33  };
34 
39  template <typename real, bool is_variable, typename Arg> class coeff_type
40  {
41 public:
42  const Arg &arg;
43  __device__ __host__ coeff_type(const Arg &arg) : arg(arg) {}
44  __device__ __host__ real a(int s) { return arg.a; }
45  __device__ __host__ real b(int s) { return arg.b; }
46  __device__ __host__ real c(int s) { return arg.c; }
47  };
48 
52  template <typename real, typename Arg> class coeff_type<real, true, Arg>
53  {
58  inline __device__ __host__ const coeff_5<real> *coeff() const
59  {
60 #ifdef __CUDA_ARCH__
61  return reinterpret_cast<const coeff_5<real> *>(mobius_d);
62 #else
63  return arg.mobius_h;
64 #endif
65  }
66 
67 public:
68  const Arg &arg;
69  __device__ __host__ inline coeff_type(const Arg &arg) : arg(arg) {}
70  __device__ __host__ complex<real> a(int s) { return coeff()->a[s]; }
71  __device__ __host__ complex<real> b(int s) { return coeff()->b[s]; }
72  __device__ __host__ complex<real> c(int s) { return coeff()->c[s]; }
73  };
74 
78  template <typename Float, int nColor> struct Dslash5Arg {
80  typedef typename mapper<Float>::type real;
81 
82  F out; // output vector field
83  const F in; // input vector field
84  const F x; // auxiliary input vector field
85  const int nParity; // number of parities we're working on
86  const int volume_cb; // checkerboarded volume
87  const int volume_4d_cb; // 4-d checkerboarded volume
88  const int_fastdiv Ls; // length of 5th dimension
89 
90  const real m_f; // fermion mass parameter
91  const real m_5; // Wilson mass shift
92 
93  const bool dagger; // dagger
94  const bool xpay; // whether we are doing xpay or not
95 
96  real b; // real constant Mobius coefficient
97  real c; // real constant Mobius coefficient
98  real a; // real xpay coefficient
99 
101 
102  coeff_5<real> *mobius_h; // constant buffer used for Mobius coefficients for CPU kernel
103 
104  Dslash5Arg(ColorSpinorField &out, const ColorSpinorField &in, const ColorSpinorField &x, double m_f, double m_5,
105  const Complex *b_5_, const Complex *c_5_, double a_, bool dagger, Dslash5Type type) :
106  out(out),
107  in(in),
108  x(x),
109  nParity(in.SiteSubset()),
110  volume_cb(in.VolumeCB()),
111  volume_4d_cb(volume_cb / in.X(4)),
112  Ls(in.X(4)),
113  m_f(m_f),
114  m_5(m_5),
115  a(a_),
116  dagger(dagger),
117  xpay(a_ == 0.0 ? false : true),
118  type(type)
119  {
120  if (in.Nspin() != 4) errorQuda("nSpin = %d not support", in.Nspin());
121  if (!in.isNative() || !out.isNative())
122  errorQuda("Unsupported field order out=%d in=%d\n", out.FieldOrder(), in.FieldOrder());
123 
124  if (sizeof(coeff_5<real>) > size) errorQuda("Coefficient buffer too large at %lu bytes\n", sizeof(coeff_5<real>));
125  mobius_h = new coeff_5<real>;
126  auto *a_5 = mobius_h->a;
127  auto *b_5 = mobius_h->b;
128  auto *c_5 = mobius_h->c;
129 
130  switch (type) {
131  case DSLASH5_DWF: break;
132  case DSLASH5_MOBIUS_PRE:
133  for (int s = 0; s < Ls; s++) {
134  b_5[s] = b_5_[s];
135  c_5[s] = 0.5 * c_5_[s];
136 
137  // xpay
138  a_5[s] = 0.5 / (b_5_[s] * (m_5 + 4.0) + 1.0);
139  a_5[s] *= a_5[s] * static_cast<real>(a);
140  }
141  break;
142  case DSLASH5_MOBIUS:
143  for (int s = 0; s < Ls; s++) {
144  b_5[s] = 1.0;
145  c_5[s] = 0.5 * (c_5_[s] * (m_5 + 4.0) - 1.0) / (b_5_[s] * (m_5 + 4.0) + 1.0);
146 
147  // axpy
148  a_5[s] = 0.5 / (b_5_[s] * (m_5 + 4.0) + 1.0);
149  a_5[s] *= a_5[s] * static_cast<real>(a);
150  }
151  break;
152  case M5_INV_DWF:
153  b = 2.0 * (0.5 / (5.0 + m_5)); // 2 * kappa_5
154  c = 0.5 / (1.0 + std::pow(b, (int)Ls) * m_f);
155  break;
156  case M5_INV_MOBIUS:
157  b = -(c_5_[0].real() * (4.0 + m_5) - 1.0) / (b_5_[0].real() * (4.0 + m_5) + 1.0);
158  c = 0.5 / (1.0 + std::pow(b, (int)Ls) * m_f);
159  a *= pow(0.5 / (b_5_[0].real() * (m_5 + 4.0) + 1.0), 2);
160  break;
161  case M5_INV_ZMOBIUS: {
162  complex<double> k = 1.0;
163  for (int s = 0; s < Ls; s++) {
164  b_5[s] = -(c_5_[s] * (4.0 + m_5) - 1.0) / (b_5_[s] * (4.0 + m_5) + 1.0);
165  k *= b_5[s];
166  }
167  c_5[0] = 0.5 / (1.0 + k * m_f);
168 
169  for (int s = 0; s < Ls; s++) { // axpy coefficients
170  a_5[s] = 0.5 / (b_5_[s] * (m_5 + 4.0) + 1.0);
171  a_5[s] *= a_5[s] * static_cast<real>(a);
172  }
173  } break;
174  default: errorQuda("Unknown Dslash5Type %d", type);
175  }
176 
177  cudaMemcpyToSymbolAsync(mobius_d, mobius_h, sizeof(coeff_5<real>), 0, cudaMemcpyHostToDevice, streams[Nstream - 1]);
178  }
179 
180  virtual ~Dslash5Arg() { delete mobius_h; }
181  };
182 
190  template <typename Float, int nColor, bool dagger, bool xpay, Dslash5Type type, typename Arg>
191  __device__ __host__ inline void dslash5(Arg &arg, int parity, int x_cb, int s)
192  {
193  typedef typename mapper<Float>::type real;
196 
197  Vector out;
198 
199  { // forwards direction
200  const int fwd_idx = ((s + 1) % arg.Ls) * arg.volume_4d_cb + x_cb;
201  const Vector in = arg.in(fwd_idx, parity);
202  constexpr int proj_dir = dagger ? +1 : -1;
203  if (s == arg.Ls - 1) {
204  out += (-arg.m_f * in.project(4, proj_dir)).reconstruct(4, proj_dir);
205  } else {
206  out += in.project(4, proj_dir).reconstruct(4, proj_dir);
207  }
208  }
209 
210  { // backwards direction
211  const int back_idx = ((s + arg.Ls - 1) % arg.Ls) * arg.volume_4d_cb + x_cb;
212  const Vector in = arg.in(back_idx, parity);
213  constexpr int proj_dir = dagger ? -1 : +1;
214  if (s == 0) {
215  out += (-arg.m_f * in.project(4, proj_dir)).reconstruct(4, proj_dir);
216  } else {
217  out += in.project(4, proj_dir).reconstruct(4, proj_dir);
218  }
219  }
220 
221  if (type == DSLASH5_DWF && xpay) {
222  Vector x = arg.x(s * arg.volume_4d_cb + x_cb, parity);
223  out = x + arg.a * out;
224  } else if (type == DSLASH5_MOBIUS_PRE) {
225  Vector diagonal = arg.in(s * arg.volume_4d_cb + x_cb, parity);
226  out = coeff.c(s) * out + coeff.b(s) * diagonal;
227 
228  if (xpay) {
229  Vector x = arg.x(s * arg.volume_4d_cb + x_cb, parity);
230  out = x + coeff.a(s) * out;
231  }
232  } else if (type == DSLASH5_MOBIUS) {
233  Vector diagonal = arg.in(s * arg.volume_4d_cb + x_cb, parity);
234  out = coeff.c(s) * out + diagonal;
235 
236  if (xpay) { // really axpy
237  Vector x = arg.x(s * arg.volume_4d_cb + x_cb, parity);
238  out = coeff.a(s) * x + out;
239  }
240  }
241 
242  arg.out(s * arg.volume_4d_cb + x_cb, parity) = out;
243  }
244 
249  template <typename Float, int nColor, bool dagger, bool xpay, Dslash5Type type, typename Arg>
251  {
252  for (int parity = 0; parity < arg.nParity; parity++) {
253  for (int s = 0; s < arg.Ls; s++) {
254  for (int x_cb = 0; x_cb < arg.volume_4d_cb; x_cb++) { // 4-d volume
255  dslash5<Float, nColor, dagger, xpay, type>(arg, parity, x_cb, s);
256  } // 4-d volumeCB
257  } // ls
258  } // parity
259  }
260 
265  template <typename Float, int nColor, bool dagger, bool xpay, Dslash5Type type, typename Arg>
266  __global__ void dslash5GPU(Arg arg)
267  {
268  int x_cb = blockIdx.x * blockDim.x + threadIdx.x;
269  int s = blockIdx.y * blockDim.y + threadIdx.y;
270  int parity = blockIdx.z * blockDim.z + threadIdx.z;
271 
272  if (x_cb >= arg.volume_4d_cb) return;
273  if (s >= arg.Ls) return;
274  if (parity >= arg.nParity) return;
275 
276  dslash5<Float, nColor, dagger, xpay, type>(arg, parity, x_cb, s);
277  }
278 
294  template <typename real, int nColor, bool dagger, Dslash5Type type, bool shared, typename Vector, typename Arg>
295  __device__ __host__ inline Vector constantInv(Arg &arg, int parity, int x_cb, int s_)
296  {
297  const auto k = arg.b;
298  const auto inv = arg.c;
299 
300  // if using shared-memory caching then load spinor field for my site into cache
302  if (shared) {
303  cache.save(arg.in(s_ * arg.volume_4d_cb + x_cb, parity));
304  cache.sync();
305  }
306 
307  Vector out;
308 
309  for (int s = 0; s < arg.Ls; s++) {
310 
311  Vector in = shared ? cache.load(threadIdx.x, s, parity) : arg.in(s * arg.volume_4d_cb + x_cb, parity);
312 
313  {
314  int exp = s_ < s ? arg.Ls - s + s_ : s_ - s;
315  real factorR = inv * __fast_pow(k, exp) * (s_ < s ? -arg.m_f : static_cast<real>(1.0));
316  constexpr int proj_dir = dagger ? -1 : +1;
317  out += factorR * (in.project(4, proj_dir)).reconstruct(4, proj_dir);
318  }
319 
320  {
321  int exp = s_ > s ? arg.Ls - s_ + s : s - s_;
322  real factorL = inv * __fast_pow(k, exp) * (s_ > s ? -arg.m_f : static_cast<real>(1.0));
323  constexpr int proj_dir = dagger ? +1 : -1;
324  out += factorL * (in.project(4, proj_dir)).reconstruct(4, proj_dir);
325  }
326  }
327 
328  return out;
329  }
330 
351  template <typename real, int nColor, bool dagger, Dslash5Type type, bool shared, typename Vector, typename Arg>
352  __device__ __host__ inline Vector variableInv(Arg &arg, int parity, int x_cb, int s_)
353  {
354  constexpr int nSpin = 4;
355  typedef ColorSpinor<real, nColor, nSpin / 2> HalfVector;
357  Vector in = arg.in(s_ * arg.volume_4d_cb + x_cb, parity);
358  Vector out;
359 
361 
362  { // first do R
363  constexpr int proj_dir = dagger ? -1 : +1;
364 
365  if (shared) {
366  cache.save(in.project(4, proj_dir));
367  cache.sync();
368  }
369 
370  int s = s_;
371  auto R = coeff.c(0);
372  HalfVector r;
373  for (int s_count = 0; s_count < arg.Ls; s_count++) {
374  auto factorR = (s_ < s ? -arg.m_f * R : R);
375 
376  if (shared) {
377  r += factorR * cache.load(threadIdx.x, s, parity);
378  } else {
379  Vector in = arg.in(s * arg.volume_4d_cb + x_cb, parity);
380  r += factorR * in.project(4, proj_dir);
381  }
382 
383  R *= coeff.b(s);
384  s = (s + arg.Ls - 1) % arg.Ls;
385  }
386 
387  out += r.reconstruct(4, proj_dir);
388  }
389 
390  { // second do L
391  constexpr int proj_dir = dagger ? +1 : -1;
392  if (shared) {
393  cache.sync(); // ensure we finish R before overwriting cache
394  cache.save(in.project(4, proj_dir));
395  cache.sync();
396  }
397 
398  int s = s_;
399  auto L = coeff.c(0);
400  HalfVector l;
401  for (int s_count = 0; s_count < arg.Ls; s_count++) {
402  auto factorL = (s_ > s ? -arg.m_f * L : L);
403 
404  if (shared) {
405  l += factorL * cache.load(threadIdx.x, s, parity);
406  } else {
407  Vector in = arg.in(s * arg.volume_4d_cb + x_cb, parity);
408  l += factorL * in.project(4, proj_dir);
409  }
410 
411  L *= coeff.b(s);
412  s = (s + 1) % arg.Ls;
413  }
414 
415  out += l.reconstruct(4, proj_dir);
416  }
417 
418  return out;
419  }
420 
432  template <typename Float, int nColor, bool dagger, bool xpay, Dslash5Type type, bool shared, bool var_inverse, typename Arg>
433  __device__ __host__ inline void dslash5inv(Arg &arg, int parity, int x_cb, int s)
434  {
435  constexpr int nSpin = 4;
436  typedef typename mapper<Float>::type real;
439 
440  Vector out;
441  if (var_inverse) { // zMobius, must call variableInv
442  out = variableInv<real, nColor, dagger, type, shared, Vector>(arg, parity, x_cb, s);
443  } else {
444  out = constantInv<real, nColor, dagger, type, shared, Vector>(arg, parity, x_cb, s);
445  }
446 
447  if (xpay) {
448  Vector x = arg.x(s * arg.volume_4d_cb + x_cb, parity);
449  out = x + coeff.a(s) * out;
450  }
451 
452  arg.out(s * arg.volume_4d_cb + x_cb, parity) = out;
453  }
454 
462  template <typename Float, int nColor, bool dagger, bool xpay, Dslash5Type type, bool shared, bool var_inverse, typename Arg>
463  __global__ void dslash5invGPU(Arg arg)
464  {
465  int x_cb = blockIdx.x * blockDim.x + threadIdx.x;
466  int s = blockIdx.y * blockDim.y + threadIdx.y;
467  int parity = blockIdx.z * blockDim.z + threadIdx.z;
468 
469  if (x_cb >= arg.volume_4d_cb) return;
470  if (s >= arg.Ls) return;
471  if (parity >= arg.nParity) return;
472 
473  dslash5inv<Float, nColor, dagger, xpay, type, shared, var_inverse>(arg, parity, x_cb, s);
474  }
475 
476 } // namespace quda
__device__ __host__ coeff_type(const Arg &arg)
static __constant__ char mobius_d[size]
__global__ void dslash5GPU(Arg arg)
GPU kernel for applying the D5 operator.
complex< real > b[QUDA_MAX_DWF_LS]
coeff_5< real > * mobius_h
__host__ __device__ ValueType exp(ValueType x)
Definition: complex_quda.h:96
#define errorQuda(...)
Definition: util_quda.h:121
__device__ __host__ coeff_type(const Arg &arg)
__device__ __host__ real __fast_pow(real a, int b)
Definition: math_helper.cuh:15
cudaStream_t * streams
Dslash5Arg(ColorSpinorField &out, const ColorSpinorField &in, const ColorSpinorField &x, double m_f, double m_5, const Complex *b_5_, const Complex *c_5_, double a_, bool dagger, Dslash5Type type)
Helper class for grabbing the constant struct, whether we are on the GPU or CPU.
const int Nstream
Definition: quda_internal.h:83
static int R[4]
Helper math routines used in QUDA.
__device__ __host__ const coeff_5< real > * coeff() const
Helper function for grabbing the constant struct, whether we are on the GPU or CPU.
Structure containing zMobius / Zolotarev coefficients.
__global__ void dslash5invGPU(Arg arg)
CPU kernel for applying the M5 inverse operator.
mapper< Float >::type real
void xpay(ColorSpinorField &x, double a, ColorSpinorField &y)
Definition: blas_quda.h:37
int Ls
Definition: test_util.cpp:38
__device__ __host__ Vector variableInv(Arg &arg, int parity, int x_cb, int s_)
Apply the M5 inverse operator at a given site on the lattice. This is an alternative algorithm that i...
__device__ __host__ real b(int s)
__device__ __host__ void dslash5inv(Arg &arg, int parity, int x_cb, int s)
Apply the M5 inverse operator at a given site on the lattice.
__device__ __host__ complex< real > a(int s)
const int nColor
Definition: covdev_test.cpp:75
cpuColorSpinorField * in
__device__ __host__ real a(int s)
constexpr int size
__host__ __device__ ValueType pow(ValueType x, ExponentType e)
Definition: complex_quda.h:111
__device__ Vector load(int x, int y, int z)
Load a vector from the shared memory cache.
complex< real > a[QUDA_MAX_DWF_LS]
int X[4]
Definition: covdev_test.cpp:70
std::complex< double > Complex
Definition: quda_internal.h:46
const int_fastdiv Ls
__device__ __host__ real c(int s)
__device__ void sync()
Synchronize the cache.
complex< real > c[QUDA_MAX_DWF_LS]
cpuColorSpinorField * out
const int nParity
Definition: spinor_noise.cu:25
Dslash5Type
Definition: dslash_quda.h:396
__shared__ float s[]
#define QUDA_MAX_DWF_LS
Maximum length of the Ls dimension for domain-wall fermions.
__device__ __host__ Vector constantInv(Arg &arg, int parity, int x_cb, int s_)
Apply the M5 inverse operator at a given site on the lattice. This is the original algorithm as descr...
Class which wraps around a shared memory cache for a Vector type, where each thread in the thread blo...
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
VectorXcd Vector
colorspinor_mapper< Float, 4, nColor >::type F
__device__ void save(const Vector &a)
Save the vector into the 3-d shared memory cache. Implicitly store the vector at coordinates given by...
__device__ __host__ complex< real > c(int s)
Parameter structure for applying the Dslash.
QudaDagType dagger
Definition: test_util.cpp:1620
__device__ __host__ complex< real > b(int s)
QudaParity parity
Definition: covdev_test.cpp:54
QudaFieldOrder FieldOrder() const
void dslash5CPU(Arg &arg)
CPU kernel for applying the D5 operator.
__device__ __host__ void dslash5(Arg &arg, int parity, int x_cb, int s)
Apply the D5 operator at given site.