18 constexpr
int size = 4096;
23 static constexpr
bool value =
false;
26 static constexpr
bool value =
true;
29 static constexpr
bool value =
true;
32 static constexpr
bool value =
true;
39 template <
typename real,
bool is_variable,
typename Arg>
class coeff_type 44 __device__ __host__ real
a(
int s) {
return arg.a; }
45 __device__ __host__ real
b(
int s) {
return arg.b; }
46 __device__ __host__ real
c(
int s) {
return arg.c; }
52 template <
typename real,
typename Arg>
class coeff_type<real, true,
Arg>
70 __device__ __host__ complex<real>
a(
int s) {
return coeff()->a[
s]; }
71 __device__ __host__ complex<real>
b(
int s) {
return coeff()->b[
s]; }
72 __device__ __host__ complex<real>
c(
int s) {
return coeff()->c[
s]; }
78 template <
typename Float,
int nColor>
struct Dslash5Arg {
109 nParity(in.SiteSubset()),
110 volume_cb(in.VolumeCB()),
111 volume_4d_cb(volume_cb / in.
X(4)),
117 xpay(a_ == 0.0 ? false : true),
126 auto *a_5 = mobius_h->
a;
127 auto *b_5 = mobius_h->
b;
128 auto *c_5 = mobius_h->
c;
133 for (
int s = 0;
s <
Ls;
s++) {
135 c_5[
s] = 0.5 * c_5_[
s];
138 a_5[
s] = 0.5 / (b_5_[
s] * (m_5 + 4.0) + 1.0);
139 a_5[
s] *= a_5[
s] *
static_cast<real
>(
a);
143 for (
int s = 0;
s <
Ls;
s++) {
145 c_5[
s] = 0.5 * (c_5_[
s] * (m_5 + 4.0) - 1.0) / (b_5_[
s] * (m_5 + 4.0) + 1.0);
148 a_5[
s] = 0.5 / (b_5_[
s] * (m_5 + 4.0) + 1.0);
149 a_5[
s] *= a_5[
s] *
static_cast<real
>(
a);
153 b = 2.0 * (0.5 / (5.0 + m_5));
154 c = 0.5 / (1.0 +
std::pow(b, (
int)Ls) * m_f);
157 b = -(c_5_[0].real() * (4.0 + m_5) - 1.0) / (b_5_[0].real() * (4.0 + m_5) + 1.0);
158 c = 0.5 / (1.0 +
std::pow(b, (
int)Ls) * m_f);
159 a *=
pow(0.5 / (b_5_[0].real() * (m_5 + 4.0) + 1.0), 2);
163 for (
int s = 0;
s <
Ls;
s++) {
164 b_5[
s] = -(c_5_[
s] * (4.0 + m_5) - 1.0) / (b_5_[
s] * (4.0 + m_5) + 1.0);
167 c_5[0] = 0.5 / (1.0 + k * m_f);
169 for (
int s = 0;
s <
Ls;
s++) {
170 a_5[
s] = 0.5 / (b_5_[
s] * (m_5 + 4.0) + 1.0);
171 a_5[
s] *= a_5[
s] *
static_cast<real
>(
a);
174 default:
errorQuda(
"Unknown Dslash5Type %d", type);
190 template <
typename Float,
int nColor,
bool dagger,
bool xpay, Dslash5Type type,
typename Arg>
200 const int fwd_idx = ((s + 1) % arg.Ls) * arg.volume_4d_cb + x_cb;
201 const Vector
in = arg.in(fwd_idx, parity);
202 constexpr
int proj_dir =
dagger ? +1 : -1;
203 if (s == arg.Ls - 1) {
204 out += (-arg.m_f * in.project(4, proj_dir)).reconstruct(4, proj_dir);
206 out += in.project(4, proj_dir).reconstruct(4, proj_dir);
211 const int back_idx = ((s + arg.Ls - 1) % arg.Ls) * arg.volume_4d_cb + x_cb;
212 const Vector
in = arg.in(back_idx, parity);
213 constexpr
int proj_dir =
dagger ? -1 : +1;
215 out += (-arg.m_f * in.project(4, proj_dir)).reconstruct(4, proj_dir);
217 out += in.project(4, proj_dir).reconstruct(4, proj_dir);
222 Vector x = arg.x(s * arg.volume_4d_cb + x_cb, parity);
223 out = x + arg.a *
out;
225 Vector diagonal = arg.in(s * arg.volume_4d_cb + x_cb, parity);
226 out = coeff.
c(s) * out + coeff.
b(s) * diagonal;
229 Vector x = arg.x(s * arg.volume_4d_cb + x_cb, parity);
230 out = x + coeff.
a(s) *
out;
233 Vector diagonal = arg.in(s * arg.volume_4d_cb + x_cb, parity);
234 out = coeff.
c(s) * out + diagonal;
237 Vector x = arg.x(s * arg.volume_4d_cb + x_cb, parity);
238 out = coeff.
a(s) * x +
out;
242 arg.out(s * arg.volume_4d_cb + x_cb, parity) =
out;
249 template <
typename Float,
int nColor,
bool dagger,
bool xpay, Dslash5Type type,
typename Arg>
253 for (
int s = 0;
s < arg.Ls;
s++) {
254 for (
int x_cb = 0; x_cb < arg.volume_4d_cb; x_cb++) {
255 dslash5<Float, nColor, dagger, xpay, type>(
arg,
parity, x_cb,
s);
265 template <
typename Float,
int nColor,
bool dagger,
bool xpay, Dslash5Type type,
typename Arg>
268 int x_cb = blockIdx.x * blockDim.x + threadIdx.x;
269 int s = blockIdx.y * blockDim.y + threadIdx.y;
270 int parity = blockIdx.z * blockDim.z + threadIdx.z;
272 if (x_cb >= arg.volume_4d_cb)
return;
273 if (s >= arg.Ls)
return;
274 if (parity >= arg.
nParity)
return;
276 dslash5<Float, nColor, dagger, xpay, type>(
arg,
parity, x_cb,
s);
294 template <
typename real,
int nColor,
bool dagger, Dslash5Type type,
bool shared,
typename Vector,
typename Arg>
297 const auto k = arg.b;
298 const auto inv = arg.c;
303 cache.
save(arg.in(s_ * arg.volume_4d_cb + x_cb, parity));
309 for (
int s = 0;
s < arg.Ls;
s++) {
311 Vector in = shared ? cache.
load(threadIdx.x,
s, parity) : arg.in(
s * arg.volume_4d_cb + x_cb, parity);
314 int exp = s_ <
s ? arg.Ls -
s + s_ : s_ -
s;
315 real factorR = inv *
__fast_pow(k, exp) * (s_ < s ? -arg.m_f :
static_cast<real
>(1.0));
316 constexpr
int proj_dir =
dagger ? -1 : +1;
317 out += factorR * (in.project(4, proj_dir)).reconstruct(4, proj_dir);
321 int exp = s_ >
s ? arg.Ls - s_ +
s :
s - s_;
322 real factorL = inv *
__fast_pow(k, exp) * (s_ >
s ? -arg.m_f :
static_cast<real
>(1.0));
323 constexpr
int proj_dir =
dagger ? +1 : -1;
324 out += factorL * (in.project(4, proj_dir)).reconstruct(4, proj_dir);
351 template <
typename real,
int nColor,
bool dagger, Dslash5Type type,
bool shared,
typename Vector,
typename Arg>
354 constexpr
int nSpin = 4;
357 Vector in = arg.in(s_ * arg.volume_4d_cb + x_cb, parity);
363 constexpr
int proj_dir =
dagger ? -1 : +1;
366 cache.
save(in.project(4, proj_dir));
373 for (
int s_count = 0; s_count < arg.Ls; s_count++) {
374 auto factorR = (s_ < s ? -arg.m_f *
R :
R);
377 r += factorR * cache.
load(threadIdx.x, s, parity);
379 Vector in = arg.in(s * arg.volume_4d_cb + x_cb, parity);
380 r += factorR * in.project(4, proj_dir);
384 s = (s + arg.Ls - 1) % arg.Ls;
387 out += r.reconstruct(4, proj_dir);
391 constexpr
int proj_dir =
dagger ? +1 : -1;
394 cache.
save(in.project(4, proj_dir));
401 for (
int s_count = 0; s_count < arg.Ls; s_count++) {
402 auto factorL = (s_ > s ? -arg.m_f * L : L);
405 l += factorL * cache.
load(threadIdx.x, s, parity);
407 Vector in = arg.in(s * arg.volume_4d_cb + x_cb, parity);
408 l += factorL * in.project(4, proj_dir);
412 s = (s + 1) % arg.Ls;
415 out += l.reconstruct(4, proj_dir);
432 template <
typename Float,
int nColor,
bool dagger,
bool xpay, Dslash5Type type,
bool shared,
bool var_inverse,
typename Arg>
435 constexpr
int nSpin = 4;
442 out = variableInv<real, nColor, dagger, type, shared, Vector>(
arg,
parity, x_cb,
s);
444 out = constantInv<real, nColor, dagger, type, shared, Vector>(
arg,
parity, x_cb,
s);
448 Vector x = arg.x(s * arg.volume_4d_cb + x_cb, parity);
449 out = x + coeff.
a(s) *
out;
452 arg.out(s * arg.volume_4d_cb + x_cb, parity) =
out;
462 template <
typename Float,
int nColor,
bool dagger,
bool xpay, Dslash5Type type,
bool shared,
bool var_inverse,
typename Arg>
465 int x_cb = blockIdx.x * blockDim.x + threadIdx.x;
466 int s = blockIdx.y * blockDim.y + threadIdx.y;
467 int parity = blockIdx.z * blockDim.z + threadIdx.z;
469 if (x_cb >= arg.volume_4d_cb)
return;
470 if (s >= arg.Ls)
return;
471 if (parity >= arg.
nParity)
return;
473 dslash5inv<Float, nColor, dagger, xpay, type, shared, var_inverse>(
arg,
parity, x_cb,
s);
__device__ __host__ coeff_type(const Arg &arg)
static __constant__ char mobius_d[size]
__global__ void dslash5GPU(Arg arg)
GPU kernel for applying the D5 operator.
complex< real > b[QUDA_MAX_DWF_LS]
coeff_5< real > * mobius_h
__host__ __device__ ValueType exp(ValueType x)
__device__ __host__ coeff_type(const Arg &arg)
__device__ __host__ real __fast_pow(real a, int b)
Dslash5Arg(ColorSpinorField &out, const ColorSpinorField &in, const ColorSpinorField &x, double m_f, double m_5, const Complex *b_5_, const Complex *c_5_, double a_, bool dagger, Dslash5Type type)
Helper class for grabbing the constant struct, whether we are on the GPU or CPU.
Helper math routines used in QUDA.
__device__ __host__ const coeff_5< real > * coeff() const
Helper function for grabbing the constant struct, whether we are on the GPU or CPU.
Structure containing zMobius / Zolotarev coefficients.
__global__ void dslash5invGPU(Arg arg)
CPU kernel for applying the M5 inverse operator.
mapper< Float >::type real
void xpay(ColorSpinorField &x, double a, ColorSpinorField &y)
__device__ __host__ Vector variableInv(Arg &arg, int parity, int x_cb, int s_)
Apply the M5 inverse operator at a given site on the lattice. This is an alternative algorithm that i...
__device__ __host__ real b(int s)
__device__ __host__ void dslash5inv(Arg &arg, int parity, int x_cb, int s)
Apply the M5 inverse operator at a given site on the lattice.
__device__ __host__ complex< real > a(int s)
__device__ __host__ real a(int s)
__host__ __device__ ValueType pow(ValueType x, ExponentType e)
__device__ Vector load(int x, int y, int z)
Load a vector from the shared memory cache.
complex< real > a[QUDA_MAX_DWF_LS]
std::complex< double > Complex
__device__ __host__ real c(int s)
__device__ void sync()
Synchronize the cache.
complex< real > c[QUDA_MAX_DWF_LS]
cpuColorSpinorField * out
#define QUDA_MAX_DWF_LS
Maximum length of the Ls dimension for domain-wall fermions.
__device__ __host__ Vector constantInv(Arg &arg, int parity, int x_cb, int s_)
Apply the M5 inverse operator at a given site on the lattice. This is the original algorithm as descr...
Class which wraps around a shared memory cache for a Vector type, where each thread in the thread blo...
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
colorspinor_mapper< Float, 4, nColor >::type F
__device__ void save(const Vector &a)
Save the vector into the 3-d shared memory cache. Implicitly store the vector at coordinates given by...
__device__ __host__ complex< real > c(int s)
Parameter structure for applying the Dslash.
__device__ __host__ complex< real > b(int s)
QudaFieldOrder FieldOrder() const
void dslash5CPU(Arg &arg)
CPU kernel for applying the D5 operator.
__device__ __host__ void dslash5(Arg &arg, int parity, int x_cb, int s)
Apply the D5 operator at given site.