QUDA  1.0.0
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros
dslash_ndeg_twisted_mass_preconditioned.cuh
Go to the documentation of this file.
1 #pragma once
2 
6 
7 namespace quda
8 {
9 
10  template <typename Float, int nColor, QudaReconstructType reconstruct_>
11  struct NdegTwistedMassArg : WilsonArg<Float, nColor, reconstruct_> {
12  typedef typename mapper<Float>::type real;
13  real a;
14  real b;
15  real c;
16  real a_inv;
17  real b_inv;
18  real c_inv;
19  bool asymmetric;
21  NdegTwistedMassArg(ColorSpinorField &out, const ColorSpinorField &in, const GaugeField &U, double a, double b,
22  double c, bool xpay, const ColorSpinorField &x, int parity, bool dagger, bool asymmetric,
23  const int *comm_override) :
24  WilsonArg<Float, nColor, reconstruct_>(out, in, U, xpay ? 1.0 : 0.0, x, parity, dagger, comm_override),
25  a(a),
26  b(dagger ? -b : b), // if dagger flip the chiral twist
27  c(c),
28  a_inv(1.0 / (a * (1.0 + b * b - c * c))),
29  b_inv(dagger ? b : -b),
30  c_inv(-c),
31  asymmetric(asymmetric)
32  {
33  // set parameters for twisting in the packing kernel
34  if (dagger && !asymmetric) {
38  }
39  }
40  };
41 
47  template <typename Float, int nDim, int nColor, int nParity, bool dagger, bool asymmetric, bool xpay,
48  KernelType kernel_type, typename Arg>
49  __device__ __host__ inline void ndegTwistedMass(Arg &arg, int idx, int flavor, int parity)
50  {
51  typedef typename mapper<Float>::type real;
53  typedef ColorSpinor<real, nColor, 2> HalfVector;
54 
55  bool active
56  = kernel_type == EXTERIOR_KERNEL_ALL ? false : true; // is thread active (non-trival for fused kernel only)
57  int thread_dim; // which dimension is thread working on (fused kernel only)
58  int coord[nDim];
59  int x_cb = getCoords<nDim, QUDA_4D_PC, kernel_type>(coord, arg, idx, parity, thread_dim);
60 
61  const int my_spinor_parity = nParity == 2 ? parity : 0;
62  Vector out;
63 
64  if (!dagger || asymmetric) // defined in dslash_wilson.cuh
65  applyWilson<Float, nDim, nColor, nParity, dagger, kernel_type>(
66  out, arg, coord, x_cb, flavor, parity, idx, thread_dim, active);
67  else // defined in dslash_twisted_mass_preconditioned
68  applyWilsonTM<Float, nDim, nColor, nParity, dagger, 2, kernel_type>(
69  out, arg, coord, x_cb, flavor, parity, idx, thread_dim, active);
70 
71  int my_flavor_idx = x_cb + flavor * arg.dc.volume_4d_cb;
72 
73  if (xpay && kernel_type == INTERIOR_KERNEL) {
74 
75  if (!dagger || asymmetric) { // apply inverse twist which is undone below
76  // use consistent load order across s to ensure better cache locality
77  Vector x0 = arg.x(x_cb + 0 * arg.dc.volume_4d_cb, my_spinor_parity);
78  Vector x1 = arg.x(x_cb + 1 * arg.dc.volume_4d_cb, my_spinor_parity);
79  if (flavor == 0)
80  out += arg.a_inv * (x0 + arg.b_inv * x0.igamma(4) + arg.c_inv * x1);
81  else
82  out += arg.a_inv * (x1 - arg.b_inv * x1.igamma(4) + arg.c_inv * x0);
83  } else {
84  Vector x = arg.x(my_flavor_idx, my_spinor_parity);
85  out += x; // just directly add since twist already applied in the dslash
86  }
87 
88  } else if (kernel_type != INTERIOR_KERNEL && active) {
89  // if we're not the interior kernel, then we must sum the partial
90  Vector x = arg.out(my_flavor_idx, my_spinor_parity);
91  out += x;
92  }
93 
94  if (isComplete<kernel_type>(arg, coord) && active) {
95  if (!dagger || asymmetric) { // apply A^{-1} to D*in
97  // to apply the preconditioner we need to put "out" in shared memory so the other flavor can access it
98  cache.save(out);
99  cache.sync(); // safe to sync in here since other threads will exit
100 
101  if (flavor == 0)
102  out = arg.a * (out + arg.b * out.igamma(4) + arg.c * cache.load(threadIdx.x, 1, threadIdx.z));
103  else
104  out = arg.a * (out - arg.b * out.igamma(4) + arg.c * cache.load(threadIdx.x, 0, threadIdx.z));
105  }
106  }
107 
108  if (kernel_type != EXTERIOR_KERNEL_ALL || active) arg.out(my_flavor_idx, my_spinor_parity) = out;
109  }
110 
111  // CPU kernel for applying the non-degenerate twisted-mass operator to a vector
112  template <typename Float, int nDim, int nColor, int nParity, bool dagger, bool xpay, KernelType kernel_type, typename Arg>
114  {
115  if (arg.asymmetric) {
116  for (int parity = 0; parity < nParity; parity++) {
117  // for full fields then set parity from loop else use arg setting
118  parity = nParity == 2 ? parity : arg.parity;
119 
120  for (int x_cb = 0; x_cb < arg.threads; x_cb++) { // 4-d volume
121  for (int flavor = 0; flavor < 2; flavor++) {
122  ndegTwistedMass<Float, nDim, nColor, nParity, dagger, true, xpay, kernel_type>(arg, x_cb, flavor, parity);
123  }
124  } // 4-d volumeCB
125  } // parity
126  } else {
127  for (int parity = 0; parity < nParity; parity++) {
128  // for full fields then set parity from loop else use arg setting
129  parity = nParity == 2 ? parity : arg.parity;
130 
131  for (int x_cb = 0; x_cb < arg.threads; x_cb++) { // 4-d volume
132  for (int flavor = 0; flavor < 2; flavor++) {
133  ndegTwistedMass<Float, nDim, nColor, nParity, dagger, false, xpay, kernel_type>(arg, x_cb, flavor, parity);
134  }
135  } // 4-d volumeCB
136  } // parity
137  }
138  }
139 
140  // GPU Kernel for applying the non-degenerate twisted-mass operator to a vector
141  template <typename Float, int nDim, int nColor, int nParity, bool dagger, bool xpay, KernelType kernel_type, typename Arg>
143  {
144  int x_cb = blockIdx.x * blockDim.x + threadIdx.x;
145  if (x_cb >= arg.threads) return;
146 
147  // for this operator flavor can be spread onto different blocks
148  int flavor = blockIdx.y * blockDim.y + threadIdx.y;
149 
150  // for full fields set parity from z thread index else use arg setting
151  int parity = nParity == 2 ? blockDim.z * blockIdx.z + threadIdx.z : arg.parity;
152 
153  if (arg.asymmetric) {
154  // constrain template instantiation for compilation (asymmetric implies dagger and !xpay)
155  switch (parity) {
156  case 0:
157  ndegTwistedMass<Float, nDim, nColor, nParity, true, true, false, kernel_type>(arg, x_cb, flavor, 0);
158  break;
159  case 1:
160  ndegTwistedMass<Float, nDim, nColor, nParity, true, true, false, kernel_type>(arg, x_cb, flavor, 1);
161  break;
162  }
163  } else {
164  switch (parity) {
165  case 0:
166  ndegTwistedMass<Float, nDim, nColor, nParity, dagger, false, xpay, kernel_type>(arg, x_cb, flavor, 0);
167  break;
168  case 1:
169  ndegTwistedMass<Float, nDim, nColor, nParity, dagger, false, xpay, kernel_type>(arg, x_cb, flavor, 1);
170  break;
171  }
172  }
173  }
174 
175 } // namespace quda
KernelType kernel_type
NdegTwistedMassArg(ColorSpinorField &out, const ColorSpinorField &in, const GaugeField &U, double a, double b, double c, bool xpay, const ColorSpinorField &x, int parity, bool dagger, bool asymmetric, const int *comm_override)
__device__ __host__ void ndegTwistedMass(Arg &arg, int idx, int flavor, int parity)
Apply the twisted-mass dslash out(x) = M*in = a * D * in + (1 + i*b*gamma_5*tau_3 + c*tau_1)*x Note t...
Parameter structure for driving the Wilson operator.
const int nColor
Definition: covdev_test.cpp:75
__device__ Vector load(int x, int y, int z)
Load a vector from the shared memory cache.
__global__ void ndegTwistedMassPreconditionedGPU(Arg arg)
__device__ void sync()
Synchronize the cache.
Class which wraps around a shared memory cache for a Vector type, where each thread in the thread blo...
void ndegTwistedMassPreconditionedCPU(Arg arg)
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
VectorXcd Vector
__device__ void save(const Vector &a)
Save the vector into the 3-d shared memory cache. Implicitly store the vector at coordinates given by...