QUDA  1.0.0
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros
dslash_domain_wall_4d.cuh
Go to the documentation of this file.
1 #pragma once
2 
4 
5 namespace quda
6 {
7 
8  constexpr int size = 4096;
9  static __constant__ char mobius_d[size]; // constant buffer used for Mobius coefficients for GPU kernel
10 
11  template <typename Float, int nColor, QudaReconstructType reconstruct_>
12  struct DomainWall4DArg : WilsonArg<Float, nColor, reconstruct_> {
13  typedef typename mapper<Float>::type real;
14  int Ls;
15  complex<real> a_5[QUDA_MAX_DWF_LS];
21  inline __device__ __host__ complex<real> a5(int s)
22  {
23 #ifdef __CUDA_ARCH__
24  return reinterpret_cast<const complex<real> *>(mobius_d)[s];
25 #else
26  return a_5[s];
27 #endif
28  }
29 
30  DomainWall4DArg(ColorSpinorField &out, const ColorSpinorField &in, const GaugeField &U, double a, double m_5,
31  const Complex *b_5, const Complex *c_5, bool xpay, const ColorSpinorField &x, int parity, bool dagger,
32  const int *comm_override) :
33  WilsonArg<Float, nColor, reconstruct_>(out, in, U, xpay ? a : 0.0, x, parity, dagger, comm_override),
34  Ls(in.X(4))
35  {
36  if (b_5 == nullptr || c_5 == nullptr)
37  for (int s = 0; s < Ls; s++) a_5[s] = a; // 4-d Shamir
38  else
39  for (int s = 0; s < Ls; s++) a_5[s] = 0.5 * a / (b_5[s] * (m_5 + 4.0) + 1.0); // 4-d Mobius
40  }
41  };
42 
43  template <typename Float, int nDim, int nColor, int nParity, bool dagger, bool xpay, KernelType kernel_type, typename Arg>
44  __device__ __host__ inline void domainWall4D(Arg &arg, int idx, int s, int parity)
45  {
46  typedef typename mapper<Float>::type real;
48 
49  bool active
50  = kernel_type == EXTERIOR_KERNEL_ALL ? false : true; // is thread active (non-trival for fused kernel only)
51  int thread_dim; // which dimension is thread working on (fused kernel only)
52  int coord[nDim];
53  int x_cb = getCoords<nDim, QUDA_4D_PC, kernel_type>(coord, arg, idx, parity, thread_dim);
54 
55  const int my_spinor_parity = nParity == 2 ? parity : 0;
56  Vector out;
57  applyWilson<Float, nDim, nColor, nParity, dagger, kernel_type>(
58  out, arg, coord, x_cb, s, parity, idx, thread_dim, active);
59 
60  int xs = x_cb + s * arg.dc.volume_4d_cb;
61  if (xpay && kernel_type == INTERIOR_KERNEL) {
62  Vector x = arg.x(xs, my_spinor_parity);
63  out = x + arg.a5(s) * out;
64  } else if (kernel_type != INTERIOR_KERNEL && active) {
65  Vector x = arg.out(xs, my_spinor_parity);
66  out = x + (xpay ? arg.a5(s) * out : out);
67  }
68 
69  if (kernel_type != EXTERIOR_KERNEL_ALL || active) arg.out(xs, my_spinor_parity) = out;
70  }
71 
72  // CPU Kernel for applying 4-d Wilson operator to a 5-d vector (replicated along fifth dimension)
73  template <typename Float, int nDim, int nColor, int nParity, bool dagger, bool xpay, KernelType kernel_type, typename Arg>
75  {
76  for (int parity = 0; parity < nParity; parity++) {
77  // for full fields then set parity from loop else use arg setting
78  parity = nParity == 2 ? parity : arg.parity;
79 
80  for (int x_cb = 0; x_cb < arg.threads; x_cb++) { // 4-d volume
81  for (int s = 0; s < arg.Ls; s++) {
82  domainWall4D<Float, nDim, nColor, nParity, dagger, xpay, kernel_type>(arg, x_cb, s, parity);
83  }
84  } // 4-d volumeCB
85  } // parity
86  }
87 
88  // GPU Kernel for applying 4-d Wilson operator to a 5-d vector (replicated along fifth dimension)
89  template <typename Float, int nDim, int nColor, int nParity, bool dagger, bool xpay, KernelType kernel_type, typename Arg>
90  __global__ void domainWall4DGPU(Arg arg)
91  {
92  int x_cb = blockIdx.x * blockDim.x + threadIdx.x;
93  if (x_cb >= arg.threads) return;
94 
95  // for this operator Ls is mapped to the y thread dimension
96  int s = blockIdx.y * blockDim.y + threadIdx.y;
97  if (s >= arg.Ls) return;
98 
99  // for full fields set parity from z thread index else use arg setting
100  int parity = nParity == 2 ? blockDim.z * blockIdx.z + threadIdx.z : arg.parity;
101 
102  switch (parity) {
103  case 0: domainWall4D<Float, nDim, nColor, nParity, dagger, xpay, kernel_type>(arg, x_cb, s, 0); break;
104  case 1: domainWall4D<Float, nDim, nColor, nParity, dagger, xpay, kernel_type>(arg, x_cb, s, 1); break;
105  }
106  }
107 
108 } // namespace quda
static __constant__ char mobius_d[size]
KernelType kernel_type
__device__ __host__ complex< real > a5(int s)
Helper function for grabbing the constant struct, whether we are on the GPU or CPU.
DomainWall4DArg(ColorSpinorField &out, const ColorSpinorField &in, const GaugeField &U, double a, double m_5, const Complex *b_5, const Complex *c_5, bool xpay, const ColorSpinorField &x, int parity, bool dagger, const int *comm_override)
Parameter structure for driving the Wilson operator.
__global__ void domainWall4DGPU(Arg arg)
const int nColor
Definition: covdev_test.cpp:75
constexpr int size
mapper< Float >::type real
int X[4]
Definition: covdev_test.cpp:70
std::complex< double > Complex
Definition: quda_internal.h:46
__device__ __host__ void domainWall4D(Arg &arg, int idx, int s, int parity)
complex< real > a_5[QUDA_MAX_DWF_LS]
__shared__ float s[]
#define QUDA_MAX_DWF_LS
Maximum length of the Ls dimension for domain-wall fermions.
void domainWall4DCPU(Arg &arg)
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
VectorXcd Vector