QUDA  0.9.0
laplace.cu
Go to the documentation of this file.
1 #include <transfer.h>
2 #include <gauge_field_order.h>
4 #include <index_helper.cuh>
5 #include <stencil.h>
6 #include <color_spinor.h>
7 
12 namespace quda {
13 
17  template <typename Float, int nColor, QudaReconstructType reconstruct, bool xpay>
18  struct LaplaceArg {
21 
22  F out; // output vector field
23  const F in; // input vector field
24  const F x; // input vector when doing xpay
25  const G U; // the gauge field
26  const Float kappa; // kappa parameter = 1/(8+m)
27  const int parity; // only use this for single parity fields
28  const int nParity; // number of parities we're working on
29  const int nFace; // hard code to 1 for now
30  const int dim[5]; // full lattice dimensions
31  const int commDim[4]; // whether a given dimension is partitioned or not
32  const int volumeCB; // checkerboarded volume
33 
34  __host__ __device__ static constexpr bool isXpay() { return xpay; }
35 
37  Float kappa, const ColorSpinorField *x, int parity)
38  : out(out), in(in), U(U), kappa(kappa), x(xpay ? *x : in), parity(parity), nParity(in.SiteSubset()), nFace(1),
39  dim{ (3-nParity) * in.X(0), in.X(1), in.X(2), in.X(3), 1 },
41  volumeCB(in.VolumeCB())
42  {
43  if (in.FieldOrder() != QUDA_FLOAT2_FIELD_ORDER || !U.isNative())
44  errorQuda("Unsupported field order colorspinor=%d gauge=%d combination\n", in.FieldOrder(), U.FieldOrder());
45  }
46  };
47 
58  template <typename Float, int nDim, int nColor, typename Vector, typename Arg>
59  __device__ __host__ inline void applyLaplace(Vector &out, Arg &arg, int x_cb, int parity) {
60  typedef Matrix<complex<Float>,nColor> Link;
61  const int their_spinor_parity = (arg.nParity == 2) ? 1-parity : 0;
62 
63  int coord[5];
64  getCoords(coord, x_cb, arg.dim, parity);
65  coord[4] = 0;
66 
67 #pragma unroll
68  for (int d = 0; d<nDim; d++) // loop over dimension
69  {
70  //Forward gather - compute fwd offset for vector fetch
71  const int fwd_idx = linkIndexP1(coord, arg.dim, d);
72 
73  if ( arg.commDim[d] && (coord[d] + arg.nFace >= arg.dim[d]) ) {
74  const int ghost_idx = ghostFaceIndex<1>(coord, arg.dim, d, arg.nFace);
75 
76  const Link U = arg.U(d, x_cb, parity);
77  const Vector in = arg.in.Ghost(d, 1, ghost_idx, their_spinor_parity);
78 
79  out += U * in;
80  } else {
81 
82  const Link U = arg.U(d, x_cb, parity);
83  const Vector in = arg.in(fwd_idx, their_spinor_parity);
84 
85  out += U * in;
86  }
87 
88  //Backward gather - compute back offset for spinor and gauge fetch
89  const int back_idx = linkIndexM1(coord, arg.dim, d);
90  const int gauge_idx = back_idx;
91 
92  if ( arg.commDim[d] && (coord[d] - arg.nFace < 0) ) {
93  const int ghost_idx = ghostFaceIndex<0>(coord, arg.dim, d, arg.nFace);
94 
95  const Link U = arg.U.Ghost(d, ghost_idx, 1-parity);
96  const Vector in = arg.in.Ghost(d, 0, ghost_idx, their_spinor_parity);
97 
98  out += conj(U) * in;
99  } else {
100 
101  const Link U = arg.U(d, gauge_idx, 1-parity);
102  const Vector in = arg.in(back_idx, their_spinor_parity);
103 
104  out += conj(U) * in;
105  }
106  } //nDim
107 
108  }
109 
110 
111  //out(x) = M*in = (-D + m) * in(x-mu)
112  template <typename Float, int nDim, int nColor, typename Arg>
113  __device__ __host__ inline void laplace(Arg &arg, int x_cb, int parity)
114  {
116  Vector out;
117 
118  applyLaplace<Float,nDim,nColor>(out, arg, x_cb, parity);
119 
120  if (arg.isXpay()) {
121  Vector x = arg.x(x_cb, parity);
122  out = x + arg.kappa * out;
123  }
124  arg.out(x_cb, arg.nParity == 2 ? parity : 0) = out;
125  }
126 
127  // CPU kernel for applying the Laplace operator to a vector
128  template <typename Float, int nDim, int nColor, typename Arg>
129  void laplaceCPU(Arg arg)
130  {
131 
132  for (int parity= 0; parity < arg.nParity; parity++) {
133  // for full fields then set parity from loop else use arg setting
134  parity = (arg.nParity == 2) ? parity : arg.parity;
135 
136  for (int x_cb = 0; x_cb < arg.volumeCB; x_cb++) { // 4-d volume
137  laplace<Float,nDim,nColor>(arg, x_cb, parity);
138  } // 4-d volumeCB
139  } // parity
140 
141  }
142 
143  // GPU Kernel for applying the Laplace operator to a vector
144  template <typename Float, int nDim, int nColor, typename Arg>
145  __global__ void laplaceGPU(Arg arg)
146  {
147  int x_cb = blockIdx.x*blockDim.x + threadIdx.x;
148 
149  // for full fields set parity from y thread index else use arg setting
150  int parity = (arg.nParity == 2) ? blockDim.y*blockIdx.y + threadIdx.y : arg.parity;
151 
152  if (x_cb >= arg.volumeCB) return;
153  if (parity >= arg.nParity) return;
154 
155  laplace<Float,nDim,nColor>(arg, x_cb, parity);
156  }
157 
158  template <typename Float, int nDim, int nColor, typename Arg>
159  class Laplace : public TunableVectorY {
160 
161  protected:
162  Arg &arg;
164 
165  long long flops() const
166  {
167  return (2*nDim*(8*nColor*nColor)-2*nColor + (arg.isXpay() ? 2*2*nColor : 0) )*arg.nParity*(long long)meta.VolumeCB();
168  }
169  long long bytes() const
170  {
171  return arg.out.Bytes() + 2*nDim*arg.in.Bytes() + arg.nParity*2*nDim*arg.U.Bytes()*meta.VolumeCB() +
172  (arg.isXpay() ? arg.x.Bytes() : 0);
173  }
174  bool tuneGridDim() const { return false; }
175  unsigned int minThreads() const { return arg.volumeCB; }
176  unsigned int maxBlockSize() const { return deviceProp.maxThreadsPerBlock / arg.nParity; }
177 
178  public:
180  {
181  strcpy(aux, meta.AuxString());
182 #ifdef MULTI_GPU
183  char comm[5];
184  comm[0] = (arg.commDim[0] ? '1' : '0');
185  comm[1] = (arg.commDim[1] ? '1' : '0');
186  comm[2] = (arg.commDim[2] ? '1' : '0');
187  comm[3] = (arg.commDim[3] ? '1' : '0');
188  comm[4] = '\0';
189  strcat(aux,",comm=");
190  strcat(aux,comm);
191 #endif
192  if (arg.isXpay()) strcat(aux,",xpay");
193  }
194  virtual ~Laplace() { }
195 
196  void apply(const cudaStream_t &stream) {
198  laplaceCPU<Float,nDim,nColor>(arg);
199  } else {
200  TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
201  laplaceGPU<Float,nDim,nColor> <<<tp.grid,tp.block,tp.shared_bytes,stream>>>(arg);
202  }
203  }
204 
205  TuneKey tuneKey() const { return TuneKey(meta.VolString(), typeid(*this).name(), aux); }
206  };
207 
208 
209  template <typename Float, int nColor, QudaReconstructType recon>
211  double kappa, const ColorSpinorField *x, int parity)
212  {
213  constexpr int nDim = 4;
214  if (x) {
217  laplace.apply(0);
218  } else {
221  laplace.apply(0);
222  }
223  }
224 
225  // template on the gauge reconstruction
226  template <typename Float, int nColor>
227  void ApplyLaplace(ColorSpinorField &out, const ColorSpinorField &in, const GaugeField &U,
228  double kappa, const ColorSpinorField *x, int parity)
229  {
230  if (U.Reconstruct()== QUDA_RECONSTRUCT_NO) {
231  ApplyLaplace<Float,nColor,QUDA_RECONSTRUCT_NO>(out, in, U, kappa, x, parity);
232  } else if (U.Reconstruct()== QUDA_RECONSTRUCT_12) {
233  ApplyLaplace<Float,nColor,QUDA_RECONSTRUCT_12>(out, in, U, kappa, x, parity);
234  } else if (U.Reconstruct()== QUDA_RECONSTRUCT_8) {
235  ApplyLaplace<Float,nColor,QUDA_RECONSTRUCT_8>(out, in, U, kappa, x, parity);
236  } else {
237  errorQuda("Unsupported reconstruct type %d\n", U.Reconstruct());
238  }
239  }
240 
241  // template on the number of colors
242  template <typename Float>
243  void ApplyLaplace(ColorSpinorField &out, const ColorSpinorField &in, const GaugeField &U,
244  double kappa, const ColorSpinorField *x, int parity)
245  {
246  if (in.Ncolor() == 3) {
247  ApplyLaplace<Float,3>(out, in, U, kappa, x, parity);
248  } else {
249  errorQuda("Unsupported number of colors %d\n", U.Ncolor());
250  }
251  }
252 
253  // this is the Worker pointer that may have issue additional work
254  // while we're waiting on communication to finish
255  namespace dslash {
256  extern Worker* aux_worker;
257  }
258 
259  //Apply the Laplace operator
260  //out(x) = M*in = - kappa*\sum_mu U_{-\mu}(x)in(x+mu) + U^\dagger_mu(x-mu)in(x-mu)
261  //Uses the kappa normalization for the Wilson operator.
262  void ApplyLaplace(ColorSpinorField &out, const ColorSpinorField &in, const GaugeField &U,
263  double kappa, const ColorSpinorField *x, int parity)
264  {
265  if (in.V() == out.V()) errorQuda("Aliasing pointers");
266  if (in.FieldOrder() != out.FieldOrder())
267  errorQuda("Field order mismatch in = %d, out = %d", in.FieldOrder(), out.FieldOrder());
268 
269  // check all precisions match
270  checkPrecision(out, in, U);
271 
272  // check all locations match
273  checkLocation(out, in, U);
274 
275  const int nFace = 1;
276  in.exchangeGhost((QudaParity)(1-parity), nFace, 0); // last parameter is dummy
277 
279 
280  if (U.Precision() == QUDA_DOUBLE_PRECISION) {
281  ApplyLaplace<double>(out, in, U, kappa, x, parity);
282  } else if (U.Precision() == QUDA_SINGLE_PRECISION) {
283  ApplyLaplace<float>(out, in, U, kappa, x, parity);
284  } else {
285  errorQuda("Unsupported precision %d\n", U.Precision());
286  }
287 
288  in.bufferIndex = (1 - in.bufferIndex);
289  }
290 
291 
292 } // namespace quda
virtual void apply(const cudaStream_t &stream)=0
dim3 dim3 blockDim
__device__ __host__ void applyLaplace(Vector &out, Arg &arg, int x_cb, int parity)
Definition: laplace.cu:59
void xpay(ColorSpinorField &x, const double &a, ColorSpinorField &y)
Definition: blas_quda.cu:173
bool tuneGridDim() const
Definition: laplace.cu:174
Laplace(Arg &arg, const ColorSpinorField &meta)
Definition: laplace.cu:179
__global__ void laplaceGPU(Arg arg)
Definition: laplace.cu:145
cudaDeviceProp deviceProp
const char * AuxString() const
QudaVerbosity getVerbosity()
Definition: util_quda.cpp:20
#define checkPrecision(...)
#define errorQuda(...)
Definition: util_quda.h:90
const int nFace
Definition: laplace.cu:29
const int volumeCB
Definition: laplace.cu:32
cudaStream_t * stream
void laplaceCPU(Arg arg)
Definition: laplace.cu:129
char * strcpy(char *__dst, const char *__src)
const char * VolString() const
char * strcat(char *__s1, const char *__s2)
virtual ~Laplace()
Definition: laplace.cu:194
void exchangeGhost(QudaParity parity, int nFace, int dagger, const MemoryLocation *pack_destination=nullptr, const MemoryLocation *halo_location=nullptr, bool gdr_send=false, bool gdr_recv=false) const
This is a unified ghost exchange function for doing a complete halo exchange regardless of the type o...
const int parity
Definition: laplace.cu:27
__device__ __host__ void laplace(Arg &arg, int x_cb, int parity)
Definition: laplace.cu:113
long long flops() const
Definition: laplace.cu:165
const F in
Definition: laplace.cu:23
__host__ static __device__ constexpr bool isXpay()
Definition: laplace.cu:34
Parameter structure for driving the Laplace operator.
Definition: laplace.cu:18
LaplaceArg(ColorSpinorField &out, const ColorSpinorField &in, const GaugeField &U, Float kappa, const ColorSpinorField *x, int parity)
Definition: laplace.cu:36
static int bufferIndex
VOLATILE spinorFloat kappa
unsigned int minThreads() const
Definition: laplace.cu:175
Worker * aux_worker
Definition: dslash_quda.cu:78
static __device__ __host__ int linkIndexM1(const int x[], const I X[4], const int mu)
const int nColor
Definition: covdev_test.cpp:77
TuneKey tuneKey() const
Definition: laplace.cu:205
cpuColorSpinorField * in
for(int s=0;s< param.dc.Ls;s++)
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
Definition: tune.cpp:603
#define checkLocation(...)
Main header file for host and device accessors to GaugeFields.
const int commDim[4]
Definition: laplace.cu:31
enum QudaParity_s QudaParity
QudaFieldLocation Location() const
cpuColorSpinorField * out
const int dim[5]
Definition: laplace.cu:30
if(err !=cudaSuccess)
colorspinor_mapper< Float, 1, nColor >::type F
Definition: laplace.cu:19
const ColorSpinorField & meta
Definition: laplace.cu:163
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
Definition: complex_quda.h:880
VectorXcd Vector
const int nParity
Definition: laplace.cu:28
void ApplyLaplace(ColorSpinorField &out, const ColorSpinorField &in, const GaugeField &U, double kappa, const ColorSpinorField *x, int parity)
Driver for applying the Laplace stencil.
Definition: laplace.cu:210
long long bytes() const
Definition: laplace.cu:169
__host__ __device__ ValueType conj(ValueType x)
Definition: complex_quda.h:115
QudaTune getTuning()
Query whether autotuning is enabled or not. Default is enabled but can be overridden by setting QUDA_...
Definition: util_quda.cpp:51
static __inline__ size_t size_t d
static __device__ __host__ int linkIndexP1(const int x[], const I X[4], const int mu)
void apply(const cudaStream_t &stream)
Definition: laplace.cu:196
QudaParity parity
Definition: covdev_test.cpp:53
gauge_mapper< Float, reconstruct >::type G
Definition: laplace.cu:20
QudaFieldOrder FieldOrder() const
const Float kappa
Definition: laplace.cu:26
char aux[TuneKey::aux_n]
Definition: tune_quda.h:189
int comm_dim_partitioned(int dim)
unsigned int maxBlockSize() const
Definition: laplace.cu:176
static __device__ __host__ void getCoords(int x[], int cb_index, const I X[], int parity)