QUDA  0.9.0
covDev.cu
Go to the documentation of this file.
1 #include <transfer.h>
2 #include <gauge_field_order.h>
4 #include <index_helper.cuh>
5 #include <stencil.h>
6 #include <color_spinor.h>
7 
12 namespace quda {
13 
14 #ifdef GPU_CONTRACT
15 
19  template <typename Float, int nSpin, int nColor, QudaReconstructType reconstruct>
20  struct CovDevArg {
21  typedef typename colorspinor_mapper<Float,nSpin,nColor>::type F;
22  typedef typename gauge_mapper<Float,reconstruct>::type G;
23 
24  F out; // output vector field
25  const F in; // input vector field
26  const G U; // the gauge field
27  const int parity; // only use this for single parity fields
28  const int nParity; // number of parities we're working on
29  const int nFace; // hard code to 1 for now
30  const int dim[5]; // full lattice dimensions
31  const int commDim[4]; // whether a given dimension is partitioned or not
32  const int volumeCB; // checkerboarded volume
33  const int mu; // direction of the covariant derivative
34 
35  CovDevArg(ColorSpinorField &out, const ColorSpinorField &in, const GaugeField &U, const int parity, const int mu)
36  : out(out), in(in), U(U), parity(parity), mu(mu), nParity(in.SiteSubset()), nFace(1),
37  dim{ (3-nParity) * in.X(0), in.X(1), in.X(2), in.X(3), 1 },
39  volumeCB(in.VolumeCB())
40  {
41  if (!U.isNative())
42  errorQuda("Unsupported field order colorspinor=%d gauge=%d combination\n", in.FieldOrder(), U.FieldOrder());
43  }
44  };
45 
56  template <typename Float, int nDim, int nColor, int mu, typename Vector, typename Arg>
57  __device__ __host__ inline void applyCovDev(Vector &out, Arg &arg, int x_cb, int parity) {
58  typedef Matrix<complex<Float>,nColor> Link;
59  const int their_spinor_parity = (arg.nParity == 2) ? 1-parity : 0;
60 
61  int coord[5];
62  getCoords(coord, x_cb, arg.dim, parity);
63  coord[4] = 0;
64 
65  const int d = mu%4;
66 
67  if (mu < 4) {
68  //Forward gather - compute fwd offset for vector fetch
69  const int fwd_idx = linkIndexP1(coord, arg.dim, d);
70 
71  if ( arg.commDim[d] && (coord[d] + arg.nFace >= arg.dim[d]) ) {
72  const int ghost_idx = ghostFaceIndex<1>(coord, arg.dim, d, arg.nFace);
73 
74  const Link U = arg.U(d, x_cb, parity);
75  const Vector in = arg.in.Ghost(d, 1, ghost_idx, their_spinor_parity);
76 
77  out += U * in;
78  } else {
79 
80  const Link U = arg.U(d, x_cb, parity);
81  const Vector in = arg.in(fwd_idx, their_spinor_parity);
82 
83  out += U * in;
84  }
85  } else {
86  //Backward gather - compute back offset for spinor and gauge fetch
87  const int back_idx = linkIndexM1(coord, arg.dim, d);
88  const int gauge_idx = back_idx;
89 
90  if ( arg.commDim[d] && (coord[d] - arg.nFace < 0) ) {
91  const int ghost_idx = ghostFaceIndex<0>(coord, arg.dim, d, arg.nFace);
92 
93  const Link U = arg.U.Ghost(d, ghost_idx, 1-parity);
94  const Vector in = arg.in.Ghost(d, 0, ghost_idx, their_spinor_parity);
95 
96  out += conj(U) * in;
97  } else {
98 
99  const Link U = arg.U(d, gauge_idx, 1-parity);
100  const Vector in = arg.in(back_idx, their_spinor_parity);
101 
102  out += conj(U) * in;
103  }
104  } // Forward/backward derivative
105 
106  }
107 
108 
109  //out(x) = M*in
110  template <typename Float, int nDim, int nSpin, int nColor, typename Arg>
111  __device__ __host__ inline void covDev(Arg &arg, int x_cb, int parity)
112  {
113  typedef ColorSpinor<Float,nColor,nSpin> Vector;
114  Vector out;
115 
116  switch (arg.mu) {
117  case 0: applyCovDev<Float,nDim,nColor,0>(out, arg, x_cb, parity); break;
118  case 1: applyCovDev<Float,nDim,nColor,1>(out, arg, x_cb, parity); break;
119  case 2: applyCovDev<Float,nDim,nColor,2>(out, arg, x_cb, parity); break;
120  case 3: applyCovDev<Float,nDim,nColor,3>(out, arg, x_cb, parity); break;
121  case 4: applyCovDev<Float,nDim,nColor,4>(out, arg, x_cb, parity); break;
122  case 5: applyCovDev<Float,nDim,nColor,5>(out, arg, x_cb, parity); break;
123  case 6: applyCovDev<Float,nDim,nColor,6>(out, arg, x_cb, parity); break;
124  case 7: applyCovDev<Float,nDim,nColor,7>(out, arg, x_cb, parity); break;
125  }
126  arg.out(x_cb, parity) = out;
127  }
128 
129  // CPU kernel for applying the Laplace operator to a vector
130  template <typename Float, int nDim, int nSpin, int nColor, typename Arg>
131  void covDevCPU(Arg arg)
132  {
133 
134  for (int parity= 0; parity < arg.nParity; parity++) {
135  // for full fields then set parity from loop else use arg setting
136  parity = (arg.nParity == 2) ? parity : arg.parity;
137 
138  for (int x_cb = 0; x_cb < arg.volumeCB; x_cb++) { // 4-d volume
139  covDev<Float,nDim,nSpin,nColor>(arg, x_cb, parity);
140  } // 4-d volumeCB
141  } // parity
142 
143  }
144 
145  // GPU Kernel for applying the Laplace operator to a vector
146  template <typename Float, int nDim, int nSpin, int nColor, typename Arg>
147  __global__ void covDevGPU(Arg arg)
148  {
149  int x_cb = blockIdx.x*blockDim.x + threadIdx.x;
150 
151  // for full fields set parity from y thread index else use arg setting
152  int parity = blockDim.y*blockIdx.y + threadIdx.y;
153 
154  if (x_cb >= arg.volumeCB) return;
155  if (parity >= arg.nParity) return;
156 
157  covDev<Float,nDim,nSpin,nColor>(arg, x_cb, parity);
158  }
159 
160  template <typename Float, int nDim, int nSpin, int nColor, typename Arg>
161  class CovDev : public TunableVectorY {
162 
163  protected:
164  Arg &arg;
165  const ColorSpinorField &meta;
166 
167  long long flops() const
168  {
169  return 8*nColor*nColor*arg.nParity*(long long)meta.VolumeCB();
170  }
171  long long bytes() const
172  {
173  return arg.out.Bytes() + arg.in.Bytes() + arg.nParity*arg.U.Bytes()*meta.VolumeCB();
174  }
175  bool tuneGridDim() const { return false; }
176  unsigned int minThreads() const { return arg.volumeCB; }
177 
178  public:
179  CovDev(Arg &arg, const ColorSpinorField &meta) : TunableVectorY(arg.nParity), arg(arg), meta(meta)
180  {
181  strcpy(aux, meta.AuxString());
182 #ifdef MULTI_GPU
183  char comm[5];
184  comm[0] = (arg.commDim[0] ? '1' : '0');
185  comm[1] = (arg.commDim[1] ? '1' : '0');
186  comm[2] = (arg.commDim[2] ? '1' : '0');
187  comm[3] = (arg.commDim[3] ? '1' : '0');
188  comm[4] = '\0';
189  strcat(aux,",comm=");
190  strcat(aux,comm);
191 #endif
192  }
193  virtual ~CovDev() { }
194 
195  void apply(const cudaStream_t &stream) {
196  if (meta.Location() == QUDA_CPU_FIELD_LOCATION) {
197  covDevCPU<Float,nDim,nSpin,nColor>(arg);
198  } else {
199  TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
200  covDevGPU<Float,nDim,nSpin,nColor> <<<tp.grid,tp.block,tp.shared_bytes,stream>>>(arg);
201  }
202  }
203 
204  TuneKey tuneKey() const { return TuneKey(meta.VolString(), typeid(*this).name(), aux); }
205  };
206 
207 
208  template <typename Float, int nColor, QudaReconstructType recon>
209  void ApplyCovDev(ColorSpinorField &out, const ColorSpinorField &in, const GaugeField &U, int parity, int mu)
210  {
211  constexpr int nDim = 4;
212  if (in.Nspin() == 1) {
213  constexpr int nSpin = 1;
214  CovDevArg<Float,nSpin,nColor,recon> arg(out, in, U, parity, mu);
215  CovDev<Float,nDim,nSpin,nColor,CovDevArg<Float,nSpin,nColor,recon> > myCovDev(arg, in);
216  myCovDev.apply(0);
217  } else if (in.Nspin() == 4) {
218  constexpr int nSpin = 4;
219  CovDevArg<Float,nSpin,nColor,recon> arg(out, in, U, parity, mu);
220  CovDev<Float,nDim,nSpin,nColor,CovDevArg<Float,nSpin,nColor,recon> > myCovDev(arg, in);
221  myCovDev.apply(0);
222  } else {
223  errorQuda("Unsupported nSpin=%d", in.Nspin());
224  }
225  }
226 
227  // template on the gauge reconstruction
228  template <typename Float, int nColor>
229  void ApplyCovDev(ColorSpinorField &out, const ColorSpinorField &in, const GaugeField &U, int parity, int mu)
230  {
231  if (U.Reconstruct()== QUDA_RECONSTRUCT_NO) {
232  ApplyCovDev<Float,nColor,QUDA_RECONSTRUCT_NO>(out, in, U, parity, mu);
233  } else if (U.Reconstruct()== QUDA_RECONSTRUCT_12) {
234  ApplyCovDev<Float,nColor,QUDA_RECONSTRUCT_12>(out, in, U, parity, mu);
235  } else if (U.Reconstruct()== QUDA_RECONSTRUCT_8) {
236  ApplyCovDev<Float,nColor,QUDA_RECONSTRUCT_8> (out, in, U, parity, mu);
237  } else {
238  errorQuda("Unsupported reconstruct type %d\n", U.Reconstruct());
239  }
240  }
241 
242  // template on the number of colors
243  template <typename Float>
244  void ApplyCovDev(ColorSpinorField &out, const ColorSpinorField &in, const GaugeField &U, int parity, int mu)
245  {
246  if (in.Ncolor() == 3) {
247  ApplyCovDev<Float,3>(out, in, U, parity, mu);
248  } else {
249  errorQuda("Unsupported number of colors %d\n", U.Ncolor());
250  }
251  }
252 
253  // this is the Worker pointer that may have issue additional work
254  // while we're waiting on communication to finish
255  namespace dslash {
256  extern Worker* aux_worker;
257  }
258 
259 #endif // GPU_CONTRACT
260 
261  //Apply the covariant derivative operator
262  //out(x) = U_{\mu}(x)in(x+mu) for mu = 0...3
263  //out(x) = U^\dagger_mu'(x-mu')in(x-mu') for mu = 4...7 and we set mu' = mu-4
265  {
266 #ifdef GPU_CONTRACT
267  if (in.V() == out.V()) errorQuda("Aliasing pointers");
268  if (in.FieldOrder() != out.FieldOrder())
269  errorQuda("Field order mismatch in = %d, out = %d", in.FieldOrder(), out.FieldOrder());
270 
271  // check all precision match
272  checkPrecision(out, in, U);
273 
274  // check all locations match
275  checkLocation(out, in, U);
276 
277  const int nFace = 1;
278  in.exchangeGhost((QudaParity)(1-parity), nFace, 0); // last parameter is dummy
279 
281 
282  if (U.Precision() == QUDA_DOUBLE_PRECISION) {
283  ApplyCovDev<double>(out, in, U, parity, mu);
284  } else if (U.Precision() == QUDA_SINGLE_PRECISION) {
285  ApplyCovDev<float>(out, in, U, parity, mu);
286  } else {
287  errorQuda("Unsupported precision %d\n", U.Precision());
288  }
289 
290  in.bufferIndex = (1 - in.bufferIndex);
291 #else
292  errorQuda("Contraction kernels have not been built");
293 #endif
294  }
295 
296 } // namespace quda
virtual void apply(const cudaStream_t &stream)=0
dim3 dim3 blockDim
double mu
Definition: test_util.cpp:1643
QudaVerbosity getVerbosity()
Definition: util_quda.cpp:20
#define checkPrecision(...)
#define errorQuda(...)
Definition: util_quda.h:90
cudaStream_t * stream
static __inline__ dim3 dim3 void size_t cudaStream_t int dim
char * strcpy(char *__dst, const char *__src)
char * strcat(char *__s1, const char *__s2)
void exchangeGhost(QudaParity parity, int nFace, int dagger, const MemoryLocation *pack_destination=nullptr, const MemoryLocation *halo_location=nullptr, bool gdr_send=false, bool gdr_recv=false) const
This is a unified ghost exchange function for doing a complete halo exchange regardless of the type o...
static int bufferIndex
Worker * aux_worker
Definition: dslash_quda.cu:78
static __device__ __host__ int linkIndexM1(const int x[], const I X[4], const int mu)
void ApplyCovDev(ColorSpinorField &out, const ColorSpinorField &in, const GaugeField &U, int parity, int mu)
Driver for applying the covariant derivative.
Definition: covDev.cu:264
const int nColor
Definition: covdev_test.cpp:77
cpuColorSpinorField * in
for(int s=0;s< param.dc.Ls;s++)
int commDim(int)
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
Definition: tune.cpp:603
#define checkLocation(...)
Main header file for host and device accessors to GaugeFields.
enum QudaParity_s QudaParity
cpuColorSpinorField * out
unsigned long long flops
Definition: blas_quda.cu:42
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
Definition: complex_quda.h:880
VectorXcd Vector
__host__ __device__ ValueType conj(ValueType x)
Definition: complex_quda.h:115
QudaTune getTuning()
Query whether autotuning is enabled or not. Default is enabled but can be overridden by setting QUDA_...
Definition: util_quda.cpp:51
static __inline__ size_t size_t d
QudaPrecision Precision() const
static __device__ __host__ int linkIndexP1(const int x[], const I X[4], const int mu)
QudaParity parity
Definition: covdev_test.cpp:53
QudaFieldOrder FieldOrder() const
unsigned long long bytes
Definition: blas_quda.cu:43
int comm_dim_partitioned(int dim)
void covDev(cudaColorSpinorField *out, cudaGaugeField &gauge, const cudaColorSpinorField *in, const int parity, const int mu, TimeProfile &profile)
static __device__ __host__ void getCoords(int x[], int cb_index, const I X[], int parity)