QUDA  0.9.0
clover_deriv_quda.cu
Go to the documentation of this file.
1 #include <cstdio>
2 #include <cstdlib>
3 #include <cuda.h>
4 #include <quda_internal.h>
5 #include <tune_quda.h>
6 #include <gauge_field.h>
7 #include <gauge_field_order.h>
8 #include <quda_matrix.h>
9 #include <index_helper.cuh>
10 #include <cassert>
11 
42 #if (CUDA_VERSION < 8000)
43 #define DYNAMIC_MU_NU
44 #endif
45 
46 // Use shared memory for the force accumulator matrix
47 #define SHARED_ACCUMULATOR
48 
49 
50 #ifdef DYNAMIC_MU_NU
51 
52 // When using dynamic mu/nu indexing, to avoid local spills use shared
53 // memory for the per thread indexing arrays.
54 // FIXME for reasons I don't understand, the shared array breaks in multi-GPU mode
55 //#define SHARED_ARRAY
56 
57 #endif // DYNAMIC_MU_NU
58 
59 
60 namespace quda {
61 
62 #ifdef SHARED_ACCUMULATOR
63 
64 #define DECLARE_LINK(U) \
65  extern __shared__ int s[]; \
66  real *U = (real*)s; \
67  { \
68  const int tid = (threadIdx.z*blockDim.y + threadIdx.y)*blockDim.x + threadIdx.x; \
69  const int block = blockDim.x * blockDim.y * blockDim.z; \
70  for (int i=0; i<18; i++) force[i*block + tid] = 0.0; \
71  }
72 
73 #define LINK real*
74 
75  template <typename real, typename Link>
76  __device__ inline void axpy(real a, const real *x, Link &y) {
77  const int tid = (threadIdx.z*blockDim.y + threadIdx.y)*blockDim.x + threadIdx.x;
78  const int block = blockDim.x * blockDim.y * blockDim.z;
79 #pragma unroll
80  for (int i=0; i<9; i++) {
81  y.data[i] += a * complex<real>(x[(2*i+0)*block + tid], x[(2*i+1)*block + tid]);
82  }
83  }
84 
85  template <typename real, typename Link>
86  __device__ inline void operator+=(real *y, const Link &x) {
87  const int tid = (threadIdx.z*blockDim.y + threadIdx.y)*blockDim.x + threadIdx.x;
88  const int block = blockDim.x * blockDim.y * blockDim.z;
89 #pragma unroll
90  for (int i=0; i<9; i++) {
91  y[(2*i+0)*block + tid] += x.data[i].real();
92  y[(2*i+1)*block + tid] += x.data[i].imag();
93  }
94  }
95 
96  template <typename real, typename Link>
97  __device__ inline void operator-=(real *y, const Link &x) {
98  const int tid = (threadIdx.z*blockDim.y + threadIdx.y)*blockDim.x + threadIdx.x;
99  const int block = blockDim.x * blockDim.y * blockDim.z;
100 #pragma unroll
101  for (int i=0; i<9; i++) {
102  y[(2*i+0)*block + tid] -= x.data[i].real();
103  y[(2*i+1)*block + tid] -= x.data[i].imag();
104  }
105  }
106 
107 #else
108 
109 #define DECLARE_LINK(U) Link U;
110 
111 #define LINK Link &
112 
113  template <typename real, typename Link>
114  __device__ inline void axpy(real a, const Link &x, Link &y) { y += a*x; }
115 
116 #endif
117 
118 #if defined(SHARED_ARRAY) && defined(SHARED_ACCUMULATOR)
119 
120 #define DECLARE_ARRAY(d, idx) \
121  unsigned char *d; \
122  { \
123  extern __shared__ int s[]; \
124  int tid = (threadIdx.z*blockDim.y + threadIdx.y)*blockDim.x + threadIdx.x; \
125  int block = blockDim.x*blockDim.y*blockDim.z; \
126  int offset = 18*block*sizeof(real)/sizeof(int) + idx*block + tid; \
127  s[offset] = 0; \
128  d = (unsigned char*)&s[offset]; \
129  }
130 #elif defined(SHARED_ARRAY)
131 #error Cannot use SHARED_ARRAY with SHARED_ACCUMULATOR
132 #else
133 #define DECLARE_ARRAY(d, idx) \
134  int d[4] = {0, 0, 0, 0};
135 #endif
136 
137 
138 #ifdef GPU_CLOVER_DIRAC
139 
140  template<class Float, typename Force, typename Gauge, typename Oprod>
141  struct CloverDerivArg
142  {
143  int X[4];
144  int E[4];
145  int border[4];
146  Float coeff;
147  int parity;
148  int volumeCB;
149 
150  Force force;
151  Gauge gauge;
152  Oprod oprod;
153 
154  CloverDerivArg(const Force& force, const Gauge& gauge, const Oprod& oprod,
155  const int *X_, const int *E_,
156  double coeff, int parity) :
157  coeff(coeff), parity(parity), volumeCB(force.volumeCB),
158  force(force), gauge(gauge), oprod(oprod)
159  {
160  for(int dir=0; dir<4; ++dir) {
161  this->X[dir] = X_[dir];
162  this->E[dir] = E_[dir];
163  this->border[dir] = (E_[dir] - X_[dir])/2;
164  }
165  }
166  };
167 
168 
169 #ifdef DYNAMIC_MU_NU
170  template <typename real, typename Arg, typename Link>
171  __device__ void computeForce(LINK force, Arg &arg, int xIndex, int yIndex, int mu, int nu) {
172 #else
173  template <typename real, typename Arg, int mu, int nu, typename Link>
174  __device__ __forceinline__ void computeForce(LINK force, Arg &arg, int xIndex, int yIndex) {
175 #endif
176 
177  int otherparity = (1-arg.parity);
178 
179  const int tidx = mu > nu ? (mu-1)*mu/2 + nu : (nu-1)*nu/2 + mu;
180 
181  if (yIndex == 0) { // do "this" force
182 
183  DECLARE_ARRAY(x, 1);
184  getCoordsExtended(x, xIndex, arg.X, arg.parity, arg.border);
185 
186  // U[mu](x) U[nu](x+mu) U[*mu](x+nu) U[*nu](x) Oprod(x)
187  {
188  DECLARE_ARRAY(d,0);
189 
190  // load U(x)_(+mu)
191  Link U1 = arg.gauge(mu, linkIndexShift(x, d, arg.E), arg.parity);
192 
193  // load U(x+mu)_(+nu)
194  d[mu]++;
195  Link U2 = arg.gauge(nu, linkIndexShift(x, d, arg.E), otherparity);
196  d[mu]--;
197 
198  // load U(x+nu)_(+mu)
199  d[nu]++;
200  Link U3 = arg.gauge(mu, linkIndexShift(x, d, arg.E), otherparity);
201  d[nu]--;
202 
203  // load U(x)_(+nu)
204  Link U4 = arg.gauge(nu, linkIndexShift(x, d, arg.E), arg.parity);
205 
206  // load Oprod
207  Link Oprod1 = arg.oprod(tidx, linkIndexShift(x, d, arg.E), arg.parity);
208 
209  if (nu < mu) force -= U1*U2*conj(U3)*conj(U4)*Oprod1;
210  else force += U1*U2*conj(U3)*conj(U4)*Oprod1;
211 
212  d[mu]++; d[nu]++;
213  Link Oprod2 = arg.oprod(tidx, linkIndexShift(x, d, arg.E), arg.parity);
214  d[mu]--; d[nu]--;
215 
216  if (nu < mu) force -= U1*U2*Oprod2*conj(U3)*conj(U4);
217  else force += U1*U2*Oprod2*conj(U3)*conj(U4);
218  }
219 
220  {
221  DECLARE_ARRAY(d,0);
222 
223  // load U(x-nu)(+nu)
224  d[nu]--;
225  Link U1 = arg.gauge(nu, linkIndexShift(x, d, arg.E), otherparity);
226  d[nu]++;
227 
228  // load U(x-nu)(+mu)
229  d[nu]--;
230  Link U2 = arg.gauge(mu, linkIndexShift(x, d, arg.E), otherparity);
231  d[nu]++;
232 
233  // load U(x+mu-nu)(nu)
234  d[mu]++; d[nu]--;
235  Link U3 = arg.gauge(nu, linkIndexShift(x, d, arg.E), arg.parity);
236  d[mu]--; d[nu]++;
237 
238  // load U(x)_(+mu)
239  Link U4 = arg.gauge(mu, linkIndexShift(x, d, arg.E), arg.parity);
240 
241  d[mu]++; d[nu]--;
242  Link Oprod1 = arg.oprod(tidx, linkIndexShift(x, d, arg.E), arg.parity);
243  d[mu]--; d[nu]++;
244 
245  if (nu < mu) force += conj(U1)*U2*Oprod1*U3*conj(U4);
246  else force -= conj(U1)*U2*Oprod1*U3*conj(U4);
247 
248  Link Oprod4 = arg.oprod(tidx, linkIndexShift(x, d, arg.E), arg.parity);
249 
250  if (nu < mu) force += Oprod4*conj(U1)*U2*U3*conj(U4);
251  else force -= Oprod4*conj(U1)*U2*U3*conj(U4);
252  }
253 
254  } else { // else do other force
255 
256  DECLARE_ARRAY(y, 1);
257  getCoordsExtended(y, xIndex, arg.X, otherparity, arg.border);
258 
259  {
260  DECLARE_ARRAY(d,0);
261 
262  // load U(x)_(+mu)
263  Link U1 = arg.gauge(mu, linkIndexShift(y, d, arg.E), otherparity);
264 
265  // load U(x+mu)_(+nu)
266  d[mu]++;
267  Link U2 = arg.gauge(nu, linkIndexShift(y, d, arg.E), arg.parity);
268  d[mu]--;
269 
270  // load U(x+nu)_(+mu)
271  d[nu]++;
272  Link U3 = arg.gauge(mu, linkIndexShift(y, d, arg.E), arg.parity);
273  d[nu]--;
274 
275  // load U(x)_(+nu)
276  Link U4 = arg.gauge(nu, linkIndexShift(y, d, arg.E), otherparity);
277 
278  // load opposite parity Oprod
279  d[nu]++;
280  Link Oprod3 = arg.oprod(tidx, linkIndexShift(y, d, arg.E), arg.parity);
281  d[nu]--;
282 
283  if (nu < mu) force -= U1*U2*conj(U3)*Oprod3*conj(U4);
284  else force += U1*U2*conj(U3)*Oprod3*conj(U4);
285 
286  // load Oprod(x+mu)
287  d[mu]++;
288  Link Oprod4 = arg.oprod(tidx, linkIndexShift(y, d, arg.E), arg.parity);
289  d[mu]--;
290 
291  if (nu < mu) force -= U1*Oprod4*U2*conj(U3)*conj(U4);
292  else force += U1*Oprod4*U2*conj(U3)*conj(U4);
293  }
294 
295  // Lower leaf
296  // U[nu*](x-nu) U[mu](x-nu) U[nu](x+mu-nu) Oprod(x+mu) U[*mu](x)
297  {
298  DECLARE_ARRAY(d,0);
299 
300  // load U(x-nu)(+nu)
301  d[nu]--;
302  Link U1 = arg.gauge(nu, linkIndexShift(y, d, arg.E), arg.parity);
303  d[nu]++;
304 
305  // load U(x-nu)(+mu)
306  d[nu]--;
307  Link U2 = arg.gauge(mu, linkIndexShift(y, d, arg.E), arg.parity);
308  d[nu]++;
309 
310  // load U(x+mu-nu)(nu)
311  d[mu]++; d[nu]--;
312  Link U3 = arg.gauge(nu, linkIndexShift(y, d, arg.E), otherparity);
313  d[mu]--; d[nu]++;
314 
315  // load U(x)_(+mu)
316  Link U4 = arg.gauge(mu, linkIndexShift(y, d, arg.E), otherparity);
317 
318  // load Oprod(x+mu)
319  d[mu]++;
320  Link Oprod1 = arg.oprod(tidx, linkIndexShift(y, d, arg.E), arg.parity);
321  d[mu]--;
322 
323  if (nu < mu) force += conj(U1)*U2*U3*Oprod1*conj(U4);
324  else force -= conj(U1)*U2*U3*Oprod1*conj(U4);
325 
326  d[nu]--;
327  Link Oprod2 = arg.oprod(tidx, linkIndexShift(y, d, arg.E), arg.parity);
328  d[nu]++;
329 
330  if (nu < mu) force += conj(U1)*Oprod2*U2*U3*conj(U4);
331  else force -= conj(U1)*Oprod2*U2*U3*conj(U4);
332  }
333 
334  }
335 
336  }
337 
338  template<typename real, typename Arg>
339  __global__ void cloverDerivativeKernel(Arg arg)
340  {
341  int index = threadIdx.x + blockIdx.x*blockDim.x;
342  if (index >= arg.volumeCB) return;
343 
344  // y index determines whether we're updating arg.parity or (1-arg.parity)
345  int yIndex = threadIdx.y + blockIdx.y*blockDim.y;
346  if (yIndex >= 2) return;
347 
348  // mu index is mapped from z thread index
349  int mu = threadIdx.z + blockIdx.z*blockDim.z;
350  if (mu >= 4) return;
351 
352  typedef complex<real> Complex;
353  typedef Matrix<Complex,3> Link;
354 
355  DECLARE_LINK(force);
356 
357 #ifdef DYNAMIC_MU_NU
358  for (int nu=0; nu<4; nu++) {
359  if (mu==nu) continue;
360  computeForce<real,Arg,Link>(force, arg, index, yIndex, mu, nu);
361  }
362 #else
363  switch(mu) {
364  case 0:
365  computeForce<real,Arg,0,1,Link>(force, arg, index, yIndex);
366  computeForce<real,Arg,0,2,Link>(force, arg, index, yIndex);
367  computeForce<real,Arg,0,3,Link>(force, arg, index, yIndex);
368  break;
369  case 1:
370  computeForce<real,Arg,1,0,Link>(force, arg, index, yIndex);
371  computeForce<real,Arg,1,3,Link>(force, arg, index, yIndex);
372  computeForce<real,Arg,1,2,Link>(force, arg, index, yIndex);
373  break;
374  case 2:
375  computeForce<real,Arg,2,3,Link>(force, arg, index, yIndex);
376  computeForce<real,Arg,2,0,Link>(force, arg, index, yIndex);
377  computeForce<real,Arg,2,1,Link>(force, arg, index, yIndex);
378  break;
379  case 3:
380  computeForce<real,Arg,3,2,Link>(force, arg, index, yIndex);
381  computeForce<real,Arg,3,1,Link>(force, arg, index, yIndex);
382  computeForce<real,Arg,3,0,Link>(force, arg, index, yIndex);
383  break;
384  }
385 #endif
386 
387  // Write to array
388  Link F;
389  arg.force.load((real*)(F.data), index, mu, yIndex == 0 ? arg.parity : 1-arg.parity);
390  axpy(arg.coeff, force, F);
391  arg.force.save((real*)(F.data), index, mu, yIndex == 0 ? arg.parity : 1-arg.parity);
392 
393  return;
394  } // cloverDerivativeKernel
395 
396 
397  template<typename Float, typename Arg>
398  class CloverDerivative : public TunableVectorY {
399 
400  private:
401  Arg arg;
402  const GaugeField &meta;
403 
404 #if defined(SHARED_ACCUMULATOR) && defined(SHARED_ARRAY)
405  unsigned int sharedBytesPerThread() const { return 18*sizeof(Float) + 8; }
406 #elif defined(SHARED_ACCUMULATOR)
407  unsigned int sharedBytesPerThread() const { return 18*sizeof(Float); }
408 #else
409  unsigned int sharedBytesPerThread() const { return 0; }
410 #endif
411  unsigned int sharedBytesPerBlock(const TuneParam &) const { return 0; }
412 
413  unsigned int minThreads() const { return arg.volumeCB; }
414  bool tuneGridDim() const { return false; } // don't tune the grid dimension
415 
416  public:
417  CloverDerivative(const Arg &arg, const GaugeField &meta) : TunableVectorY(2), arg(arg), meta(meta) {
418  writeAuxString("threads=%d,prec=%lu,fstride=%d,gstride=%d,ostride=%d",
419  arg.volumeCB,sizeof(Float),arg.force.stride,
420  arg.gauge.stride,arg.oprod.stride);
421  }
422  virtual ~CloverDerivative() {}
423 
424  void apply(const cudaStream_t &stream){
425  TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
426  cloverDerivativeKernel<Float><<<tp.grid,tp.block,tp.shared_bytes>>>(arg);
427  } // apply
428 
429  bool advanceBlockDim(TuneParam &param) const {
430  dim3 block = param.block;
431  dim3 grid = param.grid;
433  param.block.z = block.z;
434  param.grid.z = grid.z;
435 
436  if (!rtn) {
437  if (param.block.z < 4) {
438  param.block.z++;
439  param.grid.z = (4 + param.block.z - 1) / param.block.z;
440  rtn = true;
441  } else {
442  param.block.z = 1;
443  param.grid.z = 4;
444  rtn = false;
445  }
446  }
447  return rtn;
448  }
449 
450  void initTuneParam(TuneParam &param) const {
452  param.block.y = 1;
453  param.block.z = 1;
454  param.grid.y = 2;
455  param.grid.z = 4;
456  }
457 
458  void defaultTuneParam(TuneParam &param) const { initTuneParam(param); }
459 
460  // The force field is updated so we must preserve its initial state
461  void preTune() { arg.force.save(); }
462  void postTune(){ arg.force.load(); }
463 
464  long long flops() const { return 16 * 198 * 3 * 4 * 2 * (long long)arg.volumeCB; }
465  long long bytes() const { return ((8*arg.gauge.Bytes() + 4*arg.oprod.Bytes())*3 + 2*arg.force.Bytes()) * 4 * 2 * arg.volumeCB; }
466 
467  TuneKey tuneKey() const { return TuneKey(meta.VolString(), typeid(*this).name(), aux); }
468  };
469 
470 
471  template<typename Float>
472  void cloverDerivative(cudaGaugeField &force,
473  cudaGaugeField &gauge,
474  cudaGaugeField &oprod,
475  double coeff, int parity) {
476 
477  if (oprod.Reconstruct() != QUDA_RECONSTRUCT_NO)
478  errorQuda("Force field does not support reconstruction");
479 
480  if (force.Order() != oprod.Order())
481  errorQuda("Force and Oprod orders must match");
482 
483  if (force.Reconstruct() != QUDA_RECONSTRUCT_NO)
484  errorQuda("Force field does not support reconstruction");
485 
486  if (force.Order() == QUDA_FLOAT2_GAUGE_ORDER){
487  typedef gauge::FloatNOrder<Float, 18, 2, 18> F;
488  typedef gauge::FloatNOrder<Float, 18, 2, 18> O;
489 
490  if (gauge.isNative()) {
491  if (gauge.Reconstruct() == QUDA_RECONSTRUCT_NO) {
492  typedef typename gauge_mapper<Float,QUDA_RECONSTRUCT_NO>::type G;
493  typedef CloverDerivArg<Float,F,G,O> Arg;
494  Arg arg(F(force), G(gauge), O(oprod), force.X(), oprod.X(), coeff, parity);
495  CloverDerivative<Float, Arg> deriv(arg, gauge);
496  deriv.apply(0);
497 #if 0
498  } else if (gauge.Reconstruct() == QUDA_RECONSTRUCT_12) {
499  typedef typename gauge_mapper<Float,QUDA_RECONSTRUCT_12>::type G;
500  typedef CloverDerivArg<Float,F,G,O> Arg;
501  Arg arg(F(force), G(gauge), O(oprod), force.X(), oprod.X(), coeff, parity);
502  CloverDerivative<Float, Arg> deriv(arg, gauge);
503  deriv.apply(0);
504 #endif
505  } else {
506  errorQuda("Reconstruction type %d not supported",gauge.Reconstruct());
507  }
508  } else {
509  errorQuda("Gauge order %d not supported", gauge.Order());
510  }
511  } else {
512  errorQuda("Force order %d not supported", force.Order());
513  } // force / oprod order
514 
516  }
517 #endif // GPU_CLOVER
518 
520  cudaGaugeField &gauge,
521  cudaGaugeField &oprod,
522  double coeff, QudaParity parity)
523 {
524 #ifdef GPU_CLOVER_DIRAC
525  assert(oprod.Geometry() == QUDA_TENSOR_GEOMETRY);
526  assert(force.Geometry() == QUDA_VECTOR_GEOMETRY);
527 
528  for (int d=0; d<4; d++) {
529  if (oprod.X()[d] != gauge.X()[d])
530  errorQuda("Incompatible extended dimensions d=%d gauge=%d oprod=%d", d, gauge.X()[d], oprod.X()[d]);
531  }
532 
533  int device_parity = (parity == QUDA_EVEN_PARITY) ? 0 : 1;
534 
535  if(force.Precision() == QUDA_DOUBLE_PRECISION){
536  cloverDerivative<double>(force, gauge, oprod, coeff, device_parity);
537 #if 0
538  } else if (force.Precision() == QUDA_SINGLE_PRECISION){
539  cloverDerivative<float>(force, gauge, oprod, coeff, device_parity);
540 #endif
541  } else {
542  errorQuda("Precision %d not supported", force.Precision());
543  }
544 
545  return;
546 #else
547  errorQuda("Clover has not been built");
548 #endif
549 }
550 
551 
552 } // namespace quda
__host__ __device__ float4 operator-=(float4 &x, const float4 y)
Definition: float_vector.h:131
void cloverDerivative(cudaGaugeField &force, cudaGaugeField &gauge, cudaGaugeField &oprod, double coeff, QudaParity parity)
Compute the derivative of the clover matrix in the direction mu,nu and compute the resulting force gi...
dim3 dim3 blockDim
static __device__ __host__ void getCoordsExtended(I x[], int cb_index, const J X[], int parity, const int R[])
#define DECLARE_ARRAY(d, idx)
#define LINK
double mu
Definition: test_util.cpp:1643
static __device__ __host__ int linkIndexShift(const I x[], const J dx[], const K X[4])
QudaVerbosity getVerbosity()
Definition: util_quda.cpp:20
#define errorQuda(...)
Definition: util_quda.h:90
std::complex< double > Complex
Definition: eig_variables.h:13
cudaStream_t * stream
void initTuneParam(TuneParam &param) const
Definition: tune_quda.h:382
QudaFieldGeometry Geometry() const
Definition: gauge_field.h:212
bool advanceBlockDim(TuneParam &param) const
Definition: tune_quda.h:357
int E[4]
Definition: test_util.cpp:36
char * index(const char *, int)
QudaGaugeParam param
Definition: pack_test.cpp:17
for(int s=0;s< param.dc.Ls;s++)
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
Definition: tune.cpp:603
Main header file for host and device accessors to GaugeFields.
enum QudaParity_s QudaParity
cudaError_t qudaDeviceSynchronize()
Wrapper around cudaDeviceSynchronize or cuDeviceSynchronize.
#define DECLARE_LINK(U)
unsigned long long flops
Definition: blas_quda.cu:42
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
Definition: complex_quda.h:880
__host__ __device__ float4 operator+=(float4 &x, const float4 y)
Definition: float_vector.h:96
__device__ void axpy(real a, const real *x, Link &y)
__host__ __device__ ValueType conj(ValueType x)
Definition: complex_quda.h:115
QudaTune getTuning()
Query whether autotuning is enabled or not. Default is enabled but can be overridden by setting QUDA_...
Definition: util_quda.cpp:51
static __inline__ size_t size_t d
QudaPrecision Precision() const
QudaParity parity
Definition: covdev_test.cpp:53
#define a
unsigned long long bytes
Definition: blas_quda.cu:43
const int * X() const