QUDA  1.0.0
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros
clover_deriv.cuh
Go to the documentation of this file.
1 #include <gauge_field_order.h>
2 #include <quda_matrix.h>
3 #include <index_helper.cuh>
4 
5 #if (CUDA_VERSION < 8000)
6 #define DYNAMIC_MU_NU
7 #endif
8 
9 // Use shared memory for the force accumulator matrix
10 #define SHARED_ACCUMULATOR
11 
12 #ifdef DYNAMIC_MU_NU
13 
14 // When using dynamic mu/nu indexing, to avoid local spills use shared
15 // memory for the per thread indexing arrays.
16 // FIXME for reasons I don't understand, the shared array breaks in multi-GPU mode
17 //#define SHARED_ARRAY
18 
19 #endif // DYNAMIC_MU_NU
20 
21 namespace quda
22 {
23 
24 #ifdef SHARED_ACCUMULATOR
25 
26 #define DECLARE_LINK(U) \
27  extern __shared__ int s[]; \
28  real *U = (real *)s; \
29  { \
30  const int tid = (threadIdx.z * blockDim.y + threadIdx.y) * blockDim.x + threadIdx.x; \
31  const int block = blockDim.x * blockDim.y * blockDim.z; \
32  for (int i = 0; i < 18; i++) force[i * block + tid] = 0.0; \
33  }
34 
35 #define LINK real *
36 
37  template <typename real, typename Link> __device__ inline void axpy(real a, const real *x, Link &y)
38  {
39  const int tid = (threadIdx.z * blockDim.y + threadIdx.y) * blockDim.x + threadIdx.x;
40  const int block = blockDim.x * blockDim.y * blockDim.z;
41 #pragma unroll
42  for (int i = 0; i < 9; i++) {
43  y.data[i] += a * complex<real>(x[(2 * i + 0) * block + tid], x[(2 * i + 1) * block + tid]);
44  }
45  }
46 
47  template <typename real, typename Link> __device__ inline void operator+=(real *y, const Link &x)
48  {
49  const int tid = (threadIdx.z * blockDim.y + threadIdx.y) * blockDim.x + threadIdx.x;
50  const int block = blockDim.x * blockDim.y * blockDim.z;
51 #pragma unroll
52  for (int i = 0; i < 9; i++) {
53  y[(2 * i + 0) * block + tid] += x.data[i].real();
54  y[(2 * i + 1) * block + tid] += x.data[i].imag();
55  }
56  }
57 
58  template <typename real, typename Link> __device__ inline void operator-=(real *y, const Link &x)
59  {
60  const int tid = (threadIdx.z * blockDim.y + threadIdx.y) * blockDim.x + threadIdx.x;
61  const int block = blockDim.x * blockDim.y * blockDim.z;
62 #pragma unroll
63  for (int i = 0; i < 9; i++) {
64  y[(2 * i + 0) * block + tid] -= x.data[i].real();
65  y[(2 * i + 1) * block + tid] -= x.data[i].imag();
66  }
67  }
68 
69 #else
70 
71 #define DECLARE_LINK(U) Link U;
72 
73 #define LINK Link &
74 
75  template <typename real, typename Link> __device__ inline void axpy(real a, const Link &x, Link &y) { y += a * x; }
76 
77 #endif
78 
79 #if defined(SHARED_ARRAY) && defined(SHARED_ACCUMULATOR)
80 
81 #define DECLARE_ARRAY(d, idx) \
82  unsigned char *d; \
83  { \
84  extern __shared__ int s[]; \
85  int tid = (threadIdx.z * blockDim.y + threadIdx.y) * blockDim.x + threadIdx.x; \
86  int block = blockDim.x * blockDim.y * blockDim.z; \
87  int offset = 18 * block * sizeof(real) / sizeof(int) + idx * block + tid; \
88  s[offset] = 0; \
89  d = (unsigned char *)&s[offset]; \
90  }
91 #elif defined(SHARED_ARRAY)
92 #error Cannot use SHARED_ARRAY with SHARED_ACCUMULATOR
93 #else
94 #define DECLARE_ARRAY(d, idx) int d[4] = {0, 0, 0, 0};
95 #endif
96 
97  template <class Float, typename Force, typename Gauge, typename Oprod> struct CloverDerivArg {
98  int X[4];
99  int E[4];
100  int border[4];
101  Float coeff;
102  int parity;
103  int volumeCB;
104 
105  Force force;
106  Gauge gauge;
107  Oprod oprod;
108 
109  CloverDerivArg(const Force &force, const Gauge &gauge, const Oprod &oprod, const int *X_, const int *E_,
110  double coeff, int parity) :
111  coeff(coeff),
112  parity(parity),
113  volumeCB(force.volumeCB),
114  force(force),
115  gauge(gauge),
116  oprod(oprod)
117  {
118  for (int dir = 0; dir < 4; ++dir) {
119  this->X[dir] = X_[dir];
120  this->E[dir] = E_[dir];
121  this->border[dir] = (E_[dir] - X_[dir]) / 2;
122  }
123  }
124  };
125 
126 #ifdef DYNAMIC_MU_NU
127  template <typename real, typename Arg, typename Link>
128  __device__ void computeForce(LINK force, Arg &arg, int xIndex, int yIndex, int mu, int nu)
129  {
130 #else
131  template <typename real, typename Arg, int mu, int nu, typename Link>
132  __device__ __forceinline__ void computeForce(LINK force, Arg &arg, int xIndex, int yIndex)
133  {
134 #endif
135 
136  int otherparity = (1 - arg.parity);
137 
138  const int tidx = mu > nu ? (mu - 1) * mu / 2 + nu : (nu - 1) * nu / 2 + mu;
139 
140  if (yIndex == 0) { // do "this" force
141 
142  DECLARE_ARRAY(x, 1);
143  getCoordsExtended(x, xIndex, arg.X, arg.parity, arg.border);
144 
145  // U[mu](x) U[nu](x+mu) U[*mu](x+nu) U[*nu](x) Oprod(x)
146  {
147  DECLARE_ARRAY(d, 0);
148 
149  // load U(x)_(+mu)
150  Link U1 = arg.gauge(mu, linkIndexShift(x, d, arg.E), arg.parity);
151 
152  // load U(x+mu)_(+nu)
153  d[mu]++;
154  Link U2 = arg.gauge(nu, linkIndexShift(x, d, arg.E), otherparity);
155  d[mu]--;
156 
157  // load U(x+nu)_(+mu)
158  d[nu]++;
159  Link U3 = arg.gauge(mu, linkIndexShift(x, d, arg.E), otherparity);
160  d[nu]--;
161 
162  // load U(x)_(+nu)
163  Link U4 = arg.gauge(nu, linkIndexShift(x, d, arg.E), arg.parity);
164 
165  // load Oprod
166  Link Oprod1 = arg.oprod(tidx, linkIndexShift(x, d, arg.E), arg.parity);
167 
168  if (nu < mu)
169  force -= U1 * U2 * conj(U3) * conj(U4) * Oprod1;
170  else
171  force += U1 * U2 * conj(U3) * conj(U4) * Oprod1;
172 
173  d[mu]++;
174  d[nu]++;
175  Link Oprod2 = arg.oprod(tidx, linkIndexShift(x, d, arg.E), arg.parity);
176  d[mu]--;
177  d[nu]--;
178 
179  if (nu < mu)
180  force -= U1 * U2 * Oprod2 * conj(U3) * conj(U4);
181  else
182  force += U1 * U2 * Oprod2 * conj(U3) * conj(U4);
183  }
184 
185  {
186  DECLARE_ARRAY(d, 0);
187 
188  // load U(x-nu)(+nu)
189  d[nu]--;
190  Link U1 = arg.gauge(nu, linkIndexShift(x, d, arg.E), otherparity);
191  d[nu]++;
192 
193  // load U(x-nu)(+mu)
194  d[nu]--;
195  Link U2 = arg.gauge(mu, linkIndexShift(x, d, arg.E), otherparity);
196  d[nu]++;
197 
198  // load U(x+mu-nu)(nu)
199  d[mu]++;
200  d[nu]--;
201  Link U3 = arg.gauge(nu, linkIndexShift(x, d, arg.E), arg.parity);
202  d[mu]--;
203  d[nu]++;
204 
205  // load U(x)_(+mu)
206  Link U4 = arg.gauge(mu, linkIndexShift(x, d, arg.E), arg.parity);
207 
208  d[mu]++;
209  d[nu]--;
210  Link Oprod1 = arg.oprod(tidx, linkIndexShift(x, d, arg.E), arg.parity);
211  d[mu]--;
212  d[nu]++;
213 
214  if (nu < mu)
215  force += conj(U1) * U2 * Oprod1 * U3 * conj(U4);
216  else
217  force -= conj(U1) * U2 * Oprod1 * U3 * conj(U4);
218 
219  Link Oprod4 = arg.oprod(tidx, linkIndexShift(x, d, arg.E), arg.parity);
220 
221  if (nu < mu)
222  force += Oprod4 * conj(U1) * U2 * U3 * conj(U4);
223  else
224  force -= Oprod4 * conj(U1) * U2 * U3 * conj(U4);
225  }
226 
227  } else { // else do other force
228 
229  DECLARE_ARRAY(y, 1);
230  getCoordsExtended(y, xIndex, arg.X, otherparity, arg.border);
231 
232  {
233  DECLARE_ARRAY(d, 0);
234 
235  // load U(x)_(+mu)
236  Link U1 = arg.gauge(mu, linkIndexShift(y, d, arg.E), otherparity);
237 
238  // load U(x+mu)_(+nu)
239  d[mu]++;
240  Link U2 = arg.gauge(nu, linkIndexShift(y, d, arg.E), arg.parity);
241  d[mu]--;
242 
243  // load U(x+nu)_(+mu)
244  d[nu]++;
245  Link U3 = arg.gauge(mu, linkIndexShift(y, d, arg.E), arg.parity);
246  d[nu]--;
247 
248  // load U(x)_(+nu)
249  Link U4 = arg.gauge(nu, linkIndexShift(y, d, arg.E), otherparity);
250 
251  // load opposite parity Oprod
252  d[nu]++;
253  Link Oprod3 = arg.oprod(tidx, linkIndexShift(y, d, arg.E), arg.parity);
254  d[nu]--;
255 
256  if (nu < mu)
257  force -= U1 * U2 * conj(U3) * Oprod3 * conj(U4);
258  else
259  force += U1 * U2 * conj(U3) * Oprod3 * conj(U4);
260 
261  // load Oprod(x+mu)
262  d[mu]++;
263  Link Oprod4 = arg.oprod(tidx, linkIndexShift(y, d, arg.E), arg.parity);
264  d[mu]--;
265 
266  if (nu < mu)
267  force -= U1 * Oprod4 * U2 * conj(U3) * conj(U4);
268  else
269  force += U1 * Oprod4 * U2 * conj(U3) * conj(U4);
270  }
271 
272  // Lower leaf
273  // U[nu*](x-nu) U[mu](x-nu) U[nu](x+mu-nu) Oprod(x+mu) U[*mu](x)
274  {
275  DECLARE_ARRAY(d, 0);
276 
277  // load U(x-nu)(+nu)
278  d[nu]--;
279  Link U1 = arg.gauge(nu, linkIndexShift(y, d, arg.E), arg.parity);
280  d[nu]++;
281 
282  // load U(x-nu)(+mu)
283  d[nu]--;
284  Link U2 = arg.gauge(mu, linkIndexShift(y, d, arg.E), arg.parity);
285  d[nu]++;
286 
287  // load U(x+mu-nu)(nu)
288  d[mu]++;
289  d[nu]--;
290  Link U3 = arg.gauge(nu, linkIndexShift(y, d, arg.E), otherparity);
291  d[mu]--;
292  d[nu]++;
293 
294  // load U(x)_(+mu)
295  Link U4 = arg.gauge(mu, linkIndexShift(y, d, arg.E), otherparity);
296 
297  // load Oprod(x+mu)
298  d[mu]++;
299  Link Oprod1 = arg.oprod(tidx, linkIndexShift(y, d, arg.E), arg.parity);
300  d[mu]--;
301 
302  if (nu < mu)
303  force += conj(U1) * U2 * U3 * Oprod1 * conj(U4);
304  else
305  force -= conj(U1) * U2 * U3 * Oprod1 * conj(U4);
306 
307  d[nu]--;
308  Link Oprod2 = arg.oprod(tidx, linkIndexShift(y, d, arg.E), arg.parity);
309  d[nu]++;
310 
311  if (nu < mu)
312  force += conj(U1) * Oprod2 * U2 * U3 * conj(U4);
313  else
314  force -= conj(U1) * Oprod2 * U2 * U3 * conj(U4);
315  }
316  }
317 
318  } // namespace quda
319 
320  template <typename real, typename Arg> __global__ void cloverDerivativeKernel(Arg arg)
321  {
322  int index = threadIdx.x + blockIdx.x * blockDim.x;
323  if (index >= arg.volumeCB) return;
324 
325  // y index determines whether we're updating arg.parity or (1-arg.parity)
326  int yIndex = threadIdx.y + blockIdx.y * blockDim.y;
327  if (yIndex >= 2) return;
328 
329  // mu index is mapped from z thread index
330  int mu = threadIdx.z + blockIdx.z * blockDim.z;
331  if (mu >= 4) return;
332 
333  typedef complex<real> Complex;
334  typedef Matrix<Complex, 3> Link;
335 
336  DECLARE_LINK(force);
337 
338 #ifdef DYNAMIC_MU_NU
339  for (int nu = 0; nu < 4; nu++) {
340  if (mu == nu) continue;
341  computeForce<real, Arg, Link>(force, arg, index, yIndex, mu, nu);
342  }
343 #else
344  switch (mu) {
345  case 0:
346  computeForce<real, Arg, 0, 1, Link>(force, arg, index, yIndex);
347  computeForce<real, Arg, 0, 2, Link>(force, arg, index, yIndex);
348  computeForce<real, Arg, 0, 3, Link>(force, arg, index, yIndex);
349  break;
350  case 1:
351  computeForce<real, Arg, 1, 0, Link>(force, arg, index, yIndex);
352  computeForce<real, Arg, 1, 3, Link>(force, arg, index, yIndex);
353  computeForce<real, Arg, 1, 2, Link>(force, arg, index, yIndex);
354  break;
355  case 2:
356  computeForce<real, Arg, 2, 3, Link>(force, arg, index, yIndex);
357  computeForce<real, Arg, 2, 0, Link>(force, arg, index, yIndex);
358  computeForce<real, Arg, 2, 1, Link>(force, arg, index, yIndex);
359  break;
360  case 3:
361  computeForce<real, Arg, 3, 2, Link>(force, arg, index, yIndex);
362  computeForce<real, Arg, 3, 1, Link>(force, arg, index, yIndex);
363  computeForce<real, Arg, 3, 0, Link>(force, arg, index, yIndex);
364  break;
365  }
366 #endif
367 
368  // Write to array
369  Link F = arg.force(mu, index, yIndex == 0 ? arg.parity : 1 - arg.parity);
370  axpy(arg.coeff, force, F);
371  arg.force(mu, index, yIndex == 0 ? arg.parity : 1 - arg.parity) = F;
372 
373  return;
374  } // cloverDerivativeKernel
375 
376 } // namespace quda
__host__ __device__ float4 operator-=(float4 &x, const float4 y)
Definition: float_vector.h:131
static __device__ __host__ void getCoordsExtended(I x[], int cb_index, const J X[], int parity, const int R[])
double mu
Definition: test_util.cpp:1648
static __device__ __host__ int linkIndexShift(const I x[], const J dx[], const K X[4])
#define DECLARE_ARRAY(d, idx)
#define LINK
Main header file for host and device accessors to GaugeFields.
std::complex< double > Complex
Definition: quda_internal.h:46
#define DECLARE_LINK(U)
static int index(int ndim, const int *dims, const int *x)
Definition: comm_common.cpp:32
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
const int volumeCB
Definition: spinor_noise.cu:26
__host__ __device__ float4 operator+=(float4 &x, const float4 y)
Definition: float_vector.h:96
__device__ void axpy(real a, const real *x, Link &y)
__host__ __device__ ValueType conj(ValueType x)
Definition: complex_quda.h:130
CloverDerivArg(const Force &force, const Gauge &gauge, const Oprod &oprod, const int *X_, const int *E_, double coeff, int parity)
__global__ void cloverDerivativeKernel(Arg arg)
__device__ void computeForce(LINK force, Arg &arg, int xIndex, int yIndex, int mu, int nu)