QUDA  1.0.0
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros
dslash_index.cuh
Go to the documentation of this file.
1 
12 template <int dim, int nLayers, int face_num, typename Param>
13 static inline __device__ int indexFromFaceIndexExtended(int face_idx, const Param &param)
14 {
15  const auto *X = param.dc.X;
16  const auto *R = param.R;
17 
18  int face_X = X[0], face_Y = X[1], face_Z = X[2]; // face_T = X[3]
19  switch (dim) {
20  case 0:
21  face_X = nLayers;
22  break;
23  case 1:
24  face_Y = nLayers;
25  break;
26  case 2:
27  face_Z = nLayers;
28  break;
29  case 3:
30  // face_T = nLayers;
31  break;
32  }
33  int face_XYZ = face_X * face_Y * face_Z;
34  int face_XY = face_X * face_Y;
35 
36  // intrinsic parity of the face depends on offset of first element
37 
38  int face_parity = (param.parity + face_num *(X[dim] - nLayers)) & 1;
39  // reconstruct full face index from index into the checkerboard
40 
41  face_idx *= 2;
42 
43  if (!(face_X & 1)) { // face_X even
44  // int t = face_idx / face_XYZ;
45  // int z = (face_idx / face_XY) % face_Z;
46  // int y = (face_idx / face_X) % face_Y;
47  // face_idx += (face_parity + t + z + y) & 1;
48  // equivalent to the above, but with fewer divisions/mods:
49  int aux1 = face_idx / face_X;
50  int aux2 = aux1 / face_Y;
51  int y = aux1 - aux2 * face_Y;
52  int t = aux2 / face_Z;
53  int z = aux2 - t * face_Z;
54  face_idx += (face_parity + t + z + y) & 1;
55  } else if (!(face_Y & 1)) { // face_Y even
56  int t = face_idx / face_XYZ;
57  int z = (face_idx / face_XY) % face_Z;
58  face_idx += (face_parity + t + z) & 1;
59  } else if (!(face_Z & 1)) { // face_Z even
60  int t = face_idx / face_XYZ;
61  face_idx += (face_parity + t) & 1;
62  } else {
63  face_idx += face_parity;
64  }
65 
66  // compute index into the full local volume
67 
68  int idx = face_idx;
69  int aux;
70 
71  int gap = X[dim] - nLayers;
72  switch (dim) {
73  case 0:
74  aux = face_idx / face_X;
75  idx += (aux + face_num)*gap + (1 - 2*face_num)*R[0];
76  break;
77  case 1:
78  aux = face_idx / face_XY;
79  idx += ((aux + face_num)*gap + (1 - 2*face_num)*R[1])*face_X;
80  break;
81  case 2:
82  aux = face_idx / face_XYZ;
83  idx += ((aux + face_num)*gap + (1 - 2*face_num)*R[2])* face_XY;
84  break;
85  case 3:
86  idx += (face_num*gap + (1 - 2*face_num)*R[3])*face_XYZ;
87  break;
88  }
89 
90  // return index into the checkerboard
91 
92  return idx >> 1;
93 }
94 
109 template <int dim, int nLayers, int face_num, typename Param>
110 static inline __device__ int indexFromFaceIndexStaggered(int face_idx_in, const Param &param)
111 {
112  const auto *X = param.dc.X; // grid dimension
113  const auto *dims = param.dc.dims[dim]; // dimensions of the face
114  const auto &V4 = param.dc.volume_4d; // 4-d volume
115 
116  // intrinsic parity of the face depends on offset of first element
117  int face_parity = (param.parity + face_num *(X[dim] - nLayers)) & 1;
118 
119  // reconstruct full face index from index into the checkerboard
120  face_idx_in *= 2;
121 
122  // first compute src index, then find 4-d index from remainder
123  int s = face_idx_in / param.dc.face_XYZT[dim];
124  int face_idx = face_idx_in - s*param.dc.face_XYZT[dim];
125 
126  /*y,z,t here are face indexes in new order*/
127  int aux1 = face_idx / dims[0];
128  int aux2 = aux1 / dims[1];
129  int y = aux1 - aux2 * dims[1];
130  int t = aux2 / dims[2];
131  int z = aux2 - t * dims[2];
132  face_idx += (face_parity + t + z + y) & 1;
133 
134  // compute index into the full local volume
135  int gap = X[dim] - nLayers;
136  int idx = face_idx;
137  int aux;
138  switch (dim) {
139  case 0:
140  aux = face_idx;
141  idx += face_num*gap + aux*(X[0]-1);
142  idx += (idx/V4)*(1-V4);
143  break;
144  case 1:
145  aux = face_idx / param.dc.face_X[dim];
146  idx += face_num * gap * param.dc.face_X[dim] + aux*(X[1]-1)*param.dc.face_X[dim];
147  idx += (idx/V4)*(X[0]-V4);
148  break;
149  case 2:
150  aux = face_idx / param.dc.face_XY[dim];
151  idx += face_num * gap * param.dc.face_XY[dim] +aux*(X[2]-1)*param.dc.face_XY[dim];
152  idx += (idx/V4)*((X[1]*X[0])-V4);
153  break;
154  case 3:
155  idx += face_num * gap * param.dc.face_XYZ[dim];
156  break;
157  }
158 
159  // return index into the checkerboard
160  return (idx + s*V4) >> 1;
161 }
162 
163 
178 template <int dim, int nLayers, int face_num, typename Param>
179 static inline __device__ int indexFromFaceIndexExtendedStaggered(int face_idx, const Param &param)
180 {
181  const auto *X = param.dc.X;
182  const auto *R = param.R;
183 
184  // dimensions of the face
185  int dims[3];
186  int V = X[0]*X[1]*X[2]*X[3];
187  int face_X = X[0], face_Y = X[1], face_Z = X[2]; // face_T = X[3];
188  switch (dim) {
189  case 0:
190  face_X = nLayers;
191  dims[0]=X[1]; dims[1]=X[2]; dims[2]=X[3];
192  break;
193  case 1:
194  face_Y = nLayers;
195  dims[0]=X[0];dims[1]=X[2]; dims[2]=X[3];
196  break;
197  case 2:
198  face_Z = nLayers;
199  dims[0]=X[0]; dims[1]=X[1]; dims[2]=X[3];
200  break;
201  case 3:
202  // face_T = nLayers;
203  dims[0]=X[0]; dims[1]=X[1]; dims[2]=X[3];
204  break;
205  }
206  int face_XYZ = face_X * face_Y * face_Z;
207  int face_XY = face_X * face_Y;
208 
209  // intrinsic parity of the face depends on offset of first element
210  int face_parity = (param.parity + face_num *(X[dim] - nLayers)) & 1;
211 
212  // reconstruct full face index from index into the checkerboard
213  face_idx *= 2;
214  /*y,z,t here are face indexes in new order*/
215  int aux1 = face_idx / dims[0];
216  int aux2 = aux1 / dims[1];
217  int y = aux1 - aux2 * dims[1];
218  int t = aux2 / dims[2];
219  int z = aux2 - t * dims[2];
220  face_idx += (face_parity + t + z + y) & 1;
221 
222  int idx = face_idx;
223  int aux;
224 
225  int gap = X[dim] - nLayers - 2*R[dim];
226  switch (dim) {
227  case 0:
228  aux = face_idx;
229  idx += face_num*gap + aux*(X[0]-1);
230  idx += (idx/V)*(1-V);
231  idx += R[0];
232  break;
233  case 1:
234  aux = face_idx / face_X;
235  idx += face_num * gap * face_X + aux*(X[1]-1)*face_X;
236  idx += idx/V*(X[0]-V);
237  idx += R[1]*X[0];
238  break;
239  case 2:
240  aux = face_idx / face_XY;
241  idx += face_num * gap * face_XY +aux*(X[2]-1)*face_XY;
242  idx += idx/V*(face_XY-V);
243  idx += R[2]*face_XY;
244  break;
245  case 3:
246  idx += ((face_num*gap) + R[3])*face_XYZ;
247  break;
248  }
249 
250  // return index into the checkerboard
251 
252  return idx >> 1;
253 }
254 
255 
264 template<KernelType dim, int nLayers, int Dir, typename Param>
265 static inline __device__ void coordsFromFaceIndexStaggered(int x[], int idx, const Param &param)
266 {
267  const auto *X = param.dc.X;
268  const auto *Xh = param.dc.Xh;
269 
270  int za, x1h, x0h, zb;
271  switch(dim) {
272  case EXTERIOR_KERNEL_X:
273  za = idx/Xh[1];
274  x1h = idx - za*Xh[1];
275  zb = za / X[2];
276  x[2] = za - zb*X[2];
277  x[0] = zb/X[3];
278  x[3] = zb - x[0]*X[3];
279  if(Dir == 2){
280  x[0] += ((x[0] >= nLayers) ? (X[0] - 2*nLayers) : 0);
281  }else if(Dir == 1){
282  x[0] += (X[0] - nLayers);
283  }
284  x[1] = 2*x1h + ((x[0] + x[2] + x[3] + param.parity) & 1);
285  break;
286  case EXTERIOR_KERNEL_Y:
287  za = idx/Xh[0];
288  x0h = idx - za*Xh[0];
289  zb = za / X[2];
290  x[2] = za - zb*X[2];
291  x[1] = zb/X[3];
292  x[3] = zb - x[1]*X[3];
293  if(Dir == 2){
294  x[1] += ((x[1] >= nLayers) ? (X[1] - 2*nLayers) : 0);
295  }else if(Dir == 1){
296  x[1] += (X[1] - nLayers);
297  }
298  x[0] = 2*x0h + ((x[1] + x[2] + x[3] + param.parity) & 1);
299  break;
300  case EXTERIOR_KERNEL_Z:
301  za = idx/Xh[0];
302  x0h = idx - za*Xh[0];
303  zb = za / X[1];
304  x[1] = za - zb*X[1];
305  x[2] = zb / X[3];
306  x[3] = zb - x[2]*X[3];
307  if(Dir == 2){
308  x[2] += ((x[2] >= nLayers) ? (X[2] - 2*nLayers) : 0);
309  }else if(Dir == 1){
310  x[2] += (X[2] - nLayers);
311  }
312  x[0] = 2*x0h + ((x[1] + x[2] + x[3] + param.parity) & 1);
313  break;
314  case EXTERIOR_KERNEL_T:
315  za = idx/Xh[0];
316  x0h = idx - za*Xh[0];
317  zb = za / X[1];
318  x[1] = za - zb*X[1];
319  x[3] = zb / X[2];
320  x[2] = zb - x[3]*X[2];
321  if(Dir == 2){
322  x[3] += ((x[3] >= nLayers) ? (X[3] - 2*nLayers) : 0);
323  }else if(Dir == 1){
324  x[3] += (X[3] - nLayers);
325  }
326  x[0] = 2*x0h + ((x[1] + x[2] + x[3] + param.parity) & 1);
327  break;
328  }
329  return;
330 }
331 
332 enum IndexType {
333  EVEN_X = 0,
334  EVEN_Y = 1,
335  EVEN_Z = 2,
336  EVEN_T = 3
337 };
338 
351 template <int nDim, QudaPCType pc_type, IndexType idxType, typename T, typename Param>
352 static __device__ __forceinline__ void coordsFromIndex(int &idx, T *x, int &cb_idx, const Param &param)
353 {
354 
355  // The full field index is
356  // idx = x + y*X + z*X*Y + t*X*Y*Z
357  // The parity of lattice site (x,y,z,t)
358  // is defined to be (x+y+z+t) & 1
359  // 0 => even parity
360  // 1 => odd parity
361  // cb_idx runs over the half volume
362  // cb_idx = iidx/2 = (x + y*X + z*X*Y + t*X*Y*Z)/2
363  //
364  // We need to obtain idx from cb_idx + parity.
365  //
366  // 1) First, consider the case where X is even.
367  // Then, y*X + z*X*Y + t*X*Y*Z is even and
368  // 2*cb_idx = 2*(x/2) + y*X + z*X*Y + t*X*Y*Z
369  // Since, 2*(x/2) is even, if y+z+t is even
370  // (2*(x/2),y,z,t) is an even parity site.
371  // Similarly, if y+z+t is odd
372  // (2*(x/2),y,z,t) is an odd parity site.
373  //
374  // Note that (1+y+z+t)&1 = 1 for y+z+t even
375  // and (1+y+z+t)&1 = 0 for y+z+t odd
376  // Therefore,
377  // (2*/(x/2) + (1+y+z+t)&1, y, z, t) is odd.
378  //
379  // 2) Consider the case where X is odd but Y is even.
380  // Calculate 2*cb_idx
381  // t = 2*cb_idx/XYZ
382  // z = (2*cb_idx/XY) % Z
383  //
384  // Now, we need to compute (x,y) for different parities.
385  // To select a site with even parity, consider (z+t).
386  // If (z+t) is even, this implies that (x+y) must also
387  // be even in order that (x+y+z+t) is even.
388  // Therefore, x + y*X is even.
389  // Thus, 2*cb_idx = idx
390  // and y = (2*cb_idx/X) % Y
391  // and x = (2*cb_idx) % X;
392  //
393  // On the other hand, if (z+t) is odd, (x+y) must be
394  // also be odd in order to get overall even parity.
395  // Then x + y*X is odd (since X is odd and either x or y is odd)
396  // and 2*cb_idx = 2*(idx/2) = idx-1 = x + y*X -1 + z*X*Y + t*X*Y*Z
397  // => idx = 2*cb_idx + 1
398  // and y = ((2*cb_idx + 1)/X) % Y
399  // and x = (2*cb_idx + 1) % X
400  //
401  // To select a site with odd parity if (z+t) is even,
402  // (x+y) must be odd, which, following the discussion above, implies that
403  // y = ((2*cb_idx + 1)/X) % Y
404  // x = (2*cb_idx + 1) % X
405  // Finally, if (z+t) is odd (x+y) must be even to get overall odd parity,
406  // and
407  // y = ((2*cb_idx)/X) % Y
408  // x = (2*cb_idx) % X
409  //
410  // The code below covers these cases
411  // as well as the cases where X, Y are odd and Z is even,
412  // and X,Y,Z are all odd
413 
414  const auto *X = param.dc.X;
415 
416  int XYZT = param.dc.Vh << 1; // X[3]*X[2]*X[1]*X[0]
417  int XYZ = param.dc.X3X2X1; // X[2]*X[1]*X[0]
418  int XY = param.dc.X2X1; // X[1]*X[0]
419 
420  idx = 2*cb_idx;
421  if (idxType == EVEN_X ) { // X even
422  // t = idx / XYZ;
423  // z = (idx / XY) % Z;
424  // y = (idx / X) % Y;
425  // idx += (parity + t + z + y) & 1;
426  // x = idx % X;
427  // equivalent to the above, but with fewer divisions/mods:
428 #if DSLASH_TUNE_TILE // tiled indexing - experimental and disabled for now
429  const auto *block = param.block;
430  const auto *grid = param.grid;
431 
432  int aux[9];
433  aux[0] = idx;
434  for (int i=0; i<4; i++) aux[i+1] = aux[i] / block[i];
435  for (int i=4; i<8; i++) aux[i+1] = aux[i] / grid[i];
436 
437  for (int i=0; i<4; i++) x[i] = aux[i] - aux[i+1] * block[i];
438  for (int i=0; i<4; i++) x[i] += block[i]*(aux[i+4] - aux[i+5] * grid[i]);
439  x[4] = (nDim == 5) ? aux[8] : 0;
440 
441  int oddbit = (pc_type == QUDA_4D_PC ? (param.parity + t + z + y) : (param.parity + s + t + z + y)) & 1;
442  x += oddbit;
443 
444  // update cb_idx for the swizzled coordinate
445  cb_idx = (nDim == 5 ? (((x[4]*X[3]+x[3])*X[2]+x[2])*X[1]+x[1])*X[0]+x[0] : ((x[3]*X[2]+x[2])*X[1]+x[1])*X[0]+x[0]) >> 1;
446  idx = 2*cb_idx + oddbit;
447 #else
448  int aux[5];
449  aux[0] = idx;
450  for (int i=0; i<4; i++) aux[i+1] = aux[i] / X[i];
451 
452  x[0] = aux[0] - aux[1] * X[0];
453  x[1] = aux[1] - aux[2] * X[1];
454  x[2] = aux[2] - aux[3] * X[2];
455  x[3] = aux[3] - (nDim == 5 ? aux[4] * X[3] : 0);
456  x[4] = (nDim == 5) ? aux[4] : 0;
457 
458  int oddbit = (pc_type == QUDA_4D_PC ? (param.parity + x[3] + x[2] + x[1]) : (param.parity + x[4] + x[3] + x[2] + x[1])) & 1;
459 
460  x[0] += oddbit;
461  idx += oddbit;
462 #endif
463  } else if (idxType == EVEN_Y ) { // Y even
464  x[4] = idx / XYZT;
465  x[3] = (idx / XYZ) % X[3];
466  x[2] = (idx / XY) % X[2];
467  idx += (param.parity + x[3] + x[2]) & 1;
468  x[1] = (idx / X[0]) % X[1];
469  x[0] = idx % X[0];
470  } else if (idxType == EVEN_Z ) { // Z even
471  x[4] = idx / XYZT;
472  x[3] = (idx / XYZ) % X[3];
473  idx += (param.parity + x[3]) & 1;
474  x[2] = (idx / XY) % X[2];
475  x[1] = (idx / X[0]) % X[1];
476  x[0] = idx % X[0];
477  } else {
478  x[4] = idx / XYZT;
479  idx += (param.parity + x[4]) & 1;
480  x[3] = idx / XYZ;
481  x[2] = (idx / XY) % X[2];
482  x[1] = (idx / X[0]) % X[1];
483  x[0] = idx % X[0];
484  } // else we do not support all odd local dimensions except fifth dimension
485 }
486 
487 
498 template <IndexType idxType, typename Int, typename Param>
499 static __device__ __forceinline__ void coordsFromIndex3D(int &idx, Int * const x, int &cb_idx, const Param &param) {
500  const auto *X = param.x;
501 
502  if (idxType == EVEN_X) { // X even
503  int xt = blockIdx.x*blockDim.x + threadIdx.x;
504  int aux = xt+xt;
505  x[3] = aux / X[0];
506  x[0] = aux - x[3]*X[0];
507  x[1] = blockIdx.y*blockDim.y + threadIdx.y;
508  x[2] = blockIdx.z*blockDim.z + threadIdx.z;
509  x[0] += (param.parity + x[3] + x[2] + x[1]) &1;
510  idx = ((x[3]*X[2] + x[2])*X[1] + x[1])*X[0] + x[0];
511  cb_idx = idx >> 1;
512  } else {
513  // Non-even X is not (yet) supported.
514  return;
515  }
516 }
517 
518 
528 template <int dim, typename T>
529 static inline __device__ bool inBoundary(const int depth, const int coord[], const T X[]){
530  return ((coord[dim] >= X[dim] - depth) || (coord[dim] < depth));
531 }
532 
533 
554 template <typename T>
555 static inline __device__ bool isActive(const int threadDim, int offsetDim, int offset, const int y[], const int partitioned[], const T X[])
556 {
557 
558  // Threads with threadDim = t can handle t,z,y,x offsets
559  // Threads with threadDim = z can handle z,y,x offsets
560  // Threads with threadDim = y can handle y,x offsets
561  // Threads with threadDim = x can handle x offsets
562  if(!partitioned[offsetDim]) return false;
563 
564  if(threadDim < offsetDim) return false;
565  int width = (offset > 0) ? offset : -offset;
566 
567  switch(threadDim){
568  case 3: // threadDim = T
569  break;
570 
571  case 2: // threadDim = Z
572  if(!partitioned[3]) break;
573  if(partitioned[3] && inBoundary<3>(width, y, X)) return false;
574  break;
575 
576  case 1: // threadDim = Y
577  if((!partitioned[3]) && (!partitioned[2])) break;
578  if(partitioned[3] && inBoundary<3>(width, y, X)) return false;
579  if(partitioned[2] && inBoundary<2>(width, y, X)) return false;
580  break;
581 
582  case 0: // threadDim = X
583  if((!partitioned[3]) && (!partitioned[2]) && (!partitioned[1])) break;
584  if(partitioned[3] && inBoundary<3>(width, y, X)) return false;
585  if(partitioned[2] && inBoundary<2>(width, y, X)) return false;
586  if(partitioned[1] && inBoundary<1>(width, y, X)) return false;
587  break;
588 
589  default:
590  break;
591  }
592  return true;
593 }
594 
595 
605 template<int nDim, int nLayers, typename I, typename Param>
606 static inline __device__ void faceIndexFromCoords(int &face_idx, I * const x, int face_dim, const Param &param)
607 {
608  int D[4] = {param.dc.X[0], param.dc.X[1], param.dc.X[2], param.dc.X[3]};
609  int y[5] = {x[0], x[1], x[2], x[3], (nDim==5 ? x[4] : 0)};
610 
611  y[face_dim] = (y[face_dim] < nLayers) ? y[face_dim] : y[face_dim] - (D[face_dim] - nLayers);
612  D[face_dim] = nLayers;
613 
614  if (nDim == 5) face_idx = (((((D[3]*y[4] + y[3])*D[2] + y[2])*D[1] + y[1])*D[0] + y[0]) >> 1);
615  else if (nDim == 4) face_idx = ((((D[2]*y[3] + y[2])*D[1] + y[1])*D[0] + y[0]) >> 1);
616 
617  return;
618 }
619 
620 /*
621  @brief Fast power function that works for negative "a" argument
622  @param a argument we want to raise to some power
623  @param b power that we want to raise a to
624  @return pow(a,b)
625 */
626 __device__ inline float __fast_pow(float a, int b) {
627  float sign = signbit(a) ? -1.0f : 1.0f;
628  float power = __powf(fabsf(a), b);
629  return b&1 ? sign * power : power;
630 }
631 
632 
633 
static __device__ void coordsFromFaceIndexStaggered(int x[], int idx, const Param &param)
Compute the full-lattice coordinates from the input face index. This is used by the staggered halo up...
static __device__ bool isActive(const int threadDim, int offsetDim, int offset, const int y[], const int partitioned[], const T X[])
Compute whether this thread should be active for updating the a given offsetDim halo. This is used by the fused halo region update kernels: here every thread has a prescribed dimension it is tasked with updating, but for the edges and vertices, the thread responsible for the entire update is the "greatest" one. Hence some threads may be labelled as a given dimension, but they have to update other dimensions too. Conversely, a given thread may be labeled for a given dimension, but if that thread lies at en edge or vertex, and we have partitioned a higher dimension, then that thread will cede to the higher thread.
static int R[4]
static __device__ __forceinline__ void coordsFromIndex3D(int &idx, Int *const x, int &cb_idx, const Param &param)
Compute coordinates from index into the checkerboard (used by the interior Dslash kernels)...
static __device__ int indexFromFaceIndexExtended(int face_idx, const Param &param)
Compute global extended checkerboard index from face index. The following indexing routines work for ...
QudaGaugeParam param
Definition: pack_test.cpp:17
static __device__ int indexFromFaceIndexStaggered(int face_idx_in, const Param &param)
Compute global checkerboard index from face index. The following indexing routines work for arbitrary...
IndexType
static __device__ __forceinline__ void coordsFromIndex(int &idx, T *x, int &cb_idx, const Param &param)
Compute coordinates from index into the checkerboard (used by the interior Dslash kernels)...
static __device__ void faceIndexFromCoords(int &face_idx, I *const x, int face_dim, const Param &param)
Compute the face index from the lattice coordinates.
int X[4]
Definition: covdev_test.cpp:70
static __device__ bool inBoundary(const int depth, const int coord[], const T X[])
Compute whether the provided coordinate is within the halo region boundary of a given dimension...
int V
Definition: test_util.cpp:27
static int dims[4]
Definition: face_gauge.cpp:41
__device__ float __fast_pow(float a, int b)
__shared__ float s[]
static __device__ int indexFromFaceIndexExtendedStaggered(int face_idx, const Param &param)
Compute global extended checkerboard index from face index. The following indexing routines work for ...