QUDA  0.9.0
dslash_index.cuh
Go to the documentation of this file.
1 
13 template <int nDim, QudaDWFPCType type, int dim, int nLayers, int face_num, typename Param>
14 static inline __device__ int indexFromFaceIndex(int face_idx, const Param &param)
15 {
16  // intrinsic parity of the face depends on offset of first element
17  int face_parity = (param.parity + face_num * (param.dc.X[dim] - nLayers)) & 1;
18 
19  // reconstruct full face index from index into the checkerboard
20  face_idx *= 2;
21 
22  if (!(param.dc.face_X[dim] & 1)) { // face_X even
23 
24  // int s = face_idx / face_XYZT;
25  // int t = (face_idx / face_XYZ) % face_T;
26  // int z = (face_idx / face_XY) % face_Z;
27  // int y = (face_idx / face_X) % face_Y;
28  // face_idx += (face_parity + s + t + z + y) & 1;
29  // equivalent to the above, but with fewer divisions/mods:
30  int aux1 = face_idx / param.dc.face_X[dim];
31  int aux2 = aux1 / param.dc.face_Y[dim];
32  int aux3 = aux2 / param.dc.face_Z[dim];
33  int aux4 = (nDim == 5 ? aux3 / param.dc.face_T[dim] : 0);
34 
35  int y = aux1 - aux2 * param.dc.face_Y[dim];
36  int z = aux2 - aux3 * param.dc.face_Z[dim];
37  int t = aux3 - aux4 * param.dc.face_T[dim];
38  int s = aux4;
39 
40  if (type == QUDA_5D_PC) face_idx += (face_parity + s + t + z + y) & 1;
41  else if (type == QUDA_4D_PC) face_idx += (face_parity + t + z + y) & 1;
42 
43  } else if (!(param.dc.face_Y[dim] & 1)) { // face_Y even
44 
45  int s = face_idx / param.dc.face_XYZT[dim];
46  int t = (nDim == 5 ? (face_idx / param.dc.face_XYZ[dim]) % param.dc.face_T[dim] : (face_idx / param.dc.face_XYZ[dim]) );
47  int z = (face_idx / param.dc.face_XY[dim]) % param.dc.face_Z[dim];
48  if (type == QUDA_5D_PC) face_idx += (face_parity + s + t + z) & 1;
49  else if (type == QUDA_4D_PC) face_idx += (face_parity + t + z) & 1;
50 
51  } else if (!(param.dc.face_Z[dim] & 1)) { // face_Z even
52 
53  int s = face_idx / param.dc.face_XYZT[dim];
54  int t = (nDim == 5 ? (face_idx / param.dc.face_XYZ[dim]) % param.dc.face_T[dim] : (face_idx / param.dc.face_XYZ[dim]) );
55  if (type == QUDA_5D_PC) face_idx += (face_parity + s + t) & 1;
56  else if (type == QUDA_4D_PC) face_idx += (face_parity + t) & 1;
57 
58  } else if (!(param.dc.face_T[dim]) && nDim == 5) {
59 
60  int s = face_idx / param.dc.face_XYZT[dim];
61  if (type == QUDA_5D_PC) face_idx += (face_parity + s) & 1;
62  else if (type == QUDA_4D_PC) face_idx += face_parity;
63 
64  } else {
65 
66  face_idx += face_parity;
67 
68  }
69 
70  // compute index into the full local volume
71  int gap = param.dc.X[dim] - nLayers;
72  int idx = face_idx;
73  int aux;
74  switch (dim) {
75  case 0:
76  aux = face_idx / param.dc.face_X[dim];
77  idx += (aux + face_num) * gap;
78  break;
79  case 1:
80  aux = face_idx / param.dc.face_XY[dim];
81  idx += (aux + face_num) * gap * param.dc.face_X[dim];
82  break;
83  case 2:
84  aux = face_idx / param.dc.face_XYZ[dim];
85  idx += (aux + face_num) * gap * param.dc.face_XY[dim];
86  break;
87  case 3:
88  aux = (nDim == 5 ? face_idx / param.dc.face_XYZT[dim] : 0);
89  idx += (aux + face_num) * gap * param.dc.face_XYZ[dim];
90  break;
91  }
92 
93  // return index into the checkerboard
94  return idx >> 1;
95 }
96 
97 
109 template <int dim, int nLayers, int face_num, typename Param>
110 static inline __device__ int indexFromFaceIndexExtended(int face_idx, const Param &param)
111 {
112  const auto *X = param.dc.X;
113  const auto *R = param.R;
114 
115  int face_X = X[0], face_Y = X[1], face_Z = X[2]; // face_T = X[3]
116  switch (dim) {
117  case 0:
118  face_X = nLayers;
119  break;
120  case 1:
121  face_Y = nLayers;
122  break;
123  case 2:
124  face_Z = nLayers;
125  break;
126  case 3:
127  // face_T = nLayers;
128  break;
129  }
130  int face_XYZ = face_X * face_Y * face_Z;
131  int face_XY = face_X * face_Y;
132 
133  // intrinsic parity of the face depends on offset of first element
134 
135  int face_parity = (param.parity + face_num *(X[dim] - nLayers)) & 1;
136  // reconstruct full face index from index into the checkerboard
137 
138  face_idx *= 2;
139 
140  if (!(face_X & 1)) { // face_X even
141  // int t = face_idx / face_XYZ;
142  // int z = (face_idx / face_XY) % face_Z;
143  // int y = (face_idx / face_X) % face_Y;
144  // face_idx += (face_parity + t + z + y) & 1;
145  // equivalent to the above, but with fewer divisions/mods:
146  int aux1 = face_idx / face_X;
147  int aux2 = aux1 / face_Y;
148  int y = aux1 - aux2 * face_Y;
149  int t = aux2 / face_Z;
150  int z = aux2 - t * face_Z;
151  face_idx += (face_parity + t + z + y) & 1;
152  } else if (!(face_Y & 1)) { // face_Y even
153  int t = face_idx / face_XYZ;
154  int z = (face_idx / face_XY) % face_Z;
155  face_idx += (face_parity + t + z) & 1;
156  } else if (!(face_Z & 1)) { // face_Z even
157  int t = face_idx / face_XYZ;
158  face_idx += (face_parity + t) & 1;
159  } else {
160  face_idx += face_parity;
161  }
162 
163  // compute index into the full local volume
164 
165  int idx = face_idx;
166  int aux;
167 
168  int gap = X[dim] - nLayers;
169  switch (dim) {
170  case 0:
171  aux = face_idx / face_X;
172  idx += (aux + face_num)*gap + (1 - 2*face_num)*R[0];
173  break;
174  case 1:
175  aux = face_idx / face_XY;
176  idx += ((aux + face_num)*gap + (1 - 2*face_num)*R[1])*face_X;
177  break;
178  case 2:
179  aux = face_idx / face_XYZ;
180  idx += ((aux + face_num)*gap + (1 - 2*face_num)*R[2])* face_XY;
181  break;
182  case 3:
183  idx += (face_num*gap + (1 - 2*face_num)*R[3])*face_XYZ;
184  break;
185  }
186 
187  // return index into the checkerboard
188 
189  return idx >> 1;
190 }
191 
206 template <int dim, int nLayers, int face_num, typename Param>
207 static inline __device__ int indexFromFaceIndexStaggered(int face_idx_in, const Param &param)
208 {
209  const auto *X = param.dc.X; // grid dimension
210  const auto *dims = param.dc.dims[dim]; // dimensions of the face
211  const auto &V4 = param.dc.volume_4d; // 4-d volume
212 
213  // intrinsic parity of the face depends on offset of first element
214  int face_parity = (param.parity + face_num *(X[dim] - nLayers)) & 1;
215 
216  // reconstruct full face index from index into the checkerboard
217  face_idx_in *= 2;
218 
219  // first compute src index, then find 4-d index from remainder
220  int s = face_idx_in / param.dc.face_XYZT[dim];
221  int face_idx = face_idx_in - s*param.dc.face_XYZT[dim];
222 
223  /*y,z,t here are face indexes in new order*/
224  int aux1 = face_idx / dims[0];
225  int aux2 = aux1 / dims[1];
226  int y = aux1 - aux2 * dims[1];
227  int t = aux2 / dims[2];
228  int z = aux2 - t * dims[2];
229  face_idx += (face_parity + t + z + y) & 1;
230 
231  // compute index into the full local volume
232  int gap = X[dim] - nLayers;
233  int idx = face_idx;
234  int aux;
235  switch (dim) {
236  case 0:
237  aux = face_idx;
238  idx += face_num*gap + aux*(X[0]-1);
239  idx += (idx/V4)*(1-V4);
240  break;
241  case 1:
242  aux = face_idx / param.dc.face_X[dim];
243  idx += face_num * gap * param.dc.face_X[dim] + aux*(X[1]-1)*param.dc.face_X[dim];
244  idx += (idx/V4)*(X[0]-V4);
245  break;
246  case 2:
247  aux = face_idx / param.dc.face_XY[dim];
248  idx += face_num * gap * param.dc.face_XY[dim] +aux*(X[2]-1)*param.dc.face_XY[dim];
249  idx += (idx/V4)*((X[1]*X[0])-V4);
250  break;
251  case 3:
252  idx += face_num * gap * param.dc.face_XYZ[dim];
253  break;
254  }
255 
256  // return index into the checkerboard
257  return (idx + s*V4) >> 1;
258 }
259 
260 
275 template <int dim, int nLayers, int face_num, typename Param>
276 static inline __device__ int indexFromFaceIndexExtendedStaggered(int face_idx, const Param &param)
277 {
278  const auto *X = param.dc.X;
279  const auto *R = param.R;
280 
281  // dimensions of the face
282  int dims[3];
283  int V = X[0]*X[1]*X[2]*X[3];
284  int face_X = X[0], face_Y = X[1], face_Z = X[2]; // face_T = X[3];
285  switch (dim) {
286  case 0:
287  face_X = nLayers;
288  dims[0]=X[1]; dims[1]=X[2]; dims[2]=X[3];
289  break;
290  case 1:
291  face_Y = nLayers;
292  dims[0]=X[0];dims[1]=X[2]; dims[2]=X[3];
293  break;
294  case 2:
295  face_Z = nLayers;
296  dims[0]=X[0]; dims[1]=X[1]; dims[2]=X[3];
297  break;
298  case 3:
299  // face_T = nLayers;
300  dims[0]=X[0]; dims[1]=X[1]; dims[2]=X[3];
301  break;
302  }
303  int face_XYZ = face_X * face_Y * face_Z;
304  int face_XY = face_X * face_Y;
305 
306  // intrinsic parity of the face depends on offset of first element
307  int face_parity = (param.parity + face_num *(X[dim] - nLayers)) & 1;
308 
309  // reconstruct full face index from index into the checkerboard
310  face_idx *= 2;
311  /*y,z,t here are face indexes in new order*/
312  int aux1 = face_idx / dims[0];
313  int aux2 = aux1 / dims[1];
314  int y = aux1 - aux2 * dims[1];
315  int t = aux2 / dims[2];
316  int z = aux2 - t * dims[2];
317  face_idx += (face_parity + t + z + y) & 1;
318 
319  int idx = face_idx;
320  int aux;
321 
322  int gap = X[dim] - nLayers - 2*R[dim];
323  switch (dim) {
324  case 0:
325  aux = face_idx;
326  idx += face_num*gap + aux*(X[0]-1);
327  idx += (idx/V)*(1-V);
328  idx += R[0];
329  break;
330  case 1:
331  aux = face_idx / face_X;
332  idx += face_num * gap * face_X + aux*(X[1]-1)*face_X;
333  idx += idx/V*(X[0]-V);
334  idx += R[1]*X[0];
335  break;
336  case 2:
337  aux = face_idx / face_XY;
338  idx += face_num * gap * face_XY +aux*(X[2]-1)*face_XY;
339  idx += idx/V*(face_XY-V);
340  idx += R[2]*face_XY;
341  break;
342  case 3:
343  idx += ((face_num*gap) + R[3])*face_XYZ;
344  break;
345  }
346 
347  // return index into the checkerboard
348 
349  return idx >> 1;
350 }
351 
352 
361 template<KernelType dim, int nLayers, int Dir, typename Param>
362 static inline __device__ void coordsFromFaceIndexStaggered(int x[], int idx, const Param &param)
363 {
364  const auto *X = param.dc.X;
365  const auto *Xh = param.dc.Xh;
366 
367  int za, x1h, x0h, zb;
368  switch(dim) {
369  case EXTERIOR_KERNEL_X:
370  za = idx/Xh[1];
371  x1h = idx - za*Xh[1];
372  zb = za / X[2];
373  x[2] = za - zb*X[2];
374  x[0] = zb/X[3];
375  x[3] = zb - x[0]*X[3];
376  if(Dir == 2){
377  x[0] += ((x[0] >= nLayers) ? (X[0] - 2*nLayers) : 0);
378  }else if(Dir == 1){
379  x[0] += (X[0] - nLayers);
380  }
381  x[1] = 2*x1h + ((x[0] + x[2] + x[3] + param.parity) & 1);
382  break;
383  case EXTERIOR_KERNEL_Y:
384  za = idx/Xh[0];
385  x0h = idx - za*Xh[0];
386  zb = za / X[2];
387  x[2] = za - zb*X[2];
388  x[1] = zb/X[3];
389  x[3] = zb - x[1]*X[3];
390  if(Dir == 2){
391  x[1] += ((x[1] >= nLayers) ? (X[1] - 2*nLayers) : 0);
392  }else if(Dir == 1){
393  x[1] += (X[1] - nLayers);
394  }
395  x[0] = 2*x0h + ((x[1] + x[2] + x[3] + param.parity) & 1);
396  break;
397  case EXTERIOR_KERNEL_Z:
398  za = idx/Xh[0];
399  x0h = idx - za*Xh[0];
400  zb = za / X[1];
401  x[1] = za - zb*X[1];
402  x[2] = zb / X[3];
403  x[3] = zb - x[2]*X[3];
404  if(Dir == 2){
405  x[2] += ((x[2] >= nLayers) ? (X[2] - 2*nLayers) : 0);
406  }else if(Dir == 1){
407  x[2] += (X[2] - nLayers);
408  }
409  x[0] = 2*x0h + ((x[1] + x[2] + x[3] + param.parity) & 1);
410  break;
411  case EXTERIOR_KERNEL_T:
412  za = idx/Xh[0];
413  x0h = idx - za*Xh[0];
414  zb = za / X[1];
415  x[1] = za - zb*X[1];
416  x[3] = zb / X[2];
417  x[2] = zb - x[3]*X[2];
418  if(Dir == 2){
419  x[3] += ((x[3] >= nLayers) ? (X[3] - 2*nLayers) : 0);
420  }else if(Dir == 1){
421  x[3] += (X[3] - nLayers);
422  }
423  x[0] = 2*x0h + ((x[1] + x[2] + x[3] + param.parity) & 1);
424  break;
425  }
426  return;
427 }
428 
429 
441 template <int nDim, QudaDWFPCType type, int dim_, int nLayers, typename Int, typename Param>
442 static inline __device__ void coordsFromFaceIndex(int &idx, int &cb_idx, Int * const x, int face_idx,
443  const int &face_num, const Param &param)
444 {
445  constexpr int dim = (dim_ == INTERIOR_KERNEL) ? 0 : dim_; // silence compiler warning
446 
447  const auto *X = param.dc.X;
448  const auto &face_X = param.dc.face_X[dim];
449  const auto &face_Y = param.dc.face_Y[dim];
450  const auto &face_Z = param.dc.face_Z[dim];
451  const auto &face_T = param.dc.face_T[dim];
452  const auto &face_XYZT = param.dc.face_XYZT[dim];
453  const auto &face_XYZ = param.dc.face_XYZ[dim];
454  const auto &face_XY = param.dc.face_XY[dim];
455 
456  // intrinsic parity of the face depends on offset of first element
457  int face_parity = (param.parity + face_num * (param.dc.X[dim] - nLayers)) & 1;
458 
459  // compute coordinates from (checkerboard) face index
460  face_idx *= 2;
461 
462  if (!(face_X & 1)) { // face_X even
463  // s = face_idx / face_XYZT;
464  // t = (face_idx / face_XYZ) % face_T;
465  // z = (face_idx / face_XY) % face_Z;
466  // y = (face_idx / face_X) % face_Y;
467  // face_idx += (face_parity + s + t + z + y) & 1;
468  // x = face_idx % face_X;
469  // equivalent to the above, but with fewer divisions/mods:
470  int aux1 = face_idx / face_X;
471  int aux2 = aux1 / face_Y;
472  int aux3 = aux2 / face_Z;
473  int aux4 = (nDim == 5 ? aux3 / face_T : 0);
474 
475  x[0] = face_idx - aux1 * face_X;
476  x[1] = aux1 - aux2 * face_Y;
477  x[2] = aux2 - aux3 * face_Z;
478  x[3] = aux3 - aux4 * face_T;
479  x[4] = aux4;
480 
481  if (type == QUDA_5D_PC) x[0] += (face_parity + x[4] + x[3] + x[2] + x[1]) & 1;
482  else x[0] += (face_parity + x[3] + x[2] + x[1]) & 1;
483 
484  } else if (!(face_Y & 1)) { // face_Y even
485 
486  x[4] = (nDim == 5 ? face_idx / face_XYZT : 0);
487  x[3] = (nDim == 5 ? (face_idx / face_XYZ) % face_T : (face_idx/face_XYZ) );
488  x[2] = (face_idx / face_XY) % face_Z;
489 
490  if (type == QUDA_5D_PC) face_idx += (face_parity + x[4] + x[3] + x[2]) & 1;
491  else face_idx += (face_parity + x[3] + x[2]) & 1;
492 
493  x[1] = (face_idx / face_X) % face_Y;
494  x[0] = face_idx % face_X;
495 
496  } else if (!(face_Z & 1)) { // face_Z even
497 
498  x[4] = (nDim == 5 ? face_idx / face_XYZT : 0);
499  x[3] = (nDim == 5 ? (face_idx / face_XYZ) % face_T : (face_idx / face_XYZ) );
500 
501  if (type == QUDA_5D_PC) face_idx += (face_parity + x[4] + x[3]) & 1;
502  else if (type == QUDA_4D_PC) face_idx += (face_parity + x[3]) & 1;
503 
504  x[2] = (face_idx / face_XY) % face_Z;
505  x[1] = (face_idx / face_X) % face_Y;
506  x[0] = face_idx % face_X;
507 
508  } else {
509 
510  x[4] = (nDim == 5 ? face_idx / face_XYZT : 0);
511  face_idx += face_parity;
512  x[3] = (nDim == 5 ? (face_idx / face_XYZ) % face_T : (face_idx / face_XYZ) );
513  x[2] = (face_idx / face_XY) % face_Z;
514  x[1] = (face_idx / face_X) % face_Y;
515  x[0] = face_idx % face_X;
516 
517  }
518 
519  // need to convert to global coords, not face coords
520  x[dim] += face_num * (X[dim]-nLayers);
521 
522  // compute index into the full local volume
523  idx = X[0]*(X[1]*(X[2]*(X[3]*x[4] + x[3]) + x[2]) + x[1]) + x[0];
524 
525  // compute index into the checkerboard
526  cb_idx = idx >> 1;
527 }
528 
529 
530 enum IndexType {
531  EVEN_X = 0,
532  EVEN_Y = 1,
533  EVEN_Z = 2,
534  EVEN_T = 3
535 };
536 
549 template <int nDim, QudaDWFPCType pc_type, IndexType idxType, typename T, typename Param>
550 static __device__ __forceinline__ void coordsFromIndex(int &idx, T *x, int &cb_idx, const Param &param)
551 {
552 
553  // The full field index is
554  // idx = x + y*X + z*X*Y + t*X*Y*Z
555  // The parity of lattice site (x,y,z,t)
556  // is defined to be (x+y+z+t) & 1
557  // 0 => even parity
558  // 1 => odd parity
559  // cb_idx runs over the half volume
560  // cb_idx = iidx/2 = (x + y*X + z*X*Y + t*X*Y*Z)/2
561  //
562  // We need to obtain idx from cb_idx + parity.
563  //
564  // 1) First, consider the case where X is even.
565  // Then, y*X + z*X*Y + t*X*Y*Z is even and
566  // 2*cb_idx = 2*(x/2) + y*X + z*X*Y + t*X*Y*Z
567  // Since, 2*(x/2) is even, if y+z+t is even
568  // (2*(x/2),y,z,t) is an even parity site.
569  // Similarly, if y+z+t is odd
570  // (2*(x/2),y,z,t) is an odd parity site.
571  //
572  // Note that (1+y+z+t)&1 = 1 for y+z+t even
573  // and (1+y+z+t)&1 = 0 for y+z+t odd
574  // Therefore,
575  // (2*/(x/2) + (1+y+z+t)&1, y, z, t) is odd.
576  //
577  // 2) Consider the case where X is odd but Y is even.
578  // Calculate 2*cb_idx
579  // t = 2*cb_idx/XYZ
580  // z = (2*cb_idx/XY) % Z
581  //
582  // Now, we need to compute (x,y) for different parities.
583  // To select a site with even parity, consider (z+t).
584  // If (z+t) is even, this implies that (x+y) must also
585  // be even in order that (x+y+z+t) is even.
586  // Therefore, x + y*X is even.
587  // Thus, 2*cb_idx = idx
588  // and y = (2*cb_idx/X) % Y
589  // and x = (2*cb_idx) % X;
590  //
591  // On the other hand, if (z+t) is odd, (x+y) must be
592  // also be odd in order to get overall even parity.
593  // Then x + y*X is odd (since X is odd and either x or y is odd)
594  // and 2*cb_idx = 2*(idx/2) = idx-1 = x + y*X -1 + z*X*Y + t*X*Y*Z
595  // => idx = 2*cb_idx + 1
596  // and y = ((2*cb_idx + 1)/X) % Y
597  // and x = (2*cb_idx + 1) % X
598  //
599  // To select a site with odd parity if (z+t) is even,
600  // (x+y) must be odd, which, following the discussion above, implies that
601  // y = ((2*cb_idx + 1)/X) % Y
602  // x = (2*cb_idx + 1) % X
603  // Finally, if (z+t) is odd (x+y) must be even to get overall odd parity,
604  // and
605  // y = ((2*cb_idx)/X) % Y
606  // x = (2*cb_idx) % X
607  //
608  // The code below covers these cases
609  // as well as the cases where X, Y are odd and Z is even,
610  // and X,Y,Z are all odd
611 
612  const auto *X = param.dc.X;
613 
614  int XYZT = param.dc.Vh << 1; // X[3]*X[2]*X[1]*X[0]
615  int XYZ = param.dc.X3X2X1; // X[2]*X[1]*X[0]
616  int XY = param.dc.X2X1; // X[1]*X[0]
617 
618  idx = 2*cb_idx;
619  if (idxType == EVEN_X ) { // X even
620  // t = idx / XYZ;
621  // z = (idx / XY) % Z;
622  // y = (idx / X) % Y;
623  // idx += (parity + t + z + y) & 1;
624  // x = idx % X;
625  // equivalent to the above, but with fewer divisions/mods:
626 #if DSLASH_TUNE_TILE // tiled indexing - experimental and disabled for now
627  const auto *block = param.block;
628  const auto *grid = param.grid;
629 
630  int aux[9];
631  aux[0] = idx;
632  for (int i=0; i<4; i++) aux[i+1] = aux[i] / block[i];
633  for (int i=4; i<8; i++) aux[i+1] = aux[i] / grid[i];
634 
635  for (int i=0; i<4; i++) x[i] = aux[i] - aux[i+1] * block[i];
636  for (int i=0; i<4; i++) x[i] += block[i]*(aux[i+4] - aux[i+5] * grid[i]);
637  x[4] = (nDim == 5) ? aux[8] : 0;
638 
639  int oddbit = (pc_type == QUDA_4D_PC ? (param.parity + t + z + y) : (param.parity + s + t + z + y)) & 1;
640  x += oddbit;
641 
642  // update cb_idx for the swizzled coordinate
643  cb_idx = (nDim == 5 ? (((x[4]*X[3]+x[3])*X[2]+x[2])*X[1]+x[1])*X[0]+x[0] : ((x[3]*X[2]+x[2])*X[1]+x[1])*X[0]+x[0]) >> 1;
644  idx = 2*cb_idx + oddbit;
645 #else
646  int aux[5];
647  aux[0] = idx;
648  for (int i=0; i<4; i++) aux[i+1] = aux[i] / X[i];
649 
650  x[0] = aux[0] - aux[1] * X[0];
651  x[1] = aux[1] - aux[2] * X[1];
652  x[2] = aux[2] - aux[3] * X[2];
653  x[3] = aux[3] - (nDim == 5 ? aux[4] * X[3] : 0);
654  x[4] = (nDim == 5) ? aux[4] : 0;
655 
656  int oddbit = (pc_type == QUDA_4D_PC ? (param.parity + x[3] + x[2] + x[1]) : (param.parity + x[4] + x[3] + x[2] + x[1])) & 1;
657 
658  x[0] += oddbit;
659  idx += oddbit;
660 #endif
661  } else if (idxType == EVEN_Y ) { // Y even
662  x[4] = idx / XYZT;
663  x[3] = (idx / XYZ) % X[3];
664  x[2] = (idx / XY) % X[2];
665  idx += (param.parity + x[3] + x[2]) & 1;
666  x[1] = (idx / X[0]) % X[1];
667  x[0] = idx % X[0];
668  } else if (idxType == EVEN_Z ) { // Z even
669  x[4] = idx / XYZT;
670  x[3] = (idx / XYZ) % X[3];
671  idx += (param.parity + x[3]) & 1;
672  x[2] = (idx / XY) % X[2];
673  x[1] = (idx / X[0]) % X[1];
674  x[0] = idx % X[0];
675  } else {
676  x[4] = idx / XYZT;
677  idx += (param.parity + x[4]) & 1;
678  x[3] = idx / XYZ;
679  x[2] = (idx / XY) % X[2];
680  x[1] = (idx / X[0]) % X[1];
681  x[0] = idx % X[0];
682  } // else we do not support all odd local dimensions except fifth dimension
683 }
684 
685 
696 template <IndexType idxType, typename Int, typename Param>
697 static __device__ __forceinline__ void coordsFromIndex3D(int &idx, Int * const x, int &cb_idx, const Param &param) {
698  const auto *X = param.x;
699 
700  if (idxType == EVEN_X) { // X even
701  int xt = blockIdx.x*blockDim.x + threadIdx.x;
702  int aux = xt+xt;
703  x[3] = aux / X[0];
704  x[0] = aux - x[3]*X[0];
705  x[1] = blockIdx.y*blockDim.y + threadIdx.y;
706  x[2] = blockIdx.z*blockDim.z + threadIdx.z;
707  x[0] += (param.parity + x[3] + x[2] + x[1]) &1;
708  idx = ((x[3]*X[2] + x[2])*X[1] + x[1])*X[0] + x[0];
709  cb_idx = idx >> 1;
710  } else {
711  // Non-even X is not (yet) supported.
712  return;
713  }
714 }
715 
716 
726 template <int dim, typename T>
727 static inline __device__ bool inBoundary(const int depth, const int coord[], const T X[]){
728  return ((coord[dim] >= X[dim] - depth) || (coord[dim] < depth));
729 }
730 
731 
752 template <typename T>
753 static inline __device__ bool isActive(const int threadDim, int offsetDim, int offset, const int y[], const int partitioned[], const T X[])
754 {
755 
756  // Threads with threadDim = t can handle t,z,y,x offsets
757  // Threads with threadDim = z can handle z,y,x offsets
758  // Threads with threadDim = y can handle y,x offsets
759  // Threads with threadDim = x can handle x offsets
760  if(!partitioned[offsetDim]) return false;
761 
762  if(threadDim < offsetDim) return false;
763  int width = (offset > 0) ? offset : -offset;
764 
765  switch(threadDim){
766  case 3: // threadDim = T
767  break;
768 
769  case 2: // threadDim = Z
770  if(!partitioned[3]) break;
771  if(partitioned[3] && inBoundary<3>(width, y, X)) return false;
772  break;
773 
774  case 1: // threadDim = Y
775  if((!partitioned[3]) && (!partitioned[2])) break;
776  if(partitioned[3] && inBoundary<3>(width, y, X)) return false;
777  if(partitioned[2] && inBoundary<2>(width, y, X)) return false;
778  break;
779 
780  case 0: // threadDim = X
781  if((!partitioned[3]) && (!partitioned[2]) && (!partitioned[1])) break;
782  if(partitioned[3] && inBoundary<3>(width, y, X)) return false;
783  if(partitioned[2] && inBoundary<2>(width, y, X)) return false;
784  if(partitioned[1] && inBoundary<1>(width, y, X)) return false;
785  break;
786 
787  default:
788  break;
789  }
790  return true;
791 }
792 
793 
807 template <int nDim=4, typename Param>
808 __device__ inline int dimFromFaceIndex(int &face_idx, const int tid, const Param &param){
809 
810  // s - the coordinate in the fifth dimension - is the slowest-changing coordinate
811  const int s = (nDim == 5 ? tid/param.threads : 0);
812 
813  face_idx = tid - s*param.threads; // face_idx = face_idx % param.threads
814 
815  if (face_idx < param.threadDimMapUpper[0]){
816  face_idx += s*param.threadDimMapUpper[0];
817  return 0;
818  } else if (face_idx < param.threadDimMapUpper[1]){
819  face_idx -= param.threadDimMapLower[1];
820  face_idx += s*(param.threadDimMapUpper[1] - param.threadDimMapLower[1]);
821  return 1;
822  } else if (face_idx < param.threadDimMapUpper[2]){
823  face_idx -= param.threadDimMapLower[2];
824  face_idx += s*(param.threadDimMapUpper[2] - param.threadDimMapLower[2]);
825  return 2;
826  } else {
827  face_idx -= param.threadDimMapLower[3];
828  face_idx += s*(param.threadDimMapUpper[3] - param.threadDimMapLower[3]);
829  return 3;
830  }
831 }
832 
833 template <int nDim=4, typename Param>
834 __device__ inline int dimFromFaceIndex(int &face_idx, const Param &param) { return dimFromFaceIndex<nDim>(face_idx, face_idx, param); }
835 
845 template<int nDim, int nLayers, typename I, typename Param>
846 static inline __device__ void faceIndexFromCoords(int &face_idx, I * const x, int face_dim, const Param &param)
847 {
848  int D[4] = {param.dc.X[0], param.dc.X[1], param.dc.X[2], param.dc.X[3]};
849  int y[5] = {x[0], x[1], x[2], x[3], (nDim==5 ? x[4] : 0)};
850 
851  y[face_dim] = (y[face_dim] < nLayers) ? y[face_dim] : y[face_dim] - (D[face_dim] - nLayers);
852  D[face_dim] = nLayers;
853 
854  if (nDim == 5) face_idx = (((((D[3]*y[4] + y[3])*D[2] + y[2])*D[1] + y[1])*D[0] + y[0]) >> 1);
855  else if (nDim == 4) face_idx = ((((D[2]*y[3] + y[2])*D[1] + y[1])*D[0] + y[0]) >> 1);
856 
857  return;
858 }
859 
860 
880 //#define SWIZZLE
881 
882  template <typename T>
883  __device__ inline int block_idx(const T &swizzle) {
884 #ifdef SWIZZLE
885  // the portion of the grid that is exactly divisible by the number of SMs
886  const int gridp = gridDim.x - gridDim.x % swizzle;
887 
888  int block_idx = blockIdx.x;
889  if (blockIdx.x < gridp) {
890  // this is the portion of the block that we are going to transpose
891  const int i = blockIdx.x % swizzle;
892  const int j = blockIdx.x / swizzle;
893 
894  // transpose the coordinates
895  block_idx = i * (gridp / swizzle) + j;
896  }
897  return block_idx;
898 #else
899  return blockIdx.x;
900 #endif
901  }
902 
903 
904 
905 /*
906  @brief Fast power function that works for negative "a" argument
907  @param a argument we want to raise to some power
908  @param b power that we want to raise a to
909  @return pow(a,b)
910 */
911 __device__ inline float __fast_pow(float a, int b) {
912  float sign = signbit(a) ? -1.0f : 1.0f;
913  float power = __powf(fabsf(a), b);
914  return b&1 ? sign * power : power;
915 }
916 
917 
918 
static __device__ void coordsFromFaceIndexStaggered(int x[], int idx, const Param &param)
Compute the full-lattice coordinates from the input face index. This is used by the staggered halo up...
dim3 dim3 blockDim
size_t const void size_t size_t width
float fabsf(float)
static __inline__ dim3 dim3 void size_t cudaStream_t int dim
static __device__ bool isActive(const int threadDim, int offsetDim, int offset, const int y[], const int partitioned[], const T X[])
Compute whether this thread should be active for updating the a given offsetDim halo. This is used by the fused halo region update kernels: here every thread has a prescribed dimension it is tasked with updating, but for the edges and vertices, the thread responsible for the entire update is the "greatest" one. Hence some threads may be labelled as a given dimension, but they have to update other dimensions too. Conversely, a given thread may be labeled for a given dimension, but if that thread lies at en edge or vertex, and we have partitioned a higher dimension, then that thread will cede to the higher thread.
static int R[4]
static __device__ __forceinline__ void coordsFromIndex3D(int &idx, Int *const x, int &cb_idx, const Param &param)
Compute coordinates from index into the checkerboard (used by the interior Dslash kernels)...
size_t size_t offset
static __device__ int indexFromFaceIndexExtended(int face_idx, const Param &param)
Compute global extended checkerboard index from face index. The following indexing routines work for ...
QudaGaugeParam param
Definition: pack_test.cpp:17
#define b
__device__ int dimFromFaceIndex(int &face_idx, const int tid, const Param &param)
Determines which face a given thread is computing. Also rescale face_idx so that is relative to a giv...
static __device__ int indexFromFaceIndexStaggered(int face_idx_in, const Param &param)
Compute global checkerboard index from face index. The following indexing routines work for arbitrary...
IndexType
static __device__ void coordsFromFaceIndex(int &idx, int &cb_idx, Int *const x, int face_idx, const int &face_num, const Param &param)
Compute the full-lattice coordinates from the input face index. This is used by the Wilson-like halo ...
static __device__ __forceinline__ void coordsFromIndex(int &idx, T *x, int &cb_idx, const Param &param)
Compute coordinates from index into the checkerboard (used by the interior Dslash kernels)...
static __device__ void faceIndexFromCoords(int &face_idx, I *const x, int face_dim, const Param &param)
Compute the face index from the lattice coordinates.
int V
Definition: test_util.cpp:28
const auto * Xh
int X[4]
Definition: quda.h:29
static __device__ bool inBoundary(const int depth, const int coord[], const T X[])
Compute whether the provided coordinate is within the halo region boundary of a given dimension...
__device__ float __fast_pow(float a, int b)
static __device__ int indexFromFaceIndex(int face_idx, const Param &param)
Compute global checkerboard index from face index. The following indexing routines work for arbitrary...
if(err !=cudaSuccess)
int face_idx
const int face_num
static __device__ int indexFromFaceIndexExtendedStaggered(int face_idx, const Param &param)
Compute global extended checkerboard index from face index. The following indexing routines work for ...
#define a
__device__ int block_idx(const T &swizzle)
Swizzler for reordering the (x) thread block indices - use on conjunction with swizzle-factor autotun...