QUDA  1.0.0
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros
index_helper.cuh
Go to the documentation of this file.
1 #pragma once
2 
3 namespace quda {
12  template <typename I, typename J, typename K>
13  static __device__ __host__ inline int linkIndexShift(const I x[], const J dx[], const K X[4]) {
14  int y[4];
15 #pragma unroll
16  for ( int i = 0; i < 4; i++ ) y[i] = (x[i] + dx[i] + X[i]) % X[i];
17  int idx = (((y[3] * X[2] + y[2]) * X[1] + y[1]) * X[0] + y[0]) >> 1;
18  return idx;
19  }
20 
30  template <typename I, typename J, typename K>
31  static __device__ __host__ inline int linkIndexShift(I y[], const I x[], const J dx[], const K X[4]) {
32 #pragma unroll
33  for ( int i = 0; i < 4; i++ ) y[i] = (x[i] + dx[i] + X[i]) % X[i];
34  int idx = (((y[3] * X[2] + y[2]) * X[1] + y[1]) * X[0] + y[0]) >> 1;
35  return idx;
36  }
37 
45  template <typename I>
46  static __device__ __host__ inline int linkIndex(const int x[], const I X[4]) {
47  int idx = (((x[3] * X[2] + x[2]) * X[1] + x[1]) * X[0] + x[0]) >> 1;
48  return idx;
49  }
50 
59  template <typename I>
60  static __device__ __host__ inline int linkIndex(int y[], const int x[], const I X[4]) {
61  int idx = (((x[3] * X[2] + x[2]) * X[1] + x[1]) * X[0] + x[0]) >> 1;
62  y[0] = x[0]; y[1] = x[1]; y[2] = x[2]; y[3] = x[3];
63  return idx;
64  }
65 
75  template <typename I, int n>
76  static __device__ __host__ inline int linkIndexDn(const int x[], const I X[4], const int mu)
77  {
78  int y[4];
79 #pragma unroll
80  for ( int i = 0; i < 4; i++ ) y[i] = x[i];
81  y[mu] = (y[mu] + n + X[mu]) % X[mu];
82  int idx = (((y[3] * X[2] + y[2]) * X[1] + y[1]) * X[0] + y[0]) >> 1;
83  return idx;
84  }
85 
94  template <typename I> static __device__ __host__ inline int linkIndexM1(const int x[], const I X[4], const int mu)
95  {
96  return linkIndexDn<I, -1>(x, X, mu);
97  }
98 
107  template <typename I> static __device__ __host__ inline int linkIndexM3(const int x[], const I X[4], const int mu)
108  {
109  return linkIndexDn<I, -3>(x, X, mu);
110  }
111 
120  template <typename I>
121  static __device__ __host__ inline int linkNormalIndexP1(const int x[], const I X[4], const int mu) {
122  int y[4];
123 #pragma unroll
124  for ( int i = 0; i < 4; i++ ) y[i] = x[i];
125  y[mu] = (y[mu] + 1 + X[mu]) % X[mu];
126  int idx = ((y[3] * X[2] + y[2]) * X[1] + y[1]) * X[0] + y[0];
127  return idx;
128  }
129 
138  template <typename I>
139  static __device__ __host__ inline int linkIndexP1(const int x[], const I X[4], const int mu) {
140  return linkIndexDn<I, 1>(x, X, mu);
141  }
142 
151  template <typename I> static __device__ __host__ inline int linkIndexP3(const int x[], const I X[4], const int mu)
152  {
153  return linkIndexDn<I, 3>(x, X, mu);
154  }
155 
165  template <int nDim = 4, typename Arg>
166  static __device__ __host__ inline int getNeighborIndexCB(const int x[], int mu, int dir, const Arg &arg)
167  {
168  int idx = nDim == 4 ? ((x[3] * arg.X[2] + x[2]) * arg.X[1] + x[1]) * arg.X[0] + x[0] :
169  (((x[4] * arg.X[3] + x[3]) * arg.X[2] + x[2]) * arg.X[1] + x[1]) * arg.X[0] + x[0];
170  switch (dir) {
171  case +1: // positive direction
172  switch (mu) {
173  case 0: return (x[0] == arg.X[0] - 1 ? idx - (arg.X[0] - 1) : idx + 1) >> 1;
174  case 1: return (x[1] == arg.X[1] - 1 ? idx - arg.X2X1mX1 : idx + arg.X[0]) >> 1;
175  case 2: return (x[2] == arg.X[2] - 1 ? idx - arg.X3X2X1mX2X1 : idx + arg.X2X1) >> 1;
176  case 3: return (x[3] == arg.X[3] - 1 ? idx - arg.X4X3X2X1mX3X2X1 : idx + arg.X3X2X1) >> 1;
177  case 4: return (x[4] == arg.X[4] - 1 ? idx - arg.X5X4X3X2X1mX4X3X2X1 : idx + arg.X4X3X2X1) >> 1;
178  }
179  case -1:
180  switch (mu) {
181  case 0: return (x[0] == 0 ? idx + (arg.X[0] - 1) : idx - 1) >> 1;
182  case 1: return (x[1] == 0 ? idx + arg.X2X1mX1 : idx - arg.X[0]) >> 1;
183  case 2: return (x[2] == 0 ? idx + arg.X3X2X1mX2X1 : idx - arg.X2X1) >> 1;
184  case 3: return (x[3] == 0 ? idx + arg.X4X3X2X1mX3X2X1 : idx - arg.X3X2X1) >> 1;
185  case 4: return (x[4] == 0 ? idx + arg.X5X4X3X2X1mX4X3X2X1 : idx - arg.X4X3X2X1) >> 1;
186  }
187  }
188  return 0; // should never reach here
189  }
190 
200  template <typename I, typename J>
201  static __device__ __host__ inline void getCoordsCB(int x[], int cb_index, const I X[], J X0h, int parity)
202  {
203  //x[3] = cb_index/(X[2]*X[1]*X[0]/2);
204  //x[2] = (cb_index/(X[1]*X[0]/2)) % X[2];
205  //x[1] = (cb_index/(X[0]/2)) % X[1];
206  //x[0] = 2*(cb_index%(X[0]/2)) + ((x[3]+x[2]+x[1]+parity)&1);
207 
208  int za = (cb_index / X0h);
209  int zb = (za / X[1]);
210  x[1] = (za - zb * X[1]);
211  x[3] = (zb / X[2]);
212  x[2] = (zb - x[3] * X[2]);
213  int x1odd = (x[1] + x[2] + x[3] + parity) & 1;
214  x[0] = (2 * cb_index + x1odd - za * X[0]);
215  return;
216  }
217 
228  template <typename I> static __device__ __host__ inline void getCoords(int x[], int cb_index, const I X[], int parity)
229  {
230  getCoordsCB(x, cb_index, X, X[0] >> 1, parity);
231  }
232 
241  template <typename I, typename J>
242  static __device__ __host__ inline void getCoordsExtended(I x[], int cb_index, const J X[], int parity, const int R[]) {
243  //x[3] = cb_index/(X[2]*X[1]*X[0]/2);
244  //x[2] = (cb_index/(X[1]*X[0]/2)) % X[2];
245  //x[1] = (cb_index/(X[0]/2)) % X[1];
246  //x[0] = 2*(cb_index%(X[0]/2)) + ((x[3]+x[2]+x[1]+parity)&1);
247 
248  int za = (cb_index / (X[0] >> 1));
249  int zb = (za / X[1]);
250  x[1] = (za - zb * X[1]);
251  x[3] = (zb / X[2]);
252  x[2] = (zb - x[3] * X[2]);
253  int x1odd = (x[1] + x[2] + x[3] + parity) & 1;
254  x[0] = (2 * cb_index + x1odd - za * X[0]);
255 #pragma unroll
256  for (int d=0; d<4; d++) x[d] += R[d];
257  return;
258  }
259 
269  template <typename I, typename J>
270  static __device__ __host__ inline void getCoords5CB(
271  int x[5], int cb_index, const I X[5], J X0h, int parity, QudaPCType pc_type)
272  {
273  //x[4] = cb_index/(X[3]*X[2]*X[1]*X[0]/2);
274  //x[3] = (cb_index/(X[2]*X[1]*X[0]/2) % X[3];
275  //x[2] = (cb_index/(X[1]*X[0]/2)) % X[2];
276  //x[1] = (cb_index/(X[0]/2)) % X[1];
277  //x[0] = 2*(cb_index%(X[0]/2)) + ((x[3]+x[2]+x[1]+parity)&1);
278 
279  int za = (cb_index / X0h);
280  int zb = (za / X[1]);
281  x[1] = za - zb * X[1];
282  int zc = zb / X[2];
283  x[2] = zb - zc*X[2];
284  x[4] = (zc / X[3]);
285  x[3] = zc - x[4] * X[3];
286  int x1odd = (x[1] + x[2] + x[3] + (pc_type==QUDA_5D_PC ? x[4] : 0) + parity) & 1;
287  x[0] = (2 * cb_index + x1odd) - za * X[0];
288  return;
289  }
290 
300  template <typename I>
301  static __device__ __host__ inline void getCoords5(int x[5], int cb_index, const I X[5], int parity, QudaPCType pc_type)
302  {
303  getCoords5CB(x, cb_index, X, X[0] >> 1, parity, pc_type);
304  }
305 
315  template <typename I>
316  static __device__ __host__ inline int getIndexFull(int cb_index, const I X[4], int parity) {
317  int za = (cb_index / (X[0] / 2));
318  int zb = (za / X[1]);
319  int x1 = za - zb * X[1];
320  int x3 = (zb / X[2]);
321  int x2 = zb - x3 * X[2];
322  int x1odd = (x1 + x2 + x3 + parity) & 1;
323  return 2 * cb_index + x1odd;
324  }
325 
334  template <int dir, int nDim = 4, typename I>
335  __device__ __host__ inline int ghostFaceIndex(const int x_[], const I X_[], int dim, int nFace)
336  {
337  static_assert((nDim == 4 || nDim == 5), "Number of dimensions must be 4 or 5");
338  int index = 0;
339  const int x[] = {x_[0], x_[1], x_[2], x_[3], nDim == 5 ? x_[4] : 0};
340  const int X[] = {(int)X_[0], (int)X_[1], (int)X_[2], (int)X_[3], nDim == 5 ? (int)X_[4] : 1};
341 
342  switch(dim) {
343  case 0:
344  switch(dir) {
345  case 0:
346  index = (x[0]*X[4]*X[3]*X[2]*X[1] + x[4]*X[3]*X[2]*X[1] + x[3]*(X[2]*X[1])+x[2]*X[1] + x[1])>>1;
347  break;
348  case 1:
349  index = ((x[0]-X[0]+nFace)*X[4]*X[3]*X[2]*X[1] + x[4]*X[3]*X[2]*X[1] + x[3]*(X[2]*X[1]) + x[2]*X[1] + x[1])>>1;
350  break;
351  }
352  break;
353  case 1:
354  switch(dir) {
355  case 0:
356  index = (x[1]*X[4]*X[3]*X[2]*X[0] + x[4]*X[3]*X[2]*X[0] + x[3]*X[2]*X[0]+x[2]*X[0]+x[0])>>1;
357  break;
358  case 1:
359  index = ((x[1]-X[1]+nFace)*X[4]*X[3]*X[2]*X[0] +x[4]*X[3]*X[2]*X[0]+ x[3]*X[2]*X[0] + x[2]*X[0] + x[0])>>1;
360  break;
361  }
362  break;
363  case 2:
364  switch(dir) {
365  case 0:
366  index = (x[2]*X[4]*X[3]*X[1]*X[0] + x[4]*X[3]*X[1]*X[0] + x[3]*X[1]*X[0]+x[1]*X[0]+x[0])>>1;
367  break;
368  case 1:
369  index = ((x[2]-X[2]+nFace)*X[4]*X[3]*X[1]*X[0] + x[4]*X[3]*X[1]*X[0] + x[3]*X[1]*X[0] + x[1]*X[0] + x[0])>>1;
370  break;
371  }
372  break;
373  case 3:
374  switch(dir) {
375  case 0:
376  index = (x[3]*X[4]*X[2]*X[1]*X[0] + x[4]*X[2]*X[1]*X[0] + x[2]*X[1]*X[0]+x[1]*X[0]+x[0])>>1;
377  break;
378  case 1:
379  index = ((x[3]-X[3]+nFace)*X[4]*X[2]*X[1]*X[0] + x[4]*X[2]*X[1]*X[0] + x[2]*X[1]*X[0]+x[1]*X[0] + x[0])>>1;
380  break;
381  }
382  break;
383  }
384  return index;
385  }
386 
395  template <int dir, int nDim = 4, typename I>
396  __device__ __host__ inline int ghostFaceIndexStaggered(const int x_[], const I X_[], int dim, int nFace)
397  {
398  static_assert((nDim == 4 || nDim == 5), "Number of dimensions must be 4 or 5");
399  int index = 0;
400  const int x[] = {x_[0], x_[1], x_[2], x_[3], nDim == 5 ? x_[4] : 0};
401  const int X[] = {(int)X_[0], (int)X_[1], (int)X_[2], (int)X_[3], nDim == 5 ? (int)X_[4] : 1};
402 
403  switch (dim) {
404  case 0:
405  switch (dir) {
406  case 0:
407  index = ((x[0] + nFace - 1) * X[4] * X[3] * X[2] * X[1] + x[4] * X[3] * X[2] * X[1] + x[3] * (X[2] * X[1])
408  + x[2] * X[1] + x[1])
409  >> 1;
410  break;
411  case 1:
412  index = ((x[0] - X[0] + nFace) * X[4] * X[3] * X[2] * X[1] + x[4] * X[3] * X[2] * X[1] + x[3] * (X[2] * X[1])
413  + x[2] * X[1] + x[1])
414  >> 1;
415  break;
416  }
417  break;
418  case 1:
419  switch (dir) {
420  case 0:
421  index = ((x[1] + nFace - 1) * X[4] * X[3] * X[2] * X[0] + x[4] * X[3] * X[2] * X[0] + x[3] * X[2] * X[0]
422  + x[2] * X[0] + x[0])
423  >> 1;
424  break;
425  case 1:
426  index = ((x[1] - X[1] + nFace) * X[4] * X[3] * X[2] * X[0] + x[4] * X[3] * X[2] * X[0] + x[3] * X[2] * X[0]
427  + x[2] * X[0] + x[0])
428  >> 1;
429  break;
430  }
431  break;
432  case 2:
433  switch (dir) {
434  case 0:
435  index = ((x[2] + nFace - 1) * X[4] * X[3] * X[1] * X[0] + x[4] * X[3] * X[1] * X[0] + x[3] * X[1] * X[0]
436  + x[1] * X[0] + x[0])
437  >> 1;
438  break;
439  case 1:
440  index = ((x[2] - X[2] + nFace) * X[4] * X[3] * X[1] * X[0] + x[4] * X[3] * X[1] * X[0] + x[3] * X[1] * X[0]
441  + x[1] * X[0] + x[0])
442  >> 1;
443  break;
444  }
445  break;
446  case 3:
447  switch (dir) {
448  case 0:
449  index = ((x[3] + nFace - 1) * X[4] * X[2] * X[1] * X[0] + x[4] * X[2] * X[1] * X[0] + x[2] * X[1] * X[0]
450  + x[1] * X[0] + x[0])
451  >> 1;
452  break;
453  case 1:
454  index = ((x[3] - X[3] + nFace) * X[4] * X[2] * X[1] * X[0] + x[4] * X[2] * X[1] * X[0] + x[2] * X[1] * X[0]
455  + x[1] * X[0] + x[0])
456  >> 1;
457  break;
458  }
459  break;
460  }
461  return index;
462  }
463 
464  enum KernelType {
472  };
473 
487  template <int nDim, QudaPCType type, int dim_, int nLayers, typename Int, typename Arg>
488  inline __device__ __host__ void coordsFromFaceIndex(
489  int &idx, int &cb_idx, Int *const x, int face_idx, const int &face_num, int parity, const Arg &arg)
490  {
491  constexpr int dim = (dim_ == INTERIOR_KERNEL || dim_ == EXTERIOR_KERNEL_ALL) ? 0 : dim_; // silence compiler warning
492 
493  const auto *X = arg.dc.X;
494  const auto &face_X = arg.dc.face_X[dim];
495  const auto &face_Y = arg.dc.face_Y[dim];
496  const auto &face_Z = arg.dc.face_Z[dim];
497  const auto &face_T = arg.dc.face_T[dim];
498  const auto &face_XYZT = arg.dc.face_XYZT[dim];
499  const auto &face_XYZ = arg.dc.face_XYZ[dim];
500  const auto &face_XY = arg.dc.face_XY[dim];
501 
502  // intrinsic parity of the face depends on offset of first element
503  int face_parity = (parity + face_num * (arg.dc.X[dim] - nLayers)) & 1;
504 
505  // compute coordinates from (checkerboard) face index
506  face_idx *= 2;
507 
508  if (!(face_X & 1)) { // face_X even
509  // s = face_idx / face_XYZT;
510  // t = (face_idx / face_XYZ) % face_T;
511  // z = (face_idx / face_XY) % face_Z;
512  // y = (face_idx / face_X) % face_Y;
513  // face_idx += (face_parity + s + t + z + y) & 1;
514  // x = face_idx % face_X;
515  // equivalent to the above, but with fewer divisions/mods:
516  int aux1 = face_idx / face_X;
517  int aux2 = aux1 / face_Y;
518  int aux3 = aux2 / face_Z;
519  int aux4 = (nDim == 5 ? aux3 / face_T : 0);
520 
521  x[0] = face_idx - aux1 * face_X;
522  x[1] = aux1 - aux2 * face_Y;
523  x[2] = aux2 - aux3 * face_Z;
524  x[3] = aux3 - aux4 * face_T;
525  x[4] = aux4;
526 
527  if (type == QUDA_5D_PC)
528  x[0] += (face_parity + x[4] + x[3] + x[2] + x[1]) & 1;
529  else
530  x[0] += (face_parity + x[3] + x[2] + x[1]) & 1;
531 
532  } else if (!(face_Y & 1)) { // face_Y even
533 
534  x[4] = (nDim == 5 ? face_idx / face_XYZT : 0);
535  x[3] = (nDim == 5 ? (face_idx / face_XYZ) % face_T : (face_idx / face_XYZ));
536  x[2] = (face_idx / face_XY) % face_Z;
537 
538  if (type == QUDA_5D_PC)
539  face_idx += (face_parity + x[4] + x[3] + x[2]) & 1;
540  else
541  face_idx += (face_parity + x[3] + x[2]) & 1;
542 
543  x[1] = (face_idx / face_X) % face_Y;
544  x[0] = face_idx % face_X;
545 
546  } else if (!(face_Z & 1)) { // face_Z even
547 
548  x[4] = (nDim == 5 ? face_idx / face_XYZT : 0);
549  x[3] = (nDim == 5 ? (face_idx / face_XYZ) % face_T : (face_idx / face_XYZ));
550 
551  if (type == QUDA_5D_PC)
552  face_idx += (face_parity + x[4] + x[3]) & 1;
553  else if (type == QUDA_4D_PC)
554  face_idx += (face_parity + x[3]) & 1;
555 
556  x[2] = (face_idx / face_XY) % face_Z;
557  x[1] = (face_idx / face_X) % face_Y;
558  x[0] = face_idx % face_X;
559 
560  } else {
561 
562  x[4] = (nDim == 5 ? face_idx / face_XYZT : 0);
563  face_idx += face_parity;
564  x[3] = (nDim == 5 ? (face_idx / face_XYZ) % face_T : (face_idx / face_XYZ));
565  x[2] = (face_idx / face_XY) % face_Z;
566  x[1] = (face_idx / face_X) % face_Y;
567  x[0] = face_idx % face_X;
568  }
569 
570  // need to convert to global coords, not face coords
571  x[dim] += face_num * (X[dim] - nLayers);
572 
573  // compute index into the full local volume
574  idx = X[0] * (X[1] * (X[2] * (X[3] * x[4] + x[3]) + x[2]) + x[1]) + x[0];
575 
576  // compute index into the checkerboard
577  cb_idx = idx >> 1;
578  }
579 
584  template <int nDim, QudaPCType type, int dim_, int nLayers, typename Int, typename Arg>
585  inline __device__ __host__ void coordsFromFaceIndex(
586  int &idx, int &cb_idx, Int *const x, int face_idx, const int &face_num, const Arg &arg)
587  {
588  coordsFromFaceIndex<nDim, type, dim_, nLayers>(idx, cb_idx, x, face_idx, face_num, arg.parity, arg);
589  }
590 
600  template <int nDim, QudaPCType type, int dim, int nLayers, int face_num, typename Arg>
601  inline __device__ __host__ int indexFromFaceIndex(int face_idx, int parity, const Arg &arg)
602  {
603  // intrinsic parity of the face depends on offset of first element
604  int face_parity = (parity + face_num * (arg.dc.X[dim] - nLayers)) & 1;
605 
606  // reconstruct full face index from index into the checkerboard
607  face_idx *= 2;
608 
609  if (!(arg.dc.face_X[dim] & 1)) { // face_X even
610  // int s = face_idx / face_XYZT;
611  // int t = (face_idx / face_XYZ) % face_T;
612  // int z = (face_idx / face_XY) % face_Z;
613  // int y = (face_idx / face_X) % face_Y;
614  // face_idx += (face_parity + s + t + z + y) & 1;
615  // equivalent to the above, but with fewer divisions/mods:
616  int aux1 = face_idx / arg.dc.face_X[dim];
617  int aux2 = aux1 / arg.dc.face_Y[dim];
618  int aux3 = aux2 / arg.dc.face_Z[dim];
619  int aux4 = (nDim == 5 ? aux3 / arg.dc.face_T[dim] : 0);
620 
621  int y = aux1 - aux2 * arg.dc.face_Y[dim];
622  int z = aux2 - aux3 * arg.dc.face_Z[dim];
623  int t = aux3 - aux4 * arg.dc.face_T[dim];
624  int s = aux4;
625 
626  if (type == QUDA_5D_PC)
627  face_idx += (face_parity + s + t + z + y) & 1;
628  else if (type == QUDA_4D_PC)
629  face_idx += (face_parity + t + z + y) & 1;
630 
631  } else if (!(arg.dc.face_Y[dim] & 1)) { // face_Y even
632 
633  int s = face_idx / arg.dc.face_XYZT[dim];
634  int t = (nDim == 5 ? (face_idx / arg.dc.face_XYZ[dim]) % arg.dc.face_T[dim] : (face_idx / arg.dc.face_XYZ[dim]));
635  int z = (face_idx / arg.dc.face_XY[dim]) % arg.dc.face_Z[dim];
636  if (type == QUDA_5D_PC)
637  face_idx += (face_parity + s + t + z) & 1;
638  else if (type == QUDA_4D_PC)
639  face_idx += (face_parity + t + z) & 1;
640 
641  } else if (!(arg.dc.face_Z[dim] & 1)) { // face_Z even
642 
643  int s = face_idx / arg.dc.face_XYZT[dim];
644  int t = (nDim == 5 ? (face_idx / arg.dc.face_XYZ[dim]) % arg.dc.face_T[dim] : (face_idx / arg.dc.face_XYZ[dim]));
645  if (type == QUDA_5D_PC)
646  face_idx += (face_parity + s + t) & 1;
647  else if (type == QUDA_4D_PC)
648  face_idx += (face_parity + t) & 1;
649 
650  } else if (!(arg.dc.face_T[dim]) && nDim == 5) {
651 
652  int s = face_idx / arg.dc.face_XYZT[dim];
653  if (type == QUDA_5D_PC)
654  face_idx += (face_parity + s) & 1;
655  else if (type == QUDA_4D_PC)
656  face_idx += face_parity;
657 
658  } else {
659  face_idx += face_parity;
660  }
661 
662  // compute index into the full local volume
663  int gap = arg.dc.X[dim] - nLayers;
664  int idx = face_idx;
665  int aux;
666  switch (dim) {
667  case 0:
668  aux = face_idx / arg.dc.face_X[dim];
669  idx += (aux + face_num) * gap;
670  break;
671  case 1:
672  aux = face_idx / arg.dc.face_XY[dim];
673  idx += (aux + face_num) * gap * arg.dc.face_X[dim];
674  break;
675  case 2:
676  aux = face_idx / arg.dc.face_XYZ[dim];
677  idx += (aux + face_num) * gap * arg.dc.face_XY[dim];
678  break;
679  case 3:
680  aux = (nDim == 5 ? face_idx / arg.dc.face_XYZT[dim] : 0);
681  idx += (aux + face_num) * gap * arg.dc.face_XYZ[dim];
682  break;
683  }
684 
685  // return index into the checkerboard
686  return idx >> 1;
687  }
688 
693  template <int nDim, QudaPCType type, int dim, int nLayers, int face_num, typename Arg>
694  inline __device__ __host__ int indexFromFaceIndex(int face_idx, const Arg &arg)
695  {
696  return indexFromFaceIndex<nDim, type, dim, nLayers, face_num>(face_idx, arg.parity, arg);
697  }
698 
714  // int idx = indexFromFaceIndex<4,QUDA_4D_PC,dim,nFace,0>(ghost_idx, parity, arg);
715 
716  template <int nDim, QudaPCType type, int dim, int nLayers, int face_num, typename Arg>
717  static inline __device__ int indexFromFaceIndexStaggered(int face_idx_in, int parity, const Arg &arg)
718  {
719  const auto *X = arg.dc.X; // grid dimension
720  const auto *dims = arg.dc.dims[dim]; // dimensions of the face
721  const auto &V4 = arg.dc.volume_4d; // 4-d volume
722 
723  // intrinsic parity of the face depends on offset of first element
724  int face_parity = (parity + face_num * (X[dim] - nLayers)) & 1;
725 
726  // reconstruct full face index from index into the checkerboard
727  face_idx_in *= 2;
728 
729  // first compute src index, then find 4-d index from remainder
730  int s = face_idx_in / arg.dc.face_XYZT[dim];
731  int face_idx = face_idx_in - s * arg.dc.face_XYZT[dim];
732 
733  /*y,z,t here are face indexes in new order*/
734  int aux1 = face_idx / dims[0];
735  int aux2 = aux1 / dims[1];
736  int y = aux1 - aux2 * dims[1];
737  int t = aux2 / dims[2];
738  int z = aux2 - t * dims[2];
739  face_idx += (face_parity + t + z + y) & 1;
740 
741  // compute index into the full local volume
742  int gap = X[dim] - nLayers;
743  int idx = face_idx;
744  int aux;
745  switch (dim) {
746  case 0:
747  aux = face_idx;
748  idx += face_num * gap + aux * (X[0] - 1);
749  idx += (idx / V4) * (1 - V4);
750  break;
751  case 1:
752  aux = face_idx / arg.dc.face_X[dim];
753  idx += face_num * gap * arg.dc.face_X[dim] + aux * (X[1] - 1) * arg.dc.face_X[dim];
754  idx += (idx / V4) * (X[0] - V4);
755  break;
756  case 2:
757  aux = face_idx / arg.dc.face_XY[dim];
758  idx += face_num * gap * arg.dc.face_XY[dim] + aux * (X[2] - 1) * arg.dc.face_XY[dim];
759  idx += (idx / V4) * ((X[1] * X[0]) - V4);
760  break;
761  case 3: idx += face_num * gap * arg.dc.face_XYZ[dim]; break;
762  }
763 
764  // return index into the checkerboard
765  return (idx + s * V4) >> 1;
766  }
767 
782  template <int nDim = 4, typename Arg>
783  __host__ __device__ inline int dimFromFaceIndex(int &face_idx, int tid, const Arg &arg)
784  {
785 
786  // s - the coordinate in the fifth dimension - is the slowest-changing coordinate
787  const int s = (nDim == 5 ? tid / arg.threads : 0);
788 
789  face_idx = tid - s * arg.threads; // face_idx = face_idx % arg.threads
790 
791  if (face_idx < arg.threadDimMapUpper[0]) {
792  face_idx += s * arg.threadDimMapUpper[0];
793  return 0;
794  } else if (face_idx < arg.threadDimMapUpper[1]) {
795  face_idx -= arg.threadDimMapLower[1];
796  face_idx += s * (arg.threadDimMapUpper[1] - arg.threadDimMapLower[1]);
797  return 1;
798  } else if (face_idx < arg.threadDimMapUpper[2]) {
799  face_idx -= arg.threadDimMapLower[2];
800  face_idx += s * (arg.threadDimMapUpper[2] - arg.threadDimMapLower[2]);
801  return 2;
802  } else {
803  face_idx -= arg.threadDimMapLower[3];
804  face_idx += s * (arg.threadDimMapUpper[3] - arg.threadDimMapLower[3]);
805  return 3;
806  }
807  }
808 
809  template <int nDim = 4, typename Arg> __host__ __device__ inline int dimFromFaceIndex(int &face_idx, const Arg &arg)
810  {
811  return dimFromFaceIndex<nDim>(face_idx, face_idx, arg);
812  }
813 
833  //#define SWIZZLE
834  template <typename T> __device__ inline int block_idx(const T &swizzle)
835  {
836 #ifdef SWIZZLE
837  // the portion of the grid that is exactly divisible by the number of SMs
838  const int gridp = gridDim.x - gridDim.x % swizzle;
839 
840  int block_idx = blockIdx.x;
841  if (blockIdx.x < gridp) {
842  // this is the portion of the block that we are going to transpose
843  const int i = blockIdx.x % swizzle;
844  const int j = blockIdx.x / swizzle;
845 
846  // transpose the coordinates
847  block_idx = i * (gridp / swizzle) + j;
848  }
849  return block_idx;
850 #else
851  return blockIdx.x;
852 #endif
853  }
854 
867  template <typename Arg>
868  __device__ __host__ inline auto StaggeredPhase(const int coords[], int dim, int dir, const Arg &arg) -> typename Arg::real
869  {
870  using real = typename Arg::real;
871  constexpr auto phase = Arg::phase;
872  static_assert(
873  phase == QUDA_STAGGERED_PHASE_MILC || phase == QUDA_STAGGERED_PHASE_TIFR, "Unsupported staggered phase");
874  real sign;
875 
876  const auto *X = arg.dim;
877  if (phase == QUDA_STAGGERED_PHASE_MILC) {
878  switch (dim) {
879  case 0: sign = (coords[3]) % 2 == 0 ? static_cast<real>(1.0) : static_cast<real>(-1.0); break;
880  case 1: sign = (coords[3] + coords[0]) % 2 == 0 ? static_cast<real>(1.0) : static_cast<real>(-1.0); break;
881  case 2:
882  sign = (coords[3] + coords[1] + coords[0]) % 2 == 0 ? static_cast<real>(1.0) : static_cast<real>(-1.0);
883  break;
884  case 3: sign = (coords[3] + dir >= X[3] && arg.is_last_time_slice) || (coords[3] + dir < 0 && arg.is_first_time_slice) ?
885  arg.tboundary : static_cast<real>(1.0); break;
886  default: sign = static_cast<real>(1.0);
887  }
888  } else if (phase == QUDA_STAGGERED_PHASE_TIFR) {
889  switch (dim) {
890  case 0: sign = (coords[3] + coords[2] + coords[1]) % 2 == 0 ? -1 : 1; break;
891  case 1: sign = ((coords[3] + coords[2]) % 2 == 1) ? -1 : 1; break;
892  case 2: sign = (coords[3] % 2 == 0) ? -1 : 1; break;
893  case 3: sign = (coords[3] + dir >= X[3] && arg.is_last_time_slice) || (coords[3] + dir < 0 && arg.is_first_time_slice) ?
894  arg.tboundary : static_cast<real>(1.0); break;
895  default: sign = static_cast<real>(1.0);
896  }
897  }
898  return sign;
899  }
900 
901 } // namespace quda
static __device__ __host__ int getIndexFull(int cb_index, const I X[4], int parity)
static __device__ __host__ void getCoordsExtended(I x[], int cb_index, const J X[], int parity, const int R[])
double mu
Definition: test_util.cpp:1648
__device__ int block_idx(const T &swizzle)
Swizzler for reordering the (x) thread block indices - use on conjunction with swizzle-factor autotun...
static __device__ __host__ int linkIndexShift(const I x[], const J dx[], const K X[4])
static __device__ __host__ int linkIndex(const int x[], const I X[4])
static __device__ __host__ void getCoords5CB(int x[5], int cb_index, const I X[5], J X0h, int parity, QudaPCType pc_type)
__host__ __device__ int dimFromFaceIndex(int &face_idx, int tid, const Arg &arg)
Determines which face a given thread is computing. Also rescale face_idx so that is relative to a giv...
__device__ __host__ void coordsFromFaceIndex(int &idx, int &cb_idx, Int *const x, int face_idx, const int &face_num, int parity, const Arg &arg)
Compute the full-lattice coordinates from the input face index. This is used by the Wilson-like halo ...
enum QudaPCType_s QudaPCType
static __device__ __host__ void getCoordsCB(int x[], int cb_index, const I X[], J X0h, int parity)
static int R[4]
__device__ __host__ int ghostFaceIndexStaggered(const int x_[], const I X_[], int dim, int nFace)
__device__ __host__ int indexFromFaceIndex(int face_idx, int parity, const Arg &arg)
Compute the checkerboard lattice index from the input face index. This is used by the Wilson-like hal...
static __device__ __host__ int linkIndexP3(const int x[], const I X[4], const int mu)
static __device__ __host__ int linkIndexM1(const int x[], const I X[4], const int mu)
static __device__ __host__ int linkIndexM3(const int x[], const I X[4], const int mu)
int X[4]
Definition: covdev_test.cpp:70
static __device__ __host__ void getCoords5(int x[5], int cb_index, const I X[5], int parity, QudaPCType pc_type)
static int dims[4]
Definition: face_gauge.cpp:41
static int index(int ndim, const int *dims, const int *x)
Definition: comm_common.cpp:32
static __device__ int indexFromFaceIndexStaggered(int face_idx_in, int parity, const Arg &arg)
Compute global checkerboard index from face index. The following indexing routines work for arbitrary...
__shared__ float s[]
static __device__ __host__ int getNeighborIndexCB(const int x[], int mu, int dir, const Arg &arg)
Compute the checkerboard 1-d index for the nearest neighbor.
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
__device__ __host__ int ghostFaceIndex(const int x_[], const I X_[], int dim, int nFace)
static __device__ __host__ int linkIndexDn(const int x[], const I X[4], const int mu)
__device__ __host__ auto StaggeredPhase(const int coords[], int dim, int dir, const Arg &arg) -> typename Arg::real
Compute the staggered phase factor at unit shift from the current lattice coordinates. The routine below optimizes out the shift where possible, hence is only visible where we need to consider the boundary condition.
static __device__ __host__ int linkIndexP1(const int x[], const I X[4], const int mu)
QudaParity parity
Definition: covdev_test.cpp:54
static __device__ __host__ int linkNormalIndexP1(const int x[], const I X[4], const int mu)
__host__ __device__ int getCoords(int coord[], const Arg &arg, int &idx, int parity, int &dim)
Compute the space-time coordinates we are at.