QUDA  v0.5.0
A library for QCD on GPUs
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
pack_face_def.h
Go to the documentation of this file.
1 // The following indexing routines work for arbitrary (including odd) lattice dimensions.
2 // compute an index into the local volume from an index into the face (used by the face packing routines)
3 
4 template <int dim, int nLayers, int face_num>
5 static inline __device__ int indexFromFaceIndex(int face_idx, const int &face_volume, const int &parity)
6 {
7  // dimensions of the face (FIXME: optimize using constant cache)
8 
9  int face_X = X1, face_Y = X2, face_Z = X3; // face_T = X4;
10  switch (dim) {
11  case 0:
12  face_X = nLayers;
13  break;
14  case 1:
15  face_Y = nLayers;
16  break;
17  case 2:
18  face_Z = nLayers;
19  break;
20  case 3:
21  // face_T = nLayers;
22  break;
23  }
24  int face_XYZ = face_X * face_Y * face_Z;
25  int face_XY = face_X * face_Y;
26 
27  // intrinsic parity of the face depends on offset of first element
28 
29  int face_parity;
30  switch (dim) {
31  case 0:
32  face_parity = (parity + face_num * (X1 - nLayers)) & 1;
33  break;
34  case 1:
35  face_parity = (parity + face_num * (X2 - nLayers)) & 1;
36  break;
37  case 2:
38  face_parity = (parity + face_num * (X3 - nLayers)) & 1;
39  break;
40  case 3:
41  face_parity = (parity + face_num * (X4 - nLayers)) & 1;
42  break;
43  }
44 
45  // reconstruct full face index from index into the checkerboard
46 
47  face_idx *= 2;
48 
49  if (!(face_X & 1)) { // face_X even
50  // int t = face_idx / face_XYZ;
51  // int z = (face_idx / face_XY) % face_Z;
52  // int y = (face_idx / face_X) % face_Y;
53  // face_idx += (face_parity + t + z + y) & 1;
54  // equivalent to the above, but with fewer divisions/mods:
55  int aux1 = face_idx / face_X;
56  int aux2 = aux1 / face_Y;
57  int y = aux1 - aux2 * face_Y;
58  int t = aux2 / face_Z;
59  int z = aux2 - t * face_Z;
60  face_idx += (face_parity + t + z + y) & 1;
61  } else if (!(face_Y & 1)) { // face_Y even
62  int t = face_idx / face_XYZ;
63  int z = (face_idx / face_XY) % face_Z;
64  face_idx += (face_parity + t + z) & 1;
65  } else if (!(face_Z & 1)) { // face_Z even
66  int t = face_idx / face_XYZ;
67  face_idx += (face_parity + t) & 1;
68  } else {
69  face_idx += face_parity;
70  }
71 
72  // compute index into the full local volume
73 
74  int idx = face_idx;
75  int gap, aux;
76 
77  switch (dim) {
78  case 0:
79  gap = X1 - nLayers;
80  aux = face_idx / face_X;
81  idx += (aux + face_num) * gap;
82  break;
83  case 1:
84  gap = X2 - nLayers;
85  aux = face_idx / face_XY;
86  idx += (aux + face_num) * gap * face_X;
87  break;
88  case 2:
89  gap = X3 - nLayers;
90  aux = face_idx / face_XYZ;
91  idx += (aux + face_num) * gap * face_XY;
92  break;
93  case 3:
94  gap = X4 - nLayers;
95  idx += face_num * gap * face_XYZ;
96  break;
97  }
98 
99  // return index into the checkerboard
100 
101  return idx >> 1;
102 }
103 
104 // compute an index into the local volume from an index into the face (used by the face packing routines)
105 // G.Shi: the spinor order in ghost region is different between wilson and asqtad, thus different index
106 // computing routine.
107 template <int dim, int nLayers, int face_num>
108 static inline __device__ int indexFromFaceIndexAsqtad(int face_idx, const int &face_volume,
109  const int &parity)
110 {
111  // dimensions of the face (FIXME: optimize using constant cache)
112  int dims[3];
113  int V = 2*Vh;
114  int face_X = X1, face_Y = X2, face_Z = X3; // face_T = X4;
115  switch (dim) {
116  case 0:
117  face_X = nLayers;
118  dims[0]=X2; dims[1]=X3; dims[2]=X4;
119  break;
120  case 1:
121  face_Y = nLayers;
122  dims[0]=X1;dims[1]=X3; dims[2]=X4;
123  break;
124  case 2:
125  face_Z = nLayers;
126  dims[0]=X1;dims[1]=X2; dims[2]=X4;
127  break;
128  case 3:
129  // face_T = nLayers;
130  dims[0]=X1;dims[1]=X2; dims[2]=X4;
131  break;
132  }
133  int face_XYZ = face_X * face_Y * face_Z;
134  int face_XY = face_X * face_Y;
135 
136  // intrinsic parity of the face depends on offset of first element
137 
138  int face_parity;
139  switch (dim) {
140  case 0:
141  face_parity = (parity + face_num * (X1 - nLayers)) & 1;
142  break;
143  case 1:
144  face_parity = (parity + face_num * (X2 - nLayers)) & 1;
145  break;
146  case 2:
147  face_parity = (parity + face_num * (X3 - nLayers)) & 1;
148  break;
149  case 3:
150  face_parity = (parity + face_num * (X4 - nLayers)) & 1;
151  break;
152  }
153 
154 
155  // reconstruct full face index from index into the checkerboard
156 
157  face_idx *= 2;
158  /*y,z,t here are face indexes in new order*/
159  int aux1 = face_idx / dims[0];
160  int aux2 = aux1 / dims[1];
161  int y = aux1 - aux2 * dims[1];
162  int t = aux2 / dims[2];
163  int z = aux2 - t * dims[2];
164  face_idx += (face_parity + t + z + y) & 1;
165 
166  int idx = face_idx;
167  int gap, aux;
168 
169  switch (dim) {
170  case 0:
171  gap = X1 - nLayers;
172  aux = face_idx;
173  idx += face_num*gap + aux*(X1-1);
174  idx += idx/V*(1-V);
175  break;
176  case 1:
177  gap = X2 - nLayers;
178  aux = face_idx / face_X;
179  idx += face_num * gap * face_X + aux*(X2-1)*face_X;
180  idx += idx/V*(X1-V);
181  break;
182  case 2:
183  gap = X3 - nLayers;
184  aux = face_idx / face_XY;
185  idx += face_num * gap * face_XY +aux*(X3-1)*face_XY;
186  idx += idx/V*(X2X1-V);
187  break;
188  case 3:
189  gap = X4 - nLayers;
190  idx += face_num * gap * face_XYZ;
191  break;
192  }
193 
194  // return index into the checkerboard
195 
196  return idx >> 1;
197 }
198 
199 
200 // compute full coordinates from an index into the face (used by the exterior Dslash kernels)
201 template <int nLayers, typename Int>
202 static inline __device__ void coordsFromFaceIndex(int &idx, int &cb_idx, Int &X, Int &Y, Int &Z, Int &T, int face_idx,
203  const int &face_volume, const int &dim, const int &face_num, const int &parity)
204 {
205  // dimensions of the face (FIXME: optimize using constant cache)
206 
207  int face_X = X1, face_Y = X2, face_Z = X3;
208  int face_parity;
209  switch (dim) {
210  case 0:
211  face_X = nLayers;
212  face_parity = (parity + face_num * (X1 - nLayers)) & 1;
213  break;
214  case 1:
215  face_Y = nLayers;
216  face_parity = (parity + face_num * (X2 - nLayers)) & 1;
217  break;
218  case 2:
219  face_Z = nLayers;
220  face_parity = (parity + face_num * (X3 - nLayers)) & 1;
221  break;
222  case 3:
223  face_parity = (parity + face_num * (X4 - nLayers)) & 1;
224  break;
225  }
226  int face_XYZ = face_X * face_Y * face_Z;
227  int face_XY = face_X * face_Y;
228 
229  // compute coordinates from (checkerboard) face index
230 
231  face_idx *= 2;
232 
233  int x, y, z, t;
234 
235  if (!(face_X & 1)) { // face_X even
236  // t = face_idx / face_XYZ;
237  // z = (face_idx / face_XY) % face_Z;
238  // y = (face_idx / face_X) % face_Y;
239  // face_idx += (face_parity + t + z + y) & 1;
240  // x = face_idx % face_X;
241  // equivalent to the above, but with fewer divisions/mods:
242  int aux1 = face_idx / face_X;
243  x = face_idx - aux1 * face_X;
244  int aux2 = aux1 / face_Y;
245  y = aux1 - aux2 * face_Y;
246  t = aux2 / face_Z;
247  z = aux2 - t * face_Z;
248  x += (face_parity + t + z + y) & 1;
249  // face_idx += (face_parity + t + z + y) & 1;
250  } else if (!(face_Y & 1)) { // face_Y even
251  t = face_idx / face_XYZ;
252  z = (face_idx / face_XY) % face_Z;
253  face_idx += (face_parity + t + z) & 1;
254  y = (face_idx / face_X) % face_Y;
255  x = face_idx % face_X;
256  } else if (!(face_Z & 1)) { // face_Z even
257  t = face_idx / face_XYZ;
258  face_idx += (face_parity + t) & 1;
259  z = (face_idx / face_XY) % face_Z;
260  y = (face_idx / face_X) % face_Y;
261  x = face_idx % face_X;
262  } else {
263  face_idx += face_parity;
264  t = face_idx / face_XYZ;
265  z = (face_idx / face_XY) % face_Z;
266  y = (face_idx / face_X) % face_Y;
267  x = face_idx % face_X;
268  }
269 
270  //printf("Local sid %d (%d, %d, %d, %d)\n", cb_int, x, y, z, t);
271 
272  // need to convert to global coords, not face coords
273  switch(dim) {
274  case 0:
275  x += face_num * (X1-nLayers);
276  break;
277  case 1:
278  y += face_num * (X2-nLayers);
279  break;
280  case 2:
281  z += face_num * (X3-nLayers);
282  break;
283  case 3:
284  t += face_num * (X4-nLayers);
285  break;
286  }
287 
288  // compute index into the full local volume
289 
290  idx = X1*(X2*(X3*t + z) + y) + x;
291 
292  // compute index into the checkerboard
293 
294  cb_idx = idx >> 1;
295 
296  X = x;
297  Y = y;
298  Z = z;
299  T = t;
300 
301  //printf("Global sid %d (%d, %d, %d, %d)\n", cb_int, x, y, z, t);
302 }
303 
304 enum IndexType {
305  EVEN_X = 0,
306  EVEN_Y = 1,
307  EVEN_Z = 2,
308  EVEN_T = 3
309 };
310 
311 // compute coordinates from index into the checkerboard (used by the interior Dslash kernels)
312 template <IndexType idxType, typename Int>
313 static __device__ __forceinline__ void coordsFromIndex(int &idx, Int &X, Int &Y, Int &Z, Int &T,
314  const int &cb_idx, const int &parity)
315 {
316  int &LX = X1;
317  int &LY = X2;
318  int &LZ = X3;
319  int &XYZ = X3X2X1;
320  int &XY = X2X1;
321 
322  idx = 2*cb_idx;
323 
324  int x, y, z, t;
325 
326  // The full field index is
327  // idx = x + y*X + z*X*Y + t*X*Y*Z
328  // The parity of lattice site (x,y,z,t)
329  // is defined to be (x+y+z+t) & 1
330  // 0 => even parity
331  // 1 => odd parity
332  // cb_idx runs over the half volume
333  // cb_idx = iidx/2 = (x + y*X + z*X*Y + t*X*Y*Z)/2
334  //
335  // We need to obtain idx from cb_idx + parity.
336  //
337  // 1) First, consider the case where X is even.
338  // Then, y*X + z*X*Y + t*X*Y*Z is even and
339  // 2*cb_idx = 2*(x/2) + y*X + z*X*Y + t*X*Y*Z
340  // Since, 2*(x/2) is even, if y+z+t is even
341  // (2*(x/2),y,z,t) is an even parity site.
342  // Similarly, if y+z+t is odd
343  // (2*(x/2),y,z,t) is an odd parity site.
344  //
345  // Note that (1+y+z+t)&1 = 1 for y+z+t even
346  // and (1+y+z+t)&1 = 0 for y+z+t odd
347  // Therefore,
348  // (2*/(x/2) + (1+y+z+t)&1, y, z, t) is odd.
349  //
350  // 2) Consider the case where X is odd but Y is even.
351  // Calculate 2*cb_idx
352  // t = 2*cb_idx/XYZ
353  // z = (2*cb_idx/XY) % Z
354  //
355  // Now, we need to compute (x,y) for different parities.
356  // To select a site with even parity, consider (z+t).
357  // If (z+t) is even, this implies that (x+y) must also
358  // be even in order that (x+y+z+t) is even.
359  // Therefore, x + y*X is even.
360  // Thus, 2*cb_idx = idx
361  // and y = (2*cb_idx/X) % Y
362  // and x = (2*cb_idx) % X;
363  //
364  // On the other hand, if (z+t) is odd, (x+y) must be
365  // also be odd in order to get overall even parity.
366  // Then x + y*X is odd (since X is odd and either x or y is odd)
367  // and 2*cb_idx = 2*(idx/2) = idx-1 = x + y*X -1 + z*X*Y + t*X*Y*Z
368  // => idx = 2*cb_idx + 1
369  // and y = ((2*cb_idx + 1)/X) % Y
370  // and x = (2*cb_idx + 1) % X
371  //
372  // To select a site with odd parity if (z+t) is even,
373  // (x+y) must be odd, which, following the discussion above, implies that
374  // y = ((2*cb_idx + 1)/X) % Y
375  // x = (2*cb_idx + 1) % X
376  // Finally, if (z+t) is odd (x+y) must be even to get overall odd parity,
377  // and
378  // y = ((2*cb_idx)/X) % Y
379  // x = (2*cb_idx) % X
380  //
381  // The code below covers these cases
382  // as well as the cases where X, Y are odd and Z is even,
383  // and X,Y,Z are all odd
384 
385  if (idxType == EVEN_X ) { // X even
386  // t = idx / XYZ;
387  // z = (idx / XY) % Z;
388  // y = (idx / X) % Y;
389  // idx += (parity + t + z + y) & 1;
390  // x = idx % X;
391  // equivalent to the above, but with fewer divisions/mods:
392  int aux1 = idx / LX;
393  x = idx - aux1 * LX;
394  int aux2 = aux1 / LY;
395  y = aux1 - aux2 * LY;
396  t = aux2 / LZ;
397  z = aux2 - t * LZ;
398  aux1 = (parity + t + z + y) & 1;
399  x += aux1;
400  idx += aux1;
401  } else if (idxType == EVEN_Y ) { // Y even
402  t = idx / XYZ;
403  z = (idx / XY) % LZ;
404  idx += (parity + t + z) & 1;
405  y = (idx / LX) % LY;
406  x = idx % LX;
407  } else if (idxType == EVEN_Z ) { // Z even
408  t = idx / XYZ;
409  idx += (parity + t) & 1;
410  z = (idx / XY) % LZ;
411  y = (idx / LX) % LY;
412  x = idx % LX;
413  } else {
414  idx += parity;
415  t = idx / XYZ;
416  z = (idx / XY) % LZ;
417  y = (idx / LX) % LY;
418  x = idx % LX;
419  }
420 
421  X = x;
422  Y = y;
423  Z = z;
424  T = t;
425 }
426 
427 // compute coordinates from index into the checkerboard (used by the interior Dslash kernels)
428 // This is the variant used byt the shared memory wilson dslash
429 template <IndexType idxType, typename Int>
430 static __device__ __forceinline__ void coordsFromIndex3D(int &idx, Int &X, Int &Y, Int &Z, Int &T,
431  int &cb_idx, const int &parity)
432 {
433  int &LX = X1;
434  int &LY = X2;
435  int &LZ = X3;
436 
437  int x, y, z, t;
438 
439  if (idxType == EVEN_X) { // X even
440  int xt = blockIdx.x*blockDim.x + threadIdx.x;
441  int aux = xt+xt;
442  t = aux / LX;
443  x = aux - t*LX;
444  y = blockIdx.y*blockDim.y + threadIdx.y;
445  z = blockIdx.z*blockDim.z + threadIdx.z;
446  x += (parity + t + z + y) &1;
447  idx = ((t*LZ + z)*LY + y)*LX + x;
448  cb_idx = idx >> 1;
449  } else {
450  // Non-even X is not (yet) supported.
451  return;
452  }
453 
454  X = x;
455  Y = y;
456  Z = z;
457  T = t;
458 }
459 
460 //Used in DW kernels only:
461 
462 template <int dim, int nLayers, int face_num>
463 static inline __device__ int indexFromDWFaceIndex(int face_idx, const int &face_volume,
464  const int &parity)
465 {
466  // dimensions of the face (FIXME: optimize using constant cache)
467 
468  //A.S.: Also used for computing offsets in physical lattice
469  //A.S.: note that in the case of DW fermions one is dealing with 4d faces
470 
471  // intrinsic parity of the face depends on offset of first element, used for MPI DW as well
472  int face_X = X1, face_Y = X2, face_Z = X3, face_T = X4;
473  int face_parity;
474 
475  switch (dim) {
476  case 0:
477  face_X = nLayers;
478  face_parity = (parity + face_num * (X1 - nLayers)) & 1;
479  break;
480  case 1:
481  face_Y = nLayers;
482  face_parity = (parity + face_num * (X2 - nLayers)) & 1;
483  break;
484  case 2:
485  face_Z = nLayers;
486  face_parity = (parity + face_num * (X3 - nLayers)) & 1;
487  break;
488  case 3:
489  face_T = nLayers;
490  face_parity = (parity + face_num * (X4 - nLayers)) & 1;
491  break;
492  }
493 
494  int face_XYZT = face_X * face_Y * face_Z * face_T;
495  int face_XYZ = face_X * face_Y * face_Z;
496  int face_XY = face_X * face_Y;
497 
498  // reconstruct full face index from index into the checkerboard
499 
500  face_idx *= 2;
501 
502  if (!(face_X & 1)) { // face_X even
503  // int s = face_idx / face_XYZT;
504  // int t = (face_idx / face_XYZ) % face_T;
505  // int z = (face_idx / face_XY) % face_Z;
506  // int y = (face_idx / face_X) % face_Y;
507  // face_idx += (face_parity + s + t + z + y) & 1;
508  // equivalent to the above, but with fewer divisions/mods:
509  int aux1 = face_idx / face_X;
510  int aux2 = aux1 / face_Y;
511  int aux3 = aux2 / face_Z;
512  int y = aux1 - aux2 * face_Y;
513  int z = aux2 - aux3 * face_Z;
514  int s = aux3 / face_T;
515  int t = aux3 - s * face_T;
516  face_idx += (face_parity + s + t + z + y) & 1;
517  } else if (!(face_Y & 1)) { // face_Y even
518  int s = face_idx / face_XYZT;
519  int t = (face_idx / face_XYZ) % face_T;
520  int z = (face_idx / face_XY) % face_Z;
521  face_idx += (face_parity + s + t + z) & 1;
522  } else if (!(face_Z & 1)) { // face_Z even
523  int s = face_idx / face_XYZT;
524  int t = (face_idx / face_XYZ) % face_T;
525  face_idx += (face_parity + s + t) & 1;
526  } else if(!(face_T)){
527  int s = face_idx / face_XYZT;
528  face_idx += (face_parity + s) & 1;
529  }else{
530  face_idx += face_parity;
531  }
532 
533  // compute index into the full local volume
534 
535  int idx = face_idx;
536  int gap, aux;
537 
538  switch (dim) {
539  case 0:
540  gap = X1 - nLayers;
541  aux = face_idx / face_X;
542  idx += (aux + face_num) * gap;
543  break;
544  case 1:
545  gap = X2 - nLayers;
546  aux = face_idx / face_XY;
547  idx += (aux + face_num) * gap * face_X;
548  break;
549  case 2:
550  gap = X3 - nLayers;
551  aux = face_idx / face_XYZ;
552  idx += (aux + face_num) * gap * face_XY;
553  break;
554  case 3:
555  gap = X4 - nLayers;
556  aux = face_idx / face_XYZT;
557  idx += (aux + face_num) * gap * face_XYZ;
558  break;
559  }
560 
561  // return index into the checkerboard
562 
563  return idx >> 1;
564 }
565 
566 
567 // compute full coordinates from an index into the face (used by the exterior Dslash kernels)
568 template <int nLayers, typename Int>
569 static inline __device__ void coordsFromDWFaceIndex(int &cb_idx, Int &X, Int &Y, Int &Z, Int &T, Int &S, int face_idx,
570  const int &face_volume, const int &dim, const int &face_num, const int &parity)
571 {
572  // dimensions of the face (FIXME: optimize using constant cache)
573 
574  int face_X = X1, face_Y = X2, face_Z = X3, face_T = X4;
575  int face_parity;
576  switch (dim) {
577  case 0:
578  face_X = nLayers;
579  face_parity = (parity + face_num * (X1 - nLayers)) & 1;
580  break;
581  case 1:
582  face_Y = nLayers;
583  face_parity = (parity + face_num * (X2 - nLayers)) & 1;
584  break;
585  case 2:
586  face_Z = nLayers;
587  face_parity = (parity + face_num * (X3 - nLayers)) & 1;
588  break;
589  case 3:
590  face_T = nLayers;
591  face_parity = (parity + face_num * (X4 - nLayers)) & 1;
592  break;
593  }
594  int face_XYZT = face_X * face_Y * face_Z * face_T;
595  int face_XYZ = face_X * face_Y * face_Z;
596  int face_XY = face_X * face_Y;
597 
598  // compute coordinates from (checkerboard) face index
599 
600  face_idx *= 2;
601 
602  int x, y, z, t, s;
603 
604  if (!(face_X & 1)) { // face_X even
605  // s = face_idx / face_XYZT;
606  // t = (face_idx / face_XYZ) % face_T;
607  // z = (face_idx / face_XY) % face_Z;
608  // y = (face_idx / face_X) % face_Y;
609  // face_idx += (face_parity + s + t + z + y) & 1;
610  // x = face_idx % face_X;
611  // equivalent to the above, but with fewer divisions/mods:
612  int aux1 = face_idx / face_X;
613  x = face_idx - aux1 * face_X;
614  int aux2 = aux1 / face_Y;
615  y = aux1 - aux2 * face_Y;
616  int aux3 = aux2 / face_Z;
617  z = aux2 - aux3 * face_Z;
618  s = aux3 / face_T;
619  t = aux3 - s * face_T;
620  x += (face_parity + s + t + z + y) & 1;
621  // face_idx += (face_parity + t + z + y) & 1;
622  } else if (!(face_Y & 1)) { // face_Y even
623  s = face_idx / face_XYZT;
624  t = (face_idx / face_XYZ) % face_T;
625  z = (face_idx / face_XY) % face_Z;
626  face_idx += (face_parity + s + t + z) & 1;
627  y = (face_idx / face_X) % face_Y;
628  x = face_idx % face_X;
629  } else if (!(face_Z & 1)) { // face_Z even
630  s = face_idx / face_XYZT;
631  t = (face_idx / face_XYZ) % face_T;
632  face_idx += (face_parity + s + t) & 1;
633  z = (face_idx / face_XY) % face_Z;
634  y = (face_idx / face_X) % face_Y;
635  x = face_idx % face_X;
636  } else {
637  s = face_idx / face_XYZT;
638  face_idx += face_parity;
639  t = (face_idx / face_XYZ) % face_T;
640  z = (face_idx / face_XY) % face_Z;
641  y = (face_idx / face_X) % face_Y;
642  x = face_idx % face_X;
643  }
644 
645  //printf("Local sid %d (%d, %d, %d, %d)\n", cb_int, x, y, z, t);
646 
647  // need to convert to global coords, not face coords
648  switch(dim) {
649  case 0:
650  x += face_num * (X1-nLayers);
651  break;
652  case 1:
653  y += face_num * (X2-nLayers);
654  break;
655  case 2:
656  z += face_num * (X3-nLayers);
657  break;
658  case 3:
659  t += face_num * (X4-nLayers);
660  break;
661  }
662 
663  // compute index into the checkerboard
664 
665  cb_idx = (X1*(X2*(X3*(X4*s + t) + z) + y) + x) >> 1;
666 
667  X = x;
668  Y = y;
669  Z = z;
670  T = t;
671  S = s;
672  //printf("Global sid %d (%d, %d, %d, %d)\n", cb_int, x, y, z, t);
673 }
674 
675 
677 template <int dim, int nLayers, int face_num>
678 static inline __device__ int indexFromNdegTMFaceIndex(int face_idx, const int &face_volume,
679  const int &parity)
680 {
681  // dimensions of the face (FIXME: optimize using constant cache)
682 
683  int face_X = X1, face_Y = X2, face_Z = X3, face_T = X4;
684  int face_parity;
685 
686  switch (dim) {
687  case 0:
688  face_X = nLayers;
689  face_parity = (parity + face_num * (X1 - nLayers)) & 1;
690  break;
691  case 1:
692  face_Y = nLayers;
693  face_parity = (parity + face_num * (X2 - nLayers)) & 1;
694  break;
695  case 2:
696  face_Z = nLayers;
697  face_parity = (parity + face_num * (X3 - nLayers)) & 1;
698  break;
699  case 3:
700  face_T = nLayers;
701  face_parity = (parity + face_num * (X4 - nLayers)) & 1;
702  break;
703  }
704 
705  int face_XYZT = face_X * face_Y * face_Z * face_T;
706  int face_XYZ = face_X * face_Y * face_Z;
707  int face_XY = face_X * face_Y;
708 
709  // reconstruct full face index from index into the checkerboard
710 
711  face_idx *= 2;
712 
713  if (!(face_X & 1)) { // face_X even
714  // int t = (face_idx / face_XYZ) % face_T;
715  // int z = (face_idx / face_XY) % face_Z;
716  // int y = (face_idx / face_X) % face_Y;
717  // face_idx += (face_parity + t + z + y) & 1;//the same parity for both flavors
718  // equivalent to the above, but with fewer divisions/mods:
719  int aux1 = face_idx / face_X;
720  int aux2 = aux1 / face_Y;
721  int aux3 = aux2 / face_Z;
722  int y = aux1 - aux2 * face_Y;
723  int z = aux2 - aux3 * face_Z;
724  int Nf = aux3 / face_T;
725  int t = aux3 - Nf * face_T;
726  face_idx += (face_parity + t + z + y) & 1;
727  } else if (!(face_Y & 1)) { // face_Y even
728  int t = (face_idx / face_XYZ) % face_T;
729  int z = (face_idx / face_XY) % face_Z;
730  face_idx += (face_parity + t + z) & 1;
731  } else if (!(face_Z & 1)) { // face_Z even
732  int t = (face_idx / face_XYZ) % face_T;
733  face_idx += (face_parity + t) & 1;
734  } else if(!(face_T)){
735  face_idx += face_parity & 1;
736  }else{
737  face_idx += face_parity;
738  }
739 
740  // compute index into the full local volume
741 
742  int idx = face_idx;
743  int gap, aux;
744 
745  switch (dim) {
746  case 0:
747  gap = X1 - nLayers;
748  aux = face_idx / face_X;
749  idx += (aux + face_num) * gap;
750  break;
751  case 1:
752  gap = X2 - nLayers;
753  aux = face_idx / face_XY;
754  idx += (aux + face_num) * gap * face_X;
755  break;
756  case 2:
757  gap = X3 - nLayers;
758  aux = face_idx / face_XYZ;
759  idx += (aux + face_num) * gap * face_XY;
760  break;
761  case 3:
762  gap = X4 - nLayers;
763  aux = face_idx / face_XYZT;
764  idx += (aux + face_num) * gap * face_XYZ;
765  break;
766  }
767 
768  // return index into the checkerboard
769 
770  return idx >> 1;
771 }
772 
773 
774 // routines for packing the ghost zones (multi-GPU only)
775 
776 #ifdef MULTI_GPU
777 
778 template <typename FloatN>
779 struct PackParam {
780 
781  FloatN *out[2*QUDA_MAX_DIM];
782  float *outNorm[2*QUDA_MAX_DIM];
783 
784  FloatN *in;
785  float *inNorm;
786 
787  int threads; // total number of threads
788 
789  // offsets which determine thread mapping to dimension
790  int threadDimMapLower[QUDA_MAX_DIM]; // lowest thread which maps to dim
791  int threadDimMapUpper[QUDA_MAX_DIM]; // greatest thread + 1 which maps to dim
792 
793  int parity;
794 #ifdef USE_TEXTURE_OBJECTS
795  cudaTextureObject_t inTex;
796  cudaTextureObject_t inTexNorm;
797 #endif
798 
799  int stride;
800 };
801 
806 template <typename Param>
807 __device__ inline int dimFromFaceIndex (int &face_idx, const Param param) {
808  if (face_idx < param.threadDimMapUpper[0]) {
809  return 0;
810  } else if (face_idx < param.threadDimMapUpper[1]) {
811  face_idx -= param.threadDimMapLower[1];
812  return 1;
813  } else if (face_idx < param.threadDimMapUpper[2]) {
814  face_idx -= param.threadDimMapLower[2];
815  return 2;
816  } else { // this is only called if we use T kernel packing
817  face_idx -= param.threadDimMapLower[3];
818  return 3;
819  }
820 }
821 
822 #if defined(GPU_WILSON_DIRAC) || defined(GPU_DOMAIN_WALL_DIRAC)
823 
824 // double precision
825 #if (defined DIRECT_ACCESS_WILSON_PACK_SPINOR) || (defined FERMI_NO_DBLE_TEX)
826 #define READ_SPINOR READ_SPINOR_DOUBLE
827 #define READ_SPINOR_UP READ_SPINOR_DOUBLE_UP
828 #define READ_SPINOR_DOWN READ_SPINOR_DOUBLE_DOWN
829 #define SPINORTEX in
830 #else
831 #define READ_SPINOR READ_SPINOR_DOUBLE_TEX
832 #define READ_SPINOR_UP READ_SPINOR_DOUBLE_UP_TEX
833 #define READ_SPINOR_DOWN READ_SPINOR_DOUBLE_DOWN_TEX
834 #ifdef USE_TEXTURE_OBJECTS
835 #define SPINORTEX param.inTex
836 #else
837 #define SPINORTEX spinorTexDouble
838 #endif
839 #endif
840 #define WRITE_HALF_SPINOR WRITE_HALF_SPINOR_DOUBLE2
841 #define SPINOR_DOUBLE
842 template <int dim, int dagger, int face_num>
843 static inline __device__ void packFaceWilsonCore(double2 *out, float *outNorm, const double2 *in,
844  const float *inNorm, const int &idx,
845  const int &face_idx, const int &face_volume,
846  PackParam<double2> &param)
847 {
848 #if (__COMPUTE_CAPABILITY__ >= 130)
849  if (dagger) {
851  } else {
852 #include "wilson_pack_face_core.h"
853  }
854 #endif // (__COMPUTE_CAPABILITY__ >= 130)
855 }
856 #undef READ_SPINOR
857 #undef READ_SPINOR_UP
858 #undef READ_SPINOR_DOWN
859 #undef SPINORTEX
860 #undef WRITE_HALF_SPINOR
861 #undef SPINOR_DOUBLE
862 
863 
864 // single precision
865 #ifdef DIRECT_ACCESS_WILSON_PACK_SPINOR
866 #define READ_SPINOR READ_SPINOR_SINGLE
867 #define READ_SPINOR_UP READ_SPINOR_SINGLE_UP
868 #define READ_SPINOR_DOWN READ_SPINOR_SINGLE_DOWN
869 #define SPINORTEX in
870 #else
871 #define READ_SPINOR READ_SPINOR_SINGLE_TEX
872 #define READ_SPINOR_UP READ_SPINOR_SINGLE_UP_TEX
873 #define READ_SPINOR_DOWN READ_SPINOR_SINGLE_DOWN_TEX
874 #ifdef USE_TEXTURE_OBJECTS
875 #define SPINORTEX param.inTex
876 #else
877 #define SPINORTEX spinorTexSingle
878 #endif
879 #endif
880 #define WRITE_HALF_SPINOR WRITE_HALF_SPINOR_FLOAT4
881 template <int dim, int dagger, int face_num>
882 static inline __device__ void packFaceWilsonCore(float4 *out, float *outNorm, const float4 *in, const float *inNorm,
883  const int &idx, const int &face_idx,
884  const int &face_volume,
885  const PackParam<float4> &param)
886 {
887  if (dagger) {
889  } else {
890 #include "wilson_pack_face_core.h"
891  }
892 }
893 #undef READ_SPINOR
894 #undef READ_SPINOR_UP
895 #undef READ_SPINOR_DOWN
896 #undef SPINORTEX
897 #undef WRITE_HALF_SPINOR
898 
899 
900 // half precision
901 #ifdef DIRECT_ACCESS_WILSON_PACK_SPINOR
902 #define READ_SPINOR READ_SPINOR_HALF
903 #define READ_SPINOR_UP READ_SPINOR_HALF_UP
904 #define READ_SPINOR_DOWN READ_SPINOR_HALF_DOWN
905 #define SPINORTEX in
906 #else
907 #define READ_SPINOR READ_SPINOR_HALF_TEX
908 #define READ_SPINOR_UP READ_SPINOR_HALF_UP_TEX
909 #define READ_SPINOR_DOWN READ_SPINOR_HALF_DOWN_TEX
910 #ifdef USE_TEXTURE_OBJECTS
911 #define SPINORTEX param.inTex
912 #else
913 #define SPINORTEX spinorTexHalf
914 #endif
915 #endif
916 #define WRITE_HALF_SPINOR WRITE_HALF_SPINOR_SHORT4
917 template <int dim, int dagger, int face_num>
918 static inline __device__ void packFaceWilsonCore(short4 *out, float *outNorm, const short4 *in, const float *inNorm,
919  const int &idx, const int &face_idx,
920  const int &face_volume,
921  const PackParam<short4> &param)
922 {
923  if (dagger) {
925  } else {
926 #include "wilson_pack_face_core.h"
927  }
928 }
929 #undef READ_SPINOR
930 #undef READ_SPINOR_UP
931 #undef READ_SPINOR_DOWN
932 #undef SPINORTEX
933 #undef WRITE_HALF_SPINOR
934 
935 template <int dagger, typename FloatN>
936 __global__ void packFaceWilsonKernel(PackParam<FloatN> param)
937 {
938  const int nFace = 1; // 1 face for Wilson
939 
940  int face_idx = blockIdx.x*blockDim.x + threadIdx.x;
941  if (face_idx >= param.threads) return;
942 
943  // determine which dimension we are packing
944  const int dim = dimFromFaceIndex(face_idx, param);
945 
946  // face_num determines which end of the lattice we are packing: 0 = start, 1 = end
947  const int face_num = (face_idx >= nFace*ghostFace[dim]) ? 1 : 0;
948  face_idx -= face_num*nFace*ghostFace[dim];
949 
950  // compute where the output is located
951  // compute an index into the local volume from the index into the face
952  // read spinor, spin-project, and write half spinor to face
953  if (dim == 0) {
954  if (face_num == 0) {
955  const int idx = indexFromFaceIndex<0,nFace,0>(face_idx,ghostFace[0],param.parity);
956  packFaceWilsonCore<0,dagger,0>(param.out[0], param.outNorm[0], param.in,
957  param.inNorm,idx, face_idx, ghostFace[0], param);
958  } else {
959  const int idx = indexFromFaceIndex<0,nFace,1>(face_idx,ghostFace[0],param.parity);
960  packFaceWilsonCore<0,dagger,1>(param.out[1], param.outNorm[1], param.in,
961  param.inNorm,idx, face_idx, ghostFace[0], param);
962  }
963  } else if (dim == 1) {
964  if (face_num == 0) {
965  const int idx = indexFromFaceIndex<1,nFace,0>(face_idx,ghostFace[1],param.parity);
966  packFaceWilsonCore<1, dagger,0>(param.out[2], param.outNorm[2], param.in,
967  param.inNorm,idx, face_idx, ghostFace[1], param);
968  } else {
969  const int idx = indexFromFaceIndex<1,nFace,1>(face_idx,ghostFace[1],param.parity);
970  packFaceWilsonCore<1, dagger,1>(param.out[3], param.outNorm[3], param.in,
971  param.inNorm,idx, face_idx, ghostFace[1], param);
972  }
973  } else if (dim == 2) {
974  if (face_num == 0) {
975  const int idx = indexFromFaceIndex<2,nFace,0>(face_idx,ghostFace[2],param.parity);
976  packFaceWilsonCore<2, dagger,0>(param.out[4], param.outNorm[4], param.in,
977  param.inNorm,idx, face_idx, ghostFace[2], param);
978  } else {
979  const int idx = indexFromFaceIndex<2,nFace,1>(face_idx,ghostFace[2],param.parity);
980  packFaceWilsonCore<2, dagger,1>(param.out[5], param.outNorm[5], param.in,
981  param.inNorm,idx, face_idx, ghostFace[2], param);
982  }
983  } else {
984  if (face_num == 0) {
985  const int idx = indexFromFaceIndex<3,nFace,0>(face_idx,ghostFace[3],param.parity);
986  packFaceWilsonCore<3, dagger,0>(param.out[6], param.outNorm[6], param.in,
987  param.inNorm,idx, face_idx, ghostFace[3], param);
988  } else {
989  const int idx = indexFromFaceIndex<3,nFace,1>(face_idx,ghostFace[3],param.parity);
990  packFaceWilsonCore<3, dagger,1>(param.out[7], param.outNorm[7], param.in,
991  param.inNorm,idx, face_idx, ghostFace[3], param);
992  }
993  }
994 
995 }
996 
997 #endif // GPU_WILSON_DIRAC || GPU_DOMAIN_WALL_DIRAC
998 
999 
1000 
1001 template <typename FloatN>
1002 class PackFace : public Tunable {
1003 
1004  protected:
1005  FloatN *faces;
1006  const cudaColorSpinorField *in;
1007  const int dagger;
1008  const int parity;
1009  const int nFace;
1010 
1011  // compute how many threads we need in total for the face packing
1012  unsigned int threads() const {
1013  unsigned int threads = 0;
1014  for (int i=0; i<4; i++) {
1015  if (!dslashParam.commDim[i]) continue;
1016  if (i==3 && !kernelPackT) continue;
1017  threads += 2*nFace*in->GhostFace()[i]; // 2 for forwards and backwards faces
1018  }
1019  return threads;
1020  }
1021 
1022  virtual int inputPerSite() const = 0;
1023  virtual int outputPerSite() const = 0;
1024 
1025  // prepare the param struct with kernel arguments
1026  PackParam<FloatN> prepareParam() {
1027  PackParam<FloatN> param;
1028  param.in = (FloatN*)in->V();
1029  param.inNorm = (float*)in->Norm();
1030  param.parity = parity;
1031 #ifdef USE_TEXTURE_OBJECTS
1032  param.inTex = in->Tex();
1033  param.inTexNorm = in->TexNorm();
1034 #endif
1035 
1036  param.threads = threads();
1037  param.stride = in->Stride();
1038 
1039  int prev = -1; // previous dimension that was partitioned
1040  for (int i=0; i<4; i++) {
1041  param.threadDimMapLower[i] = 0;
1042  param.threadDimMapUpper[i] = 0;
1043  if (!dslashParam.commDim[i]) continue;
1044  param.threadDimMapLower[i] = (prev>=0 ? param.threadDimMapUpper[prev] : 0);
1045  param.threadDimMapUpper[i] = param.threadDimMapLower[i] + 2*nFace*in->GhostFace()[i];
1046 
1047  size_t faceBytes = nFace*outputPerSite()*in->GhostFace()[i]*sizeof(faces->x);
1048 
1049  if (typeid(FloatN) == typeid(short4) || typeid(FloatN) == typeid(short2)) {
1050  faceBytes += nFace*in->GhostFace()[i]*sizeof(float);
1051  param.out[2*i] = (FloatN*)((char*)faces +
1052  (outputPerSite()*sizeof(faces->x) + sizeof(float))*param.threadDimMapLower[i]);
1053  param.outNorm[2*i] = (float*)((char*)param.out[2*i] +
1054  nFace*outputPerSite()*in->GhostFace()[i]*sizeof(faces->x));
1055  } else {
1056  param.out[2*i] = (FloatN*)((char*)faces+outputPerSite()*sizeof(faces->x)*param.threadDimMapLower[i]);
1057  }
1058 
1059  param.out[2*i+1] = (FloatN*)((char*)param.out[2*i] + faceBytes);
1060  param.outNorm[2*i+1] = (float*)((char*)param.outNorm[2*i] + faceBytes);
1061 
1062  prev=i;
1063 
1064  //printf("%d: map=%d %d out=%llu %llu outNorm=%llu %llu bytes=%d\n",
1065  // i,param.threadDimMapLower[i], param.threadDimMapUpper[i],
1066  // param.out[2*i], param.out[2*i+1], param.outNorm[2*i], param.outNorm[2*i+1], faceBytes);
1067  }
1068 
1069  return param;
1070  }
1071 
1072  int sharedBytesPerThread() const { return 0; }
1073  int sharedBytesPerBlock(const TuneParam &param) const { return 0; }
1074 
1075  bool advanceGridDim(TuneParam &param) const { return false; } // Don't tune the grid dimensions.
1076  bool advanceBlockDim(TuneParam &param) const {
1077  bool advance = Tunable::advanceBlockDim(param);
1078  if (advance) param.grid = dim3( (threads()+param.block.x-1) / param.block.x, 1, 1);
1079  return advance;
1080  }
1081 
1082  public:
1083  PackFace(FloatN *faces, const cudaColorSpinorField *in,
1084  const int dagger, const int parity, const int nFace)
1085  : faces(faces), in(in), dagger(dagger), parity(parity), nFace(nFace) { }
1086  virtual ~PackFace() { }
1087 
1088  virtual int tuningIter() const { return 100; }
1089 
1090  virtual TuneKey tuneKey() const {
1091  std::stringstream vol, aux;
1092  vol << in->X()[0] << "x";
1093  vol << in->X()[1] << "x";
1094  vol << in->X()[2] << "x";
1095  vol << in->X()[3];
1096  aux << "threads=" <<threads() << ",stride=" << in->Stride() << ",prec=" << sizeof(((FloatN*)0)->x);
1097  return TuneKey(vol.str(), typeid(*this).name(), aux.str());
1098  }
1099 
1100  virtual void apply(const cudaStream_t &stream) = 0;
1101 
1102  virtual void initTuneParam(TuneParam &param) const
1103  {
1104  Tunable::initTuneParam(param);
1105  param.grid = dim3( (threads()+param.block.x-1) / param.block.x, 1, 1);
1106  }
1107 
1109  virtual void defaultTuneParam(TuneParam &param) const
1110  {
1111  Tunable::defaultTuneParam(param);
1112  param.grid = dim3( (threads()+param.block.x-1) / param.block.x, 1, 1);
1113  }
1114 
1115  long long bytes() const {
1116  size_t faceBytes = (inputPerSite() + outputPerSite())*this->threads()*sizeof(((FloatN*)0)->x);
1117  if (sizeof(((FloatN*)0)->x) == QUDA_HALF_PRECISION)
1118  faceBytes += 2*this->threads()*sizeof(float); // 2 is from input and output
1119  return faceBytes;
1120  }
1121 };
1122 
1123 template <typename FloatN>
1124 class PackFaceWilson : public PackFace<FloatN> {
1125 
1126  private:
1127 
1128  int inputPerSite() const { return 24; } // input is full spinor
1129  int outputPerSite() const { return 12; } // output is spin projected
1130 
1131  public:
1132  PackFaceWilson(FloatN *faces, const cudaColorSpinorField *in,
1133  const int dagger, const int parity)
1134  : PackFace<FloatN>(faces, in, dagger, parity, 1) { }
1135  virtual ~PackFaceWilson() { }
1136 
1137  void apply(const cudaStream_t &stream) {
1138  TuneParam tp = tuneLaunch(*this, dslashTuning, verbosity);
1139 
1140 #ifdef GPU_WILSON_DIRAC
1141  PackParam<FloatN> param = this->prepareParam();
1142  if (this->dagger) {
1143  packFaceWilsonKernel<1><<<tp.grid, tp.block, tp.shared_bytes, stream>>>(param);
1144  } else {
1145  packFaceWilsonKernel<0><<<tp.grid, tp.block, tp.shared_bytes, stream>>>(param);
1146  }
1147 #else
1148  errorQuda("Wilson face packing kernel is not built");
1149 #endif
1150  }
1151 
1152  long long flops() const { return outputPerSite()*this->threads(); }
1153 };
1154 
1155 void packFaceWilson(void *ghost_buf, cudaColorSpinorField &in, const int dagger,
1156  const int parity, const cudaStream_t &stream) {
1157 
1158  switch(in.Precision()) {
1159  case QUDA_DOUBLE_PRECISION:
1160  {
1161  PackFaceWilson<double2> pack((double2*)ghost_buf, &in, dagger, parity);
1162  pack.apply(stream);
1163  }
1164  break;
1165  case QUDA_SINGLE_PRECISION:
1166  {
1167  PackFaceWilson<float4> pack((float4*)ghost_buf, &in, dagger, parity);
1168  pack.apply(stream);
1169  }
1170  break;
1171  case QUDA_HALF_PRECISION:
1172  {
1173  PackFaceWilson<short4> pack((short4*)ghost_buf, &in, dagger, parity);
1174  pack.apply(stream);
1175  }
1176  break;
1177  }
1178 }
1179 
1180 #ifdef GPU_STAGGERED_DIRAC
1181 
1182 #ifdef USE_TEXTURE_OBJECTS
1183 #define SPINORTEXDOUBLE param.inTex
1184 #define SPINORTEXSINGLE param.inTex
1185 #define SPINORTEXHALF param.inTex
1186 #define SPINORTEXHALFNORM param.inTexNorm
1187 #else
1188 #define SPINORTEXDOUBLE spinorTexDouble
1189 #define SPINORTEXSINGLE spinorTexSingle2
1190 #define SPINORTEXHALF spinorTexHalf2
1191 #define SPINORTEXHALFNORM spinorTexHalf2Norm
1192 #endif
1193 
1194 #if (defined DIRECT_ACCESS_PACK) || (defined FERMI_NO_DBLE_TEX)
1195 template <typename Float2>
1196 __device__ void packFaceAsqtadCore(Float2 *out, float *outNorm, const int out_idx,
1197  const int out_stride, const Float2 *in, const float *inNorm,
1198  const int in_idx, const PackParam<double2> &param) {
1199  out[out_idx + 0*out_stride] = in[in_idx + 0*param.stride];
1200  out[out_idx + 1*out_stride] = in[in_idx + 1*param.stride];
1201  out[out_idx + 2*out_stride] = in[in_idx + 2*param.stride];
1202 }
1203 template<>
1204 __device__ void packFaceAsqtadCore(short2 *out, float *outNorm, const int out_idx,
1205  const int out_stride, const short2 *in, const float *inNorm,
1206  const int in_idx, const PackParam<double2> &param) {
1207  out[out_idx + 0*out_stride] = in[in_idx + 0*param.stride];
1208  out[out_idx + 1*out_stride] = in[in_idx + 1*param.stride];
1209  out[out_idx + 2*out_stride] = in[in_idx + 2*param.stride];
1210  outNorm[out_idx] = inNorm[in_idx];
1211 }
1212 #else
1213 __device__ void packFaceAsqtadCore(double2 *out, float *outNorm, const int out_idx,
1214  const int out_stride, const double2 *in, const float *inNorm,
1215  const int in_idx, const PackParam<double2> &param) {
1216  out[out_idx + 0*out_stride] = fetch_double2(SPINORTEXDOUBLE, in_idx + 0*param.stride);
1217  out[out_idx + 1*out_stride] = fetch_double2(SPINORTEXDOUBLE, in_idx + 1*param.stride);
1218  out[out_idx + 2*out_stride] = fetch_double2(SPINORTEXDOUBLE, in_idx + 2*param.stride);
1219 }
1220 __device__ void packFaceAsqtadCore(float2 *out, float *outNorm, const int out_idx,
1221  const int out_stride, const float2 *in,
1222  const float *inNorm, const int in_idx,
1223  const PackParam<float2> &param) {
1224  out[out_idx + 0*out_stride] = TEX1DFETCH(float2, SPINORTEXSINGLE, in_idx + 0*param.stride);
1225  out[out_idx + 1*out_stride] = TEX1DFETCH(float2, SPINORTEXSINGLE, in_idx + 1*param.stride);
1226  out[out_idx + 2*out_stride] = TEX1DFETCH(float2, SPINORTEXSINGLE, in_idx + 2*param.stride);
1227 }
1228 
1229 // this is rather dumb: undoing the texture load because cudaNormalizedReadMode is used
1230 // should really bind to an appropriate texture instead of reusing
1231 static inline __device__ short2 float22short2(float c, float2 a) {
1232  return make_short2((short)(a.x*c*MAX_SHORT), (short)(a.y*c*MAX_SHORT));
1233 }
1234 
1235 __device__ void packFaceAsqtadCore(short2 *out, float *outNorm, const int out_idx,
1236  const int out_stride, const short2 *in,
1237  const float *inNorm, const int in_idx,
1238  const PackParam<short2> &param) {
1239  out[out_idx + 0*out_stride] = float22short2(1.0f,TEX1DFETCH(float2,SPINORTEXHALF,in_idx+0*param.stride));
1240  out[out_idx + 1*out_stride] = float22short2(1.0f,TEX1DFETCH(float2,SPINORTEXHALF,in_idx+1*param.stride));
1241  out[out_idx + 2*out_stride] = float22short2(1.0f,TEX1DFETCH(float2,SPINORTEXHALF,in_idx+2*param.stride));
1242  outNorm[out_idx] = TEX1DFETCH(float, SPINORTEXHALFNORM, in_idx);
1243 }
1244 #endif
1245 
1246 template <typename FloatN>
1247 __global__ void packFaceAsqtadKernel(PackParam<FloatN> param)
1248 {
1249  const int nFace = 3; //3 faces for asqtad
1250 
1251  int face_idx = blockIdx.x*blockDim.x + threadIdx.x;
1252  if (face_idx >= param.threads) return;
1253 
1254  // determine which dimension we are packing
1255  const int dim = dimFromFaceIndex(face_idx, param);
1256 
1257  // face_num determines which end of the lattice we are packing: 0 = start, 1 = end
1258  const int face_num = (face_idx >= nFace*ghostFace[dim]) ? 1 : 0;
1259  face_idx -= face_num*nFace*ghostFace[dim];
1260 
1261  // compute where the output is located
1262  // compute an index into the local volume from the index into the face
1263  // read spinor, spin-project, and write half spinor to face
1264  if (dim == 0) {
1265  if (face_num == 0) {
1266  const int idx = indexFromFaceIndexAsqtad<0,nFace,0>(face_idx,ghostFace[0],param.parity);
1267  packFaceAsqtadCore(param.out[0], param.outNorm[0], face_idx,
1268  nFace*ghostFace[0], param.in, param.inNorm, idx, param);
1269  } else {
1270  const int idx = indexFromFaceIndexAsqtad<0,nFace,1>(face_idx,ghostFace[0],param.parity);
1271  packFaceAsqtadCore(param.out[1], param.outNorm[1], face_idx,
1272  nFace*ghostFace[0], param.in, param.inNorm, idx, param);
1273  }
1274  } else if (dim == 1) {
1275  if (face_num == 0) {
1276  const int idx = indexFromFaceIndexAsqtad<1,nFace,0>(face_idx,ghostFace[1],param.parity);
1277  packFaceAsqtadCore(param.out[2], param.outNorm[2], face_idx,
1278  nFace*ghostFace[1], param.in, param.inNorm, idx, param);
1279  } else {
1280  const int idx = indexFromFaceIndexAsqtad<1,nFace,1>(face_idx,ghostFace[1],param.parity);
1281  packFaceAsqtadCore(param.out[3], param.outNorm[3], face_idx,
1282  nFace*ghostFace[1], param.in, param.inNorm, idx, param);
1283  }
1284  } else if (dim == 2) {
1285  if (face_num == 0) {
1286  const int idx = indexFromFaceIndexAsqtad<2,nFace,0>(face_idx,ghostFace[2],param.parity);
1287  packFaceAsqtadCore(param.out[4], param.outNorm[4], face_idx,
1288  nFace*ghostFace[2], param.in, param.inNorm, idx, param);
1289  } else {
1290  const int idx = indexFromFaceIndexAsqtad<2,nFace,1>(face_idx,ghostFace[2],param.parity);
1291  packFaceAsqtadCore(param.out[5], param.outNorm[5], face_idx,
1292  nFace*ghostFace[2], param.in, param.inNorm, idx, param);
1293  }
1294  } else {
1295  if (face_num == 0) {
1296  const int idx = indexFromFaceIndexAsqtad<3,nFace,0>(face_idx,ghostFace[3],param.parity);
1297  packFaceAsqtadCore(param.out[6], param.outNorm[6], face_idx,
1298  nFace*ghostFace[3], param.in, param.inNorm,idx, param);
1299  } else {
1300  const int idx = indexFromFaceIndexAsqtad<3,nFace,1>(face_idx,ghostFace[3],param.parity);
1301  packFaceAsqtadCore(param.out[7], param.outNorm[7], face_idx,
1302  nFace*ghostFace[3], param.in, param.inNorm, idx, param);
1303  }
1304  }
1305 
1306 }
1307 
1308 #undef SPINORTEXDOUBLE
1309 #undef SPINORTEXSINGLE
1310 #undef SPINORTEXHALF
1311 
1312 #endif // GPU_STAGGERED_DIRAC
1313 
1314 
1315 template <typename FloatN>
1316 class PackFaceAsqtad : public PackFace<FloatN> {
1317 
1318  private:
1319 
1320  int inputPerSite() const { return 6; } // input is full spinor
1321  int outputPerSite() const { return 6; } // output is full spinor
1322 
1323  public:
1324  PackFaceAsqtad(FloatN *faces, const cudaColorSpinorField *in,
1325  const int dagger, const int parity)
1326  : PackFace<FloatN>(faces, in, dagger, parity, 3) { }
1327  virtual ~PackFaceAsqtad() { }
1328 
1329  void apply(const cudaStream_t &stream) {
1330  TuneParam tp = tuneLaunch(*this, dslashTuning, verbosity);
1331 
1332 #ifdef GPU_STAGGERED_DIRAC
1333  PackParam<FloatN> param = this->prepareParam();
1334  packFaceAsqtadKernel<<<tp.grid, tp.block, tp.shared_bytes, stream>>>(param);
1335 #else
1336  errorQuda("Asqtad face packing kernel is not built");
1337 #endif
1338  }
1339 
1340  long long flops() const { return 0; }
1341 };
1342 
1343 void packFaceAsqtad(void *ghost_buf, cudaColorSpinorField &in, const int dagger,
1344  const int parity, const cudaStream_t &stream) {
1345 
1346  switch(in.Precision()) {
1347  case QUDA_DOUBLE_PRECISION:
1348  {
1349  PackFaceAsqtad<double2> pack((double2*)ghost_buf, &in, dagger, parity);
1350  pack.apply(stream);
1351  }
1352  break;
1353  case QUDA_SINGLE_PRECISION:
1354  {
1355  PackFaceAsqtad<float2> pack((float2*)ghost_buf, &in, dagger, parity);
1356  pack.apply(stream);
1357  }
1358  break;
1359  case QUDA_HALF_PRECISION:
1360  {
1361  PackFaceAsqtad<short2> pack((short2*)ghost_buf, &in, dagger, parity);
1362  pack.apply(stream);
1363  }
1364  break;
1365  }
1366 
1367 }
1368 
1369 #ifdef GPU_DOMAIN_WALL_DIRAC
1370 template <int dagger, typename FloatN>
1371 __global__ void packFaceDWKernel(PackParam<FloatN> param)
1372 {
1373  const int nFace = 1; // 1 face for dwf
1374 
1375  int face_idx = blockIdx.x*blockDim.x + threadIdx.x;
1376  if (face_idx >= param.threads) return;
1377 
1378  // determine which dimension we are packing
1379  const int dim = dimFromFaceIndex(face_idx, param);
1380 
1381  // face_num determines which end of the lattice we are packing: 0 = beginning, 1 = end
1382  // FIXME these ghostFace constants do not incude the Ls dimension
1383  const int face_num = (face_idx >= nFace*Ls*ghostFace[dim]) ? 1 : 0;
1384  face_idx -= face_num*nFace*Ls*ghostFace[dim];
1385 
1386  // compute where the output is located
1387  // compute an index into the local volume from the index into the face
1388  // read spinor, spin-project, and write half spinor to face
1389  if (dim == 0) {
1390  if (face_num == 0) {
1391  const int idx = indexFromDWFaceIndex<0,nFace,0>(face_idx,Ls*ghostFace[0],param.parity);
1392  packFaceWilsonCore<0,dagger,0>(param.out[0], param.outNorm[0], param.in,
1393  param.inNorm, idx, face_idx, Ls*ghostFace[0], param);
1394  } else {
1395  const int idx = indexFromDWFaceIndex<0,nFace,1>(face_idx,Ls*ghostFace[0],param.parity);
1396  packFaceWilsonCore<0,dagger,1>(param.out[1], param.outNorm[1], param.in,
1397  param.inNorm, idx, face_idx, Ls*ghostFace[0], param);
1398  }
1399  } else if (dim == 1) {
1400  if (face_num == 0) {
1401  const int idx = indexFromDWFaceIndex<1,nFace,0>(face_idx,Ls*ghostFace[1],param.parity);
1402  packFaceWilsonCore<1, dagger,0>(param.out[2], param.outNorm[2], param.in,
1403  param.inNorm, idx, face_idx, Ls*ghostFace[1], param);
1404  } else {
1405  const int idx = indexFromDWFaceIndex<1,nFace,1>(face_idx,Ls*ghostFace[1],param.parity);
1406  packFaceWilsonCore<1, dagger,1>(param.out[3], param.outNorm[3], param.in,
1407  param.inNorm, idx, face_idx, Ls*ghostFace[1], param);
1408  }
1409  } else if (dim == 2) {
1410  if (face_num == 0) {
1411  const int idx = indexFromDWFaceIndex<2,nFace,0>(face_idx,Ls*ghostFace[2],param.parity);
1412  packFaceWilsonCore<2, dagger,0>(param.out[4], param.outNorm[4], param.in,
1413  param.inNorm, idx, face_idx, Ls*ghostFace[2], param);
1414  } else {
1415  const int idx = indexFromDWFaceIndex<2,nFace,1>(face_idx,Ls*ghostFace[2],param.parity);
1416  packFaceWilsonCore<2, dagger,1>(param.out[5], param.outNorm[5], param.in,
1417  param.inNorm, idx, face_idx, Ls*ghostFace[2], param);
1418  }
1419  } else {
1420  if (face_num == 0) {
1421  const int idx = indexFromDWFaceIndex<3,nFace,0>(face_idx,Ls*ghostFace[3],param.parity);
1422  packFaceWilsonCore<3, dagger,0>(param.out[6], param.outNorm[6], param.in,
1423  param.inNorm, idx, face_idx, Ls*ghostFace[3], param);
1424  } else {
1425  const int idx = indexFromDWFaceIndex<3,nFace,1>(face_idx,Ls*ghostFace[3],param.parity);
1426  packFaceWilsonCore<3, dagger,1>(param.out[7], param.outNorm[7], param.in,
1427  param.inNorm, idx, face_idx, Ls*ghostFace[3], param);
1428  }
1429  }
1430 }
1431 #endif
1432 
1433 template <typename FloatN>
1434 class PackFaceDW : public PackFace<FloatN> {
1435 
1436  private:
1437 
1438  int inputPerSite() const { return 24; } // input is full spinor
1439  int outputPerSite() const { return 12; } // output is spin projected
1440 
1441  public:
1442  PackFaceDW(FloatN *faces, const cudaColorSpinorField *in,
1443  const int dagger, const int parity)
1444  : PackFace<FloatN>(faces, in, dagger, parity, 1) { }
1445  virtual ~PackFaceDW() { }
1446 
1447  void apply(const cudaStream_t &stream) {
1448  TuneParam tp = tuneLaunch(*this, dslashTuning, verbosity);
1449 
1450 #ifdef GPU_DOMAIN_WALL_DIRAC
1451  PackParam<FloatN> param = this->prepareParam();
1452  if (this->dagger) {
1453  packFaceDWKernel<1><<<tp.grid, tp.block, tp.shared_bytes, stream>>>(param);
1454  } else {
1455  packFaceDWKernel<0><<<tp.grid, tp.block, tp.shared_bytes, stream>>>(param);
1456  }
1457 #else
1458  errorQuda("DW face packing kernel is not built");
1459 #endif
1460  }
1461 
1462  long long flops() const { return outputPerSite()*this->threads(); }
1463 };
1464 
1465 void packFaceDW(void *ghost_buf, cudaColorSpinorField &in, const int dagger,
1466  const int parity, const cudaStream_t &stream) {
1467 
1468  switch(in.Precision()) {
1469  case QUDA_DOUBLE_PRECISION:
1470  {
1471  PackFaceDW<double2> pack((double2*)ghost_buf, &in, dagger, parity);
1472  pack.apply(stream);
1473  }
1474  break;
1475  case QUDA_SINGLE_PRECISION:
1476  {
1477  PackFaceDW<float4> pack((float4*)ghost_buf, &in, dagger, parity);
1478  pack.apply(stream);
1479  }
1480  break;
1481  case QUDA_HALF_PRECISION:
1482  {
1483  PackFaceDW<short4> pack((short4*)ghost_buf, &in, dagger, parity);
1484  pack.apply(stream);
1485  }
1486  break;
1487  }
1488 }
1489 
1490 #ifdef GPU_NDEG_TWISTED_MASS_DIRAC
1491 template <int dagger, typename FloatN>
1492 __global__ void packFaceNdegTMKernel(PackParam<FloatN> param)
1493 {
1494  const int nFace = 1; // 1 face for Wilson
1495  const int Nf = 2;
1496 
1497  int face_idx = blockIdx.x*blockDim.x + threadIdx.x;
1498  if (face_idx >= param.threads) return;
1499 
1500  // determine which dimension we are packing
1501  const int dim = dimFromFaceIndex(face_idx, param);
1502 
1503  // face_num determines which end of the lattice we are packing: 0 = beginning, 1 = end
1504  // FIXME these ghostFace constants do not include the Nf dimension
1505  const int face_num = (face_idx >= nFace*Nf*ghostFace[dim]) ? 1 : 0;
1506  face_idx -= face_num*nFace*Nf*ghostFace[dim];
1507 
1508  // compute where the output is located
1509  // compute an index into the local volume from the index into the face
1510  // read spinor, spin-project, and write half spinor to face
1511  if (dim == 0) {
1512  if (face_num == 0) {
1513  const int idx = indexFromNdegTMFaceIndex<0,nFace,0>(face_idx,Nf*ghostFace[0],param.parity);
1514  packFaceWilsonCore<0,dagger,0>(param.out[0], param.outNorm[0], param.in,
1515  param.inNorm, idx, face_idx, Nf*ghostFace[0], param);
1516  } else {
1517  const int idx = indexFromNdegTMFaceIndex<0,nFace,1>(face_idx,Nf*ghostFace[0],param.parity);
1518  packFaceWilsonCore<0,dagger,1>(param.out[1], param.outNorm[1], param.in,
1519  param.inNorm, idx, face_idx, Nf*ghostFace[0], param);
1520  }
1521  } else if (dim == 1) {
1522  if (face_num == 0) {
1523  const int idx = indexFromNdegTMFaceIndex<1,nFace,0>(face_idx,Nf*ghostFace[1],param.parity);
1524  packFaceWilsonCore<1, dagger,0>(param.out[2], param.outNorm[2], param.in,
1525  param.inNorm, idx, face_idx, Nf*ghostFace[1], param);
1526  } else {
1527  const int idx = indexFromNdegTMFaceIndex<1,nFace,1>(face_idx,Nf*ghostFace[1],param.parity);
1528  packFaceWilsonCore<1, dagger,1>(param.out[3], param.outNorm[3], param.in,
1529  param.inNorm, idx, face_idx, Nf*ghostFace[1], param);
1530  }
1531  } else if (dim == 2) {
1532  if (face_num == 0) {
1533  const int idx = indexFromNdegTMFaceIndex<2,nFace,0>(face_idx,Nf*ghostFace[2],param.parity);
1534  packFaceWilsonCore<2, dagger,0>(param.out[4], param.outNorm[4], param.in,
1535  param.inNorm, idx, face_idx, Nf*ghostFace[2], param);
1536  } else {
1537  const int idx = indexFromNdegTMFaceIndex<2,nFace,1>(face_idx,Nf*ghostFace[2],param.parity);
1538  packFaceWilsonCore<2, dagger,1>(param.out[5], param.outNorm[5], param.in,
1539  param.inNorm, idx, face_idx, Nf*ghostFace[2], param);
1540  }
1541  } else {
1542  if (face_num == 0) {
1543  const int idx = indexFromNdegTMFaceIndex<3,nFace,0>(face_idx,Nf*ghostFace[3],param.parity);
1544  packFaceWilsonCore<3, dagger,0>(param.out[6], param.outNorm[6], param.in,
1545  param.inNorm, idx, face_idx, Nf*ghostFace[3], param);
1546  } else {
1547  const int idx = indexFromNdegTMFaceIndex<3,nFace,1>(face_idx,Nf*ghostFace[3],param.parity);
1548  packFaceWilsonCore<3, dagger,1>(param.out[7], param.outNorm[7], param.in,
1549  param.inNorm, idx, face_idx, Nf*ghostFace[3], param);
1550  }
1551  }
1552 }
1553 #endif
1554 
1555 template <typename FloatN>
1556 class PackFaceNdegTM : public PackFace<FloatN> {
1557 
1558  private:
1559 
1560  int inputPerSite() const { return 24; } // input is full spinor
1561  int outputPerSite() const { return 12; } // output is spin projected
1562 
1563  public:
1564  PackFaceNdegTM(FloatN *faces, const cudaColorSpinorField *in,
1565  const int dagger, const int parity)
1566  : PackFace<FloatN>(faces, in, dagger, parity, 1) { }
1567  virtual ~PackFaceNdegTM() { }
1568 
1569  void apply(const cudaStream_t &stream) {
1570  TuneParam tp = tuneLaunch(*this, dslashTuning, verbosity);
1571 
1572 #ifdef GPU_NDEG_TWISTED_MASS_DIRAC
1573  PackParam<FloatN> param = this->prepareParam();
1574  if (this->dagger) {
1575  packFaceNdegTMKernel<1><<<tp.grid, tp.block, tp.shared_bytes, stream>>>(param);
1576  } else {
1577  packFaceNdegTMKernel<0><<<tp.grid, tp.block, tp.shared_bytes, stream>>>(param);
1578  }
1579 #else
1580  errorQuda("Non-degenerate twisted mass face packing kernel is not built");
1581 #endif
1582  }
1583 
1584  long long flops() const { return outputPerSite()*this->threads(); }
1585 
1586 };
1587 
1588 void packFaceNdegTM(void *ghost_buf, cudaColorSpinorField &in, const int dagger,
1589  const int parity, const cudaStream_t &stream) {
1590 
1591  switch(in.Precision()) {
1592  case QUDA_DOUBLE_PRECISION:
1593  {
1594  PackFaceNdegTM<double2> pack((double2*)ghost_buf, &in, dagger, parity);
1595  pack.apply(stream);
1596  }
1597  break;
1598  case QUDA_SINGLE_PRECISION:
1599  {
1600  PackFaceNdegTM<float4> pack((float4*)ghost_buf, &in, dagger, parity);
1601  pack.apply(stream);
1602  }
1603  break;
1604  case QUDA_HALF_PRECISION:
1605  {
1606  PackFaceNdegTM<short4> pack((short4*)ghost_buf, &in, dagger, parity);
1607  pack.apply(stream);
1608  }
1609  break;
1610  }
1611 }
1612 
1613 void packFace(void *ghost_buf, cudaColorSpinorField &in, const int dagger, const int parity, const cudaStream_t &stream)
1614 {
1615  int nDimPack = 0;
1616  for (int dim=0; dim<4; dim++) {
1617  if(!dslashParam.commDim[dim]) continue;
1618  if (dim != 3 || getKernelPackT()) nDimPack++;
1619  }
1620  if (!nDimPack) return; // if zero then we have nothing to pack
1621 
1622  // Need to update this logic for other multi-src dslash packing
1623  if (in.Nspin() == 1) {
1624  packFaceAsqtad(ghost_buf, in, dagger, parity, stream);
1625  } else if (in.Ndim() == 5) {
1626  if(in.TwistFlavor() == QUDA_TWIST_INVALID) {
1627  packFaceDW(ghost_buf, in, dagger, parity, stream);
1628  } else {
1629  packFaceNdegTM(ghost_buf, in, dagger, parity, stream);
1630  }
1631  } else {
1632  packFaceWilson(ghost_buf, in, dagger, parity, stream);
1633  }
1634 }
1635 
1636 #endif // MULTI_GPU
1637