4 template <
int dim,
int nLayers,
int face_num>
5 static inline __device__
int indexFromFaceIndex(
int face_idx,
const int &face_volume,
const int &
parity)
9 int face_X =
X1, face_Y =
X2, face_Z =
X3;
24 int face_XYZ = face_X * face_Y * face_Z;
25 int face_XY = face_X * face_Y;
32 face_parity = (parity + face_num * (
X1 - nLayers)) & 1;
35 face_parity = (parity + face_num * (
X2 - nLayers)) & 1;
38 face_parity = (parity + face_num * (
X3 - nLayers)) & 1;
41 face_parity = (parity + face_num * (
X4 - nLayers)) & 1;
55 int aux1 = face_idx / face_X;
56 int aux2 = aux1 / face_Y;
57 int y = aux1 - aux2 * face_Y;
58 int t = aux2 / face_Z;
59 int z = aux2 - t * face_Z;
60 face_idx += (face_parity + t + z + y) & 1;
61 }
else if (!(face_Y & 1)) {
62 int t = face_idx / face_XYZ;
63 int z = (face_idx / face_XY) % face_Z;
64 face_idx += (face_parity + t + z) & 1;
65 }
else if (!(face_Z & 1)) {
66 int t = face_idx / face_XYZ;
67 face_idx += (face_parity + t) & 1;
69 face_idx += face_parity;
80 aux = face_idx / face_X;
81 idx += (aux + face_num) * gap;
85 aux = face_idx / face_XY;
86 idx += (aux + face_num) * gap * face_X;
90 aux = face_idx / face_XYZ;
91 idx += (aux + face_num) * gap * face_XY;
95 idx += face_num * gap * face_XYZ;
107 template <
int dim,
int nLayers,
int face_num>
108 static inline __device__
int indexFromFaceIndexAsqtad(
int face_idx,
const int &face_volume,
114 int face_X =
X1, face_Y =
X2, face_Z =
X3;
118 dims[0]=
X2; dims[1]=
X3; dims[2]=
X4;
122 dims[0]=
X1;dims[1]=
X3; dims[2]=
X4;
126 dims[0]=
X1;dims[1]=
X2; dims[2]=
X4;
130 dims[0]=
X1;dims[1]=
X2; dims[2]=
X4;
133 int face_XYZ = face_X * face_Y * face_Z;
134 int face_XY = face_X * face_Y;
141 face_parity = (parity + face_num * (
X1 - nLayers)) & 1;
144 face_parity = (parity + face_num * (
X2 - nLayers)) & 1;
147 face_parity = (parity + face_num * (
X3 - nLayers)) & 1;
150 face_parity = (parity + face_num * (
X4 - nLayers)) & 1;
159 int aux1 = face_idx / dims[0];
160 int aux2 = aux1 / dims[1];
161 int y = aux1 - aux2 * dims[1];
162 int t = aux2 / dims[2];
163 int z = aux2 - t * dims[2];
164 face_idx += (face_parity + t + z + y) & 1;
173 idx += face_num*gap + aux*(
X1-1);
178 aux = face_idx / face_X;
179 idx += face_num * gap * face_X + aux*(
X2-1)*face_X;
184 aux = face_idx / face_XY;
185 idx += face_num * gap * face_XY +aux*(
X3-1)*face_XY;
186 idx += idx/V*(
X2X1-
V);
190 idx += face_num * gap * face_XYZ;
201 template <
int nLayers,
typename Int>
202 static inline __device__
void coordsFromFaceIndex(
int &idx,
int &cb_idx, Int &
X, Int &Y, Int &
Z, Int &T,
int face_idx,
203 const int &face_volume,
const int &dim,
const int &face_num,
const int &parity)
207 int face_X =
X1, face_Y =
X2, face_Z =
X3;
212 face_parity = (parity + face_num * (
X1 - nLayers)) & 1;
216 face_parity = (parity + face_num * (
X2 - nLayers)) & 1;
220 face_parity = (parity + face_num * (
X3 - nLayers)) & 1;
223 face_parity = (parity + face_num * (
X4 - nLayers)) & 1;
226 int face_XYZ = face_X * face_Y * face_Z;
227 int face_XY = face_X * face_Y;
242 int aux1 = face_idx / face_X;
243 x = face_idx - aux1 * face_X;
244 int aux2 = aux1 / face_Y;
245 y = aux1 - aux2 * face_Y;
247 z = aux2 - t * face_Z;
248 x += (face_parity + t + z + y) & 1;
250 }
else if (!(face_Y & 1)) {
251 t = face_idx / face_XYZ;
252 z = (face_idx / face_XY) % face_Z;
253 face_idx += (face_parity + t + z) & 1;
254 y = (face_idx / face_X) % face_Y;
255 x = face_idx % face_X;
256 }
else if (!(face_Z & 1)) {
257 t = face_idx / face_XYZ;
258 face_idx += (face_parity + t) & 1;
259 z = (face_idx / face_XY) % face_Z;
260 y = (face_idx / face_X) % face_Y;
261 x = face_idx % face_X;
263 face_idx += face_parity;
264 t = face_idx / face_XYZ;
265 z = (face_idx / face_XY) % face_Z;
266 y = (face_idx / face_X) % face_Y;
267 x = face_idx % face_X;
275 x += face_num * (
X1-nLayers);
278 y += face_num * (
X2-nLayers);
281 z += face_num * (
X3-nLayers);
284 t += face_num * (
X4-nLayers);
290 idx =
X1*(
X2*(
X3*t + z) + y) +
x;
312 template <IndexType
idxType,
typename Int>
313 static __device__ __forceinline__
void coordsFromIndex(
int &idx, Int &X, Int &Y, Int &Z, Int &T,
314 const int &cb_idx,
const int &parity)
394 int aux2 = aux1 / LY;
395 y = aux1 - aux2 * LY;
398 aux1 = (parity + t + z + y) & 1;
401 }
else if (idxType ==
EVEN_Y ) {
404 idx += (parity + t + z) & 1;
407 }
else if (idxType ==
EVEN_Z ) {
409 idx += (parity + t) & 1;
429 template <IndexType
idxType,
typename Int>
430 static __device__ __forceinline__
void coordsFromIndex3D(
int &idx, Int &X, Int &Y, Int &Z, Int &T,
431 int &cb_idx,
const int &parity)
440 int xt = blockIdx.x*blockDim.x + threadIdx.x;
444 y = blockIdx.y*blockDim.y + threadIdx.y;
445 z = blockIdx.z*blockDim.z + threadIdx.z;
446 x += (parity + t + z + y) &1;
447 idx = ((t*LZ + z)*LY + y)*LX +
x;
462 template <
int dim,
int nLayers,
int face_num>
463 static inline __device__
int indexFromDWFaceIndex(
int face_idx,
const int &face_volume,
472 int face_X =
X1, face_Y =
X2, face_Z =
X3, face_T =
X4;
478 face_parity = (parity + face_num * (
X1 - nLayers)) & 1;
482 face_parity = (parity + face_num * (
X2 - nLayers)) & 1;
486 face_parity = (parity + face_num * (
X3 - nLayers)) & 1;
490 face_parity = (parity + face_num * (
X4 - nLayers)) & 1;
494 int face_XYZT = face_X * face_Y * face_Z * face_T;
495 int face_XYZ = face_X * face_Y * face_Z;
496 int face_XY = face_X * face_Y;
509 int aux1 = face_idx / face_X;
510 int aux2 = aux1 / face_Y;
511 int aux3 = aux2 / face_Z;
512 int y = aux1 - aux2 * face_Y;
513 int z = aux2 - aux3 * face_Z;
514 int s = aux3 / face_T;
515 int t = aux3 - s * face_T;
516 face_idx += (face_parity + s + t + z + y) & 1;
517 }
else if (!(face_Y & 1)) {
518 int s = face_idx / face_XYZT;
519 int t = (face_idx / face_XYZ) % face_T;
520 int z = (face_idx / face_XY) % face_Z;
521 face_idx += (face_parity + s + t + z) & 1;
522 }
else if (!(face_Z & 1)) {
523 int s = face_idx / face_XYZT;
524 int t = (face_idx / face_XYZ) % face_T;
525 face_idx += (face_parity + s + t) & 1;
526 }
else if(!(face_T)){
527 int s = face_idx / face_XYZT;
528 face_idx += (face_parity +
s) & 1;
530 face_idx += face_parity;
541 aux = face_idx / face_X;
542 idx += (aux + face_num) * gap;
546 aux = face_idx / face_XY;
547 idx += (aux + face_num) * gap * face_X;
551 aux = face_idx / face_XYZ;
552 idx += (aux + face_num) * gap * face_XY;
556 aux = face_idx / face_XYZT;
557 idx += (aux + face_num) * gap * face_XYZ;
568 template <
int nLayers,
typename Int>
569 static inline __device__
void coordsFromDWFaceIndex(
int &cb_idx, Int &X, Int &Y, Int &Z, Int &T, Int &S,
int face_idx,
570 const int &face_volume,
const int &dim,
const int &face_num,
const int &parity)
574 int face_X =
X1, face_Y =
X2, face_Z =
X3, face_T =
X4;
579 face_parity = (parity + face_num * (
X1 - nLayers)) & 1;
583 face_parity = (parity + face_num * (
X2 - nLayers)) & 1;
587 face_parity = (parity + face_num * (
X3 - nLayers)) & 1;
591 face_parity = (parity + face_num * (
X4 - nLayers)) & 1;
594 int face_XYZT = face_X * face_Y * face_Z * face_T;
595 int face_XYZ = face_X * face_Y * face_Z;
596 int face_XY = face_X * face_Y;
612 int aux1 = face_idx / face_X;
613 x = face_idx - aux1 * face_X;
614 int aux2 = aux1 / face_Y;
615 y = aux1 - aux2 * face_Y;
616 int aux3 = aux2 / face_Z;
617 z = aux2 - aux3 * face_Z;
619 t = aux3 - s * face_T;
620 x += (face_parity + s + t + z + y) & 1;
622 }
else if (!(face_Y & 1)) {
623 s = face_idx / face_XYZT;
624 t = (face_idx / face_XYZ) % face_T;
625 z = (face_idx / face_XY) % face_Z;
626 face_idx += (face_parity + s + t + z) & 1;
627 y = (face_idx / face_X) % face_Y;
628 x = face_idx % face_X;
629 }
else if (!(face_Z & 1)) {
630 s = face_idx / face_XYZT;
631 t = (face_idx / face_XYZ) % face_T;
632 face_idx += (face_parity + s + t) & 1;
633 z = (face_idx / face_XY) % face_Z;
634 y = (face_idx / face_X) % face_Y;
635 x = face_idx % face_X;
637 s = face_idx / face_XYZT;
638 face_idx += face_parity;
639 t = (face_idx / face_XYZ) % face_T;
640 z = (face_idx / face_XY) % face_Z;
641 y = (face_idx / face_X) % face_Y;
642 x = face_idx % face_X;
650 x += face_num * (
X1-nLayers);
653 y += face_num * (
X2-nLayers);
656 z += face_num * (
X3-nLayers);
659 t += face_num * (
X4-nLayers);
665 cb_idx = (
X1*(
X2*(
X3*(
X4*s + t) + z) + y) + x) >> 1;
677 template <
int dim,
int nLayers,
int face_num>
678 static inline __device__
int indexFromNdegTMFaceIndex(
int face_idx,
const int &face_volume,
683 int face_X =
X1, face_Y =
X2, face_Z =
X3, face_T =
X4;
689 face_parity = (parity + face_num * (
X1 - nLayers)) & 1;
693 face_parity = (parity + face_num * (
X2 - nLayers)) & 1;
697 face_parity = (parity + face_num * (
X3 - nLayers)) & 1;
701 face_parity = (parity + face_num * (
X4 - nLayers)) & 1;
705 int face_XYZT = face_X * face_Y * face_Z * face_T;
706 int face_XYZ = face_X * face_Y * face_Z;
707 int face_XY = face_X * face_Y;
719 int aux1 = face_idx / face_X;
720 int aux2 = aux1 / face_Y;
721 int aux3 = aux2 / face_Z;
722 int y = aux1 - aux2 * face_Y;
723 int z = aux2 - aux3 * face_Z;
724 int Nf = aux3 / face_T;
725 int t = aux3 - Nf * face_T;
726 face_idx += (face_parity + t + z + y) & 1;
727 }
else if (!(face_Y & 1)) {
728 int t = (face_idx / face_XYZ) % face_T;
729 int z = (face_idx / face_XY) % face_Z;
730 face_idx += (face_parity + t + z) & 1;
731 }
else if (!(face_Z & 1)) {
732 int t = (face_idx / face_XYZ) % face_T;
733 face_idx += (face_parity + t) & 1;
734 }
else if(!(face_T)){
735 face_idx += face_parity & 1;
737 face_idx += face_parity;
748 aux = face_idx / face_X;
749 idx += (aux + face_num) * gap;
753 aux = face_idx / face_XY;
754 idx += (aux + face_num) * gap * face_X;
758 aux = face_idx / face_XYZ;
759 idx += (aux + face_num) * gap * face_XY;
763 aux = face_idx / face_XYZT;
764 idx += (aux + face_num) * gap * face_XYZ;
778 template <
typename FloatN>
794 #ifdef USE_TEXTURE_OBJECTS
795 cudaTextureObject_t inTex;
796 cudaTextureObject_t inTexNorm;
806 template <
typename Param>
807 __device__
inline int dimFromFaceIndex (
int &face_idx,
const Param
param) {
808 if (face_idx < param.threadDimMapUpper[0]) {
810 }
else if (face_idx < param.threadDimMapUpper[1]) {
811 face_idx -= param.threadDimMapLower[1];
813 }
else if (face_idx < param.threadDimMapUpper[2]) {
814 face_idx -= param.threadDimMapLower[2];
817 face_idx -= param.threadDimMapLower[3];
822 #if defined(GPU_WILSON_DIRAC) || defined(GPU_DOMAIN_WALL_DIRAC)
825 #if (defined DIRECT_ACCESS_WILSON_PACK_SPINOR) || (defined FERMI_NO_DBLE_TEX)
826 #define READ_SPINOR READ_SPINOR_DOUBLE
827 #define READ_SPINOR_UP READ_SPINOR_DOUBLE_UP
828 #define READ_SPINOR_DOWN READ_SPINOR_DOUBLE_DOWN
831 #define READ_SPINOR READ_SPINOR_DOUBLE_TEX
832 #define READ_SPINOR_UP READ_SPINOR_DOUBLE_UP_TEX
833 #define READ_SPINOR_DOWN READ_SPINOR_DOUBLE_DOWN_TEX
834 #ifdef USE_TEXTURE_OBJECTS
835 #define SPINORTEX param.inTex
837 #define SPINORTEX spinorTexDouble
840 #define WRITE_HALF_SPINOR WRITE_HALF_SPINOR_DOUBLE2
841 #define SPINOR_DOUBLE
842 template <
int dim,
int dagger,
int face_num>
843 static inline __device__
void packFaceWilsonCore(double2 *
out,
float *outNorm,
const double2 *
in,
844 const float *inNorm,
const int &idx,
845 const int &face_idx,
const int &face_volume,
846 PackParam<double2> ¶m)
848 #if (__COMPUTE_CAPABILITY__ >= 130)
854 #endif // (__COMPUTE_CAPABILITY__ >= 130)
857 #undef READ_SPINOR_UP
858 #undef READ_SPINOR_DOWN
860 #undef WRITE_HALF_SPINOR
865 #ifdef DIRECT_ACCESS_WILSON_PACK_SPINOR
866 #define READ_SPINOR READ_SPINOR_SINGLE
867 #define READ_SPINOR_UP READ_SPINOR_SINGLE_UP
868 #define READ_SPINOR_DOWN READ_SPINOR_SINGLE_DOWN
871 #define READ_SPINOR READ_SPINOR_SINGLE_TEX
872 #define READ_SPINOR_UP READ_SPINOR_SINGLE_UP_TEX
873 #define READ_SPINOR_DOWN READ_SPINOR_SINGLE_DOWN_TEX
874 #ifdef USE_TEXTURE_OBJECTS
875 #define SPINORTEX param.inTex
877 #define SPINORTEX spinorTexSingle
880 #define WRITE_HALF_SPINOR WRITE_HALF_SPINOR_FLOAT4
881 template <
int dim,
int dagger,
int face_num>
882 static inline __device__
void packFaceWilsonCore(float4 *out,
float *outNorm,
const float4 *in,
const float *inNorm,
883 const int &idx,
const int &face_idx,
884 const int &face_volume,
885 const PackParam<float4> ¶m)
894 #undef READ_SPINOR_UP
895 #undef READ_SPINOR_DOWN
897 #undef WRITE_HALF_SPINOR
901 #ifdef DIRECT_ACCESS_WILSON_PACK_SPINOR
902 #define READ_SPINOR READ_SPINOR_HALF
903 #define READ_SPINOR_UP READ_SPINOR_HALF_UP
904 #define READ_SPINOR_DOWN READ_SPINOR_HALF_DOWN
907 #define READ_SPINOR READ_SPINOR_HALF_TEX
908 #define READ_SPINOR_UP READ_SPINOR_HALF_UP_TEX
909 #define READ_SPINOR_DOWN READ_SPINOR_HALF_DOWN_TEX
910 #ifdef USE_TEXTURE_OBJECTS
911 #define SPINORTEX param.inTex
913 #define SPINORTEX spinorTexHalf
916 #define WRITE_HALF_SPINOR WRITE_HALF_SPINOR_SHORT4
917 template <
int dim,
int dagger,
int face_num>
918 static inline __device__
void packFaceWilsonCore(short4 *out,
float *outNorm,
const short4 *in,
const float *inNorm,
919 const int &idx,
const int &face_idx,
920 const int &face_volume,
921 const PackParam<short4> ¶m)
930 #undef READ_SPINOR_UP
931 #undef READ_SPINOR_DOWN
933 #undef WRITE_HALF_SPINOR
935 template <
int dagger,
typename FloatN>
936 __global__
void packFaceWilsonKernel(PackParam<FloatN> param)
940 int face_idx = blockIdx.x*blockDim.x + threadIdx.x;
941 if (face_idx >= param.threads)
return;
944 const int dim = dimFromFaceIndex(face_idx, param);
947 const int face_num = (face_idx >= nFace*
ghostFace[dim]) ? 1 : 0;
948 face_idx -= face_num*nFace*
ghostFace[dim];
955 const int idx = indexFromFaceIndex<0,nFace,0>(face_idx,ghostFace[0],param.parity);
956 packFaceWilsonCore<0,dagger,0>(param.out[0], param.outNorm[0], param.in,
957 param.inNorm,
idx, face_idx, ghostFace[0],
param);
959 const int idx = indexFromFaceIndex<0,nFace,1>(face_idx,ghostFace[0],param.parity);
960 packFaceWilsonCore<0,dagger,1>(param.out[1], param.outNorm[1], param.in,
961 param.inNorm,
idx, face_idx, ghostFace[0],
param);
963 }
else if (dim == 1) {
965 const int idx = indexFromFaceIndex<1,nFace,0>(face_idx,ghostFace[1],param.parity);
966 packFaceWilsonCore<1, dagger,0>(param.out[2], param.outNorm[2], param.in,
967 param.inNorm,
idx, face_idx, ghostFace[1],
param);
969 const int idx = indexFromFaceIndex<1,nFace,1>(face_idx,ghostFace[1],param.parity);
970 packFaceWilsonCore<1, dagger,1>(param.out[3], param.outNorm[3], param.in,
971 param.inNorm,
idx, face_idx, ghostFace[1],
param);
973 }
else if (dim == 2) {
975 const int idx = indexFromFaceIndex<2,nFace,0>(face_idx,ghostFace[2],param.parity);
976 packFaceWilsonCore<2, dagger,0>(param.out[4], param.outNorm[4], param.in,
977 param.inNorm,
idx, face_idx, ghostFace[2],
param);
979 const int idx = indexFromFaceIndex<2,nFace,1>(face_idx,ghostFace[2],param.parity);
980 packFaceWilsonCore<2, dagger,1>(param.out[5], param.outNorm[5], param.in,
981 param.inNorm,
idx, face_idx, ghostFace[2],
param);
985 const int idx = indexFromFaceIndex<3,nFace,0>(face_idx,ghostFace[3],param.parity);
986 packFaceWilsonCore<3, dagger,0>(param.out[6], param.outNorm[6], param.in,
987 param.inNorm,
idx, face_idx, ghostFace[3],
param);
989 const int idx = indexFromFaceIndex<3,nFace,1>(face_idx,ghostFace[3],param.parity);
990 packFaceWilsonCore<3, dagger,1>(param.out[7], param.outNorm[7], param.in,
991 param.inNorm,
idx, face_idx, ghostFace[3],
param);
997 #endif // GPU_WILSON_DIRAC || GPU_DOMAIN_WALL_DIRAC
1001 template <
typename FloatN>
1002 class PackFace :
public Tunable {
1006 const cudaColorSpinorField *
in;
1012 unsigned int threads()
const {
1014 for (
int i=0; i<4; i++) {
1016 if (i==3 && !kernelPackT)
continue;
1017 threads += 2*nFace*in->GhostFace()[i];
1022 virtual int inputPerSite()
const = 0;
1023 virtual int outputPerSite()
const = 0;
1026 PackParam<FloatN> prepareParam() {
1027 PackParam<FloatN>
param;
1028 param.in = (
FloatN*)in->V();
1029 param.inNorm = (
float*)in->Norm();
1031 #ifdef USE_TEXTURE_OBJECTS
1032 param.inTex = in->Tex();
1033 param.inTexNorm = in->TexNorm();
1037 param.stride = in->Stride();
1040 for (
int i=0; i<4; i++) {
1041 param.threadDimMapLower[i] = 0;
1042 param.threadDimMapUpper[i] = 0;
1044 param.threadDimMapLower[i] = (prev>=0 ? param.threadDimMapUpper[prev] : 0);
1045 param.threadDimMapUpper[i] = param.threadDimMapLower[i] + 2*nFace*in->GhostFace()[i];
1047 size_t faceBytes = nFace*outputPerSite()*in->GhostFace()[i]*
sizeof(faces->x);
1049 if (
typeid(
FloatN) ==
typeid(short4) ||
typeid(
FloatN) ==
typeid(short2)) {
1050 faceBytes += nFace*in->GhostFace()[i]*
sizeof(float);
1051 param.out[2*i] = (
FloatN*)((
char*)faces +
1052 (outputPerSite()*
sizeof(faces->x) +
sizeof(
float))*param.threadDimMapLower[i]);
1053 param.outNorm[2*i] = (
float*)((
char*)param.out[2*i] +
1054 nFace*outputPerSite()*in->GhostFace()[i]*
sizeof(faces->x));
1056 param.out[2*i] = (
FloatN*)((
char*)faces+outputPerSite()*
sizeof(faces->x)*param.threadDimMapLower[i]);
1059 param.out[2*i+1] = (
FloatN*)((
char*)param.out[2*i] + faceBytes);
1060 param.outNorm[2*i+1] = (
float*)((
char*)param.outNorm[2*i] + faceBytes);
1072 int sharedBytesPerThread()
const {
return 0; }
1073 int sharedBytesPerBlock(
const TuneParam ¶m)
const {
return 0; }
1075 bool advanceGridDim(TuneParam ¶m)
const {
return false; }
1076 bool advanceBlockDim(TuneParam ¶m)
const {
1077 bool advance = Tunable::advanceBlockDim(param);
1078 if (advance) param.grid = dim3( (
threads()+param.block.x-1) / param.block.x, 1, 1);
1083 PackFace(
FloatN *faces,
const cudaColorSpinorField *in,
1084 const int dagger,
const int parity,
const int nFace)
1085 : faces(faces), in(in), dagger(dagger), parity(parity), nFace(nFace) { }
1086 virtual ~PackFace() { }
1088 virtual int tuningIter()
const {
return 100; }
1090 virtual TuneKey tuneKey()
const {
1091 std::stringstream vol, aux;
1092 vol << in->X()[0] <<
"x";
1093 vol << in->X()[1] <<
"x";
1094 vol << in->X()[2] <<
"x";
1096 aux <<
"threads=" <<
threads() <<
",stride=" << in->Stride() <<
",prec=" <<
sizeof(((
FloatN*)0)->x);
1097 return TuneKey(vol.str(),
typeid(*this).name(), aux.str());
1100 virtual void apply(
const cudaStream_t &
stream) = 0;
1102 virtual void initTuneParam(TuneParam ¶m)
const
1104 Tunable::initTuneParam(param);
1105 param.grid = dim3( (
threads()+param.block.x-1) / param.block.x, 1, 1);
1109 virtual void defaultTuneParam(TuneParam ¶m)
const
1111 Tunable::defaultTuneParam(param);
1112 param.grid = dim3( (
threads()+param.block.x-1) / param.block.x, 1, 1);
1115 long long bytes()
const {
1116 size_t faceBytes = (inputPerSite() + outputPerSite())*this->
threads()*
sizeof(((
FloatN*)0)->x);
1118 faceBytes += 2*this->
threads()*
sizeof(float);
1123 template <
typename FloatN>
1124 class PackFaceWilson :
public PackFace<FloatN> {
1128 int inputPerSite()
const {
return 24; }
1129 int outputPerSite()
const {
return 12; }
1132 PackFaceWilson(
FloatN *faces,
const cudaColorSpinorField *in,
1133 const int dagger,
const int parity)
1134 : PackFace<
FloatN>(faces, in, dagger, parity, 1) { }
1135 virtual ~PackFaceWilson() { }
1137 void apply(
const cudaStream_t &
stream) {
1138 TuneParam tp =
tuneLaunch(*
this, dslashTuning, verbosity);
1140 #ifdef GPU_WILSON_DIRAC
1141 PackParam<FloatN> param = this->prepareParam();
1143 packFaceWilsonKernel<1><<<tp.grid, tp.block, tp.shared_bytes, stream>>>(
param);
1145 packFaceWilsonKernel<0><<<tp.grid, tp.block, tp.shared_bytes, stream>>>(
param);
1148 errorQuda(
"Wilson face packing kernel is not built");
1152 long long flops()
const {
return outputPerSite()*this->
threads(); }
1155 void packFaceWilson(
void *ghost_buf, cudaColorSpinorField &in,
const int dagger,
1156 const int parity,
const cudaStream_t &
stream) {
1158 switch(in.Precision()) {
1161 PackFaceWilson<double2> pack((double2*)ghost_buf, &in, dagger, parity);
1167 PackFaceWilson<float4> pack((float4*)ghost_buf, &in, dagger, parity);
1173 PackFaceWilson<short4> pack((short4*)ghost_buf, &in, dagger, parity);
1180 #ifdef GPU_STAGGERED_DIRAC
1182 #ifdef USE_TEXTURE_OBJECTS
1183 #define SPINORTEXDOUBLE param.inTex
1184 #define SPINORTEXSINGLE param.inTex
1185 #define SPINORTEXHALF param.inTex
1186 #define SPINORTEXHALFNORM param.inTexNorm
1188 #define SPINORTEXDOUBLE spinorTexDouble
1189 #define SPINORTEXSINGLE spinorTexSingle2
1190 #define SPINORTEXHALF spinorTexHalf2
1191 #define SPINORTEXHALFNORM spinorTexHalf2Norm
1194 #if (defined DIRECT_ACCESS_PACK) || (defined FERMI_NO_DBLE_TEX)
1195 template <
typename Float2>
1196 __device__
void packFaceAsqtadCore(Float2 *out,
float *outNorm,
const int out_idx,
1197 const int out_stride,
const Float2 *in,
const float *inNorm,
1198 const int in_idx,
const PackParam<double2> ¶m) {
1199 out[out_idx + 0*out_stride] = in[in_idx + 0*param.stride];
1200 out[out_idx + 1*out_stride] = in[in_idx + 1*param.stride];
1201 out[out_idx + 2*out_stride] = in[in_idx + 2*param.stride];
1204 __device__
void packFaceAsqtadCore(short2 *out,
float *outNorm,
const int out_idx,
1205 const int out_stride,
const short2 *in,
const float *inNorm,
1206 const int in_idx,
const PackParam<double2> ¶m) {
1207 out[out_idx + 0*out_stride] = in[in_idx + 0*param.stride];
1208 out[out_idx + 1*out_stride] = in[in_idx + 1*param.stride];
1209 out[out_idx + 2*out_stride] = in[in_idx + 2*param.stride];
1210 outNorm[out_idx] = inNorm[in_idx];
1213 __device__
void packFaceAsqtadCore(double2 *out,
float *outNorm,
const int out_idx,
1214 const int out_stride,
const double2 *in,
const float *inNorm,
1215 const int in_idx,
const PackParam<double2> ¶m) {
1216 out[out_idx + 0*out_stride] =
fetch_double2(SPINORTEXDOUBLE, in_idx + 0*param.stride);
1217 out[out_idx + 1*out_stride] =
fetch_double2(SPINORTEXDOUBLE, in_idx + 1*param.stride);
1218 out[out_idx + 2*out_stride] =
fetch_double2(SPINORTEXDOUBLE, in_idx + 2*param.stride);
1220 __device__
void packFaceAsqtadCore(float2 *out,
float *outNorm,
const int out_idx,
1221 const int out_stride,
const float2 *in,
1222 const float *inNorm,
const int in_idx,
1223 const PackParam<float2> ¶m) {
1224 out[out_idx + 0*out_stride] =
TEX1DFETCH(float2, SPINORTEXSINGLE, in_idx + 0*param.stride);
1225 out[out_idx + 1*out_stride] =
TEX1DFETCH(float2, SPINORTEXSINGLE, in_idx + 1*param.stride);
1226 out[out_idx + 2*out_stride] =
TEX1DFETCH(float2, SPINORTEXSINGLE, in_idx + 2*param.stride);
1231 static inline __device__ short2 float22short2(
float c, float2 a) {
1232 return make_short2((
short)(a.x*c*
MAX_SHORT), (
short)(a.y*c*MAX_SHORT));
1235 __device__
void packFaceAsqtadCore(short2 *out,
float *outNorm,
const int out_idx,
1236 const int out_stride,
const short2 *in,
1237 const float *inNorm,
const int in_idx,
1238 const PackParam<short2> ¶m) {
1239 out[out_idx + 0*out_stride] = float22short2(1.0f,
TEX1DFETCH(float2,SPINORTEXHALF,in_idx+0*param.stride));
1240 out[out_idx + 1*out_stride] = float22short2(1.0f,
TEX1DFETCH(float2,SPINORTEXHALF,in_idx+1*param.stride));
1241 out[out_idx + 2*out_stride] = float22short2(1.0f,
TEX1DFETCH(float2,SPINORTEXHALF,in_idx+2*param.stride));
1242 outNorm[out_idx] =
TEX1DFETCH(
float, SPINORTEXHALFNORM, in_idx);
1246 template <
typename FloatN>
1247 __global__
void packFaceAsqtadKernel(PackParam<FloatN> param)
1249 const int nFace = 3;
1251 int face_idx = blockIdx.x*blockDim.x + threadIdx.x;
1252 if (face_idx >= param.threads)
return;
1255 const int dim = dimFromFaceIndex(face_idx, param);
1258 const int face_num = (face_idx >= nFace*ghostFace[dim]) ? 1 : 0;
1259 face_idx -= face_num*nFace*ghostFace[dim];
1265 if (face_num == 0) {
1266 const int idx = indexFromFaceIndexAsqtad<0,nFace,0>(face_idx,ghostFace[0],param.parity);
1267 packFaceAsqtadCore(param.out[0], param.outNorm[0], face_idx,
1268 nFace*ghostFace[0], param.in, param.inNorm, idx, param);
1270 const int idx = indexFromFaceIndexAsqtad<0,nFace,1>(face_idx,ghostFace[0],param.parity);
1271 packFaceAsqtadCore(param.out[1], param.outNorm[1], face_idx,
1272 nFace*ghostFace[0], param.in, param.inNorm, idx, param);
1274 }
else if (dim == 1) {
1275 if (face_num == 0) {
1276 const int idx = indexFromFaceIndexAsqtad<1,nFace,0>(face_idx,ghostFace[1],param.parity);
1277 packFaceAsqtadCore(param.out[2], param.outNorm[2], face_idx,
1278 nFace*ghostFace[1], param.in, param.inNorm, idx, param);
1280 const int idx = indexFromFaceIndexAsqtad<1,nFace,1>(face_idx,ghostFace[1],param.parity);
1281 packFaceAsqtadCore(param.out[3], param.outNorm[3], face_idx,
1282 nFace*ghostFace[1], param.in, param.inNorm, idx, param);
1284 }
else if (dim == 2) {
1285 if (face_num == 0) {
1286 const int idx = indexFromFaceIndexAsqtad<2,nFace,0>(face_idx,ghostFace[2],param.parity);
1287 packFaceAsqtadCore(param.out[4], param.outNorm[4], face_idx,
1288 nFace*ghostFace[2], param.in, param.inNorm, idx, param);
1290 const int idx = indexFromFaceIndexAsqtad<2,nFace,1>(face_idx,ghostFace[2],param.parity);
1291 packFaceAsqtadCore(param.out[5], param.outNorm[5], face_idx,
1292 nFace*ghostFace[2], param.in, param.inNorm, idx, param);
1295 if (face_num == 0) {
1296 const int idx = indexFromFaceIndexAsqtad<3,nFace,0>(face_idx,ghostFace[3],param.parity);
1297 packFaceAsqtadCore(param.out[6], param.outNorm[6], face_idx,
1298 nFace*ghostFace[3], param.in, param.inNorm,idx, param);
1300 const int idx = indexFromFaceIndexAsqtad<3,nFace,1>(face_idx,ghostFace[3],param.parity);
1301 packFaceAsqtadCore(param.out[7], param.outNorm[7], face_idx,
1302 nFace*ghostFace[3], param.in, param.inNorm, idx, param);
1308 #undef SPINORTEXDOUBLE
1309 #undef SPINORTEXSINGLE
1310 #undef SPINORTEXHALF
1312 #endif // GPU_STAGGERED_DIRAC
1315 template <
typename FloatN>
1316 class PackFaceAsqtad :
public PackFace<FloatN> {
1320 int inputPerSite()
const {
return 6; }
1321 int outputPerSite()
const {
return 6; }
1324 PackFaceAsqtad(
FloatN *faces,
const cudaColorSpinorField *in,
1325 const int dagger,
const int parity)
1326 : PackFace<
FloatN>(faces, in, dagger, parity, 3) { }
1327 virtual ~PackFaceAsqtad() { }
1329 void apply(
const cudaStream_t &stream) {
1330 TuneParam tp =
tuneLaunch(*
this, dslashTuning, verbosity);
1332 #ifdef GPU_STAGGERED_DIRAC
1333 PackParam<FloatN> param = this->prepareParam();
1334 packFaceAsqtadKernel<<<tp.grid, tp.block, tp.shared_bytes, stream>>>(
param);
1336 errorQuda(
"Asqtad face packing kernel is not built");
1340 long long flops()
const {
return 0; }
1343 void packFaceAsqtad(
void *ghost_buf, cudaColorSpinorField &in,
const int dagger,
1344 const int parity,
const cudaStream_t &stream) {
1346 switch(in.Precision()) {
1349 PackFaceAsqtad<double2> pack((double2*)ghost_buf, &in, dagger, parity);
1355 PackFaceAsqtad<float2> pack((float2*)ghost_buf, &in, dagger, parity);
1361 PackFaceAsqtad<short2> pack((short2*)ghost_buf, &in, dagger, parity);
1369 #ifdef GPU_DOMAIN_WALL_DIRAC
1370 template <
int dagger,
typename FloatN>
1371 __global__
void packFaceDWKernel(PackParam<FloatN> param)
1373 const int nFace = 1;
1375 int face_idx = blockIdx.x*blockDim.x + threadIdx.x;
1376 if (face_idx >= param.threads)
return;
1379 const int dim = dimFromFaceIndex(face_idx, param);
1383 const int face_num = (face_idx >= nFace*
Ls*ghostFace[dim]) ? 1 : 0;
1384 face_idx -= face_num*nFace*
Ls*ghostFace[dim];
1390 if (face_num == 0) {
1391 const int idx = indexFromDWFaceIndex<0,nFace,0>(face_idx,
Ls*ghostFace[0],param.parity);
1392 packFaceWilsonCore<0,dagger,0>(param.out[0], param.outNorm[0], param.in,
1393 param.inNorm,
idx, face_idx,
Ls*ghostFace[0],
param);
1395 const int idx = indexFromDWFaceIndex<0,nFace,1>(face_idx,
Ls*ghostFace[0],param.parity);
1396 packFaceWilsonCore<0,dagger,1>(param.out[1], param.outNorm[1], param.in,
1397 param.inNorm,
idx, face_idx,
Ls*ghostFace[0],
param);
1399 }
else if (dim == 1) {
1400 if (face_num == 0) {
1401 const int idx = indexFromDWFaceIndex<1,nFace,0>(face_idx,
Ls*ghostFace[1],param.parity);
1402 packFaceWilsonCore<1, dagger,0>(param.out[2], param.outNorm[2], param.in,
1403 param.inNorm,
idx, face_idx,
Ls*ghostFace[1],
param);
1405 const int idx = indexFromDWFaceIndex<1,nFace,1>(face_idx,
Ls*ghostFace[1],param.parity);
1406 packFaceWilsonCore<1, dagger,1>(param.out[3], param.outNorm[3], param.in,
1407 param.inNorm,
idx, face_idx,
Ls*ghostFace[1],
param);
1409 }
else if (dim == 2) {
1410 if (face_num == 0) {
1411 const int idx = indexFromDWFaceIndex<2,nFace,0>(face_idx,
Ls*ghostFace[2],param.parity);
1412 packFaceWilsonCore<2, dagger,0>(param.out[4], param.outNorm[4], param.in,
1413 param.inNorm,
idx, face_idx,
Ls*ghostFace[2],
param);
1415 const int idx = indexFromDWFaceIndex<2,nFace,1>(face_idx,
Ls*ghostFace[2],param.parity);
1416 packFaceWilsonCore<2, dagger,1>(param.out[5], param.outNorm[5], param.in,
1417 param.inNorm,
idx, face_idx,
Ls*ghostFace[2],
param);
1420 if (face_num == 0) {
1421 const int idx = indexFromDWFaceIndex<3,nFace,0>(face_idx,
Ls*ghostFace[3],param.parity);
1422 packFaceWilsonCore<3, dagger,0>(param.out[6], param.outNorm[6], param.in,
1423 param.inNorm,
idx, face_idx,
Ls*ghostFace[3],
param);
1425 const int idx = indexFromDWFaceIndex<3,nFace,1>(face_idx,
Ls*ghostFace[3],param.parity);
1426 packFaceWilsonCore<3, dagger,1>(param.out[7], param.outNorm[7], param.in,
1427 param.inNorm,
idx, face_idx,
Ls*ghostFace[3],
param);
1433 template <
typename FloatN>
1434 class PackFaceDW :
public PackFace<FloatN> {
1438 int inputPerSite()
const {
return 24; }
1439 int outputPerSite()
const {
return 12; }
1442 PackFaceDW(
FloatN *faces,
const cudaColorSpinorField *in,
1443 const int dagger,
const int parity)
1444 : PackFace<
FloatN>(faces, in, dagger, parity, 1) { }
1445 virtual ~PackFaceDW() { }
1447 void apply(
const cudaStream_t &stream) {
1448 TuneParam tp =
tuneLaunch(*
this, dslashTuning, verbosity);
1450 #ifdef GPU_DOMAIN_WALL_DIRAC
1451 PackParam<FloatN> param = this->prepareParam();
1453 packFaceDWKernel<1><<<tp.grid, tp.block, tp.shared_bytes, stream>>>(
param);
1455 packFaceDWKernel<0><<<tp.grid, tp.block, tp.shared_bytes, stream>>>(
param);
1458 errorQuda(
"DW face packing kernel is not built");
1462 long long flops()
const {
return outputPerSite()*this->
threads(); }
1465 void packFaceDW(
void *ghost_buf, cudaColorSpinorField &in,
const int dagger,
1466 const int parity,
const cudaStream_t &stream) {
1468 switch(in.Precision()) {
1471 PackFaceDW<double2> pack((double2*)ghost_buf, &in, dagger, parity);
1477 PackFaceDW<float4> pack((float4*)ghost_buf, &in, dagger, parity);
1483 PackFaceDW<short4> pack((short4*)ghost_buf, &in, dagger, parity);
1490 #ifdef GPU_NDEG_TWISTED_MASS_DIRAC
1491 template <
int dagger,
typename FloatN>
1492 __global__
void packFaceNdegTMKernel(PackParam<FloatN> param)
1494 const int nFace = 1;
1497 int face_idx = blockIdx.x*blockDim.x + threadIdx.x;
1498 if (face_idx >= param.threads)
return;
1501 const int dim = dimFromFaceIndex(face_idx, param);
1505 const int face_num = (face_idx >= nFace*Nf*ghostFace[dim]) ? 1 : 0;
1506 face_idx -= face_num*nFace*Nf*ghostFace[dim];
1512 if (face_num == 0) {
1513 const int idx = indexFromNdegTMFaceIndex<0,nFace,0>(face_idx,Nf*ghostFace[0],param.parity);
1514 packFaceWilsonCore<0,dagger,0>(param.out[0], param.outNorm[0], param.in,
1515 param.inNorm,
idx, face_idx, Nf*ghostFace[0],
param);
1517 const int idx = indexFromNdegTMFaceIndex<0,nFace,1>(face_idx,Nf*ghostFace[0],param.parity);
1518 packFaceWilsonCore<0,dagger,1>(param.out[1], param.outNorm[1], param.in,
1519 param.inNorm,
idx, face_idx, Nf*ghostFace[0],
param);
1521 }
else if (dim == 1) {
1522 if (face_num == 0) {
1523 const int idx = indexFromNdegTMFaceIndex<1,nFace,0>(face_idx,Nf*ghostFace[1],param.parity);
1524 packFaceWilsonCore<1, dagger,0>(param.out[2], param.outNorm[2], param.in,
1525 param.inNorm,
idx, face_idx, Nf*ghostFace[1],
param);
1527 const int idx = indexFromNdegTMFaceIndex<1,nFace,1>(face_idx,Nf*ghostFace[1],param.parity);
1528 packFaceWilsonCore<1, dagger,1>(param.out[3], param.outNorm[3], param.in,
1529 param.inNorm,
idx, face_idx, Nf*ghostFace[1],
param);
1531 }
else if (dim == 2) {
1532 if (face_num == 0) {
1533 const int idx = indexFromNdegTMFaceIndex<2,nFace,0>(face_idx,Nf*ghostFace[2],param.parity);
1534 packFaceWilsonCore<2, dagger,0>(param.out[4], param.outNorm[4], param.in,
1535 param.inNorm,
idx, face_idx, Nf*ghostFace[2],
param);
1537 const int idx = indexFromNdegTMFaceIndex<2,nFace,1>(face_idx,Nf*ghostFace[2],param.parity);
1538 packFaceWilsonCore<2, dagger,1>(param.out[5], param.outNorm[5], param.in,
1539 param.inNorm,
idx, face_idx, Nf*ghostFace[2],
param);
1542 if (face_num == 0) {
1543 const int idx = indexFromNdegTMFaceIndex<3,nFace,0>(face_idx,Nf*ghostFace[3],param.parity);
1544 packFaceWilsonCore<3, dagger,0>(param.out[6], param.outNorm[6], param.in,
1545 param.inNorm,
idx, face_idx, Nf*ghostFace[3],
param);
1547 const int idx = indexFromNdegTMFaceIndex<3,nFace,1>(face_idx,Nf*ghostFace[3],param.parity);
1548 packFaceWilsonCore<3, dagger,1>(param.out[7], param.outNorm[7], param.in,
1549 param.inNorm,
idx, face_idx, Nf*ghostFace[3],
param);
1555 template <
typename FloatN>
1556 class PackFaceNdegTM :
public PackFace<FloatN> {
1560 int inputPerSite()
const {
return 24; }
1561 int outputPerSite()
const {
return 12; }
1564 PackFaceNdegTM(
FloatN *faces,
const cudaColorSpinorField *in,
1565 const int dagger,
const int parity)
1566 : PackFace<
FloatN>(faces, in, dagger, parity, 1) { }
1567 virtual ~PackFaceNdegTM() { }
1569 void apply(
const cudaStream_t &stream) {
1570 TuneParam tp =
tuneLaunch(*
this, dslashTuning, verbosity);
1572 #ifdef GPU_NDEG_TWISTED_MASS_DIRAC
1573 PackParam<FloatN> param = this->prepareParam();
1575 packFaceNdegTMKernel<1><<<tp.grid, tp.block, tp.shared_bytes, stream>>>(
param);
1577 packFaceNdegTMKernel<0><<<tp.grid, tp.block, tp.shared_bytes, stream>>>(
param);
1580 errorQuda(
"Non-degenerate twisted mass face packing kernel is not built");
1584 long long flops()
const {
return outputPerSite()*this->
threads(); }
1588 void packFaceNdegTM(
void *ghost_buf, cudaColorSpinorField &in,
const int dagger,
1589 const int parity,
const cudaStream_t &stream) {
1591 switch(in.Precision()) {
1594 PackFaceNdegTM<double2> pack((double2*)ghost_buf, &in, dagger, parity);
1600 PackFaceNdegTM<float4> pack((float4*)ghost_buf, &in, dagger, parity);
1606 PackFaceNdegTM<short4> pack((short4*)ghost_buf, &in, dagger, parity);
1613 void packFace(
void *ghost_buf, cudaColorSpinorField &in,
const int dagger,
const int parity,
const cudaStream_t &stream)
1616 for (
int dim=0; dim<4; dim++) {
1620 if (!nDimPack)
return;
1623 if (in.Nspin() == 1) {
1624 packFaceAsqtad(ghost_buf, in, dagger, parity, stream);
1625 }
else if (in.Ndim() == 5) {
1627 packFaceDW(ghost_buf, in, dagger, parity, stream);
1629 packFaceNdegTM(ghost_buf, in, dagger, parity, stream);
1632 packFaceWilson(ghost_buf, in, dagger, parity, stream);