10 #ifdef GPU_WILSON_DIRAC 12 #endif // GPU_WILSON_DIRAC 57 template <
typename FloatN>
69 int threadDimMapLower[4];
70 int threadDimMapUpper[4];
73 #ifdef USE_TEXTURE_OBJECTS 74 cudaTextureObject_t inTex;
75 cudaTextureObject_t inTexNorm;
89 template<
typename FloatN>
90 std::ostream& operator<<(std::ostream& output, const PackParam<FloatN>&
param) {
91 output <<
"threads = " <<
param.threads << std::endl;
92 output <<
"threadDimMapLower = {" <<
param.threadDimMapLower[0] <<
"," <<
93 param.threadDimMapLower[1] <<
"," <<
param.threadDimMapLower[2] <<
"," <<
param.threadDimMapLower[3] <<
"}" << std::endl;
94 output <<
"threadDimMapUpper = {" <<
param.threadDimMapUpper[0] <<
"," <<
95 param.threadDimMapUpper[1] <<
"," <<
param.threadDimMapUpper[2] <<
"," <<
param.threadDimMapUpper[3] <<
"}" << std::endl;
96 output <<
"parity = " <<
param.parity << std::endl;
97 output <<
"dim = " <<
param.dim << std::endl;
98 output <<
"face_num = " <<
param.face_num << std::endl;
99 output <<
"X = {" <<
param.dc.
X[0] <<
","<<
param.dc.
X[1] <<
"," <<
param.dc.
X[2] <<
"," <<
param.dc.
X[3] <<
"," <<
param.dc.
X[4] <<
"}" << std::endl;
100 output <<
"ghostFace = {" <<
param.dc.ghostFace[0] <<
","<<
param.dc.ghostFace[1] <<
"," 101 <<
param.dc.ghostFace[2] <<
"," <<
param.dc.ghostFace[3] <<
"}" << std::endl;
102 output <<
"sp_stride = " <<
param.sp_stride << std::endl;
107 template<
typename Float>
108 struct PackExtendedParam :
public PackParam<Float>
110 PackExtendedParam(){}
111 PackExtendedParam(
const PackParam<Float>&
base) : PackParam<Float>(
base) {}
115 #if defined(GPU_WILSON_DIRAC) || defined(GPU_DOMAIN_WALL_DIRAC) 118 #if (defined DIRECT_ACCESS_WILSON_PACK_SPINOR) || (defined FERMI_NO_DBLE_TEX) 119 #define READ_SPINOR READ_SPINOR_DOUBLE 120 #define READ_SPINOR_UP READ_SPINOR_DOUBLE_UP 121 #define READ_SPINOR_DOWN READ_SPINOR_DOUBLE_DOWN 124 #define READ_SPINOR READ_SPINOR_DOUBLE_TEX 125 #define READ_SPINOR_UP READ_SPINOR_DOUBLE_UP_TEX 126 #define READ_SPINOR_DOWN READ_SPINOR_DOUBLE_DOWN_TEX 127 #ifdef USE_TEXTURE_OBJECTS 128 #define SPINORTEX param.inTex 130 #define SPINORTEX spinorTexDouble 133 #define WRITE_HALF_SPINOR WRITE_HALF_SPINOR_DOUBLE2 134 #define SPINOR_DOUBLE 135 template <
int dim,
int dagger,
int face_num>
136 static inline __device__
void packFaceWilsonCore(double2 *
out,
float *outNorm,
const double2 *
in,
137 const float *inNorm,
const int &
idx,
139 PackParam<double2> &
param)
148 template <
int dim,
int dagger,
int face_num>
149 static inline __device__
void unpackFaceWilsonCore(double2 *
out,
float *outNorm,
const double2 *
in,
150 const float *inNorm,
const int &
idx,
152 PackParam<double2> &
param)
163 #undef READ_SPINOR_UP 164 #undef READ_SPINOR_DOWN 166 #undef WRITE_HALF_SPINOR 171 #ifdef DIRECT_ACCESS_WILSON_PACK_SPINOR 172 #define READ_SPINOR READ_SPINOR_SINGLE 173 #define READ_SPINOR_UP READ_SPINOR_SINGLE_UP 174 #define READ_SPINOR_DOWN READ_SPINOR_SINGLE_DOWN 177 #define READ_SPINOR READ_SPINOR_SINGLE_TEX 178 #define READ_SPINOR_UP READ_SPINOR_SINGLE_UP_TEX 179 #define READ_SPINOR_DOWN READ_SPINOR_SINGLE_DOWN_TEX 180 #ifdef USE_TEXTURE_OBJECTS 181 #define SPINORTEX param.inTex 183 #define SPINORTEX spinorTexSingle 186 #define WRITE_HALF_SPINOR WRITE_HALF_SPINOR_FLOAT4 187 template <
int dim,
int dagger,
int face_num>
188 static inline __device__
void packFaceWilsonCore(float4 *
out,
float *outNorm,
const float4 *
in,
const float *inNorm,
191 const PackParam<float4> &
param)
200 template <
int dim,
int dagger,
int face_num>
201 static inline __device__
void unpackFaceWilsonCore(float4 *
out,
float *outNorm,
const float4 *
in,
const float *inNorm,
204 const PackParam<float4> &
param)
213 #undef READ_SPINOR_UP 214 #undef READ_SPINOR_DOWN 216 #undef WRITE_HALF_SPINOR 220 #ifdef DIRECT_ACCESS_WILSON_PACK_SPINOR 221 #define READ_SPINOR READ_SPINOR_HALF 222 #define READ_SPINOR_UP READ_SPINOR_HALF_UP 223 #define READ_SPINOR_DOWN READ_SPINOR_HALF_DOWN 226 #define READ_SPINOR READ_SPINOR_HALF_TEX 227 #define READ_SPINOR_UP READ_SPINOR_HALF_UP_TEX 228 #define READ_SPINOR_DOWN READ_SPINOR_HALF_DOWN_TEX 229 #ifdef USE_TEXTURE_OBJECTS 230 #define SPINORTEX param.inTex 232 #define SPINORTEX spinorTexHalf 235 #define WRITE_HALF_SPINOR WRITE_HALF_SPINOR_SHORT4 236 template <
int dim,
int dagger,
int face_num>
237 static inline __device__
void packFaceWilsonCore(short4 *
out,
float *outNorm,
const short4 *
in,
const float *inNorm,
240 const PackParam<short4> &
param)
249 template <
int dim,
int dagger,
int face_num>
250 static inline __device__
void unpackFaceWilsonCore(short4 *
out,
float *outNorm,
const short4 *
in,
const float *inNorm,
253 const PackParam<short4> &
param)
262 #undef READ_SPINOR_UP 263 #undef READ_SPINOR_DOWN 265 #undef WRITE_HALF_SPINOR 267 template <
int dagger,
typename FloatN>
268 __global__
void packFaceWilsonKernel(PackParam<FloatN>
param)
273 const int sites_per_block =
param.sites_per_block;
274 int local_tid = threadIdx.x;
275 int tid = sites_per_block * blockIdx.x + local_tid;
278 constexpr
int sites_per_block = 1;
279 constexpr
int local_tid = 0;
282 while ( local_tid < sites_per_block && tid <
param.threads ) {
297 packFaceWilsonCore<0,dagger,0>(
param.out[0],
param.outNorm[0],
param.in,
301 packFaceWilsonCore<0,dagger,1>(
param.out[1],
param.outNorm[1],
param.in,
304 }
else if (
dim == 1) {
310 packFaceWilsonCore<1, dagger,0>(
param.out[2],
param.outNorm[2],
param.in,
314 packFaceWilsonCore<1, dagger,1>(
param.out[3],
param.outNorm[3],
param.in,
317 }
else if (
dim == 2) {
323 packFaceWilsonCore<2, dagger,0>(
param.out[4],
param.outNorm[4],
param.in,
327 packFaceWilsonCore<2, dagger,1>(
param.out[5],
param.outNorm[5],
param.in,
336 packFaceWilsonCore<3, dagger,0>(
param.out[6],
param.outNorm[6],
param.in,
340 packFaceWilsonCore<3, dagger,1>(
param.out[7],
param.outNorm[7],
param.in,
356 template <
int dagger,
typename FloatN,
int nFace>
357 __global__
void packFaceExtendedWilsonKernel(PackParam<FloatN>
param)
361 const int sites_per_block =
param.sites_per_block;
362 int local_tid = threadIdx.x;
363 int tid = sites_per_block * blockIdx.x + local_tid;
366 constexpr
int sites_per_block = 1;
367 constexpr
int local_tid = 0;
370 while ( local_tid < sites_per_block && tid <
param.threads ) {
387 packFaceWilsonCore<0,dagger,0>(
param.out[0],
param.outNorm[0],
param.in,
391 packFaceWilsonCore<0,dagger,1>(
param.out[1],
param.outNorm[1],
param.in,
394 }
else if (
dim == 1) {
399 packFaceWilsonCore<1, dagger,0>(
param.out[2],
param.outNorm[2],
param.in,
403 packFaceWilsonCore<1, dagger,1>(
param.out[3],
param.outNorm[3],
param.in,
406 }
else if (
dim == 2) {
411 packFaceWilsonCore<2, dagger,0>(
param.out[4],
param.outNorm[4],
param.in,
415 packFaceWilsonCore<2, dagger,1>(
param.out[5],
param.outNorm[5],
param.in,
424 packFaceWilsonCore<3, dagger,0>(
param.out[6],
param.outNorm[6],
param.in,
428 packFaceWilsonCore<3, dagger,1>(
param.out[7],
param.outNorm[7],
param.in,
444 template <
int dagger,
typename FloatN,
int nFace>
445 __global__
void unpackFaceExtendedWilsonKernel(PackParam<FloatN>
param)
449 const int sites_per_block =
param.sites_per_block;
450 int local_tid = threadIdx.x;
451 int tid = sites_per_block * blockIdx.x + local_tid;
454 constexpr
int sites_per_block = 1;
455 constexpr
int local_tid = 0;
458 while ( local_tid < sites_per_block && tid <
param.threads ) {
476 unpackFaceWilsonCore<0,dagger,0>(
param.out[0],
param.outNorm[0],
param.in,
480 unpackFaceWilsonCore<0,dagger,1>(
param.out[1],
param.outNorm[1],
param.in,
483 }
else if (
dim == 1) {
489 unpackFaceWilsonCore<1, dagger,0>(
param.out[2],
param.outNorm[2],
param.in,
493 unpackFaceWilsonCore<1, dagger,1>(
param.out[3],
param.outNorm[3],
param.in,
496 }
else if (
dim == 2) {
502 unpackFaceWilsonCore<2, dagger,0>(
param.out[4],
param.outNorm[4],
param.in,
506 unpackFaceWilsonCore<2, dagger,1>(
param.out[5],
param.outNorm[5],
param.in,
515 unpackFaceWilsonCore<3, dagger,0>(
param.out[6],
param.outNorm[6],
param.in,
519 unpackFaceWilsonCore<3, dagger,1>(
param.out[7],
param.outNorm[7],
param.in,
534 #endif // GPU_WILSON_DIRAC || GPU_DOMAIN_WALL_DIRAC 537 #if defined(GPU_WILSON_DIRAC) || defined(GPU_TWISTED_MASS_DIRAC) 540 #endif // GPU_WILSON_DIRAC || GPU_DOMAIN_WALL_DIRAC 543 #if defined(GPU_WILSON_DIRAC) || defined(GPU_TWISTED_MASS_DIRAC) 547 #endif // GPU_WILSON_DIRAC || GPU_DOMAIN_WALL_DIRAC 550 #if defined(GPU_WILSON_DIRAC) || defined(GPU_TWISTED_MASS_DIRAC) 553 #if (defined DIRECT_ACCESS_WILSON_PACK_SPINOR) || (defined FERMI_NO_DBLE_TEX) 554 #define READ_SPINOR READ_SPINOR_DOUBLE 555 #define READ_SPINOR_UP READ_SPINOR_DOUBLE_UP 556 #define READ_SPINOR_DOWN READ_SPINOR_DOUBLE_DOWN 559 #define READ_SPINOR READ_SPINOR_DOUBLE_TEX 560 #define READ_SPINOR_UP READ_SPINOR_DOUBLE_UP_TEX 561 #define READ_SPINOR_DOWN READ_SPINOR_DOUBLE_DOWN_TEX 562 #ifdef USE_TEXTURE_OBJECTS 563 #define SPINORTEX param.inTex 565 #define SPINORTEX spinorTexDouble 568 #define WRITE_HALF_SPINOR WRITE_HALF_SPINOR_DOUBLE2 569 #define SPINOR_DOUBLE 570 template <
int dim,
int dagger,
int face_num>
571 static inline __device__
void packTwistedFaceWilsonCore(double2 *
out,
float *outNorm,
const double2 *
in,
572 const float *inNorm,
double a,
double b,
const int &
idx,
574 PackParam<double2> &
param)
583 #undef READ_SPINOR_UP 584 #undef READ_SPINOR_DOWN 586 #undef WRITE_HALF_SPINOR 591 #ifdef DIRECT_ACCESS_WILSON_PACK_SPINOR 592 #define READ_SPINOR READ_SPINOR_SINGLE 593 #define READ_SPINOR_UP READ_SPINOR_SINGLE_UP 594 #define READ_SPINOR_DOWN READ_SPINOR_SINGLE_DOWN 597 #define READ_SPINOR READ_SPINOR_SINGLE_TEX 598 #define READ_SPINOR_UP READ_SPINOR_SINGLE_UP_TEX 599 #define READ_SPINOR_DOWN READ_SPINOR_SINGLE_DOWN_TEX 600 #ifdef USE_TEXTURE_OBJECTS 601 #define SPINORTEX param.inTex 603 #define SPINORTEX spinorTexSingle 606 #define WRITE_HALF_SPINOR WRITE_HALF_SPINOR_FLOAT4 607 template <
int dim,
int dagger,
int face_num>
608 static inline __device__
void packTwistedFaceWilsonCore(float4 *
out,
float *outNorm,
const float4 *
in,
const float *inNorm,
float a,
float b,
611 const PackParam<float4> &
param)
620 #undef READ_SPINOR_UP 621 #undef READ_SPINOR_DOWN 623 #undef WRITE_HALF_SPINOR 627 #ifdef DIRECT_ACCESS_WILSON_PACK_SPINOR 628 #define READ_SPINOR READ_SPINOR_HALF 629 #define READ_SPINOR_UP READ_SPINOR_HALF_UP 630 #define READ_SPINOR_DOWN READ_SPINOR_HALF_DOWN 633 #define READ_SPINOR READ_SPINOR_HALF_TEX 634 #define READ_SPINOR_UP READ_SPINOR_HALF_UP_TEX 635 #define READ_SPINOR_DOWN READ_SPINOR_HALF_DOWN_TEX 636 #ifdef USE_TEXTURE_OBJECTS 637 #define SPINORTEX param.inTex 639 #define SPINORTEX spinorTexHalf 642 #define WRITE_HALF_SPINOR WRITE_HALF_SPINOR_SHORT4 643 template <
int dim,
int dagger,
int face_num>
644 static inline __device__
void packTwistedFaceWilsonCore(short4 *
out,
float *outNorm,
const short4 *
in,
const float *inNorm,
float a,
float b,
647 const PackParam<short4> &
param)
656 #undef READ_SPINOR_UP 657 #undef READ_SPINOR_DOWN 659 #undef WRITE_HALF_SPINOR 661 template <
int dagger,
typename FloatN,
typename Float>
662 __global__
void packTwistedFaceWilsonKernel(Float
a, Float
b, PackParam<FloatN>
param)
667 const int sites_per_block =
param.sites_per_block;
668 int local_tid = threadIdx.x;
669 int tid = sites_per_block * blockIdx.x + local_tid;
672 constexpr
int sites_per_block = 1;
673 constexpr
int local_tid = 0;
676 while ( local_tid < sites_per_block && tid <
param.threads ) {
691 packTwistedFaceWilsonCore<0,dagger,0>(
param.out[0],
param.outNorm[0],
param.in,
695 packTwistedFaceWilsonCore<0,dagger,1>(
param.out[1],
param.outNorm[1],
param.in,
698 }
else if (
dim == 1) {
703 packTwistedFaceWilsonCore<1, dagger,0>(
param.out[2],
param.outNorm[2],
param.in,
707 packTwistedFaceWilsonCore<1, dagger,1>(
param.out[3],
param.outNorm[3],
param.in,
710 }
else if (
dim == 2) {
715 packTwistedFaceWilsonCore<2, dagger,0>(
param.out[4],
param.outNorm[4],
param.in,
719 packTwistedFaceWilsonCore<2, dagger,1>(
param.out[5],
param.outNorm[5],
param.in,
727 packTwistedFaceWilsonCore<3, dagger,0>(
param.out[6],
param.outNorm[6],
param.in,
731 packTwistedFaceWilsonCore<3, dagger,1>(
param.out[7],
param.outNorm[7],
param.in,
746 #endif // GPU_TWISTED_MASS_DIRAC 748 template <
typename FloatN,
typename Float>
749 class PackFace :
public Tunable {
753 const cudaColorSpinorField *
in;
762 unsigned int threads()
const {
763 unsigned int threads = 0;
765 for (
int i=0;
i<4;
i++) {
768 threads += 2*nFace*
in->GhostFace()[
i];
772 threads = nFace*
in->GhostFace()[
dim];
779 virtual int inputPerSite()
const = 0;
780 virtual int outputPerSite()
const = 0;
783 void prepareParam(PackParam<FloatN> &
param, TuneParam &tp,
int dim=-1,
int face_num=2) {
785 param.inNorm = (
float*)
in->Norm();
790 #ifdef USE_TEXTURE_OBJECTS 792 param.inTexNorm =
in->TexNorm();
795 param.threads = threads();
796 param.sp_stride =
in->Stride();
799 for (
int i=0;
i<4;
i++) {
800 param.threadDimMapLower[
i] = 0;
801 param.threadDimMapUpper[
i] = 0;
803 param.threadDimMapLower[
i] = (prev>=0 ?
param.threadDimMapUpper[prev] : 0);
804 param.threadDimMapUpper[
i] =
param.threadDimMapLower[
i] + 2*nFace*
in->GhostFace()[
i];
806 param.out[2*
i+0] =
static_cast<FloatN*
>(faces[2*
i+0]);
807 param.out[2*
i+1] =
static_cast<FloatN*
>(faces[2*
i+1]);
809 param.outNorm[2*
i+0] =
reinterpret_cast<float*
>(
static_cast<char*
>(faces[2*
i+0]) + nFace*outputPerSite()*
in->GhostFace()[
i]*
QUDA_HALF_PRECISION);
810 param.outNorm[2*
i+1] =
reinterpret_cast<float*
>(
static_cast<char*
>(faces[2*
i+1]) + nFace*outputPerSite()*
in->GhostFace()[
i]*
QUDA_HALF_PRECISION);
815 param.dc =
in->getDslashConstant();
817 param.swizzle = tp.aux.x;
818 param.sites_per_block = (
param.threads + tp.grid.x - 1) / tp.grid.x;
821 unsigned int sharedBytesPerThread()
const {
return 0; }
822 unsigned int sharedBytesPerBlock(
const TuneParam &
param)
const {
return 0; }
825 bool tuneGridDim()
const {
return true; }
826 unsigned int maxGridSize()
const {
827 if (location &
Host) {
837 unsigned int minGridSize()
const {
838 if (location &
Host) {
849 bool tuneGridDim()
const {
return location &
Host; }
853 bool tuneAuxDim()
const {
return true; }
854 unsigned int minThreads()
const {
return threads(); }
857 strcpy(aux,
"policy_kernel,");
860 comm[0] = (
commDim[0] ?
'1' :
'0');
861 comm[1] = (
commDim[1] ?
'1' :
'0');
862 comm[2] = (
commDim[2] ?
'1' :
'0');
863 comm[3] = (
commDim[3] ?
'1' :
'0');
864 comm[4] =
'\0';
strcat(aux,
",comm=");
869 case 1:
strcat(aux,
",nFace=1,");
break;
870 case 3:
strcat(aux,
",nFace=3,");
break;
871 default:
errorQuda(
"Number of faces not supported");
876 switch ((
int)location) {
881 default:
errorQuda(
"Unknown pack target location %d\n", location);
887 PackFace(
void *faces_[],
const cudaColorSpinorField *
in,
MemoryLocation location,
894 #ifndef USE_TEXTURE_OBJECTS 895 bindSpinorTex<FloatN>(
in);
899 virtual ~PackFace() {
900 #ifndef USE_TEXTURE_OBJECTS 901 unbindSpinorTex<FloatN>(
in);
907 bool advanceAux(TuneParam &
param)
const 910 if ( location &
Remote ) {
911 if (
param.aux.x < (
int)maxGridSize()) {
926 void initTuneParam(TuneParam &
param)
const {
935 void defaultTuneParam(TuneParam &
param)
const {
940 long long flops()
const {
return outputPerSite()*this->threads(); }
942 virtual int tuningIter()
const {
return 3; }
944 virtual TuneKey tuneKey()
const {
return TuneKey(
in->VolString(),
typeid(*this).name(), aux); }
946 virtual void apply(
const cudaStream_t &
stream) = 0;
948 long long bytes()
const {
949 size_t faceBytes = (inputPerSite() + outputPerSite())*this->threads()*
sizeof(((FloatN*)0)->x);
951 faceBytes += 2*this->threads()*
sizeof(
float);
956 template <
typename FloatN,
typename Float>
957 class PackFaceWilson :
public PackFace<FloatN, Float> {
961 int inputPerSite()
const {
return 24; }
962 int outputPerSite()
const {
return 12; }
965 PackFaceWilson(
void *faces[],
const cudaColorSpinorField *
in,
MemoryLocation location,
967 : PackFace<FloatN, Float>(faces,
in, location,
dagger,
parity, 1) { }
968 virtual ~PackFaceWilson() { }
970 void apply(
const cudaStream_t &
stream) {
973 #ifdef GPU_WILSON_DIRAC 974 static PackParam<FloatN>
param;
975 this->prepareParam(
param,tp);
978 void (*
func)(PackParam<FloatN>) = this->dagger ? &(packFaceWilsonKernel<1,FloatN>) : &(packFaceWilsonKernel<0,FloatN>);
981 errorQuda(
"Wilson face packing kernel is not built");
987 void packFaceWilson(
void *ghost_buf[], cudaColorSpinorField &
in,
MemoryLocation location,
990 switch(
in.Precision()) {
1010 errorQuda(
"Precision %d not supported",
in.Precision());
1014 template <
typename FloatN,
typename Float>
1015 class PackFaceTwisted :
public PackFace<FloatN, Float> {
1019 int inputPerSite()
const {
return 24; }
1020 int outputPerSite()
const {
return 12; }
1025 PackFaceTwisted(
void *faces[],
const cudaColorSpinorField *
in,
MemoryLocation location,
1028 virtual ~PackFaceTwisted() { }
1030 void apply(
const cudaStream_t &
stream) {
1033 #ifdef GPU_TWISTED_MASS_DIRAC 1034 static PackParam<FloatN>
param;
1035 this->prepareParam(
param,tp);
1037 void (*
func)(Float,Float,PackParam<FloatN>) = this->dagger ? &(packTwistedFaceWilsonKernel<1,FloatN,Float>) : &(packTwistedFaceWilsonKernel<0,FloatN,Float>);
1038 cudaLaunchKernel( (
const void*)
func, tp.grid, tp.block,
args, tp.shared_bytes,
stream);
1040 errorQuda(
"Twisted face packing kernel is not built");
1044 long long flops()
const {
return outputPerSite()*this->threads(); }
1048 void packTwistedFaceWilson(
void *ghost_buf[], cudaColorSpinorField &
in,
MemoryLocation location,
const int dagger,
1049 const int parity,
const double a,
const double b,
const cudaStream_t &
stream) {
1051 switch(
in.Precision()) {
1060 PackFaceTwisted<float4, float>
pack(ghost_buf, &
in, location,
dagger,
parity, (
float)
a, (
float)
b);
1066 PackFaceTwisted<short4, float>
pack(ghost_buf, &
in, location,
dagger,
parity, (
float)
a, (
float)
b);
1071 errorQuda(
"Precision %d not supported",
in.Precision());
1075 #ifdef GPU_STAGGERED_DIRAC 1077 #ifdef USE_TEXTURE_OBJECTS 1078 #define SPINORTEXDOUBLE param.inTex 1079 #define SPINORTEXSINGLE param.inTex 1080 #define SPINORTEXHALF param.inTex 1081 #define SPINORTEXHALFNORM param.inTexNorm 1083 #define SPINORTEXDOUBLE spinorTexDouble 1084 #define SPINORTEXSINGLE spinorTexSingle2 1085 #define SPINORTEXHALF spinorTexHalf2 1086 #define SPINORTEXHALFNORM spinorTexHalf2Norm 1089 template <
typename Float2>
1090 __device__
void packFaceStaggeredCore(Float2 *
out,
float *outNorm,
const int out_idx,
1091 const int out_stride,
const Float2 *
in,
const float *inNorm,
1092 const int in_idx,
const int in_stride) {
1093 out[out_idx + 0*out_stride] =
in[in_idx + 0*in_stride];
1094 out[out_idx + 1*out_stride] =
in[in_idx + 1*in_stride];
1095 out[out_idx + 2*out_stride] =
in[in_idx + 2*in_stride];
1098 __device__
void packFaceStaggeredCore(short2 *
out,
float *outNorm,
const int out_idx,
1099 const int out_stride,
const short2 *
in,
const float *inNorm,
1100 const int in_idx,
const int in_stride) {
1101 out[out_idx + 0*out_stride] =
in[in_idx + 0*in_stride];
1102 out[out_idx + 1*out_stride] =
in[in_idx + 1*in_stride];
1103 out[out_idx + 2*out_stride] =
in[in_idx + 2*in_stride];
1104 outNorm[out_idx] = inNorm[in_idx];
1107 #if (defined DIRECT_ACCESS_PACK) || (defined FERMI_NO_DBLE_TEX) 1108 template <
typename Float2>
1109 __device__
void packFaceStaggeredCore(Float2 *
out,
float *outNorm,
const int out_idx,
1110 const int out_stride,
const Float2 *
in,
const float *inNorm,
1111 const int in_idx,
const PackParam<double2> &
param) {
1112 out[out_idx + 0*out_stride] =
in[in_idx + 0*
param.sp_stride];
1113 out[out_idx + 1*out_stride] =
in[in_idx + 1*
param.sp_stride];
1114 out[out_idx + 2*out_stride] =
in[in_idx + 2*
param.sp_stride];
1117 __device__
void packFaceStaggeredCore(short2 *
out,
float *outNorm,
const int out_idx,
1118 const int out_stride,
const short2 *
in,
const float *inNorm,
1119 const int in_idx,
const PackParam<double2> &
param) {
1120 out[out_idx + 0*out_stride] =
in[in_idx + 0*
param.sp_stride];
1121 out[out_idx + 1*out_stride] =
in[in_idx + 1*
param.sp_stride];
1122 out[out_idx + 2*out_stride] =
in[in_idx + 2*
param.sp_stride];
1123 outNorm[out_idx] = inNorm[in_idx];
1128 __device__
void packFaceStaggeredCore(double2 *
out,
float *outNorm,
const int out_idx,
1129 const int out_stride,
const double2 *
in,
const float *inNorm,
1130 const int in_idx,
const PackParam<double2> &
param) {
1136 __device__
void packFaceStaggeredCore(float2 *
out,
float *outNorm,
const int out_idx,
1137 const int out_stride,
const float2 *
in,
1138 const float *inNorm,
const int in_idx,
1139 const PackParam<float2> &
param) {
1140 out[out_idx + 0*out_stride] =
TEX1DFETCH(float2, SPINORTEXSINGLE, in_idx + 0*
param.sp_stride);
1141 out[out_idx + 1*out_stride] =
TEX1DFETCH(float2, SPINORTEXSINGLE, in_idx + 1*
param.sp_stride);
1142 out[out_idx + 2*out_stride] =
TEX1DFETCH(float2, SPINORTEXSINGLE, in_idx + 2*
param.sp_stride);
1147 static inline __device__ short2 float22short2(
float c, float2
a) {
1151 __device__
void packFaceStaggeredCore(short2 *
out,
float *outNorm,
const int out_idx,
1152 const int out_stride,
const short2 *
in,
1153 const float *inNorm,
const int in_idx,
1154 const PackParam<short2> &
param) {
1155 out[out_idx + 0*out_stride] = float22short2(1.0
f,
TEX1DFETCH(float2,SPINORTEXHALF,in_idx+0*
param.sp_stride));
1156 out[out_idx + 1*out_stride] = float22short2(1.0
f,
TEX1DFETCH(float2,SPINORTEXHALF,in_idx+1*
param.sp_stride));
1157 out[out_idx + 2*out_stride] = float22short2(1.0
f,
TEX1DFETCH(float2,SPINORTEXHALF,in_idx+2*
param.sp_stride));
1158 outNorm[out_idx] =
TEX1DFETCH(
float, SPINORTEXHALFNORM, in_idx);
1163 template <
typename FloatN,
int nFace>
1164 __global__
void packFaceStaggeredKernel(PackParam<FloatN>
param)
1168 const int sites_per_block =
param.sites_per_block;
1169 int local_tid = threadIdx.x;
1170 int tid = sites_per_block * blockIdx.x + local_tid;
1173 constexpr
int sites_per_block = 1;
1174 constexpr
int local_tid = 0;
1177 while ( local_tid < sites_per_block && tid <
param.threads ) {
1201 }
else if (
dim == 1) {
1213 }
else if (
dim == 2) {
1250 template <
typename FloatN,
int nFace>
1251 __global__
void packFaceExtendedStaggeredKernel(PackExtendedParam<FloatN>
param)
1255 const int sites_per_block =
param.sites_per_block;
1256 int local_tid = threadIdx.x;
1257 int tid = sites_per_block * blockIdx.x + local_tid;
1260 constexpr
int sites_per_block = 1;
1261 constexpr
int local_tid = 0;
1264 while ( local_tid < sites_per_block && tid <
param.threads ) {
1280 const int idx = indexFromFaceIndexExtendedStaggered<0,nFace,0>(
face_idx,
param);
1284 const int idx = indexFromFaceIndexExtendedStaggered<0,nFace,1>(
face_idx,
param);
1288 }
else if (
dim == 1) {
1292 const int idx = indexFromFaceIndexExtendedStaggered<1,nFace,0>(
face_idx,
param);
1296 const int idx = indexFromFaceIndexExtendedStaggered<1,nFace,1>(
face_idx,
param);
1300 }
else if (
dim == 2) {
1304 const int idx = indexFromFaceIndexExtendedStaggered<2,nFace,0>(
face_idx,
param);
1308 const int idx = indexFromFaceIndexExtendedStaggered<2,nFace,1>(
face_idx,
param);
1316 const int idx = indexFromFaceIndexExtendedStaggered<3,nFace,0>(
face_idx,
param);
1320 const int idx = indexFromFaceIndexExtendedStaggered<3,nFace,1>(
face_idx,
param);
1337 template <
typename FloatN,
int nFace>
1338 __global__
void unpackFaceExtendedStaggeredKernel(PackExtendedParam<FloatN>
param)
1342 const int sites_per_block =
param.sites_per_block;
1343 int local_tid = threadIdx.x;
1344 int tid = sites_per_block * blockIdx.x + local_tid;
1347 constexpr
int sites_per_block = 1;
1348 constexpr
int local_tid = 0;
1351 while ( local_tid < sites_per_block && tid <
param.threads ) {
1367 const int idx = indexFromFaceIndexExtendedStaggered<0,nFace,0>(
face_idx,
param);
1371 const int idx = indexFromFaceIndexExtendedStaggered<0,nFace,1>(
face_idx,
param);
1375 }
else if (
dim == 1) {
1379 const int idx = indexFromFaceIndexExtendedStaggered<1,nFace,0>(
face_idx,
param);
1383 const int idx = indexFromFaceIndexExtendedStaggered<1,nFace,1>(
face_idx,
param);
1387 }
else if (
dim == 2) {
1391 const int idx = indexFromFaceIndexExtendedStaggered<2,nFace,0>(
face_idx,
param);
1395 const int idx = indexFromFaceIndexExtendedStaggered<2,nFace,1>(
face_idx,
param);
1403 const int idx = indexFromFaceIndexExtendedStaggered<3,nFace,0>(
face_idx,
param);
1407 const int idx = indexFromFaceIndexExtendedStaggered<3,nFace,1>(
face_idx,
param);
1424 #undef SPINORTEXDOUBLE 1425 #undef SPINORTEXSINGLE 1426 #undef SPINORTEXHALF 1428 #endif // GPU_STAGGERED_DIRAC 1431 template <
typename FloatN,
typename Float>
1432 class PackFaceStaggered :
public PackFace<FloatN, Float> {
1438 int inputPerSite()
const {
return 6; }
1439 int outputPerSite()
const {
return 6; }
1443 PackFaceStaggered(
void *faces[],
const cudaColorSpinorField *
in,
MemoryLocation location,
1445 const int dim,
const int face_num,
const int*
R=NULL,
const bool unpack=
false)
1446 : PackFace<FloatN, Float>(faces,
in, location,
dagger,
parity, nFace,
dim,
face_num),
R(
R), unpack(unpack) { }
1447 virtual ~PackFaceStaggered() { }
1449 void apply(
const cudaStream_t &
stream) {
1452 #ifdef GPU_STAGGERED_DIRAC 1454 static PackParam<FloatN>
param;
1455 this->prepareParam(
param,tp,this->dim,this->face_num);
1458 void (*
func)(PackParam<FloatN>) = PackFace<FloatN,Float>::nFace==1 ? &(packFaceStaggeredKernel<FloatN,1>) : &(packFaceStaggeredKernel<FloatN,3>);
1459 cudaLaunchKernel( (
const void*)
func, tp.grid, tp.block,
args, tp.shared_bytes,
stream);
1461 PackExtendedParam<FloatN> extendedParam(
param);
1464 switch(PackFace<FloatN,Float>::nFace){
1466 packFaceExtendedStaggeredKernel<FloatN,1><<<tp.grid, tp.block, tp.shared_bytes,
stream>>>(extendedParam);
1470 packFaceExtendedStaggeredKernel<FloatN,2><<<tp.grid, tp.block, tp.shared_bytes,
stream>>>(extendedParam);
1474 packFaceExtendedStaggeredKernel<FloatN,3><<<tp.grid, tp.block, tp.shared_bytes,
stream>>>(extendedParam);
1478 packFaceExtendedStaggeredKernel<FloatN,4><<<tp.grid, tp.block, tp.shared_bytes,
stream>>>(extendedParam);
1482 errorQuda(
"Unsupported boundary width");
1486 switch(PackFace<FloatN,Float>::nFace){
1488 unpackFaceExtendedStaggeredKernel<FloatN,1><<<tp.grid, tp.block, tp.shared_bytes,
stream>>>(extendedParam);
1492 unpackFaceExtendedStaggeredKernel<FloatN,2><<<tp.grid, tp.block, tp.shared_bytes,
stream>>>(extendedParam);
1496 unpackFaceExtendedStaggeredKernel<FloatN,3><<<tp.grid, tp.block, tp.shared_bytes,
stream>>>(extendedParam);
1500 unpackFaceExtendedStaggeredKernel<FloatN,4><<<tp.grid, tp.block, tp.shared_bytes,
stream>>>(extendedParam);
1504 errorQuda(
"Unsupported boundary width");
1510 errorQuda(
"Staggered face packing kernel is not built");
1514 long long flops()
const {
return 0; }
1518 void packFaceStaggered(
void *ghost_buf[], cudaColorSpinorField &
in,
MemoryLocation location,
int nFace,
1521 switch(
in.Precision()) {
1541 errorQuda(
"Precision %d not supported",
in.Precision());
1545 void packFaceExtendedStaggered(
void *buffer[], cudaColorSpinorField &field,
MemoryLocation location,
const int nFace,
const int R[],
1548 switch(field.Precision()){
1551 PackFaceStaggered<double2,double>
pack(buffer, &field, location, nFace,
dagger,
parity,
dim,
face_num,
R, unpack);
1557 PackFaceStaggered<float2,float>
pack(buffer, &field, location, nFace,
dagger,
parity,
dim,
face_num,
R, unpack);
1563 PackFaceStaggered<short2,float>
pack(buffer, &field, location, nFace,
dagger,
parity,
dim,
face_num,
R, unpack);
1568 errorQuda(
"Precision %d not supported", field.Precision());
1572 #ifdef GPU_DOMAIN_WALL_DIRAC 1573 template <
int dagger,
typename FloatN>
1574 __global__
void packFaceDWKernel(PackParam<FloatN>
param)
1576 const int nFace = 1;
1579 const int sites_per_block =
param.sites_per_block;
1580 int local_tid = threadIdx.x;
1581 int tid = sites_per_block * blockIdx.x + local_tid;
1584 constexpr
int sites_per_block = 1;
1585 constexpr
int local_tid = 0;
1588 while ( local_tid < sites_per_block && tid <
param.threads ) {
1605 const int idx = indexFromFaceIndex<5,QUDA_5D_PC,0,nFace,0>(
face_idx,
param);
1606 packFaceWilsonCore<0,dagger,0>(
param.out[0],
param.outNorm[0],
param.in,
1609 const int idx = indexFromFaceIndex<5,QUDA_5D_PC,0,nFace,1>(
face_idx,
param);
1610 packFaceWilsonCore<0,dagger,1>(
param.out[1],
param.outNorm[1],
param.in,
1613 }
else if (
dim == 1) {
1617 const int idx = indexFromFaceIndex<5,QUDA_5D_PC,1,nFace,0>(
face_idx,
param);
1618 packFaceWilsonCore<1, dagger,0>(
param.out[2],
param.outNorm[2],
param.in,
1621 const int idx = indexFromFaceIndex<5,QUDA_5D_PC,1,nFace,1>(
face_idx,
param);
1622 packFaceWilsonCore<1, dagger,1>(
param.out[3],
param.outNorm[3],
param.in,
1625 }
else if (
dim == 2) {
1629 const int idx = indexFromFaceIndex<5,QUDA_5D_PC,2,nFace,0>(
face_idx,
param);
1630 packFaceWilsonCore<2, dagger,0>(
param.out[4],
param.outNorm[4],
param.in,
1633 const int idx = indexFromFaceIndex<5,QUDA_5D_PC,2,nFace,1>(
face_idx,
param);
1634 packFaceWilsonCore<2, dagger,1>(
param.out[5],
param.outNorm[5],
param.in,
1641 const int idx = indexFromFaceIndex<5,QUDA_5D_PC,3,nFace,0>(
face_idx,
param);
1642 packFaceWilsonCore<3, dagger,0>(
param.out[6],
param.outNorm[6],
param.in,
1645 const int idx = indexFromFaceIndex<5,QUDA_5D_PC,3,nFace,1>(
face_idx,
param);
1646 packFaceWilsonCore<3, dagger,1>(
param.out[7],
param.outNorm[7],
param.in,
1662 template <
int dagger,
typename FloatN>
1663 __global__
void packFaceDW4DKernel(PackParam<FloatN>
param)
1665 const int nFace = 1;
1668 const int sites_per_block =
param.sites_per_block;
1669 int local_tid = threadIdx.x;
1670 int tid = sites_per_block * blockIdx.x + local_tid;
1673 constexpr
int sites_per_block = 1;
1674 constexpr
int local_tid = 0;
1677 while ( local_tid < sites_per_block && tid <
param.threads ) {
1694 const int idx = indexFromFaceIndex<5,QUDA_4D_PC,0,nFace,0>(
face_idx,
param);
1695 packFaceWilsonCore<0,dagger,0>(
param.out[0],
param.outNorm[0],
param.in,
1698 const int idx = indexFromFaceIndex<5,QUDA_4D_PC,0,nFace,1>(
face_idx,
param);
1699 packFaceWilsonCore<0,dagger,1>(
param.out[1],
param.outNorm[1],
param.in,
1702 }
else if (
dim == 1) {
1706 const int idx = indexFromFaceIndex<5,QUDA_4D_PC,1,nFace,0>(
face_idx,
param);
1707 packFaceWilsonCore<1, dagger,0>(
param.out[2],
param.outNorm[2],
param.in,
1710 const int idx = indexFromFaceIndex<5,QUDA_4D_PC,1,nFace,1>(
face_idx,
param);
1711 packFaceWilsonCore<1, dagger,1>(
param.out[3],
param.outNorm[3],
param.in,
1714 }
else if (
dim == 2) {
1718 const int idx = indexFromFaceIndex<5,QUDA_4D_PC,2,nFace,0>(
face_idx,
param);
1719 packFaceWilsonCore<2, dagger,0>(
param.out[4],
param.outNorm[4],
param.in,
1722 const int idx = indexFromFaceIndex<5,QUDA_4D_PC,2,nFace,1>(
face_idx,
param);
1723 packFaceWilsonCore<2, dagger,1>(
param.out[5],
param.outNorm[5],
param.in,
1730 const int idx = indexFromFaceIndex<5,QUDA_4D_PC,3,nFace,0>(
face_idx,
param);
1731 packFaceWilsonCore<3, dagger,0>(
param.out[6],
param.outNorm[6],
param.in,
1734 const int idx = indexFromFaceIndex<5,QUDA_4D_PC,3,nFace,1>(
face_idx,
param);
1735 packFaceWilsonCore<3, dagger,1>(
param.out[7],
param.outNorm[7],
param.in,
1752 template <
typename FloatN,
typename Float>
1753 class PackFaceDW :
public PackFace<FloatN, Float> {
1757 int inputPerSite()
const {
return 24; }
1758 int outputPerSite()
const {
return 12; }
1761 PackFaceDW(
void *faces[],
const cudaColorSpinorField *
in,
MemoryLocation location,
1763 : PackFace<FloatN, Float>(faces,
in, location,
dagger,
parity, 1) { }
1764 virtual ~PackFaceDW() { }
1766 void apply(
const cudaStream_t &
stream) {
1769 #ifdef GPU_DOMAIN_WALL_DIRAC 1770 static PackParam<FloatN>
param;
1771 this->prepareParam(
param,tp);
1773 void (*
func)(PackParam<FloatN>) = this->dagger ? &(packFaceDWKernel<1,FloatN>) : &(packFaceDWKernel<0,FloatN>);
1774 cudaLaunchKernel( (
const void*)
func, tp.grid, tp.block,
args, tp.shared_bytes,
stream);
1776 errorQuda(
"DW face packing kernel is not built");
1780 long long flops()
const {
return outputPerSite()*this->threads(); }
1783 template <
typename FloatN,
typename Float>
1784 class PackFaceDW4D :
public PackFace<FloatN, Float> {
1788 int inputPerSite()
const {
return 24; }
1789 int outputPerSite()
const {
return 12; }
1792 PackFaceDW4D(
void *faces[],
const cudaColorSpinorField *
in,
MemoryLocation location,
1794 : PackFace<FloatN, Float>(faces,
in, location,
dagger,
parity, 1) { }
1795 virtual ~PackFaceDW4D() { }
1797 void apply(
const cudaStream_t &
stream) {
1800 #ifdef GPU_DOMAIN_WALL_DIRAC 1801 static PackParam<FloatN>
param;
1802 this->prepareParam(
param,tp);
1804 void (*
func)(PackParam<FloatN>) = this->dagger ? &(packFaceDW4DKernel<1,FloatN>) : &(packFaceDW4DKernel<0,FloatN>);
1805 cudaLaunchKernel( (
const void*)
func, tp.grid, tp.block,
args, tp.shared_bytes,
stream);
1807 errorQuda(
"4D preconditioned DW face packing kernel is not built");
1811 long long flops()
const {
return outputPerSite()*this->threads(); }
1820 switch(
in.Precision()) {
1840 errorQuda(
"Precision %d not supported",
in.Precision());
1845 switch(
in.Precision()) {
1865 errorQuda(
"Precision %d not supported",
in.Precision());
1870 #ifdef GPU_NDEG_TWISTED_MASS_DIRAC 1871 template <
int dagger,
typename FloatN>
1872 __global__
void packFaceNdegTMKernel(PackParam<FloatN>
param)
1874 const int nFace = 1;
1878 const int sites_per_block =
param.sites_per_block;
1879 int local_tid = threadIdx.x;
1880 int tid = sites_per_block * blockIdx.x + local_tid;
1883 constexpr
int sites_per_block = 1;
1884 constexpr
int local_tid = 0;
1887 while ( local_tid < sites_per_block && tid <
param.threads ) {
1903 const int idx = indexFromFaceIndex<5,QUDA_4D_PC,0,nFace,0>(
face_idx,
param);
1904 packFaceWilsonCore<0,dagger,0>(
param.out[0],
param.outNorm[0],
param.in,
1907 const int idx = indexFromFaceIndex<5,QUDA_4D_PC,0,nFace,1>(
face_idx,
param);
1908 packFaceWilsonCore<0,dagger,1>(
param.out[1],
param.outNorm[1],
param.in,
1911 }
else if (
dim == 1) {
1915 const int idx = indexFromFaceIndex<5,QUDA_4D_PC,1,nFace,0>(
face_idx,
param);
1916 packFaceWilsonCore<1, dagger,0>(
param.out[2],
param.outNorm[2],
param.in,
1919 const int idx = indexFromFaceIndex<5,QUDA_4D_PC,1,nFace,1>(
face_idx,
param);
1920 packFaceWilsonCore<1, dagger,1>(
param.out[3],
param.outNorm[3],
param.in,
1923 }
else if (
dim == 2) {
1927 const int idx = indexFromFaceIndex<5,QUDA_4D_PC,2,nFace,0>(
face_idx,
param);
1928 packFaceWilsonCore<2, dagger,0>(
param.out[4],
param.outNorm[4],
param.in,
1931 const int idx = indexFromFaceIndex<5,QUDA_4D_PC,2,nFace,1>(
face_idx,
param);
1932 packFaceWilsonCore<2, dagger,1>(
param.out[5],
param.outNorm[5],
param.in,
1939 const int idx = indexFromFaceIndex<5,QUDA_4D_PC,3,nFace,0>(
face_idx,
param);
1940 packFaceWilsonCore<3, dagger,0>(
param.out[6],
param.outNorm[6],
param.in,
1943 const int idx = indexFromFaceIndex<5,QUDA_4D_PC,3,nFace,1>(
face_idx,
param);
1944 packFaceWilsonCore<3, dagger,1>(
param.out[7],
param.outNorm[7],
param.in,
1961 template <
typename FloatN,
typename Float>
1962 class PackFaceNdegTM :
public PackFace<FloatN, Float> {
1966 int inputPerSite()
const {
return 24; }
1967 int outputPerSite()
const {
return 12; }
1970 PackFaceNdegTM(
void *faces[],
const cudaColorSpinorField *
in,
MemoryLocation location,
1972 : PackFace<FloatN, Float>(faces,
in, location,
dagger,
parity, 1) { }
1973 virtual ~PackFaceNdegTM() { }
1975 void apply(
const cudaStream_t &
stream) {
1978 #ifdef GPU_NDEG_TWISTED_MASS_DIRAC 1979 static PackParam<FloatN>
param;
1980 this->prepareParam(
param,tp);
1982 void (*
func)(PackParam<FloatN>) = this->dagger ? &(packFaceNdegTMKernel<1,FloatN>) : &(packFaceNdegTMKernel<0,FloatN>);
1983 cudaLaunchKernel( (
const void*)
func, tp.grid, tp.block,
args, tp.shared_bytes,
stream);
1985 errorQuda(
"Non-degenerate twisted mass face packing kernel is not built");
1989 long long flops()
const {
return outputPerSite()*this->threads(); }
1992 void packFaceNdegTM(
void *ghost_buf[], cudaColorSpinorField &
in,
MemoryLocation location,
const int dagger,
1995 switch(
in.Precision()) {
2015 errorQuda(
"Precision %d not supported",
in.Precision());
2023 const cudaStream_t &
stream,
2024 const double a,
const double b)
2028 for (
int d=0;
d<4;
d++) {
2037 if (!nDimPack)
return;
2039 if (nFace != 1 &&
in.Nspin() != 1)
2040 errorQuda(
"Unsupported number of faces %d", nFace);
2043 if (
in.Nspin() == 1) {
2045 }
else if (
a!=0.0 ||
b!=0.0) {
2050 errorQuda(
"Cannot perform twisted packing for the spinor.");
2052 }
else if (
in.Ndim() == 5) {
2068 const cudaStream_t &
stream,
const bool unpack)
2072 for(
int d=0;
d<4;
d++){
2073 if(
R[
d]) nDimPack++;
2076 if(
R[
dim]) nDimPack++;
2079 if(!nDimPack)
return;
2080 if(field.Nspin() == 1){
2081 packFaceExtendedStaggered(buffer, field, location, nFace,
R,
dagger,
parity,
dim,
face_num,
stream, unpack);
2083 errorQuda(
"Extended quark field is not supported");
cudaDeviceProp deviceProp
QudaVerbosity getVerbosity()
void setPackComms(const int *commDim)
static __inline__ dim3 dim3 void size_t cudaStream_t int dim
char * strcpy(char *__dst, const char *__src)
char * strcat(char *__s1, const char *__s2)
void packFaceExtended(void *ghost_buf[2 *QUDA_MAX_DIM], cudaColorSpinorField &field, MemoryLocation location, const int nFace, const int R[], const int dagger, const int parity, const int dim, const int face_num, const cudaStream_t &stream, const bool unpack=false)
void packFace(void *ghost_buf[2 *QUDA_MAX_DIM], cudaColorSpinorField &in, MemoryLocation location, const int nFace, const int dagger, const int parity, const int dim, const int face_num, const cudaStream_t &stream, const double a=0.0, const double b=0.0)
Dslash face packing routine.
virtual unsigned int maxGridSize() const
__device__ int dimFromFaceIndex(int &face_idx, const int tid, const Param ¶m)
Determines which face a given thread is computing. Also rescale face_idx so that is relative to a giv...
const char * comm_dim_topology_string()
Return a string that defines the comm topology (for use as a tuneKey)
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
int int int enum cudaChannelFormatKind f
__inline__ __device__ double2 fetch_double2(texture< int4, 1 > t, int i)
void * memcpy(void *__dst, const void *__src, size_t __n)
virtual bool tuneSharedBytes() const
virtual unsigned int minGridSize() const
cpuColorSpinorField * out
virtual void initTuneParam(TuneParam ¶m) const
#define QUDA_MAX_DIM
Maximum number of dimensions supported by QUDA. In practice, no routines make use of more than 5...
QudaTune getTuning()
Query whether autotuning is enabled or not. Default is enabled but can be overridden by setting QUDA_...
static __inline__ size_t size_t d
int comm_peer2peer_enabled_global()
cudaError_t qudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim, void **args, size_t sharedMem, cudaStream_t stream)
Wrapper around cudaLaunchKernel.
__device__ int block_idx(const T &swizzle)
Swizzler for reordering the (x) thread block indices - use on conjunction with swizzle-factor autotun...
#define TEX1DFETCH(type, tex, idx)
virtual void defaultTuneParam(TuneParam ¶m) const