10 #ifdef GPU_WILSON_DIRAC
12 #endif // GPU_WILSON_DIRAC
42 #include <dslash_index.cuh>
49 template <
typename FloatN>
61 int threadDimMapLower[4];
62 int threadDimMapUpper[4];
65 #ifdef USE_TEXTURE_OBJECTS
66 cudaTextureObject_t inTex;
67 cudaTextureObject_t inTexNorm;
78 template<
typename FloatN>
79 std::ostream& operator<<(std::ostream& output, const PackParam<FloatN>&
param) {
80 output <<
"threads = " <<
param.threads << std::endl;
81 output <<
"threadDimMapLower = {" <<
param.threadDimMapLower[0] <<
"," <<
82 param.threadDimMapLower[1] <<
"," <<
param.threadDimMapLower[2] <<
"," <<
param.threadDimMapLower[3] <<
"}" << std::endl;
83 output <<
"threadDimMapUpper = {" <<
param.threadDimMapUpper[0] <<
"," <<
84 param.threadDimMapUpper[1] <<
"," <<
param.threadDimMapUpper[2] <<
"," <<
param.threadDimMapUpper[3] <<
"}" << std::endl;
85 output <<
"parity = " <<
param.parity << std::endl;
86 output <<
"dim = " <<
param.dim << std::endl;
87 output <<
"face_num = " <<
param.face_num << std::endl;
89 output <<
"ghostFace = {" <<
param.ghostFace[0] <<
","<<
param.ghostFace[1] <<
","
90 <<
param.ghostFace[2] <<
"," <<
param.ghostFace[3] <<
"}" << std::endl;
91 output <<
"sp_stride = " <<
param.sp_stride << std::endl;
96 template<
typename Float>
97 struct PackExtendedParam :
public PackParam<Float>
100 PackExtendedParam(
const PackParam<Float>& base) : PackParam<
Float>(base) {}
125 #if defined(GPU_WILSON_DIRAC) || defined(GPU_DOMAIN_WALL_DIRAC)
128 #if (defined DIRECT_ACCESS_WILSON_PACK_SPINOR) || (defined FERMI_NO_DBLE_TEX)
129 #define READ_SPINOR READ_SPINOR_DOUBLE
130 #define READ_SPINOR_UP READ_SPINOR_DOUBLE_UP
131 #define READ_SPINOR_DOWN READ_SPINOR_DOUBLE_DOWN
134 #define READ_SPINOR READ_SPINOR_DOUBLE_TEX
135 #define READ_SPINOR_UP READ_SPINOR_DOUBLE_UP_TEX
136 #define READ_SPINOR_DOWN READ_SPINOR_DOUBLE_DOWN_TEX
137 #ifdef USE_TEXTURE_OBJECTS
138 #define SPINORTEX param.inTex
140 #define SPINORTEX spinorTexDouble
143 #define WRITE_HALF_SPINOR WRITE_HALF_SPINOR_DOUBLE2
144 #define SPINOR_DOUBLE
145 template <
int dim,
int dagger,
int face_num>
146 static inline __device__
void packFaceWilsonCore(double2 *
out,
float *outNorm,
const double2 *
in,
147 const float *inNorm,
const int &
idx,
148 const int &
face_idx,
const int &face_volume,
149 PackParam<double2> &
param)
151 #if (__COMPUTE_CAPABILITY__ >= 130)
157 #endif // (__COMPUTE_CAPABILITY__ >= 130)
160 template <
int dim,
int dagger,
int face_num>
161 static inline __device__
void unpackFaceWilsonCore(double2 *out,
float *outNorm,
const double2 *in,
162 const float *inNorm,
const int &idx,
163 const int &face_idx,
const int &face_volume,
164 PackParam<double2> ¶m)
166 #if (__COMPUTE_CAPABILITY__ >= 130)
172 #endif // (__COMPUTE_CAPABILITY__ >= 130)
177 #undef READ_SPINOR_UP
178 #undef READ_SPINOR_DOWN
180 #undef WRITE_HALF_SPINOR
185 #ifdef DIRECT_ACCESS_WILSON_PACK_SPINOR
186 #define READ_SPINOR READ_SPINOR_SINGLE
187 #define READ_SPINOR_UP READ_SPINOR_SINGLE_UP
188 #define READ_SPINOR_DOWN READ_SPINOR_SINGLE_DOWN
191 #define READ_SPINOR READ_SPINOR_SINGLE_TEX
192 #define READ_SPINOR_UP READ_SPINOR_SINGLE_UP_TEX
193 #define READ_SPINOR_DOWN READ_SPINOR_SINGLE_DOWN_TEX
194 #ifdef USE_TEXTURE_OBJECTS
195 #define SPINORTEX param.inTex
197 #define SPINORTEX spinorTexSingle
200 #define WRITE_HALF_SPINOR WRITE_HALF_SPINOR_FLOAT4
201 template <
int dim,
int dagger,
int face_num>
202 static inline __device__
void packFaceWilsonCore(float4 *out,
float *outNorm,
const float4 *in,
const float *inNorm,
203 const int &idx,
const int &face_idx,
204 const int &face_volume,
205 const PackParam<float4> ¶m)
214 template <
int dim,
int dagger,
int face_num>
215 static inline __device__
void unpackFaceWilsonCore(float4 *out,
float *outNorm,
const float4 *in,
const float *inNorm,
216 const int &idx,
const int &face_idx,
217 const int &face_volume,
218 const PackParam<float4> ¶m)
227 #undef READ_SPINOR_UP
228 #undef READ_SPINOR_DOWN
230 #undef WRITE_HALF_SPINOR
234 #ifdef DIRECT_ACCESS_WILSON_PACK_SPINOR
235 #define READ_SPINOR READ_SPINOR_HALF
236 #define READ_SPINOR_UP READ_SPINOR_HALF_UP
237 #define READ_SPINOR_DOWN READ_SPINOR_HALF_DOWN
240 #define READ_SPINOR READ_SPINOR_HALF_TEX
241 #define READ_SPINOR_UP READ_SPINOR_HALF_UP_TEX
242 #define READ_SPINOR_DOWN READ_SPINOR_HALF_DOWN_TEX
243 #ifdef USE_TEXTURE_OBJECTS
244 #define SPINORTEX param.inTex
246 #define SPINORTEX spinorTexHalf
249 #define WRITE_HALF_SPINOR WRITE_HALF_SPINOR_SHORT4
250 template <
int dim,
int dagger,
int face_num>
251 static inline __device__
void packFaceWilsonCore(short4 *out,
float *outNorm,
const short4 *in,
const float *inNorm,
252 const int &idx,
const int &face_idx,
253 const int &face_volume,
254 const PackParam<short4> ¶m)
263 template <
int dim,
int dagger,
int face_num>
264 static inline __device__
void unpackFaceWilsonCore(short4 *out,
float *outNorm,
const short4 *in,
const float *inNorm,
265 const int &idx,
const int &face_idx,
266 const int &face_volume,
267 const PackParam<short4> ¶m)
276 #undef READ_SPINOR_UP
277 #undef READ_SPINOR_DOWN
279 #undef WRITE_HALF_SPINOR
281 template <
int dagger,
typename FloatN>
282 __global__
void packFaceWilsonKernel(PackParam<FloatN> param)
286 int face_idx = blockIdx.x*blockDim.x + threadIdx.x;
287 if (face_idx >= param.threads)
return;
290 const int dim = dimFromFaceIndex(face_idx, param);
297 const int face_num = (face_idx >= nFace*param.ghostFace[0]) ? 1 : 0;
298 face_idx -= face_num*nFace*param.ghostFace[0];
300 const int idx = indexFromFaceIndex<0,nFace,0>(
face_idx,param.ghostFace[0],param.parity,param.X);
301 packFaceWilsonCore<0,dagger,0>(param.out[0], param.outNorm[0], param.in,
304 const int idx = indexFromFaceIndex<0,nFace,1>(
face_idx,param.ghostFace[0],param.parity,param.X);
305 packFaceWilsonCore<0,dagger,1>(param.out[1], param.outNorm[1], param.in,
308 }
else if (dim == 1) {
310 const int face_num = (face_idx >= nFace*param.ghostFace[1]) ? 1 : 0;
311 face_idx -= face_num*nFace*param.ghostFace[1];
313 const int idx = indexFromFaceIndex<1,nFace,0>(
face_idx,param.ghostFace[1],param.parity,param.X);
314 packFaceWilsonCore<1, dagger,0>(param.out[2], param.outNorm[2], param.in,
317 const int idx = indexFromFaceIndex<1,nFace,1>(
face_idx,param.ghostFace[1],param.parity,param.X);
318 packFaceWilsonCore<1, dagger,1>(param.out[3], param.outNorm[3], param.in,
321 }
else if (dim == 2) {
323 const int face_num = (face_idx >= nFace*param.ghostFace[2]) ? 1 : 0;
324 face_idx -= face_num*nFace*param.ghostFace[2];
326 const int idx = indexFromFaceIndex<2,nFace,0>(
face_idx,param.ghostFace[2],param.parity,param.X);
327 packFaceWilsonCore<2, dagger,0>(param.out[4], param.outNorm[4], param.in,
330 const int idx = indexFromFaceIndex<2,nFace,1>(
face_idx,param.ghostFace[2],param.parity,param.X);
331 packFaceWilsonCore<2, dagger,1>(param.out[5], param.outNorm[5], param.in,
336 const int face_num = (face_idx >= nFace*param.ghostFace[3]) ? 1 : 0;
337 face_idx -= face_num*nFace*param.ghostFace[3];
339 const int idx = indexFromFaceIndex<3,nFace,0>(
face_idx,param.ghostFace[3],param.parity,param.X);
340 packFaceWilsonCore<3, dagger,0>(param.out[6], param.outNorm[6], param.in,
343 const int idx = indexFromFaceIndex<3,nFace,1>(
face_idx,param.ghostFace[3],param.parity,param.X);
344 packFaceWilsonCore<3, dagger,1>(param.out[7], param.outNorm[7], param.in,
352 template <
int dagger,
typename FloatN,
int nFace>
353 __global__
void packFaceExtendedWilsonKernel(PackParam<FloatN> param)
355 int face_idx = blockIdx.x*blockDim.x + threadIdx.x;
356 if (face_idx >= param.threads)
return;
359 const int dim = dimFromFaceIndex(face_idx, param);
368 const int face_num = (param.face_num==2) ? ((face_idx >= nFace*param.ghostFace[0]) ? 1 : 0) : param.face_num;
369 if(param.face_num==2) face_idx -= face_num*nFace*param.ghostFace[0];
371 const int idx = indexFromFaceIndexExtended<0,nFace,0>(
face_idx,param.ghostFace[0],param.parity,param.X,param.R);
372 packFaceWilsonCore<0,dagger,0>(param.out[0], param.outNorm[0], param.in,
375 const int idx = indexFromFaceIndexExtended<0,nFace,1>(
face_idx,param.ghostFace[0],param.parity,param.X,param.R);
376 packFaceWilsonCore<0,dagger,1>(param.out[1], param.outNorm[1], param.in,
379 }
else if (dim == 1) {
380 const int face_num = (param.face_num==2) ? ((face_idx >= nFace*param.ghostFace[1]) ? 1 : 0) : param.face_num;
381 if(param.face_num==2) face_idx -= face_num*nFace*param.ghostFace[1];
383 const int idx = indexFromFaceIndexExtended<1,nFace,0>(
face_idx,param.ghostFace[1],param.parity,param.X,param.R);
384 packFaceWilsonCore<1, dagger,0>(param.out[2], param.outNorm[2], param.in,
387 const int idx = indexFromFaceIndexExtended<1,nFace,1>(
face_idx,param.ghostFace[1],param.parity,param.X,param.R);
388 packFaceWilsonCore<1, dagger,1>(param.out[3], param.outNorm[3], param.in,
391 }
else if (dim == 2) {
392 const int face_num = (param.face_num==2) ? ((face_idx >= nFace*param.ghostFace[2]) ? 1 : 0) : param.face_num;
393 if(param.face_num==2) face_idx -= face_num*nFace*param.ghostFace[2];
395 const int idx = indexFromFaceIndexExtended<2,nFace,0>(
face_idx,param.ghostFace[2],param.parity,param.X,param.R);
396 packFaceWilsonCore<2, dagger,0>(param.out[4], param.outNorm[4], param.in,
399 const int idx = indexFromFaceIndexExtended<2,nFace,1>(
face_idx,param.ghostFace[2],param.parity,param.X,param.R);
400 packFaceWilsonCore<2, dagger,1>(param.out[5], param.outNorm[5], param.in,
404 const int face_num = (param.face_num==2) ? ((face_idx >= nFace*param.ghostFace[3]) ? 1 : 0) : param.face_num;
405 if(param.face_num==2) face_idx -= face_num*nFace*param.ghostFace[3];
408 const int idx = indexFromFaceIndexExtended<3,nFace,0>(
face_idx,param.ghostFace[3],param.parity,param.X,param.R);
409 packFaceWilsonCore<3, dagger,0>(param.out[6], param.outNorm[6], param.in,
412 const int idx = indexFromFaceIndexExtended<3,nFace,1>(
face_idx,param.ghostFace[3],param.parity,param.X,param.R);
413 packFaceWilsonCore<3, dagger,1>(param.out[7], param.outNorm[7], param.in,
421 template <
int dagger,
typename FloatN,
int nFace>
422 __global__
void unpackFaceExtendedWilsonKernel(PackParam<FloatN> param)
424 int face_idx = blockIdx.x*blockDim.x + threadIdx.x;
425 if (face_idx >= param.threads)
return;
428 const int dim = dimFromFaceIndex(face_idx, param);
437 const int face_num = (param.face_num==2) ? ((face_idx >= nFace*param.ghostFace[0]) ? 1 : 0) : param.face_num;
438 if(param.face_num==2) face_idx -= face_num*nFace*param.ghostFace[0];
441 const int idx = indexFromFaceIndexExtended<0,nFace,0>(
face_idx,param.ghostFace[0],param.parity,param.X,param.R);
442 unpackFaceWilsonCore<0,dagger,0>(param.out[0], param.outNorm[0], param.in,
445 const int idx = indexFromFaceIndexExtended<0,nFace,1>(
face_idx,param.ghostFace[0],param.parity,param.X,param.R);
446 unpackFaceWilsonCore<0,dagger,1>(param.out[1], param.outNorm[1], param.in,
449 }
else if (dim == 1) {
450 const int face_num = (param.face_num==2) ? ((face_idx >= nFace*param.ghostFace[1]) ? 1 : 0) : param.face_num;
451 if(param.face_num==2) face_idx -= face_num*nFace*param.ghostFace[1];
454 const int idx = indexFromFaceIndexExtended<1,nFace,0>(
face_idx,param.ghostFace[1],param.parity,param.X,param.R);
455 unpackFaceWilsonCore<1, dagger,0>(param.out[2], param.outNorm[2], param.in,
458 const int idx = indexFromFaceIndexExtended<1,nFace,1>(
face_idx,param.ghostFace[1],param.parity,param.X,param.R);
459 unpackFaceWilsonCore<1, dagger,1>(param.out[3], param.outNorm[3], param.in,
462 }
else if (dim == 2) {
463 const int face_num = (param.face_num==2) ? ((face_idx >= nFace*param.ghostFace[2]) ? 1 : 0) : param.face_num;
464 if(param.face_num==2) face_idx -= face_num*nFace*param.ghostFace[2];
467 const int idx = indexFromFaceIndexExtended<2,nFace,0>(
face_idx,param.ghostFace[2],param.parity,param.X,param.R);
468 unpackFaceWilsonCore<2, dagger,0>(param.out[4], param.outNorm[4], param.in,
471 const int idx = indexFromFaceIndexExtended<2,nFace,1>(
face_idx,param.ghostFace[2],param.parity,param.X,param.R);
472 unpackFaceWilsonCore<2, dagger,1>(param.out[5], param.outNorm[5], param.in,
476 const int face_num = (param.face_num==2) ? ((face_idx >= nFace*param.ghostFace[3]) ? 1 : 0) : param.face_num;
477 if(param.face_num==2) face_idx -= face_num*nFace*param.ghostFace[3];
480 const int idx = indexFromFaceIndexExtended<3,nFace,0>(
face_idx,param.ghostFace[3],param.parity,param.X,param.R);
481 unpackFaceWilsonCore<3, dagger,0>(param.out[6], param.outNorm[6], param.in,
484 const int idx = indexFromFaceIndexExtended<3,nFace,1>(
face_idx,param.ghostFace[3],param.parity,param.X,param.R);
485 unpackFaceWilsonCore<3, dagger,1>(param.out[7], param.outNorm[7], param.in,
492 #endif // GPU_WILSON_DIRAC || GPU_DOMAIN_WALL_DIRAC
495 #if defined(GPU_WILSON_DIRAC) || defined(GPU_TWISTED_MASS_DIRAC)
498 #endif // GPU_WILSON_DIRAC || GPU_DOMAIN_WALL_DIRAC
501 #if defined(GPU_WILSON_DIRAC) || defined(GPU_TWISTED_MASS_DIRAC)
505 #endif // GPU_WILSON_DIRAC || GPU_DOMAIN_WALL_DIRAC
508 #if defined(GPU_WILSON_DIRAC) || defined(GPU_TWISTED_MASS_DIRAC)
511 #if (defined DIRECT_ACCESS_WILSON_PACK_SPINOR) || (defined FERMI_NO_DBLE_TEX)
512 #define READ_SPINOR READ_SPINOR_DOUBLE
513 #define READ_SPINOR_UP READ_SPINOR_DOUBLE_UP
514 #define READ_SPINOR_DOWN READ_SPINOR_DOUBLE_DOWN
517 #define READ_SPINOR READ_SPINOR_DOUBLE_TEX
518 #define READ_SPINOR_UP READ_SPINOR_DOUBLE_UP_TEX
519 #define READ_SPINOR_DOWN READ_SPINOR_DOUBLE_DOWN_TEX
520 #ifdef USE_TEXTURE_OBJECTS
521 #define SPINORTEX param.inTex
523 #define SPINORTEX spinorTexDouble
526 #define WRITE_HALF_SPINOR WRITE_HALF_SPINOR_DOUBLE2
527 #define SPINOR_DOUBLE
528 template <
int dim,
int dagger,
int face_num>
529 static inline __device__
void packTwistedFaceWilsonCore(double2 *out,
float *outNorm,
const double2 *in,
530 const float *inNorm,
double a,
double b,
const int &idx,
531 const int &face_idx,
const int &face_volume,
532 PackParam<double2> ¶m)
534 #if (__COMPUTE_CAPABILITY__ >= 130)
540 #endif // (__COMPUTE_CAPABILITY__ >= 130)
543 #undef READ_SPINOR_UP
544 #undef READ_SPINOR_DOWN
546 #undef WRITE_HALF_SPINOR
551 #ifdef DIRECT_ACCESS_WILSON_PACK_SPINOR
552 #define READ_SPINOR READ_SPINOR_SINGLE
553 #define READ_SPINOR_UP READ_SPINOR_SINGLE_UP
554 #define READ_SPINOR_DOWN READ_SPINOR_SINGLE_DOWN
557 #define READ_SPINOR READ_SPINOR_SINGLE_TEX
558 #define READ_SPINOR_UP READ_SPINOR_SINGLE_UP_TEX
559 #define READ_SPINOR_DOWN READ_SPINOR_SINGLE_DOWN_TEX
560 #ifdef USE_TEXTURE_OBJECTS
561 #define SPINORTEX param.inTex
563 #define SPINORTEX spinorTexSingle
566 #define WRITE_HALF_SPINOR WRITE_HALF_SPINOR_FLOAT4
567 template <
int dim,
int dagger,
int face_num>
568 static inline __device__
void packTwistedFaceWilsonCore(float4 *out,
float *outNorm,
const float4 *in,
const float *inNorm,
float a,
float b,
569 const int &idx,
const int &face_idx,
570 const int &face_volume,
571 const PackParam<float4> ¶m)
580 #undef READ_SPINOR_UP
581 #undef READ_SPINOR_DOWN
583 #undef WRITE_HALF_SPINOR
587 #ifdef DIRECT_ACCESS_WILSON_PACK_SPINOR
588 #define READ_SPINOR READ_SPINOR_HALF
589 #define READ_SPINOR_UP READ_SPINOR_HALF_UP
590 #define READ_SPINOR_DOWN READ_SPINOR_HALF_DOWN
593 #define READ_SPINOR READ_SPINOR_HALF_TEX
594 #define READ_SPINOR_UP READ_SPINOR_HALF_UP_TEX
595 #define READ_SPINOR_DOWN READ_SPINOR_HALF_DOWN_TEX
596 #ifdef USE_TEXTURE_OBJECTS
597 #define SPINORTEX param.inTex
599 #define SPINORTEX spinorTexHalf
602 #define WRITE_HALF_SPINOR WRITE_HALF_SPINOR_SHORT4
603 template <
int dim,
int dagger,
int face_num>
604 static inline __device__
void packTwistedFaceWilsonCore(short4 *out,
float *outNorm,
const short4 *in,
const float *inNorm,
float a,
float b,
605 const int &idx,
const int &face_idx,
606 const int &face_volume,
607 const PackParam<short4> ¶m)
616 #undef READ_SPINOR_UP
617 #undef READ_SPINOR_DOWN
619 #undef WRITE_HALF_SPINOR
621 template <
int dagger,
typename FloatN,
typename Float>
622 __global__
void packTwistedFaceWilsonKernel(
Float a,
Float b, PackParam<FloatN> param)
626 int face_idx = blockIdx.x*blockDim.x + threadIdx.x;
627 if (face_idx >= param.threads)
return;
630 const int dim = dimFromFaceIndex(face_idx, param);
637 const int face_num = (face_idx >= nFace*param.ghostFace[0]) ? 1 : 0;
638 face_idx -= face_num*nFace*param.ghostFace[0];
640 const int idx = indexFromFaceIndex<0,nFace,0>(
face_idx,param.ghostFace[0],param.parity,param.X);
641 packTwistedFaceWilsonCore<0,dagger,0>(param.out[0], param.outNorm[0], param.in,
644 const int idx = indexFromFaceIndex<0,nFace,1>(
face_idx,param.ghostFace[0],param.parity,param.X);
645 packTwistedFaceWilsonCore<0,dagger,1>(param.out[1], param.outNorm[1], param.in,
648 }
else if (dim == 1) {
649 const int face_num = (face_idx >= nFace*param.ghostFace[1]) ? 1 : 0;
650 face_idx -= face_num*nFace*param.ghostFace[1];
652 const int idx = indexFromFaceIndex<1,nFace,0>(
face_idx,param.ghostFace[1],param.parity,param.X);
653 packTwistedFaceWilsonCore<1, dagger,0>(param.out[2], param.outNorm[2], param.in,
656 const int idx = indexFromFaceIndex<1,nFace,1>(
face_idx,param.ghostFace[1],param.parity,param.X);
657 packTwistedFaceWilsonCore<1, dagger,1>(param.out[3], param.outNorm[3], param.in,
660 }
else if (dim == 2) {
661 const int face_num = (face_idx >= nFace*param.ghostFace[2]) ? 1 : 0;
662 face_idx -= face_num*nFace*param.ghostFace[2];
664 const int idx = indexFromFaceIndex<2,nFace,0>(
face_idx,param.ghostFace[2],param.parity,param.X);
665 packTwistedFaceWilsonCore<2, dagger,0>(param.out[4], param.outNorm[4], param.in,
668 const int idx = indexFromFaceIndex<2,nFace,1>(
face_idx,param.ghostFace[2],param.parity,param.X);
669 packTwistedFaceWilsonCore<2, dagger,1>(param.out[5], param.outNorm[5], param.in,
673 const int face_num = (face_idx >= nFace*param.ghostFace[3]) ? 1 : 0;
674 face_idx -= face_num*nFace*param.ghostFace[3];
676 const int idx = indexFromFaceIndex<3,nFace,0>(
face_idx,param.ghostFace[3],param.parity,param.X);
677 packTwistedFaceWilsonCore<3, dagger,0>(param.out[6], param.outNorm[6], param.in,
680 const int idx = indexFromFaceIndex<3,nFace,1>(
face_idx,param.ghostFace[3],param.parity,param.X);
681 packTwistedFaceWilsonCore<3, dagger,1>(param.out[7], param.outNorm[7], param.in,
688 #endif // GPU_TWISTED_MASS_DIRAC
690 template <
typename FloatN,
typename Float>
691 class PackFace :
public Tunable {
695 const cudaColorSpinorField *
in;
706 for (
int i=0; i<4; i++) {
709 threads += 2*nFace*in->GhostFace()[i];
713 threads = nFace*in->GhostFace()[
dim];
714 if(face_num==2) threads *= 2;
720 virtual int inputPerSite()
const = 0;
721 virtual int outputPerSite()
const = 0;
724 PackParam<FloatN> prepareParam(
int dim=-1,
int face_num=2) {
725 PackParam<FloatN>
param;
726 param.in = (FloatN*)in->V();
727 param.inNorm = (
float*)in->Norm();
729 param.face_num = face_num;
731 for(
int d=0; d<
QUDA_MAX_DIM; d++) param.X[d] = in->X()[d];
734 #ifdef USE_TEXTURE_OBJECTS
735 param.inTex = in->Tex();
736 param.inTexNorm = in->TexNorm();
740 param.sp_stride = in->Stride();
743 for (
int i=0; i<4; i++) {
744 param.threadDimMapLower[i] = 0;
745 param.threadDimMapUpper[i] = 0;
747 param.threadDimMapLower[i] = (prev>=0 ? param.threadDimMapUpper[prev] : 0);
748 param.threadDimMapUpper[i] = param.threadDimMapLower[i] + 2*nFace*in->GhostFace()[i];
750 size_t faceBytes = nFace*outputPerSite()*in->GhostFace()[i]*
sizeof(faces->x);
752 if (
typeid(FloatN) ==
typeid(short4) ||
typeid(FloatN) ==
typeid(short2)) {
753 faceBytes += nFace*in->GhostFace()[i]*
sizeof(float);
754 param.out[2*i] = (FloatN*)((
char*)faces +
755 (outputPerSite()*
sizeof(faces->x) +
sizeof(
float))*param.threadDimMapLower[i]);
756 param.outNorm[2*i] = (
float*)((
char*)param.out[2*i] +
757 nFace*outputPerSite()*in->GhostFace()[i]*
sizeof(faces->x));
759 param.out[2*i] = (FloatN*)((
char*)faces+outputPerSite()*
sizeof(faces->x)*param.threadDimMapLower[i]);
762 param.out[2*i+1] = (FloatN*)((
char*)param.out[2*i] + faceBytes);
763 param.outNorm[2*i+1] = (
float*)((
char*)param.outNorm[2*i] + faceBytes);
768 param.ghostFace[0] = param.X[1]*param.X[2]*param.X[3]/2;
769 param.ghostFace[1] = param.X[0]*param.X[2]*param.X[3]/2;
770 param.ghostFace[2] = param.X[0]*param.X[1]*param.X[3]/2;
771 param.ghostFace[3] = param.X[0]*param.X[1]*param.X[2]/2;
776 unsigned int sharedBytesPerThread()
const {
return 0; }
777 unsigned int sharedBytesPerBlock(
const TuneParam ¶m)
const {
return 0; }
779 bool tuneGridDim()
const {
return false; }
780 unsigned int minThreads()
const {
return threads(); }
783 strcpy(aux, in->AuxString());
785 comm[0] = (
commDim[0] ?
'1' :
'0');
786 comm[1] = (
commDim[1] ?
'1' :
'0');
787 comm[2] = (
commDim[2] ?
'1' :
'0');
788 comm[3] = (
commDim[3] ?
'1' :
'0');
789 comm[4] =
'\0'; strcat(aux,
",comm=");
795 PackFace(FloatN *faces,
const cudaColorSpinorField *in,
796 const int dagger,
const int parity,
const int nFace,
const int dim=-1,
const int face_num=2)
797 : faces(faces), in(in), dagger(dagger),
798 parity(parity), nFace(nFace), dim(dim), face_num(face_num)
801 bindSpinorTex<FloatN>(
in);
804 virtual ~PackFace() {
805 unbindSpinorTex<FloatN>(
in);
808 virtual int tuningIter()
const {
return 3; }
810 virtual TuneKey tuneKey()
const {
811 return TuneKey(in->VolString(),
typeid(*this).name(), aux);
814 virtual void apply(
const cudaStream_t &
stream) = 0;
816 long long bytes()
const {
817 size_t faceBytes = (inputPerSite() + outputPerSite())*this->
threads()*
sizeof(((FloatN*)0)->x);
819 faceBytes += 2*this->
threads()*
sizeof(float);
824 template <
typename FloatN,
typename Float>
825 class PackFaceWilson :
public PackFace<FloatN, Float> {
829 int inputPerSite()
const {
return 24; }
830 int outputPerSite()
const {
return 12; }
833 PackFaceWilson(FloatN *faces,
const cudaColorSpinorField *in,
835 : PackFace<FloatN,
Float>(faces, in, dagger, parity, 1) { }
836 virtual ~PackFaceWilson() { }
838 void apply(
const cudaStream_t &
stream) {
841 #ifdef GPU_WILSON_DIRAC
842 PackParam<FloatN> param = this->prepareParam();
844 packFaceWilsonKernel<1><<<tp.grid, tp.block, tp.shared_bytes, stream>>>(
param);
846 packFaceWilsonKernel<0><<<tp.grid, tp.block, tp.shared_bytes, stream>>>(
param);
849 errorQuda(
"Wilson face packing kernel is not built");
853 long long flops()
const {
return outputPerSite()*this->
threads(); }
856 void packFaceWilson(
void *ghost_buf, cudaColorSpinorField &in,
const int dagger,
857 const int parity,
const cudaStream_t &stream) {
859 switch(in.Precision()) {
862 PackFaceWilson<double2, double> pack((double2*)ghost_buf, &in, dagger, parity);
868 PackFaceWilson<float4, float> pack((float4*)ghost_buf, &in, dagger, parity);
874 PackFaceWilson<short4, float> pack((short4*)ghost_buf, &in, dagger, parity);
881 template <
typename FloatN,
typename Float>
882 class PackFaceTwisted :
public PackFace<FloatN, Float> {
886 int inputPerSite()
const {
return 24; }
887 int outputPerSite()
const {
return 12; }
892 PackFaceTwisted(FloatN *faces,
const cudaColorSpinorField *in,
893 const int dagger,
const int parity,
Float a,
Float b)
894 : PackFace<FloatN,
Float>(faces, in, dagger, parity, 1), a(a), b(b) { }
895 virtual ~PackFaceTwisted() { }
897 void apply(
const cudaStream_t &stream) {
900 #ifdef GPU_TWISTED_MASS_DIRAC
901 PackParam<FloatN> param = this->prepareParam();
903 packTwistedFaceWilsonKernel<1><<<tp.grid, tp.block, tp.shared_bytes, stream>>>(a, b,
param);
905 packTwistedFaceWilsonKernel<0><<<tp.grid, tp.block, tp.shared_bytes, stream>>>(a, b,
param);
908 errorQuda(
"Twisted face packing kernel is not built");
912 long long flops()
const {
return outputPerSite()*this->
threads(); }
916 void packTwistedFaceWilson(
void *ghost_buf, cudaColorSpinorField &in,
const int dagger,
917 const int parity,
const double a,
const double b,
const cudaStream_t &stream) {
919 switch(in.Precision()) {
922 PackFaceTwisted<double2, double> pack((double2*)ghost_buf, &in, dagger, parity, a, b);
928 PackFaceTwisted<float4, float> pack((float4*)ghost_buf, &in, dagger, parity, (
float)a, (
float)b);
934 PackFaceTwisted<short4, float> pack((short4*)ghost_buf, &in, dagger, parity, (
float)a, (
float)b);
941 #ifdef GPU_STAGGERED_DIRAC
943 #ifdef USE_TEXTURE_OBJECTS
944 #define SPINORTEXDOUBLE param.inTex
945 #define SPINORTEXSINGLE param.inTex
946 #define SPINORTEXHALF param.inTex
947 #define SPINORTEXHALFNORM param.inTexNorm
949 #define SPINORTEXDOUBLE spinorTexDouble
950 #define SPINORTEXSINGLE spinorTexSingle2
951 #define SPINORTEXHALF spinorTexHalf2
952 #define SPINORTEXHALFNORM spinorTexHalf2Norm
955 template <
typename Float2>
956 __device__
void packFaceStaggeredCore(Float2 *out,
float *outNorm,
const int out_idx,
957 const int out_stride,
const Float2 *in,
const float *inNorm,
958 const int in_idx,
const int in_stride) {
959 out[out_idx + 0*out_stride] = in[in_idx + 0*in_stride];
960 out[out_idx + 1*out_stride] = in[in_idx + 1*in_stride];
961 out[out_idx + 2*out_stride] = in[in_idx + 2*in_stride];
964 __device__
void packFaceStaggeredCore(short2 *out,
float *outNorm,
const int out_idx,
965 const int out_stride,
const short2 *in,
const float *inNorm,
966 const int in_idx,
const int in_stride) {
967 out[out_idx + 0*out_stride] = in[in_idx + 0*in_stride];
968 out[out_idx + 1*out_stride] = in[in_idx + 1*in_stride];
969 out[out_idx + 2*out_stride] = in[in_idx + 2*in_stride];
970 outNorm[out_idx] = inNorm[in_idx];
973 #if (defined DIRECT_ACCESS_PACK) || (defined FERMI_NO_DBLE_TEX)
974 template <
typename Float2>
975 __device__
void packFaceStaggeredCore(Float2 *out,
float *outNorm,
const int out_idx,
976 const int out_stride,
const Float2 *in,
const float *inNorm,
977 const int in_idx,
const PackParam<double2> ¶m) {
978 out[out_idx + 0*out_stride] = in[in_idx + 0*param.sp_stride];
979 out[out_idx + 1*out_stride] = in[in_idx + 1*param.sp_stride];
980 out[out_idx + 2*out_stride] = in[in_idx + 2*param.sp_stride];
983 __device__
void packFaceStaggeredCore(short2 *out,
float *outNorm,
const int out_idx,
984 const int out_stride,
const short2 *in,
const float *inNorm,
985 const int in_idx,
const PackParam<double2> ¶m) {
986 out[out_idx + 0*out_stride] = in[in_idx + 0*param.sp_stride];
987 out[out_idx + 1*out_stride] = in[in_idx + 1*param.sp_stride];
988 out[out_idx + 2*out_stride] = in[in_idx + 2*param.sp_stride];
989 outNorm[out_idx] = inNorm[in_idx];
994 #if __COMPUTE_CAPABILITY__ >= 130
995 __device__
void packFaceStaggeredCore(double2 *out,
float *outNorm,
const int out_idx,
996 const int out_stride,
const double2 *in,
const float *inNorm,
997 const int in_idx,
const PackParam<double2> ¶m) {
998 out[out_idx + 0*out_stride] =
fetch_double2(SPINORTEXDOUBLE, in_idx + 0*param.sp_stride);
999 out[out_idx + 1*out_stride] =
fetch_double2(SPINORTEXDOUBLE, in_idx + 1*param.sp_stride);
1000 out[out_idx + 2*out_stride] =
fetch_double2(SPINORTEXDOUBLE, in_idx + 2*param.sp_stride);
1003 __device__
void packFaceStaggeredCore(float2 *out,
float *outNorm,
const int out_idx,
1004 const int out_stride,
const float2 *in,
1005 const float *inNorm,
const int in_idx,
1006 const PackParam<float2> ¶m) {
1007 out[out_idx + 0*out_stride] =
TEX1DFETCH(float2, SPINORTEXSINGLE, in_idx + 0*param.sp_stride);
1008 out[out_idx + 1*out_stride] =
TEX1DFETCH(float2, SPINORTEXSINGLE, in_idx + 1*param.sp_stride);
1009 out[out_idx + 2*out_stride] =
TEX1DFETCH(float2, SPINORTEXSINGLE, in_idx + 2*param.sp_stride);
1014 static inline __device__ short2 float22short2(
float c, float2 a) {
1015 return make_short2((
short)(a.x*c*
MAX_SHORT), (
short)(a.y*c*MAX_SHORT));
1018 __device__
void packFaceStaggeredCore(short2 *out,
float *outNorm,
const int out_idx,
1019 const int out_stride,
const short2 *in,
1020 const float *inNorm,
const int in_idx,
1021 const PackParam<short2> ¶m) {
1022 out[out_idx + 0*out_stride] = float22short2(1.0f,
TEX1DFETCH(float2,SPINORTEXHALF,in_idx+0*param.sp_stride));
1023 out[out_idx + 1*out_stride] = float22short2(1.0f,
TEX1DFETCH(float2,SPINORTEXHALF,in_idx+1*param.sp_stride));
1024 out[out_idx + 2*out_stride] = float22short2(1.0f,
TEX1DFETCH(float2,SPINORTEXHALF,in_idx+2*param.sp_stride));
1025 outNorm[out_idx] =
TEX1DFETCH(
float, SPINORTEXHALFNORM, in_idx);
1030 template <
typename FloatN,
int nFace>
1031 __global__
void packFaceStaggeredKernel(PackParam<FloatN> param)
1033 int face_idx = blockIdx.x*blockDim.x + threadIdx.x;
1034 if (face_idx >= param.threads)
return;
1037 const int dim = dimFromFaceIndex(face_idx, param);
1044 const int face_num = (param.face_num==2) ? ((face_idx >= nFace*param.ghostFace[0]) ? 1 : 0) : param.face_num;
1045 if(param.face_num==2) face_idx -= face_num*nFace*param.ghostFace[0];
1046 if (face_num == 0) {
1047 const int idx = indexFromFaceIndexStaggered<0,nFace,0>(
face_idx,param.ghostFace[0],param.parity,param.X);
1048 packFaceStaggeredCore(param.out[0], param.outNorm[0], face_idx,
1049 nFace*param.ghostFace[0], param.in, param.inNorm, idx, param);
1051 const int idx = indexFromFaceIndexStaggered<0,nFace,1>(
face_idx,param.ghostFace[0],param.parity,param.X);
1052 packFaceStaggeredCore(param.out[1], param.outNorm[1], face_idx,
1053 nFace*param.ghostFace[0], param.in, param.inNorm, idx, param);
1055 }
else if (dim == 1) {
1056 const int face_num = (param.face_num==2) ? ((face_idx >= nFace*param.ghostFace[1]) ? 1 : 0) : param.face_num;
1057 if(param.face_num==2) face_idx -= face_num*nFace*param.ghostFace[1];
1058 if (face_num == 0) {
1059 const int idx = indexFromFaceIndexStaggered<1,nFace,0>(
face_idx,param.ghostFace[1],param.parity,param.X);
1060 packFaceStaggeredCore(param.out[2], param.outNorm[2], face_idx,
1061 nFace*param.ghostFace[1], param.in, param.inNorm, idx, param);
1063 const int idx = indexFromFaceIndexStaggered<1,nFace,1>(
face_idx,param.ghostFace[1],param.parity,param.X);
1064 packFaceStaggeredCore(param.out[3], param.outNorm[3], face_idx,
1065 nFace*param.ghostFace[1], param.in, param.inNorm, idx, param);
1067 }
else if (dim == 2) {
1068 const int face_num = (param.face_num==2) ? ((face_idx >= nFace*param.ghostFace[2]) ? 1 : 0) : param.face_num;
1069 if(param.face_num==2) face_idx -= face_num*nFace*param.ghostFace[2];
1070 if (face_num == 0) {
1071 const int idx = indexFromFaceIndexStaggered<2,nFace,0>(
face_idx,param.ghostFace[2],param.parity,param.X);
1072 packFaceStaggeredCore(param.out[4], param.outNorm[4], face_idx,
1073 nFace*param.ghostFace[2], param.in, param.inNorm, idx, param);
1075 const int idx = indexFromFaceIndexStaggered<2,nFace,1>(
face_idx,param.ghostFace[2],param.parity,param.X);
1076 packFaceStaggeredCore(param.out[5], param.outNorm[5], face_idx,
1077 nFace*param.ghostFace[2], param.in, param.inNorm, idx, param);
1080 const int face_num = (param.face_num==2) ? ((face_idx >= nFace*param.ghostFace[3]) ? 1 : 0) : param.face_num;
1081 if(param.face_num==2) face_idx -= face_num*nFace*param.ghostFace[3];
1082 if (face_num == 0) {
1083 const int idx = indexFromFaceIndexStaggered<3,nFace,0>(
face_idx,param.ghostFace[3],param.parity,param.X);
1084 packFaceStaggeredCore(param.out[6], param.outNorm[6], face_idx,
1085 nFace*param.ghostFace[3], param.in, param.inNorm,idx, param);
1087 const int idx = indexFromFaceIndexStaggered<3,nFace,1>(
face_idx,param.ghostFace[3],param.parity,param.X);
1088 packFaceStaggeredCore(param.out[7], param.outNorm[7], face_idx,
1089 nFace*param.ghostFace[3], param.in, param.inNorm, idx, param);
1096 template <
typename FloatN,
int nFace>
1097 __global__
void packFaceExtendedStaggeredKernel(PackExtendedParam<FloatN> param)
1099 int face_idx = blockIdx.x*blockDim.x + threadIdx.x;
1100 if (face_idx >= param.threads)
return;
1103 const int dim = dimFromFaceIndex(face_idx, param);
1112 const int face_num = (param.face_num==2) ? ((face_idx >= nFace*param.ghostFace[0]) ? 1 : 0) : param.face_num;
1113 if(param.face_num==2) face_idx -= face_num*nFace*param.ghostFace[0];
1114 if (face_num == 0) {
1115 const int idx = indexFromFaceIndexExtendedStaggered<0,nFace,0>(
face_idx,param.ghostFace[0],param.parity,param.X,param.R);
1116 packFaceStaggeredCore(param.out[0], param.outNorm[0], face_idx,
1117 nFace*param.ghostFace[0], param.in, param.inNorm, idx, param);
1119 const int idx = indexFromFaceIndexExtendedStaggered<0,nFace,1>(
face_idx,param.ghostFace[0],param.parity,param.X,param.R);
1120 packFaceStaggeredCore(param.out[1], param.outNorm[1], face_idx,
1121 nFace*param.ghostFace[0], param.in, param.inNorm, idx, param);
1123 }
else if (dim == 1) {
1124 const int face_num = (param.face_num==2) ? ((face_idx >= nFace*param.ghostFace[1]) ? 1 : 0) : param.face_num;
1125 if(param.face_num==2) face_idx -= face_num*nFace*param.ghostFace[1];
1126 if (face_num == 0) {
1127 const int idx = indexFromFaceIndexExtendedStaggered<1,nFace,0>(
face_idx,param.ghostFace[1],param.parity,param.X,param.R);
1128 packFaceStaggeredCore(param.out[2], param.outNorm[2], face_idx,
1129 nFace*param.ghostFace[1], param.in, param.inNorm, idx, param);
1131 const int idx = indexFromFaceIndexExtendedStaggered<1,nFace,1>(
face_idx,param.ghostFace[1],param.parity,param.X,param.R);
1132 packFaceStaggeredCore(param.out[3], param.outNorm[3], face_idx,
1133 nFace*param.ghostFace[1], param.in, param.inNorm, idx, param);
1135 }
else if (dim == 2) {
1136 const int face_num = (param.face_num==2) ? ((face_idx >= nFace*param.ghostFace[2]) ? 1 : 0) : param.face_num;
1137 if(param.face_num==2) face_idx -= face_num*nFace*param.ghostFace[2];
1138 if (face_num == 0) {
1139 const int idx = indexFromFaceIndexExtendedStaggered<2,nFace,0>(
face_idx,param.ghostFace[2],param.parity,param.X,param.R);
1140 packFaceStaggeredCore(param.out[4], param.outNorm[4], face_idx,
1141 nFace*param.ghostFace[2], param.in, param.inNorm, idx, param);
1143 const int idx = indexFromFaceIndexExtendedStaggered<2,nFace,1>(
face_idx,param.ghostFace[2],param.parity,param.X,param.R);
1144 packFaceStaggeredCore(param.out[5], param.outNorm[5], face_idx,
1145 nFace*param.ghostFace[2], param.in, param.inNorm, idx, param);
1148 const int face_num = (param.face_num==2) ? ((face_idx >= nFace*param.ghostFace[3]) ? 1 : 0) : param.face_num;
1149 if(param.face_num==2) face_idx -= face_num*nFace*param.ghostFace[3];
1150 if (face_num == 0) {
1151 const int idx = indexFromFaceIndexExtendedStaggered<3,nFace,0>(
face_idx,param.ghostFace[3],param.parity,param.X,param.R);
1152 packFaceStaggeredCore(param.out[6], param.outNorm[6], face_idx,
1153 nFace*param.ghostFace[3], param.in, param.inNorm,idx, param);
1155 const int idx = indexFromFaceIndexExtendedStaggered<3,nFace,1>(
face_idx,param.ghostFace[3],param.parity,param.X,param.R);
1156 packFaceStaggeredCore(param.out[7], param.outNorm[7], face_idx,
1157 nFace*param.ghostFace[3], param.in, param.inNorm, idx, param);
1164 template <
typename FloatN,
int nFace>
1165 __global__
void unpackFaceExtendedStaggeredKernel(PackExtendedParam<FloatN> param)
1167 int face_idx = blockIdx.x*blockDim.x + threadIdx.x;
1168 if (face_idx >= param.threads)
return;
1171 const int dim = dimFromFaceIndex(face_idx, param);
1180 const int face_num = (param.face_num==2) ? ((face_idx >= nFace*param.ghostFace[0]) ? 1 : 0) : param.face_num;
1181 if(param.face_num==2) face_idx -= face_num*nFace*param.ghostFace[0];
1182 if (face_num == 0) {
1183 const int idx = indexFromFaceIndexExtendedStaggered<0,nFace,0>(
face_idx,param.ghostFace[0],param.parity,param.X,param.R);
1184 packFaceStaggeredCore(param.in, param.inNorm, idx,
1185 param.sp_stride, param.out[0], param.outNorm[0], face_idx, nFace*param.ghostFace[0]);
1187 const int idx = indexFromFaceIndexExtendedStaggered<0,nFace,1>(
face_idx,param.ghostFace[0],param.parity,param.X,param.R);
1188 packFaceStaggeredCore(param.in, param.inNorm, idx,
1189 param.sp_stride, param.out[1], param.outNorm[1], face_idx, nFace*param.ghostFace[0]);
1191 }
else if (dim == 1) {
1192 const int face_num = (param.face_num==2) ? ((face_idx >= nFace*param.ghostFace[1]) ? 1 : 0) : param.face_num;
1193 if(param.face_num==2) face_idx -= face_num*nFace*param.ghostFace[1];
1194 if (face_num == 0) {
1195 const int idx = indexFromFaceIndexExtendedStaggered<1,nFace,0>(
face_idx,param.ghostFace[1],param.parity,param.X,param.R);
1196 packFaceStaggeredCore(param.in, param.inNorm, idx,
1197 param.sp_stride, param.out[2], param.outNorm[2], face_idx, nFace*param.ghostFace[1]);
1199 const int idx = indexFromFaceIndexExtendedStaggered<1,nFace,1>(
face_idx,param.ghostFace[1],param.parity,param.X,param.R);
1200 packFaceStaggeredCore(param.in, param.inNorm, idx,
1201 param.sp_stride, param.out[3], param.outNorm[3], face_idx, nFace*param.ghostFace[1]);
1203 }
else if (dim == 2) {
1204 const int face_num = (param.face_num==2) ? ((face_idx >= nFace*param.ghostFace[2]) ? 1 : 0) : param.face_num;
1205 if(param.face_num==2) face_idx -= face_num*nFace*param.ghostFace[2];
1206 if (face_num == 0) {
1207 const int idx = indexFromFaceIndexExtendedStaggered<2,nFace,0>(
face_idx,param.ghostFace[2],param.parity,param.X,param.R);
1208 packFaceStaggeredCore(param.in, param.inNorm, idx,
1209 param.sp_stride, param.out[4], param.outNorm[4], face_idx, nFace*param.ghostFace[2]);
1211 const int idx = indexFromFaceIndexExtendedStaggered<2,nFace,1>(
face_idx,param.ghostFace[2],param.parity,param.X,param.R);
1212 packFaceStaggeredCore(param.in, param.inNorm, idx,
1213 param.sp_stride, param.out[5], param.outNorm[5], face_idx, nFace*param.ghostFace[2]);
1216 const int face_num = (param.face_num==2) ? ((face_idx >= nFace*param.ghostFace[3]) ? 1 : 0) : param.face_num;
1217 if(param.face_num==2) face_idx -= face_num*nFace*param.ghostFace[3];
1218 if (face_num == 0) {
1219 const int idx = indexFromFaceIndexExtendedStaggered<3,nFace,0>(
face_idx,param.ghostFace[3],param.parity,param.X,param.R);
1220 packFaceStaggeredCore(param.in, param.inNorm, idx,
1221 param.sp_stride, param.out[6], param.outNorm[6], face_idx, nFace*param.ghostFace[3]);
1223 const int idx = indexFromFaceIndexExtendedStaggered<3,nFace,1>(
face_idx,param.ghostFace[3],param.parity,param.X,param.R);
1224 packFaceStaggeredCore(param.in, param.inNorm, idx,
1225 param.sp_stride, param.out[7], param.outNorm[7], face_idx, nFace*param.ghostFace[3]);
1232 #undef SPINORTEXDOUBLE
1233 #undef SPINORTEXSINGLE
1234 #undef SPINORTEXHALF
1236 #endif // GPU_STAGGERED_DIRAC
1239 template <
typename FloatN,
typename Float>
1240 class PackFaceStaggered :
public PackFace<FloatN, Float> {
1246 int inputPerSite()
const {
return 6; }
1247 int outputPerSite()
const {
return 6; }
1251 PackFaceStaggered(FloatN *faces,
const cudaColorSpinorField *in,
1252 const int nFace,
const int dagger,
const int parity,
1253 const int dim,
const int face_num,
const int* R=NULL,
const bool unpack=
false)
1254 : PackFace<FloatN,
Float>(faces, in, dagger, parity, nFace, dim, face_num), R(R), unpack(unpack) { }
1255 virtual ~PackFaceStaggered() { }
1257 void apply(
const cudaStream_t &stream) {
1260 #ifdef GPU_STAGGERED_DIRAC
1262 PackParam<FloatN> param = this->prepareParam(this->dim, this->face_num);
1264 if (PackFace<FloatN,Float>::nFace==1) {
1265 packFaceStaggeredKernel<FloatN, 1> <<<tp.grid, tp.block, tp.shared_bytes, stream>>>(
param);
1267 packFaceStaggeredKernel<FloatN, 3> <<<tp.grid, tp.block, tp.shared_bytes, stream>>>(
param);
1270 PackExtendedParam<FloatN> extendedParam(param);
1272 for(
int d=0; d<
QUDA_MAX_DIM; ++d) extendedParam.R[d] = R[d];
1273 switch(PackFace<FloatN,Float>::nFace){
1275 packFaceExtendedStaggeredKernel<FloatN,1><<<tp.grid, tp.block, tp.shared_bytes, stream>>>(extendedParam);
1279 packFaceExtendedStaggeredKernel<FloatN,2><<<tp.grid, tp.block, tp.shared_bytes, stream>>>(extendedParam);
1283 packFaceExtendedStaggeredKernel<FloatN,3><<<tp.grid, tp.block, tp.shared_bytes, stream>>>(extendedParam);
1287 packFaceExtendedStaggeredKernel<FloatN,4><<<tp.grid, tp.block, tp.shared_bytes, stream>>>(extendedParam);
1291 errorQuda(
"Unsupported boundary width");
1295 switch(PackFace<FloatN,Float>::nFace){
1297 unpackFaceExtendedStaggeredKernel<FloatN,1><<<tp.grid, tp.block, tp.shared_bytes, stream>>>(extendedParam);
1301 unpackFaceExtendedStaggeredKernel<FloatN,2><<<tp.grid, tp.block, tp.shared_bytes, stream>>>(extendedParam);
1305 unpackFaceExtendedStaggeredKernel<FloatN,3><<<tp.grid, tp.block, tp.shared_bytes, stream>>>(extendedParam);
1309 unpackFaceExtendedStaggeredKernel<FloatN,4><<<tp.grid, tp.block, tp.shared_bytes, stream>>>(extendedParam);
1313 errorQuda(
"Unsupported boundary width");
1319 errorQuda(
"Staggered face packing kernel is not built");
1323 long long flops()
const {
return 0; }
1327 void packFaceStaggered(
void *ghost_buf, cudaColorSpinorField &in,
int nFace,
1328 int dagger,
int parity,
const int dim,
const int face_num,
const cudaStream_t &stream) {
1330 switch(in.Precision()) {
1333 #if __COMPUTE_CAPABILITY__ >= 130
1334 PackFaceStaggered<double2, double> pack((double2*)ghost_buf, &in, nFace, dagger, parity, dim, face_num);
1341 PackFaceStaggered<float2, float> pack((float2*)ghost_buf, &in, nFace, dagger, parity, dim, face_num);
1347 PackFaceStaggered<short2, float> pack((short2*)ghost_buf, &in, nFace, dagger, parity, dim, face_num);
1354 void packFaceExtendedStaggered(
void *buffer, cudaColorSpinorField &field,
const int nFace,
const int R[],
1355 int dagger,
int parity,
const int dim,
const int face_num,
const cudaStream_t &stream,
bool unpack=
false)
1357 switch(field.Precision()){
1360 #if __COMPUTE_CAPABILITY__ >= 130
1361 PackFaceStaggered<double2,double> pack(static_cast<double2*>(buffer), &field, nFace, dagger, parity, dim, face_num, R, unpack);
1368 PackFaceStaggered<float2,float> pack(static_cast<float2*>(buffer), &field, nFace, dagger, parity, dim, face_num, R, unpack);
1374 PackFaceStaggered<short2,float> pack(static_cast<short2*>(buffer), &field, nFace, dagger, parity, dim, face_num, R, unpack);
1382 #ifdef GPU_DOMAIN_WALL_DIRAC
1383 template <
int dagger,
typename FloatN>
1384 __global__
void packFaceDWKernel(PackParam<FloatN> param)
1386 const int nFace = 1;
1388 int face_idx = blockIdx.x*blockDim.x + threadIdx.x;
1389 if (face_idx >= param.threads)
return;
1392 const int dim = dimFromFaceIndex(face_idx, param);
1394 const int Ls = param.X[4];
1402 const int face_num = (face_idx >= nFace*Ls*param.ghostFace[0]) ? 1 : 0;
1403 face_idx -= face_num*nFace*Ls*param.ghostFace[0];
1404 if (face_num == 0) {
1405 const int idx = indexFromDWFaceIndex<0,nFace,0>(
face_idx,Ls*param.ghostFace[0],param.parity,param.X);
1406 packFaceWilsonCore<0,dagger,0>(param.out[0], param.outNorm[0], param.in,
1409 const int idx = indexFromDWFaceIndex<0,nFace,1>(
face_idx,Ls*param.ghostFace[0],param.parity,param.X);
1410 packFaceWilsonCore<0,dagger,1>(param.out[1], param.outNorm[1], param.in,
1413 }
else if (dim == 1) {
1414 const int face_num = (face_idx >= nFace*Ls*param.ghostFace[1]) ? 1 : 0;
1415 face_idx -= face_num*nFace*Ls*param.ghostFace[1];
1416 if (face_num == 0) {
1417 const int idx = indexFromDWFaceIndex<1,nFace,0>(
face_idx,Ls*param.ghostFace[1],param.parity,param.X);
1418 packFaceWilsonCore<1, dagger,0>(param.out[2], param.outNorm[2], param.in,
1421 const int idx = indexFromDWFaceIndex<1,nFace,1>(
face_idx,Ls*param.ghostFace[1],param.parity,param.X);
1422 packFaceWilsonCore<1, dagger,1>(param.out[3], param.outNorm[3], param.in,
1425 }
else if (dim == 2) {
1426 const int face_num = (face_idx >= nFace*Ls*param.ghostFace[2]) ? 1 : 0;
1427 face_idx -= face_num*nFace*Ls*param.ghostFace[2];
1428 if (face_num == 0) {
1429 const int idx = indexFromDWFaceIndex<2,nFace,0>(
face_idx,Ls*param.ghostFace[2],param.parity,param.X);
1430 packFaceWilsonCore<2, dagger,0>(param.out[4], param.outNorm[4], param.in,
1433 const int idx = indexFromDWFaceIndex<2,nFace,1>(
face_idx,Ls*param.ghostFace[2],param.parity,param.X);
1434 packFaceWilsonCore<2, dagger,1>(param.out[5], param.outNorm[5], param.in,
1438 const int face_num = (face_idx >= nFace*Ls*param.ghostFace[3]) ? 1 : 0;
1439 face_idx -= face_num*nFace*Ls*param.ghostFace[3];
1440 if (face_num == 0) {
1441 const int idx = indexFromDWFaceIndex<3,nFace,0>(
face_idx,Ls*param.ghostFace[3],param.parity,param.X);
1442 packFaceWilsonCore<3, dagger,0>(param.out[6], param.outNorm[6], param.in,
1445 const int idx = indexFromDWFaceIndex<3,nFace,1>(
face_idx,Ls*param.ghostFace[3],param.parity,param.X);
1446 packFaceWilsonCore<3, dagger,1>(param.out[7], param.outNorm[7], param.in,
1453 template <
int dagger,
typename FloatN>
1454 __global__
void packFaceDW4DKernel(PackParam<FloatN> param)
1456 const int nFace = 1;
1458 int face_idx = blockIdx.x*blockDim.x + threadIdx.x;
1459 if (face_idx >= param.threads)
return;
1461 const int Ls = param.X[4];
1464 const int dim = dimFromFaceIndex(face_idx, param);
1472 const int face_num = (face_idx >= nFace*Ls*param.ghostFace[0]) ? 1 : 0;
1473 face_idx -= face_num*nFace*Ls*param.ghostFace[0];
1474 if (face_num == 0) {
1475 const int idx = indexFromDW4DFaceIndex<0,nFace,0>(
face_idx,Ls*param.ghostFace[0],param.parity,param.X);
1476 packFaceWilsonCore<0,dagger,0>(param.out[0], param.outNorm[0], param.in,
1479 const int idx = indexFromDW4DFaceIndex<0,nFace,1>(
face_idx,Ls*param.ghostFace[0],param.parity,param.X);
1480 packFaceWilsonCore<0,dagger,1>(param.out[1], param.outNorm[1], param.in,
1483 }
else if (dim == 1) {
1484 const int face_num = (face_idx >= nFace*Ls*param.ghostFace[1]) ? 1 : 0;
1485 face_idx -= face_num*nFace*Ls*param.ghostFace[1];
1486 if (face_num == 0) {
1487 const int idx = indexFromDW4DFaceIndex<1,nFace,0>(
face_idx,Ls*param.ghostFace[1],param.parity,param.X);
1488 packFaceWilsonCore<1, dagger,0>(param.out[2], param.outNorm[2], param.in,
1491 const int idx = indexFromDW4DFaceIndex<1,nFace,1>(
face_idx,Ls*param.ghostFace[1],param.parity,param.X);
1492 packFaceWilsonCore<1, dagger,1>(param.out[3], param.outNorm[3], param.in,
1495 }
else if (dim == 2) {
1496 const int face_num = (face_idx >= nFace*Ls*param.ghostFace[2]) ? 1 : 0;
1497 face_idx -= face_num*nFace*Ls*param.ghostFace[2];
1498 if (face_num == 0) {
1499 const int idx = indexFromDW4DFaceIndex<2,nFace,0>(
face_idx,Ls*param.ghostFace[2],param.parity,param.X);
1500 packFaceWilsonCore<2, dagger,0>(param.out[4], param.outNorm[4], param.in,
1503 const int idx = indexFromDW4DFaceIndex<2,nFace,1>(
face_idx,Ls*param.ghostFace[2],param.parity,param.X);
1504 packFaceWilsonCore<2, dagger,1>(param.out[5], param.outNorm[5], param.in,
1508 const int face_num = (face_idx >= nFace*Ls*param.ghostFace[3]) ? 1 : 0;
1509 face_idx -= face_num*nFace*Ls*param.ghostFace[3];
1510 if (face_num == 0) {
1511 const int idx = indexFromDW4DFaceIndex<3,nFace,0>(
face_idx,Ls*param.ghostFace[3],param.parity,param.X);
1512 packFaceWilsonCore<3, dagger,0>(param.out[6], param.outNorm[6], param.in,
1515 const int idx = indexFromDW4DFaceIndex<3,nFace,1>(
face_idx,Ls*param.ghostFace[3],param.parity,param.X);
1516 packFaceWilsonCore<3, dagger,1>(param.out[7], param.outNorm[7], param.in,
1524 template <
typename FloatN,
typename Float>
1525 class PackFaceDW :
public PackFace<FloatN, Float> {
1529 int inputPerSite()
const {
return 24; }
1530 int outputPerSite()
const {
return 12; }
1533 PackFaceDW(FloatN *faces,
const cudaColorSpinorField *in,
1534 const int dagger,
const int parity)
1535 : PackFace<FloatN,
Float>(faces, in, dagger, parity, 1) { }
1536 virtual ~PackFaceDW() { }
1538 void apply(
const cudaStream_t &stream) {
1541 #ifdef GPU_DOMAIN_WALL_DIRAC
1542 PackParam<FloatN> param = this->prepareParam();
1544 packFaceDWKernel<1><<<tp.grid, tp.block, tp.shared_bytes, stream>>>(
param);
1546 packFaceDWKernel<0><<<tp.grid, tp.block, tp.shared_bytes, stream>>>(
param);
1549 errorQuda(
"DW face packing kernel is not built");
1553 long long flops()
const {
return outputPerSite()*this->
threads(); }
1556 template <
typename FloatN,
typename Float>
1557 class PackFaceDW4D :
public PackFace<FloatN, Float> {
1561 int inputPerSite()
const {
return 24; }
1562 int outputPerSite()
const {
return 12; }
1565 PackFaceDW4D(FloatN *faces,
const cudaColorSpinorField *in,
1566 const int dagger,
const int parity)
1567 : PackFace<FloatN,
Float>(faces, in, dagger, parity, 1) { }
1568 virtual ~PackFaceDW4D() { }
1570 void apply(
const cudaStream_t &stream) {
1573 #ifdef GPU_DOMAIN_WALL_DIRAC
1574 PackParam<FloatN> param = this->prepareParam();
1576 packFaceDW4DKernel<1><<<tp.grid, tp.block, tp.shared_bytes, stream>>>(
param);
1578 packFaceDW4DKernel<0><<<tp.grid, tp.block, tp.shared_bytes, stream>>>(
param);
1581 errorQuda(
"4D preconditioned DW face packing kernel is not built");
1585 long long flops()
const {
return outputPerSite()*this->
threads(); }
1588 void packFaceDW(
void *ghost_buf, cudaColorSpinorField &in,
const int dagger,
1589 const int parity,
const cudaStream_t &stream) {
1594 switch(in.Precision()) {
1597 PackFaceDW4D<double2, double> pack((double2*)ghost_buf, &in, dagger, parity);
1603 PackFaceDW4D<float4, float> pack((float4*)ghost_buf, &in, dagger, parity);
1609 PackFaceDW4D<short4, float> pack((short4*)ghost_buf, &in, dagger, parity);
1617 switch(in.Precision()) {
1620 PackFaceDW<double2, double> pack((double2*)ghost_buf, &in, dagger, parity);
1626 PackFaceDW<float4, float> pack((float4*)ghost_buf, &in, dagger, parity);
1632 PackFaceDW<short4, float> pack((short4*)ghost_buf, &in, dagger, parity);
1640 #ifdef GPU_NDEG_TWISTED_MASS_DIRAC
1641 template <
int dagger,
typename FloatN>
1642 __global__
void packFaceNdegTMKernel(PackParam<FloatN> param)
1644 const int nFace = 1;
1647 int face_idx = blockIdx.x*blockDim.x + threadIdx.x;
1648 if (face_idx >= param.threads)
return;
1651 const int dim = dimFromFaceIndex(face_idx, param);
1659 const int face_num = (face_idx >= nFace*Nf*param.ghostFace[0]) ? 1 : 0;
1660 face_idx -= face_num*nFace*Nf*param.ghostFace[0];
1661 if (face_num == 0) {
1662 const int idx = indexFromNdegTMFaceIndex<0,nFace,0>(
face_idx,Nf*param.ghostFace[0],param.parity,param.X);
1663 packFaceWilsonCore<0,dagger,0>(param.out[0], param.outNorm[0], param.in,
1666 const int idx = indexFromNdegTMFaceIndex<0,nFace,1>(
face_idx,Nf*param.ghostFace[0],param.parity,param.X);
1667 packFaceWilsonCore<0,dagger,1>(param.out[1], param.outNorm[1], param.in,
1670 }
else if (dim == 1) {
1671 const int face_num = (face_idx >= nFace*Nf*param.ghostFace[1]) ? 1 : 0;
1672 face_idx -= face_num*nFace*Nf*param.ghostFace[1];
1673 if (face_num == 0) {
1674 const int idx = indexFromNdegTMFaceIndex<1,nFace,0>(
face_idx,Nf*param.ghostFace[1],param.parity,param.X);
1675 packFaceWilsonCore<1, dagger,0>(param.out[2], param.outNorm[2], param.in,
1678 const int idx = indexFromNdegTMFaceIndex<1,nFace,1>(
face_idx,Nf*param.ghostFace[1],param.parity,param.X);
1679 packFaceWilsonCore<1, dagger,1>(param.out[3], param.outNorm[3], param.in,
1682 }
else if (dim == 2) {
1683 const int face_num = (face_idx >= nFace*Nf*param.ghostFace[2]) ? 1 : 0;
1684 face_idx -= face_num*nFace*Nf*param.ghostFace[2];
1685 if (face_num == 0) {
1686 const int idx = indexFromNdegTMFaceIndex<2,nFace,0>(
face_idx,Nf*param.ghostFace[2],param.parity,param.X);
1687 packFaceWilsonCore<2, dagger,0>(param.out[4], param.outNorm[4], param.in,
1690 const int idx = indexFromNdegTMFaceIndex<2,nFace,1>(
face_idx,Nf*param.ghostFace[2],param.parity,param.X);
1691 packFaceWilsonCore<2, dagger,1>(param.out[5], param.outNorm[5], param.in,
1695 const int face_num = (face_idx >= nFace*Nf*param.ghostFace[3]) ? 1 : 0;
1696 face_idx -= face_num*nFace*Nf*param.ghostFace[3];
1697 if (face_num == 0) {
1698 const int idx = indexFromNdegTMFaceIndex<3,nFace,0>(
face_idx,Nf*param.ghostFace[3],param.parity,param.X);
1699 packFaceWilsonCore<3, dagger,0>(param.out[6], param.outNorm[6], param.in,
1702 const int idx = indexFromNdegTMFaceIndex<3,nFace,1>(
face_idx,Nf*param.ghostFace[3],param.parity,param.X);
1703 packFaceWilsonCore<3, dagger,1>(param.out[7], param.outNorm[7], param.in,
1710 template <
typename FloatN,
typename Float>
1711 class PackFaceNdegTM :
public PackFace<FloatN, Float> {
1715 int inputPerSite()
const {
return 24; }
1716 int outputPerSite()
const {
return 12; }
1719 PackFaceNdegTM(FloatN *faces,
const cudaColorSpinorField *in,
1720 const int dagger,
const int parity)
1721 : PackFace<FloatN,
Float>(faces, in, dagger, parity, 1) { }
1722 virtual ~PackFaceNdegTM() { }
1724 void apply(
const cudaStream_t &stream) {
1727 #ifdef GPU_NDEG_TWISTED_MASS_DIRAC
1728 PackParam<FloatN> param = this->prepareParam();
1730 packFaceNdegTMKernel<1><<<tp.grid, tp.block, tp.shared_bytes, stream>>>(
param);
1732 packFaceNdegTMKernel<0><<<tp.grid, tp.block, tp.shared_bytes, stream>>>(
param);
1735 errorQuda(
"Non-degenerate twisted mass face packing kernel is not built");
1739 long long flops()
const {
return outputPerSite()*this->
threads(); }
1742 void packFaceNdegTM(
void *ghost_buf, cudaColorSpinorField &in,
const int dagger,
1743 const int parity,
const cudaStream_t &stream) {
1745 switch(in.Precision()) {
1748 PackFaceNdegTM<double2, double> pack((double2*)ghost_buf, &in, dagger, parity);
1754 PackFaceNdegTM<float4, float> pack((float4*)ghost_buf, &in, dagger, parity);
1760 PackFaceNdegTM<short4, float> pack((short4*)ghost_buf, &in, dagger, parity);
1767 void packFace(
void *ghost_buf, cudaColorSpinorField &in,
const int nFace,
1768 const int dagger,
const int parity,
1769 const int dim,
const int face_num,
1770 const cudaStream_t &stream,
1771 const double a,
const double b)
1775 for (
int d=0; d<4; d++) {
1777 if (d != 3 ||
getKernelPackT() || a != 0.0 || b!= 0.0) nDimPack++;
1784 if (!nDimPack)
return;
1786 if (nFace != 1 && in.Nspin() != 1)
1787 errorQuda(
"Unsupported number of faces %d", nFace);
1790 if (in.Nspin() == 1) {
1791 packFaceStaggered(ghost_buf, in, nFace, dagger, parity, dim, face_num, stream);
1792 }
else if (a!=0.0 || b!=0.0) {
1795 packTwistedFaceWilson(ghost_buf, in, dagger, parity, a, b, stream);
1797 errorQuda(
"Cannot perform twisted packing for the spinor.");
1799 }
else if (in.Ndim() == 5) {
1801 packFaceDW(ghost_buf, in, dagger, parity, stream);
1803 packFaceNdegTM(ghost_buf, in, dagger, parity, stream);
1806 packFaceWilson(ghost_buf, in, dagger, parity, stream);
1812 void packFaceExtended(
void* buffer, cudaColorSpinorField &field,
const int nFace,
const int R[],
1813 const int dagger,
const int parity,
const int dim,
const int face_num,
1814 const cudaStream_t &stream,
const bool unpack)
1818 for(
int d=0; d<4; d++){
1819 if(R[d]) nDimPack++;
1822 if(R[dim]) nDimPack++;
1825 if(!nDimPack)
return;
1826 if(field.Nspin() == 1){
1827 packFaceExtendedStaggered(buffer, field, nFace, R, dagger, parity, dim, face_num, stream, unpack);
1829 errorQuda(
"Extended quark field is not supported");
void packFace(void *ghost_buf, cudaColorSpinorField &in, const int nFace, const int dagger, const int parity, const int dim, const int face_num, const cudaStream_t &stream, const double a=0.0, const double b=0.0)
QudaVerbosity getVerbosity()
void setPackComms(const int *commDim)
__global__ void const FloatN FloatM FloatM Float Float int threads
__constant__ int ghostFace[QUDA_MAX_DIM+1]
FloatingPoint< float > Float
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
__inline__ __device__ double2 fetch_double2(texture< int4, 1 > t, int i)
cpuColorSpinorField * out
#define QUDA_MAX_DIM
Maximum number of dimensions supported by QUDA. In practice, no routines make use of more than 5...
void packFaceExtended(void *ghost_buf, cudaColorSpinorField &field, const int nFace, const int R[], const int dagger, const int parity, const int dim, const int face_num, const cudaStream_t &stream, const bool unpack=false)
#define TEX1DFETCH(type, tex, idx)