45 template <
typename Float,
bool block_
float,
int Ns,
int Ms,
int Nc,
int Mc,
typename Arg>
55 :
TunableVectorYZ((Ns/Ms)*(Nc/Mc), 2*arg.nParity), arg(arg), meta(meta) {
59 create_jitify_program(
"kernels/color_spinor_pack.cuh");
73 char label[15] =
",dest=";
74 for (
int dim=0; dim<4; dim++) {
75 for (
int dir=0; dir<2; dir++) {
87 if (arg.nDim == 5) GenericPackGhost<Float,block_float,Ns,Ms,Nc,Mc,5,Arg>(
arg);
88 else GenericPackGhost<Float,block_float,Ns,Ms,Nc,Mc,4,Arg>(
arg);
91 arg.nParity2dim_threads = arg.
nParity*2*tp.
aux.x;
93 using namespace jitify::reflection;
94 jitify_error = program->kernel(
"quda::GenericPackGhostKernel")
95 .instantiate(Type<Float>(),block_float,Ns,Ms,Nc,Mc,arg.nDim,(
int)tp.
aux.x,Type<Arg>())
100 if (arg.nDim == 5) GenericPackGhostKernel<Float,block_float,Ns,Ms,Nc,Mc,5,1,Arg> <<<tp.
grid,tp.
block,tp.
shared_bytes,stream>>>(
arg);
101 else GenericPackGhostKernel<Float,block_float,Ns,Ms,Nc,Mc,4,1,Arg> <<<tp.
grid,tp.
block,tp.
shared_bytes,stream>>>(
arg);
104 if (arg.nDim == 5) GenericPackGhostKernel<Float,block_float,Ns,Ms,Nc,Mc,5,2,Arg> <<<tp.
grid,tp.
block,tp.
shared_bytes,stream>>>(
arg);
105 else GenericPackGhostKernel<Float,block_float,Ns,Ms,Nc,Mc,4,2,Arg> <<<tp.
grid,tp.
block,tp.
shared_bytes,stream>>>(
arg);
108 if (arg.nDim == 5) GenericPackGhostKernel<Float,block_float,Ns,Ms,Nc,Mc,5,4,Arg> <<<tp.
grid,tp.
block,tp.
shared_bytes,stream>>>(
arg);
109 else GenericPackGhostKernel<Float,block_float,Ns,Ms,Nc,Mc,4,4,Arg> <<<tp.
grid,tp.
block,tp.
shared_bytes,stream>>>(
arg);
118 param.
block.y = (Ns/Ms)*(Nc/Mc);
138 if (param.
aux.x < 4) {
158 param.
aux = make_int4(1,1,1,1);
164 param.
aux = make_int4(1,1,1,1);
168 long long flops()
const {
return 0; }
170 size_t totalBytes = 0;
171 for (
int d=0; d<4; d++) {
179 template <
typename Float,
typename ghostFloat, QudaFieldOrder order,
int Ns,
int Nc>
185 Q field(a, nFace, 0, ghost);
187 constexpr
int spins_per_thread = Ns == 1 ? 1 : 2;
188 constexpr
int colors_per_thread = Nc%2 == 0 ? 2 : 1;
203 errorQuda(
"Block-float format not supported for Nc = %d", Nc);
206 launch(arg, a, destination);
219 #ifndef GPU_MULTIGRID_DOUBLE 225 template <
typename Float,
typename ghostFloat, QudaFieldOrder order,
int Ns>
233 errorQuda(
"Ncolor = %d not supported for Nspin = %d fields with precision = %d and ghost_precision = %d",
235 #ifndef GPU_MULTIGRID_DOUBLE 237 errorQuda(
"Ncolor = %d not supported for double precision fields", a.
Ncolor());
241 genericPackGhost<Float,ghostFloat,order,Ns,precision_spin_color_mapper<Float,ghostFloat,Ns,2>::nColor>(ghost, a,
parity, nFace,
dagger, destination);
242 }
else if (a.
Ncolor() == 3) {
243 genericPackGhost<Float,ghostFloat,order,Ns,precision_spin_color_mapper<Float,ghostFloat,Ns,3>::nColor>(ghost, a,
parity, nFace,
dagger, destination);
245 }
else if (a.
Ncolor() == 4) {
246 genericPackGhost<Float,ghostFloat,order,Ns,precision_spin_color_mapper<Float,ghostFloat,Ns,4>::nColor>(ghost, a,
parity, nFace,
dagger, destination);
247 }
else if (a.
Ncolor() == 6) {
248 genericPackGhost<Float,ghostFloat,order,Ns,precision_spin_color_mapper<Float,ghostFloat,Ns,6>::nColor>(ghost, a,
parity, nFace,
dagger, destination);
249 }
else if (a.
Ncolor() == 8) {
250 genericPackGhost<Float,ghostFloat,order,Ns,precision_spin_color_mapper<Float,ghostFloat,Ns,8>::nColor>(ghost, a,
parity, nFace,
dagger, destination);
251 }
else if (a.
Ncolor() == 12) {
252 genericPackGhost<Float,ghostFloat,order,Ns,precision_spin_color_mapper<Float,ghostFloat,Ns,12>::nColor>(ghost, a,
parity, nFace,
dagger, destination);
253 }
else if (a.
Ncolor() == 16) {
254 genericPackGhost<Float,ghostFloat,order,Ns,precision_spin_color_mapper<Float,ghostFloat,Ns,16>::nColor>(ghost, a,
parity, nFace,
dagger, destination);
255 }
else if (a.
Ncolor() == 18) {
256 genericPackGhost<Float,ghostFloat,order,Ns,precision_spin_color_mapper<Float,ghostFloat,Ns,18>::nColor>(ghost, a,
parity, nFace,
dagger, destination);
257 }
else if (a.
Ncolor() == 20) {
258 genericPackGhost<Float,ghostFloat,order,Ns,precision_spin_color_mapper<Float,ghostFloat,Ns,20>::nColor>(ghost, a,
parity, nFace,
dagger, destination);
259 }
else if (a.
Ncolor() == 24) {
260 genericPackGhost<Float,ghostFloat,order,Ns,precision_spin_color_mapper<Float,ghostFloat,Ns,24>::nColor>(ghost, a,
parity, nFace,
dagger, destination);
261 }
else if (a.
Ncolor() == 28) {
262 genericPackGhost<Float,ghostFloat,order,Ns,precision_spin_color_mapper<Float,ghostFloat,Ns,28>::nColor>(ghost, a,
parity, nFace,
dagger, destination);
263 }
else if (a.
Ncolor() == 32) {
264 genericPackGhost<Float,ghostFloat,order,Ns,precision_spin_color_mapper<Float,ghostFloat,Ns,32>::nColor>(ghost, a,
parity, nFace,
dagger, destination);
265 }
else if (a.
Ncolor() == 36) {
266 genericPackGhost<Float,ghostFloat,order,Ns,precision_spin_color_mapper<Float,ghostFloat,Ns,36>::nColor>(ghost, a,
parity, nFace,
dagger, destination);
267 }
else if (a.
Ncolor() == 48) {
268 genericPackGhost<Float,ghostFloat,order,Ns,precision_spin_color_mapper<Float,ghostFloat,Ns,48>::nColor>(ghost, a,
parity, nFace,
dagger, destination);
269 }
else if (a.
Ncolor() == 72) {
270 genericPackGhost<Float,ghostFloat,order,Ns,precision_spin_color_mapper<Float,ghostFloat,Ns,72>::nColor>(ghost, a,
parity, nFace,
dagger, destination);
271 }
else if (a.
Ncolor() == 96) {
272 genericPackGhost<Float,ghostFloat,order,Ns,precision_spin_color_mapper<Float,ghostFloat,Ns,96>::nColor>(ghost, a,
parity, nFace,
dagger, destination);
273 }
else if (a.
Ncolor() == 256) {
274 genericPackGhost<Float,ghostFloat,order,Ns,precision_spin_color_mapper<Float,ghostFloat,Ns,256>::nColor>(ghost, a,
parity, nFace,
dagger, destination);
275 }
else if (a.
Ncolor() == 576) {
276 genericPackGhost<Float,ghostFloat,order,Ns,precision_spin_color_mapper<Float,ghostFloat,Ns,576>::nColor>(ghost, a,
parity, nFace,
dagger, destination);
277 }
else if (a.
Ncolor() == 768) {
278 genericPackGhost<Float,ghostFloat,order,Ns,precision_spin_color_mapper<Float,ghostFloat,Ns,768>::nColor>(ghost, a,
parity, nFace,
dagger, destination);
279 }
else if (a.
Ncolor() == 1024) {
280 genericPackGhost<Float,ghostFloat,order,Ns,precision_spin_color_mapper<Float,ghostFloat,Ns,1024>::nColor>(ghost, a,
parity, nFace,
dagger, destination);
281 #endif // GPU_MULTIGRID 293 template <
typename Float,
typename ghostFloat, QudaFieldOrder order>
297 if (a.
Nspin() == 4) {
298 genericPackGhost<Float,ghostFloat,order,4>(ghost, a,
parity, nFace,
dagger, destination);
299 }
else if (a.
Nspin() == 2) {
301 genericPackGhost<Float,ghostFloat,spin_order_mapper<2,order>::order,2>(ghost, a,
parity, nFace,
dagger, destination);
302 #ifdef GPU_STAGGERED_DIRAC 303 }
else if (a.
Nspin() == 1) {
305 genericPackGhost<Float,ghostFloat,spin_order_mapper<1,order>::order,1>(ghost, a,
parity, nFace,
dagger, destination);
326 template <
typename Float,
typename ghostFloat>
333 genericPackGhost<Float,ghostFloat,QUDA_FLOAT2_FIELD_ORDER>(ghost, a,
parity, nFace,
dagger, destination);
342 genericPackGhost<typename float4_precision_mapper<Float>::type,
351 genericPackGhost<typename non_native_precision_mapper<Float>::type,
369 for (
int i=0; i<4*2; i++) {
374 bool partitioned =
false;
375 for (
int d=0; d<4; d++)
377 if (!partitioned)
return;
381 genericPackGhost<double,double>(ghost, a,
parity, nFace,
dagger, destination);
387 genericPackGhost<float,float>(ghost, a,
parity, nFace,
dagger, destination);
389 #if QUDA_PRECISION & 2 390 genericPackGhost<float,short>(ghost, a,
parity, nFace,
dagger, destination);
392 errorQuda(
"QUDA_PRECISION=%d does not enable half precision", QUDA_PRECISION);
395 #if QUDA_PRECISION & 1 396 genericPackGhost<float,char>(ghost, a,
parity, nFace,
dagger, destination);
398 errorQuda(
"QUDA_PRECISION=%d does not enable quarter precision", QUDA_PRECISION);
405 #if QUDA_PRECISION & 2 406 genericPackGhost<short,short>(ghost, a,
parity, nFace,
dagger, destination);
408 errorQuda(
"QUDA_PRECISION=%d does not enable half precision", QUDA_PRECISION);
const char * AuxString() const
void apply(const cudaStream_t &stream)
QudaVerbosity getVerbosity()
Helper file when using jitify run-time compilation. This file should be included in source code...
enum QudaFieldOrder_s QudaFieldOrder
QudaPrecision GhostPrecision() const
unsigned int minThreads() const
virtual void defaultTuneParam(TuneParam ¶m) const
const char * VolString() const
const int * SurfaceCB() const
virtual void initTuneParam(TuneParam ¶m) const
const char * comm_dim_partitioned_string(const int *comm_dim_override=0)
Return a string that defines the comm partitioning (used as a tuneKey)
void setColorSpinBlock(TuneParam ¶m) const
const char * compile_type_str(const LatticeField &meta, QudaFieldLocation location_=QUDA_INVALID_FIELD_LOCATION)
Helper function for setting auxilary string.
const ColorSpinorField & meta
bool advanceBlockDim(TuneParam ¶m) const
void genericPackGhost(void **ghost, const ColorSpinorField &a, QudaParity parity, int nFace, int dagger, MemoryLocation *destination=nullptr)
Generic ghost packing routine.
virtual int blockMin() const
const char * comm_dim_topology_string()
Return a string that defines the comm topology (for use as a tuneKey)
GenericPackGhostLauncher(Arg &arg, const ColorSpinorField &meta, MemoryLocation *destination)
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
enum QudaParity_s QudaParity
#define MAX_BLOCK_FLOAT_NC
void initTuneParam(TuneParam ¶m) const
QudaFieldLocation Location() const
void resizeVector(int y, int z) const
bool advanceAux(TuneParam ¶m) const
virtual int blockStep() const
#define QUDA_MAX_DIM
Maximum number of dimensions supported by QUDA. In practice, no routines make use of more than 5...
virtual bool advanceBlockDim(TuneParam ¶m) const
QudaTune getTuning()
Query whether autotuning is enabled or not. Default is enabled but can be overridden by setting QUDA_...
QudaPrecision Precision() const
virtual ~GenericPackGhostLauncher()
QudaFieldOrder FieldOrder() const
int comm_dim_partitioned(int dim)
void defaultTuneParam(TuneParam ¶m) const
bool advanceBlockDim(TuneParam ¶m) const