9 template <
typename Field>
39 X[4] = (
nDim == 5) ?
a.X(4) : 1;
40 for (
int i=0;
i<4;
i++) {
46 template <
typename Float,
int Ns,
int Ms,
int Nc,
int Mc,
int nDim,
typename Arg>
47 __device__ __host__
inline void packGhost(Arg &
arg,
int cb_idx,
int parity,
int spinor_parity,
int spin_block,
int color_block) {
57 for (
int spin_local=0; spin_local<Ms; spin_local++) {
58 int s = spin_block + spin_local;
59 for (
int color_local=0; color_local<Mc; color_local++) {
60 int c = color_block + color_local;
62 =
arg.field(spinor_parity, cb_idx,
s,
c);
68 for (
int spin_local=0; spin_local<Ms; spin_local++) {
69 int s = spin_block + spin_local;
70 for (
int color_local=0; color_local<Mc; color_local++) {
71 int c = color_block + color_local;
73 =
arg.field(spinor_parity, cb_idx,
s,
c);
80 template <
typename Float,
int Ns,
int Ms,
int Nc,
int Mc,
int nDim,
typename Arg>
84 const int spinor_parity = (
arg.nParity == 2) ?
parity : 0;
85 for (
int i=0;
i<
arg.volumeCB;
i++)
86 for (
int spin_block=0; spin_block<Ns; spin_block+=Ms)
87 for (
int color_block=0; color_block<Nc; color_block+=Mc)
88 packGhost<Float,Ns,Ms,Nc,Mc,nDim>(
arg,
i,
parity, spinor_parity, spin_block, color_block);
92 template <
typename Float,
int Ns,
int Ms,
int Nc,
int Mc,
int nDim,
typename Arg>
94 int x_cb = blockIdx.x*
blockDim.x + threadIdx.x;
95 if (x_cb >=
arg.volumeCB)
return;
98 const int spinor_parity = (
arg.nParity == 2) ?
parity : 0;
99 const int spin_color_block =
blockDim.y*blockIdx.y + threadIdx.y;
100 if (spin_color_block >= (Ns/Ms)*(Nc/Mc))
return;
101 const int spin_block = (spin_color_block / (Nc / Mc)) * Ms;
102 const int color_block = (spin_color_block % (Nc / Mc)) * Mc;
103 packGhost<Float,Ns,Ms,Nc,Mc,nDim>(
arg, x_cb,
parity, spinor_parity, spin_block, color_block);
106 template <
typename Float,
int Ns,
int Ms,
int Nc,
int Mc,
typename Arg>
124 char label[15] =
",dest=";
126 for (
int dir=0; dir<2; dir++) {
138 if (
arg.nDim == 5) GenericPackGhost<Float,Ns,Ms,Nc,Mc,5,Arg>(
arg);
139 else GenericPackGhost<Float,Ns,Ms,Nc,Mc,4,Arg>(
arg);
151 long long flops()
const {
return 0; }
153 size_t totalBytes = 0;
154 for (
int d=0;
d<4;
d++) {
162 template <
typename Float, QudaFieldOrder order,
int Ns,
int Nc>
167 Q field(
a, nFace, 0, ghost);
169 constexpr
int spins_per_thread = 1;
170 constexpr
int colors_per_thread = 1;
173 launch(
arg,
a, destination);
177 template <
typename Float, QudaFieldOrder order,
int Ns>
181 if (
a.Ncolor() == 2) {
182 genericPackGhost<Float,order,Ns,2>(ghost,
a,
parity, nFace,
dagger, destination);
183 }
else if (
a.Ncolor() == 3) {
184 genericPackGhost<Float,order,Ns,3>(ghost,
a,
parity, nFace,
dagger, destination);
185 }
else if (
a.Ncolor() == 4) {
186 genericPackGhost<Float,order,Ns,4>(ghost,
a,
parity, nFace,
dagger, destination);
187 }
else if (
a.Ncolor() == 6) {
188 genericPackGhost<Float,order,Ns,6>(ghost,
a,
parity, nFace,
dagger, destination);
189 }
else if (
a.Ncolor() == 8) {
190 genericPackGhost<Float,order,Ns,8>(ghost,
a,
parity, nFace,
dagger, destination);
191 }
else if (
a.Ncolor() == 12) {
192 genericPackGhost<Float,order,Ns,12>(ghost,
a,
parity, nFace,
dagger, destination);
193 }
else if (
a.Ncolor() == 16) {
194 genericPackGhost<Float,order,Ns,16>(ghost,
a,
parity, nFace,
dagger, destination);
195 }
else if (
a.Ncolor() == 20) {
196 genericPackGhost<Float,order,Ns,20>(ghost,
a,
parity, nFace,
dagger, destination);
197 }
else if (
a.Ncolor() == 24) {
198 genericPackGhost<Float,order,Ns,24>(ghost,
a,
parity, nFace,
dagger, destination);
199 }
else if (
a.Ncolor() == 28) {
200 genericPackGhost<Float,order,Ns,28>(ghost,
a,
parity, nFace,
dagger, destination);
201 }
else if (
a.Ncolor() == 32) {
202 genericPackGhost<Float,order,Ns,32>(ghost,
a,
parity, nFace,
dagger, destination);
203 }
else if (
a.Ncolor() == 48) {
204 genericPackGhost<Float,order,Ns,48>(ghost,
a,
parity, nFace,
dagger, destination);
205 }
else if (
a.Ncolor() == 72) {
206 genericPackGhost<Float,order,Ns,72>(ghost,
a,
parity, nFace,
dagger, destination);
207 }
else if (
a.Ncolor() == 96) {
208 genericPackGhost<Float,order,Ns,96>(ghost,
a,
parity, nFace,
dagger, destination);
209 }
else if (
a.Ncolor() == 256) {
210 genericPackGhost<Float,order,Ns,256>(ghost,
a,
parity, nFace,
dagger, destination);
211 }
else if (
a.Ncolor() == 576) {
212 genericPackGhost<Float,order,Ns,576>(ghost,
a,
parity, nFace,
dagger, destination);
213 }
else if (
a.Ncolor() == 768) {
214 genericPackGhost<Float,order,Ns,768>(ghost,
a,
parity, nFace,
dagger, destination);
215 }
else if (
a.Ncolor() == 1024) {
216 genericPackGhost<Float,order,Ns,1024>(ghost,
a,
parity, nFace,
dagger, destination);
218 errorQuda(
"Unsupported nColor = %d",
a.Ncolor());
223 template <
typename Float, QudaFieldOrder order>
227 if (
a.Nspin() == 4) {
228 genericPackGhost<Float,order,4>(ghost,
a,
parity, nFace,
dagger, destination);
229 }
else if (
a.Nspin() == 2) {
230 genericPackGhost<Float,order,2>(ghost,
a,
parity, nFace,
dagger, destination);
231 #ifdef GPU_STAGGERED_DIRAC 232 }
else if (
a.Nspin() == 1) {
233 genericPackGhost<Float,order,1>(ghost,
a,
parity, nFace,
dagger, destination);
236 errorQuda(
"Unsupported nSpin = %d",
a.Nspin());
241 template <
typename Float>
246 genericPackGhost<Float,QUDA_FLOAT2_FIELD_ORDER>(ghost,
a,
parity, nFace,
dagger, destination);
248 genericPackGhost<Float,QUDA_FLOAT4_FIELD_ORDER>(ghost,
a,
parity, nFace,
dagger, destination);
250 genericPackGhost<Float,QUDA_SPACE_SPIN_COLOR_FIELD_ORDER>(ghost,
a,
parity, nFace,
dagger, destination);
252 errorQuda(
"Unsupported field order = %d",
a.FieldOrder());
261 errorQuda(
"Field order %d not supported",
a.FieldOrder());
266 for (
int i=0;
i<4*2;
i++) {
271 bool partitioned =
false;
272 for (
int d=0;
d<4;
d++)
274 if (!partitioned)
return;
277 genericPackGhost<double>(ghost,
a,
parity, nFace,
dagger, destination);
279 genericPackGhost<float>(ghost,
a,
parity, nFace,
dagger, destination);
281 errorQuda(
"Unsupported precision %d",
a.Precision());
__device__ __host__ void packGhost(Arg &arg, int cb_idx, int parity, int spinor_parity, int spin_block, int color_block)
const char * comm_dim_partitioned_string()
Return a string that defines the comm partitioning (used as a tuneKey)
const char * AuxString() const
QudaVerbosity getVerbosity()
const QudaDWFPCType pc_type
static __device__ __host__ void getCoords5(int x[5], int cb_index, const I X[5], int parity, QudaDWFPCType pc_type)
static __inline__ dim3 dim3 void size_t cudaStream_t int dim
unsigned int minThreads() const
char * strcpy(char *__dst, const char *__src)
const char * VolString() const
char * strcat(char *__s1, const char *__s2)
const int * SurfaceCB() const
void genericPackGhost(void **ghost, const ColorSpinorField &a, QudaParity parity, int nFace, int dagger, MemoryLocation *destination=nullptr)
Generic ghost packing routine.
enum QudaDWFPCType_s QudaDWFPCType
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
enum QudaParity_s QudaParity
__global__ void GenericPackGhostKernel(Arg arg)
QudaFieldLocation Location() const
void GenericPackGhost(Arg &arg)
int_fastdiv X[QUDA_MAX_DIM]
const ColorSpinorField & meta
PackGhostArg(Field field, void **ghost, const ColorSpinorField &a, int parity, int nFace, int dagger)
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
void apply(const cudaStream_t &stream)
#define QUDA_MAX_DIM
Maximum number of dimensions supported by QUDA. In practice, no routines make use of more than 5...
QudaTune getTuning()
Query whether autotuning is enabled or not. Default is enabled but can be overridden by setting QUDA_...
static __inline__ size_t size_t d
QudaPrecision Precision() const
virtual ~GenericPackGhostLauncher()
int comm_dim_partitioned(int dim)
GenericPackGhostLauncher(Arg &arg, const ColorSpinorField &meta, MemoryLocation *destination)
static __device__ __host__ void getCoords(int x[], int cb_index, const I X[], int parity)