16 template <
typename Float,
int fineSpin,
int fineColor,
int coarseSpin,
int coarseColor, QudaFieldOrder order>
17 struct ProlongateArg {
22 const spin_mapper<fineSpin,coarseSpin> spin_map;
26 ProlongateArg(ColorSpinorField &
out,
const ColorSpinorField &
in,
const ColorSpinorField &
V,
27 const int *geo_map,
const int parity)
30 ProlongateArg(
const ProlongateArg<Float,fineSpin,fineColor,coarseSpin,coarseColor,order> &
arg)
38 template <
typename Float,
int fineSpin,
int coarseColor,
class Coarse,
typename S>
39 __device__ __host__
inline void prolongate(complex<Float>
out[fineSpin*coarseColor],
const Coarse &
in,
40 int parity,
int x_cb,
const int *geo_map,
const S& spin_map,
int fineVolumeCB) {
41 int x =
parity*fineVolumeCB + x_cb;
42 int x_coarse = geo_map[
x];
43 int parity_coarse = (x_coarse >=
in.VolumeCB()) ? 1 : 0;
44 int x_coarse_cb = x_coarse - parity_coarse*
in.VolumeCB();
47 for (
int s=0;
s<fineSpin;
s++) {
49 for (
int c=0;
c<coarseColor;
c++) {
50 out[
s*coarseColor+
c] =
in(parity_coarse, x_coarse_cb, spin_map(
s),
c);
59 template <
typename Float,
int fineSpin,
int fineColor,
int coarseColor,
int fine_colors_per_thread,
60 class FineColor,
class Rotator>
61 __device__ __host__
inline void rotateFineColor(FineColor &
out,
const complex<Float>
in[fineSpin*coarseColor],
62 const Rotator &
V,
int parity,
int nParity,
int x_cb,
int fine_color_block) {
63 const int spinor_parity = (nParity == 2) ?
parity : 0;
64 const int v_parity = (
V.Nparity() == 2) ?
parity : 0;
66 constexpr
int color_unroll = 2;
69 for (
int s=0;
s<fineSpin;
s++)
71 for (
int fine_color_local=0; fine_color_local<fine_colors_per_thread; fine_color_local++)
72 out(spinor_parity, x_cb,
s, fine_color_block+fine_color_local) = 0.0;
75 for (
int s=0;
s<fineSpin;
s++) {
77 for (
int fine_color_local=0; fine_color_local<fine_colors_per_thread; fine_color_local++) {
78 int i = fine_color_block + fine_color_local;
80 complex<Float> partial[color_unroll];
82 for (
int k=0; k<color_unroll; k++) partial[k] = 0.0;
85 for (
int j=0; j<coarseColor; j+=color_unroll) {
88 for (
int k=0; k<color_unroll; k++)
89 partial[k] +=
V(v_parity, x_cb,
s,
i, j+k) *
in[
s*coarseColor + j + k];
93 for (
int k=0; k<color_unroll; k++)
out(spinor_parity, x_cb,
s,
i) += partial[k];
99 template <
typename Float,
int fineSpin,
int fineColor,
int coarseSpin,
int coarseColor,
int fine_colors_per_thread,
typename Arg>
104 for (
int x_cb=0; x_cb<
arg.out.VolumeCB(); x_cb++) {
105 complex<Float>
tmp[fineSpin*coarseColor];
106 prolongate<Float,fineSpin,coarseColor>(
tmp,
arg.in,
parity, x_cb,
arg.geo_map,
arg.spin_map,
arg.out.VolumeCB());
107 for (
int fine_color_block=0; fine_color_block<fineColor; fine_color_block+=fine_colors_per_thread) {
108 rotateFineColor<Float,fineSpin,fineColor,coarseColor,fine_colors_per_thread>
115 template <
typename Float,
int fineSpin,
int fineColor,
int coarseSpin,
int coarseColor,
int fine_colors_per_thread,
typename Arg>
116 __global__
void ProlongateKernel(Arg
arg) {
117 int x_cb = blockIdx.x*
blockDim.x + threadIdx.x;
119 if (x_cb >=
arg.out.VolumeCB())
return;
121 int fine_color_block = (
blockDim.z*blockIdx.z + threadIdx.z) * fine_colors_per_thread;
122 if (fine_color_block >= fineColor)
return;
124 complex<Float>
tmp[fineSpin*coarseColor];
125 prolongate<Float,fineSpin,coarseColor>(
tmp,
arg.in,
parity, x_cb,
arg.geo_map,
arg.spin_map,
arg.out.VolumeCB());
126 rotateFineColor<Float,fineSpin,fineColor,coarseColor,fine_colors_per_thread>
130 template <
typename Float,
int fineSpin,
int fineColor,
int coarseSpin,
int coarseColor,
int fine_colors_per_thread>
131 class ProlongateLaunch :
public TunableVectorYZ {
134 ColorSpinorField &
out;
135 const ColorSpinorField &
in;
136 const ColorSpinorField &
V;
137 const int *fine_to_coarse;
142 bool tuneGridDim()
const {
return false; }
143 unsigned int minThreads()
const {
return out.VolumeCB(); }
146 ProlongateLaunch(ColorSpinorField &
out,
const ColorSpinorField &
in,
const ColorSpinorField &
V,
147 const int *fine_to_coarse,
int parity)
148 : TunableVectorYZ(
out.SiteSubset(), fineColor/fine_colors_per_thread),
out(
out),
in(
in),
V(
V),
160 virtual ~ProlongateLaunch() { }
162 void apply(
const cudaStream_t &
stream) {
165 ProlongateArg<Float,fineSpin,fineColor,coarseSpin,coarseColor,QUDA_SPACE_SPIN_COLOR_FIELD_ORDER>
167 Prolongate<Float,fineSpin,fineColor,coarseSpin,coarseColor,fine_colors_per_thread>(
arg);
169 errorQuda(
"Unsupported field order %d",
out.FieldOrder());
174 ProlongateArg<Float,fineSpin,fineColor,coarseSpin,coarseColor,QUDA_FLOAT2_FIELD_ORDER>
176 ProlongateKernel<Float,fineSpin,fineColor,coarseSpin,coarseColor,fine_colors_per_thread>
177 <<<tp.grid, tp.block, tp.shared_bytes,
stream>>>(
arg);
179 errorQuda(
"Unsupported field order %d",
out.FieldOrder());
184 TuneKey tuneKey()
const {
return TuneKey(vol,
typeid(*this).name(), aux); }
186 long long flops()
const {
return 8 * fineSpin * fineColor * coarseColor *
out.SiteSubset()*
out.VolumeCB(); }
188 long long bytes()
const {
189 size_t v_bytes =
V.Bytes() / (
V.SiteSubset() ==
out.SiteSubset() ? 1 : 2);
190 return in.Bytes() +
out.Bytes() + v_bytes +
out.SiteSubset()*
out.VolumeCB()*
sizeof(
int);
195 template <
typename Float,
int fineSpin,
int fineColor,
int coarseSpin,
int coarseColor>
196 void Prolongate(ColorSpinorField &
out,
const ColorSpinorField &
in,
const ColorSpinorField &v,
197 const int *fine_to_coarse,
int parity) {
200 constexpr
int fine_colors_per_thread = 1;
202 ProlongateLaunch<Float, fineSpin, fineColor, coarseSpin, coarseColor, fine_colors_per_thread>
204 prolongator.apply(0);
210 template <
typename Float,
int fineSpin>
211 void Prolongate(ColorSpinorField &
out,
const ColorSpinorField &
in,
const ColorSpinorField &v,
212 int nVec,
const int *fine_to_coarse,
const int *spin_map,
int parity) {
214 if (
in.Nspin() != 2)
errorQuda(
"Coarse spin %d is not supported",
in.Nspin());
215 const int coarseSpin = 2;
218 spin_mapper<fineSpin,coarseSpin> mapper;
219 for (
int s=0;
s<fineSpin;
s++)
220 if (mapper(
s) != spin_map[
s])
errorQuda(
"Spin map does not match spin_mapper");
222 if (
out.Ncolor() == 3) {
223 const int fineColor = 3;
225 Prolongate<Float,fineSpin,fineColor,coarseSpin,2>(
out,
in, v, fine_to_coarse,
parity);
226 }
else if (nVec == 4) {
227 Prolongate<Float,fineSpin,fineColor,coarseSpin,4>(
out,
in, v, fine_to_coarse,
parity);
228 }
else if (nVec == 24) {
229 Prolongate<Float,fineSpin,fineColor,coarseSpin,24>(
out,
in, v, fine_to_coarse,
parity);
230 }
else if (nVec == 32) {
231 Prolongate<Float,fineSpin,fineColor,coarseSpin,32>(
out,
in, v, fine_to_coarse,
parity);
235 }
else if (
out.Ncolor() == 2) {
236 const int fineColor = 2;
238 Prolongate<Float,fineSpin,fineColor,coarseSpin,2>(
out,
in, v, fine_to_coarse,
parity);
239 }
else if (nVec == 4) {
240 Prolongate<Float,fineSpin,fineColor,coarseSpin,4>(
out,
in, v, fine_to_coarse,
parity);
244 }
else if (
out.Ncolor() == 24) {
245 const int fineColor = 24;
247 Prolongate<Float,fineSpin,fineColor,coarseSpin,24>(
out,
in, v, fine_to_coarse,
parity);
248 }
else if (nVec == 32) {
249 Prolongate<Float,fineSpin,fineColor,coarseSpin,32>(
out,
in, v, fine_to_coarse,
parity);
253 }
else if (
out.Ncolor() == 32) {
254 const int fineColor = 32;
256 Prolongate<Float,fineSpin,fineColor,coarseSpin,32>(
out,
in, v, fine_to_coarse,
parity);
265 template <
typename Float>
266 void Prolongate(ColorSpinorField &
out,
const ColorSpinorField &
in,
const ColorSpinorField &v,
267 int Nvec,
const int *fine_to_coarse,
const int *spin_map,
int parity) {
269 if (
out.Nspin() == 4) {
270 Prolongate<Float,4>(
out,
in, v, Nvec, fine_to_coarse, spin_map,
parity);
271 }
else if (
out.Nspin() == 2) {
272 Prolongate<Float,2>(
out,
in, v, Nvec, fine_to_coarse, spin_map,
parity);
273 #ifdef GPU_STAGGERED_DIRAC 274 }
else if (
out.Nspin() == 1) {
275 Prolongate<Float,1>(
out,
in, v, Nvec, fine_to_coarse, spin_map,
parity);
282 #endif // GPU_MULTIGRID 285 int Nvec,
const int *fine_to_coarse,
const int *spin_map,
int parity) {
288 errorQuda(
"Field orders do not match (out=%d, in=%d, v=%d)",
294 #ifdef GPU_MULTIGRID_DOUBLE 295 Prolongate<double>(
out,
in, v, Nvec, fine_to_coarse, spin_map,
parity);
297 errorQuda(
"Double precision multigrid has not been enabled");
300 Prolongate<float>(
out,
in, v, Nvec, fine_to_coarse, spin_map,
parity);
307 errorQuda(
"Multigrid has not been built");
enum QudaPrecision_s QudaPrecision
QudaVerbosity getVerbosity()
#define checkPrecision(...)
cudaColorSpinorField * tmp
char * strcpy(char *__dst, const char *__src)
char * strcat(char *__s1, const char *__s2)
This is just a dummy structure we use for trove to define the required structure size.
for(int s=0;s< param.dc.Ls;s++)
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
#define checkLocation(...)
enum QudaFieldLocation_s QudaFieldLocation
cpuColorSpinorField * out
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
static const int volume_n
QudaTune getTuning()
Query whether autotuning is enabled or not. Default is enabled but can be overridden by setting QUDA_...
QudaFieldOrder FieldOrder() const
void Prolongate(ColorSpinorField &out, const ColorSpinorField &in, const ColorSpinorField &v, int Nvec, const int *fine_to_coarse, const int *spin_map, int parity=QUDA_INVALID_PARITY)
Apply the prolongation operator.