21 template <
typename Float,
int fineSpin,
int fineColor,
int coarseSpin,
int coarseColor, QudaFieldOrder order>
27 const int *fine_to_coarse;
28 const int *coarse_to_fine;
29 const spin_mapper<fineSpin,coarseSpin> spin_map;
34 RestrictArg(ColorSpinorField &
out,
const ColorSpinorField &
in,
const ColorSpinorField &
V,
35 const int *fine_to_coarse,
const int *coarse_to_fine,
int parity)
36 :
out(
out),
in(
in),
V(
V), fine_to_coarse(fine_to_coarse), coarse_to_fine(coarse_to_fine),
40 RestrictArg(
const RestrictArg<Float,fineSpin,fineColor,coarseSpin,coarseColor,order> &
arg) :
42 fine_to_coarse(
arg.fine_to_coarse), coarse_to_fine(
arg.coarse_to_fine), spin_map(),
50 template <
typename Float,
int fineSpin,
int fineColor,
int coarseColor,
int coarse_colors_per_thread,
51 class FineColor,
class Rotator>
52 __device__ __host__
inline void rotateCoarseColor(complex<Float>
out[fineSpin*coarse_colors_per_thread],
53 const FineColor &
in,
const Rotator &
V,
54 int parity,
int nParity,
int x_cb,
int coarse_color_block) {
55 const int spinor_parity = (nParity == 2) ?
parity : 0;
56 const int v_parity = (
V.Nparity() == 2) ?
parity : 0;
59 for (
int s=0;
s<fineSpin;
s++)
61 for (
int coarse_color_local=0; coarse_color_local<coarse_colors_per_thread; coarse_color_local++) {
62 out[
s*coarse_colors_per_thread+coarse_color_local] = 0.0;
66 for (
int coarse_color_local=0; coarse_color_local<coarse_colors_per_thread; coarse_color_local++) {
67 int i = coarse_color_block + coarse_color_local;
69 for (
int s=0;
s<fineSpin;
s++) {
71 constexpr
int color_unroll = fineColor == 3 ? 3 : 2;
73 complex<Float> partial[color_unroll];
75 for (
int k=0; k<color_unroll; k++) partial[k] = 0.0;
78 for (
int j=0; j<fineColor; j+=color_unroll) {
80 for (
int k=0; k<color_unroll; k++)
81 partial[k] +=
conj(
V(v_parity, x_cb,
s, j+k,
i)) *
in(spinor_parity, x_cb,
s, j+k);
85 for (
int k=0; k<color_unroll; k++)
out[
s*coarse_colors_per_thread + coarse_color_local] += partial[k];
91 template <
typename Float,
int fineSpin,
int fineColor,
int coarseSpin,
int coarseColor,
int coarse_colors_per_thread,
typename Arg>
93 for (
int parity_coarse=0; parity_coarse<2; parity_coarse++)
94 for (
int x_coarse_cb=0; x_coarse_cb<
arg.out.VolumeCB(); x_coarse_cb++)
95 for (
int s=0;
s<coarseSpin;
s++)
96 for (
int c=0;
c<coarseColor;
c++)
97 arg.out(parity_coarse, x_coarse_cb,
s,
c) = 0.0;
103 for (
int x_cb=0; x_cb<
arg.in.VolumeCB(); x_cb++) {
106 int x_coarse =
arg.fine_to_coarse[
x];
107 int parity_coarse = (x_coarse >=
arg.out.VolumeCB()) ? 1 : 0;
108 int x_coarse_cb = x_coarse - parity_coarse*
arg.out.VolumeCB();
110 for (
int coarse_color_block=0; coarse_color_block<coarseColor; coarse_color_block+=coarse_colors_per_thread) {
111 complex<Float>
tmp[fineSpin*coarse_colors_per_thread];
112 rotateCoarseColor<Float,fineSpin,fineColor,coarseColor,coarse_colors_per_thread>
115 for (
int s=0;
s<fineSpin;
s++) {
116 for (
int coarse_color_local=0; coarse_color_local<coarse_colors_per_thread; coarse_color_local++) {
117 int c = coarse_color_block + coarse_color_local;
118 arg.out(parity_coarse,x_coarse_cb,
arg.spin_map(
s),
c) +=
tmp[
s*coarse_colors_per_thread+coarse_color_local];
131 template <
typename scalar,
int n>
134 __device__ __host__
inline scalar& operator[](
int i) {
return data[
i]; }
135 __device__ __host__
inline const scalar& operator[](
int i)
const {
return data[
i]; }
136 __device__ __host__
inline static constexpr
int size() {
return n; }
137 __device__ __host__ vector_type() {
for (
int i=0;
i<
n;
i++) data[
i] = 0.0; }
143 template <
typename T>
145 __device__ __host__
inline T operator()(
const T &
a,
const T &
b) {
160 template <
typename Float,
int fineSpin,
int fineColor,
int coarseSpin,
int coarseColor,
int coarse_colors_per_thread,
161 typename Arg,
int block_size>
162 __global__
void RestrictKernel(Arg
arg) {
168 int x_coarse = blockIdx.x;
169 if (blockIdx.x < gridp) {
171 const int i = blockIdx.x %
arg.swizzle;
172 const int j = blockIdx.x /
arg.swizzle;
175 x_coarse =
i * (gridp /
arg.swizzle) + j;
178 int x_coarse = blockIdx.x;
181 int parity_coarse = x_coarse >=
arg.out.VolumeCB() ? 1 : 0;
182 int x_coarse_cb = x_coarse - parity_coarse*
arg.out.VolumeCB();
192 int parity =
arg.nParity == 2 ? threadIdx.y :
arg.parity;
193 int x_fine =
arg.coarse_to_fine[ (x_coarse*2 +
parity) *
blockDim.x + threadIdx.x];
194 int x_fine_cb = x_fine -
parity*
arg.in.VolumeCB();
196 int coarse_color_block = (
blockDim.z*blockIdx.z + threadIdx.z) * coarse_colors_per_thread;
197 if (coarse_color_block >= coarseColor)
return;
199 complex<Float>
tmp[fineSpin*coarse_colors_per_thread];
200 rotateCoarseColor<Float,fineSpin,fineColor,coarseColor,coarse_colors_per_thread>
203 typedef vector_type<complex<Float>, coarseSpin*coarse_colors_per_thread>
vector;
207 for (
int s=0;
s<fineSpin;
s++) {
208 for (
int v=0; v<coarse_colors_per_thread; v++) {
209 reduced[
arg.spin_map(
s)*coarse_colors_per_thread+v] +=
tmp[
s*coarse_colors_per_thread+v];
214 if (
arg.nParity == 2) {
215 typedef cub::BlockReduce<vector, block_size, cub::BLOCK_REDUCE_WARP_REDUCTIONS, 2> BlockReduce;
216 __shared__
typename BlockReduce::TempStorage temp_storage;
217 reduce<vector> reducer;
220 reduced = BlockReduce(temp_storage).Reduce(reduced, reducer);
222 typedef cub::BlockReduce<vector, block_size, cub::BLOCK_REDUCE_WARP_REDUCTIONS> BlockReduce;
223 __shared__
typename BlockReduce::TempStorage temp_storage;
224 reduce<vector> reducer;
227 reduced = BlockReduce(temp_storage).Reduce(reduced, reducer);
230 if (threadIdx.x==0 && threadIdx.y == 0) {
231 for (
int s=0;
s<coarseSpin;
s++) {
232 for (
int coarse_color_local=0; coarse_color_local<coarse_colors_per_thread; coarse_color_local++) {
233 int v = coarse_color_block + coarse_color_local;
234 arg.out(parity_coarse, x_coarse_cb,
s, v) = reduced[
s*coarse_colors_per_thread+coarse_color_local];
240 template <
typename Float,
int fineSpin,
int fineColor,
int coarseSpin,
int coarseColor,
241 int coarse_colors_per_thread>
242 class RestrictLaunch :
public Tunable {
245 ColorSpinorField &
out;
246 const ColorSpinorField &
in;
247 const ColorSpinorField &v;
248 const int *fine_to_coarse;
249 const int *coarse_to_fine;
252 const int block_size;
255 unsigned int sharedBytesPerThread()
const {
return 0; }
256 unsigned int sharedBytesPerBlock(
const TuneParam &
param)
const {
return 0; }
257 bool tuneGridDim()
const {
return false; }
258 bool tuneAuxDim()
const {
return true; }
259 unsigned int minThreads()
const {
return in.VolumeCB(); }
262 RestrictLaunch(ColorSpinorField &
out,
const ColorSpinorField &
in,
const ColorSpinorField &v,
263 const int *fine_to_coarse,
const int *coarse_to_fine,
int parity)
264 :
out(
out),
in(
in), v(v), fine_to_coarse(fine_to_coarse), coarse_to_fine(coarse_to_fine),
275 virtual ~RestrictLaunch() { }
277 void apply(
const cudaStream_t &
stream) {
280 RestrictArg<Float,fineSpin,fineColor,coarseSpin,coarseColor,QUDA_SPACE_SPIN_COLOR_FIELD_ORDER>
282 Restrict<Float,fineSpin,fineColor,coarseSpin,coarseColor,coarse_colors_per_thread>(
arg);
284 errorQuda(
"Unsupported field order %d",
out.FieldOrder());
290 typedef RestrictArg<Float,fineSpin,fineColor,coarseSpin,coarseColor,QUDA_FLOAT2_FIELD_ORDER> Arg;
292 arg.swizzle = tp.aux.x;
294 if (block_size == 4) {
295 RestrictKernel<Float,fineSpin,fineColor,coarseSpin,coarseColor,coarse_colors_per_thread,Arg,4>
296 <<<tp.grid, tp.block, tp.shared_bytes,
stream>>>(
arg);
297 }
else if (block_size == 8) {
298 RestrictKernel<Float,fineSpin,fineColor,coarseSpin,coarseColor,coarse_colors_per_thread,Arg,8>
299 <<<tp.grid, tp.block, tp.shared_bytes,
stream>>>(
arg);
300 }
else if (block_size == 16) {
301 RestrictKernel<Float,fineSpin,fineColor,coarseSpin,coarseColor,coarse_colors_per_thread,Arg,16>
302 <<<tp.grid, tp.block, tp.shared_bytes,
stream>>>(
arg);
303 }
else if (block_size == 27) {
304 RestrictKernel<Float,fineSpin,fineColor,coarseSpin,coarseColor,coarse_colors_per_thread,Arg,27>
305 <<<tp.grid, tp.block, tp.shared_bytes,
stream>>>(
arg);
306 }
else if (block_size == 36) {
307 RestrictKernel<Float,fineSpin,fineColor,coarseSpin,coarseColor,coarse_colors_per_thread,Arg,36>
308 <<<tp.grid, tp.block, tp.shared_bytes,
stream>>>(
arg);
309 }
else if (block_size == 54) {
310 RestrictKernel<Float,fineSpin,fineColor,coarseSpin,coarseColor,coarse_colors_per_thread,Arg,54>
311 <<<tp.grid, tp.block, tp.shared_bytes,
stream>>>(
arg);
312 }
else if (block_size == 64) {
313 RestrictKernel<Float,fineSpin,fineColor,coarseSpin,coarseColor,coarse_colors_per_thread,Arg,64>
314 <<<tp.grid, tp.block, tp.shared_bytes,
stream>>>(
arg);
315 }
else if (block_size == 100) {
316 RestrictKernel<Float,fineSpin,fineColor,coarseSpin,coarseColor,coarse_colors_per_thread,Arg,100>
317 <<<tp.grid, tp.block, tp.shared_bytes,
stream>>>(
arg);
318 }
else if (block_size == 108) {
319 RestrictKernel<Float,fineSpin,fineColor,coarseSpin,coarseColor,coarse_colors_per_thread,Arg,108>
320 <<<tp.grid, tp.block, tp.shared_bytes,
stream>>>(
arg);
321 }
else if (block_size == 128) {
322 RestrictKernel<Float,fineSpin,fineColor,coarseSpin,coarseColor,coarse_colors_per_thread,Arg,128>
323 <<<tp.grid, tp.block, tp.shared_bytes,
stream>>>(
arg);
324 #if __COMPUTE_CAPABILITY__ >= 300 325 }
else if (block_size == 200) {
326 RestrictKernel<Float,fineSpin,fineColor,coarseSpin,coarseColor,coarse_colors_per_thread,Arg,200>
327 <<<tp.grid, tp.block, tp.shared_bytes,
stream>>>(
arg);
328 }
else if (block_size == 256) {
329 RestrictKernel<Float,fineSpin,fineColor,coarseSpin,coarseColor,coarse_colors_per_thread,Arg,256>
330 <<<tp.grid, tp.block, tp.shared_bytes,
stream>>>(
arg);
331 }
else if (block_size == 432) {
332 RestrictKernel<Float,fineSpin,fineColor,coarseSpin,coarseColor,coarse_colors_per_thread,Arg,432>
333 <<<tp.grid, tp.block, tp.shared_bytes,
stream>>>(
arg);
334 }
else if (block_size == 500) {
335 RestrictKernel<Float,fineSpin,fineColor,coarseSpin,coarseColor,coarse_colors_per_thread,Arg,500>
336 <<<tp.grid, tp.block, tp.shared_bytes,
stream>>>(
arg);
339 errorQuda(
"Block size %d not instantiated", block_size);
342 errorQuda(
"Unsupported field order %d",
out.FieldOrder());
353 bool advanceBlockDim(TuneParam &
param)
const 356 while(
param.block.z <= coarseColor/coarse_colors_per_thread) {
358 if ( (coarseColor/coarse_colors_per_thread) %
param.block.z == 0) {
359 param.grid.z = (coarseColor/coarse_colors_per_thread) /
param.block.z;
365 if (
param.block.z <= (coarseColor/coarse_colors_per_thread) ) {
369 param.grid.z = coarseColor/coarse_colors_per_thread;
374 int tuningIter()
const {
return 3; }
376 bool advanceAux(TuneParam &
param)
const 392 bool advanceTuneParam(TuneParam &
param)
const {
return advanceSharedBytes(
param) || advanceAux(
param); }
394 TuneKey tuneKey()
const {
return TuneKey(vol,
typeid(*this).name(), aux); }
396 void initTuneParam(TuneParam &
param)
const { defaultTuneParam(
param); }
399 void defaultTuneParam(TuneParam &
param)
const {
400 param.block = dim3(block_size,
in.SiteSubset(), 1);
401 param.grid = dim3( (minThreads()+
param.block.x-1) /
param.block.x, 1, 1);
402 param.shared_bytes = 0;
405 param.grid.z = coarseColor / coarse_colors_per_thread;
409 long long flops()
const {
return 8 * fineSpin * fineColor * coarseColor *
in.SiteSubset()*
in.VolumeCB(); }
411 long long bytes()
const {
412 size_t v_bytes = v.Bytes() / (v.SiteSubset() ==
in.SiteSubset() ? 1 : 2);
413 return in.Bytes() +
out.Bytes() + v_bytes +
in.SiteSubset()*
in.VolumeCB()*
sizeof(
int);
418 template <
typename Float,
int fineSpin,
int fineColor,
int coarseSpin,
int coarseColor>
419 void Restrict(ColorSpinorField &
out,
const ColorSpinorField &
in,
const ColorSpinorField &v,
420 const int *fine_to_coarse,
const int *coarse_to_fine,
int parity) {
423 constexpr
int coarse_colors_per_thread = fineColor != 3 ? 2 : coarseColor >= 4 && coarseColor % 4 == 0 ? 4 : 2;
426 RestrictLaunch<Float, fineSpin, fineColor, coarseSpin, coarseColor, coarse_colors_per_thread>
427 restrictor(
out,
in, v, fine_to_coarse, coarse_to_fine,
parity);
433 template <
typename Float,
int fineSpin>
434 void Restrict(ColorSpinorField &
out,
const ColorSpinorField &
in,
const ColorSpinorField &v,
435 int nVec,
const int *fine_to_coarse,
const int *coarse_to_fine,
const int *spin_map,
int parity) {
438 const int coarseSpin = 2;
441 spin_mapper<fineSpin,coarseSpin> mapper;
442 for (
int s=0;
s<fineSpin;
s++)
443 if (mapper(
s) != spin_map[
s])
errorQuda(
"Spin map does not match spin_mapper");
447 if (
in.Ncolor() == 3) {
448 const int fineColor = 3;
450 Restrict<Float,fineSpin,fineColor,coarseSpin,2>(
out,
in, v, fine_to_coarse, coarse_to_fine,
parity);
451 }
else if (nVec == 4) {
452 Restrict<Float,fineSpin,fineColor,coarseSpin,4>(
out,
in, v, fine_to_coarse, coarse_to_fine,
parity);
453 }
else if (nVec == 24) {
454 Restrict<Float,fineSpin,fineColor,coarseSpin,24>(
out,
in, v, fine_to_coarse, coarse_to_fine,
parity);
455 }
else if (nVec == 32) {
456 Restrict<Float,fineSpin,fineColor,coarseSpin,32>(
out,
in, v, fine_to_coarse, coarse_to_fine,
parity);
460 }
else if (
in.Ncolor() == 2) {
461 const int fineColor = 2;
463 Restrict<Float,fineSpin,fineColor,coarseSpin,2>(
out,
in, v, fine_to_coarse, coarse_to_fine,
parity);
464 }
else if (nVec == 4) {
465 Restrict<Float,fineSpin,fineColor,coarseSpin,4>(
out,
in, v, fine_to_coarse, coarse_to_fine,
parity);
469 }
else if (
in.Ncolor() == 24) {
470 const int fineColor = 24;
472 Restrict<Float,fineSpin,fineColor,coarseSpin,24>(
out,
in, v, fine_to_coarse, coarse_to_fine,
parity);
473 }
else if (nVec == 32) {
474 Restrict<Float,fineSpin,fineColor,coarseSpin,32>(
out,
in, v, fine_to_coarse, coarse_to_fine,
parity);
478 }
else if (
in.Ncolor() == 32) {
479 const int fineColor = 32;
481 Restrict<Float,fineSpin,fineColor,coarseSpin,32>(
out,
in, v, fine_to_coarse, coarse_to_fine,
parity);
490 template <
typename Float>
491 void Restrict(ColorSpinorField &
out,
const ColorSpinorField &
in,
const ColorSpinorField &v,
492 int Nvec,
const int *fine_to_coarse,
const int *coarse_to_fine,
const int *spin_map,
int parity) {
494 if (
in.Nspin() == 4) {
495 Restrict<Float,4>(
out,
in, v, Nvec, fine_to_coarse, coarse_to_fine, spin_map,
parity);
496 }
else if (
in.Nspin() == 2) {
497 Restrict<Float,2>(
out,
in, v, Nvec, fine_to_coarse, coarse_to_fine, spin_map,
parity);
498 #if GPU_STAGGERED_DIRAC 499 }
else if (
in.Nspin() == 1) {
500 Restrict<Float,1>(
out,
in, v, Nvec, fine_to_coarse, coarse_to_fine, spin_map,
parity);
507 #endif // GPU_MULTIGRID 510 int Nvec,
const int *fine_to_coarse,
const int *coarse_to_fine,
const int *spin_map,
int parity) {
514 errorQuda(
"Field orders do not match (out=%d, in=%d, v=%d)",
520 #ifdef GPU_MULTIGRID_DOUBLE 521 Restrict<double>(
out,
in, v, Nvec, fine_to_coarse, coarse_to_fine, spin_map,
parity);
523 errorQuda(
"Double precision multigrid has not been enabled");
526 Restrict<float>(
out,
in, v, Nvec, fine_to_coarse, coarse_to_fine, spin_map,
parity);
531 errorQuda(
"Multigrid has not been built");
enum QudaPrecision_s QudaPrecision
cudaDeviceProp deviceProp
QudaVerbosity getVerbosity()
#define checkPrecision(...)
cudaColorSpinorField * tmp
char * strcpy(char *__dst, const char *__src)
char * strcat(char *__s1, const char *__s2)
__host__ __device__ void sum(double &a, double &b)
for(int s=0;s< param.dc.Ls;s++)
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
#define checkLocation(...)
void Restrict(ColorSpinorField &out, const ColorSpinorField &in, const ColorSpinorField &v, int Nvec, const int *fine_to_coarse, const int *coarse_to_fine, const int *spin_map, int parity=QUDA_INVALID_PARITY)
Apply the restriction operator.
enum QudaFieldLocation_s QudaFieldLocation
cpuColorSpinorField * out
__device__ void reduce(ReduceArg< T > arg, const T &in, const int idx=0)
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
__host__ __device__ ValueType conj(ValueType x)
static const int volume_n
QudaTune getTuning()
Query whether autotuning is enabled or not. Default is enabled but can be overridden by setting QUDA_...
QudaFieldOrder FieldOrder() const