13 template <
typename Float,
typename vFloat,
int fineSpin,
int fineColor,
int coarseSpin,
int coarseColor,
14 int coarse_colors_per_thread>
15 class RestrictLaunch :
public Tunable {
21 const int *fine_to_coarse;
22 const int *coarse_to_fine;
28 unsigned int sharedBytesPerThread()
const {
return 0; }
29 unsigned int sharedBytesPerBlock(
const TuneParam &
param)
const {
return 0; }
30 bool tuneGridDim()
const {
return false; }
31 bool tuneAuxDim()
const {
return true; }
32 unsigned int minThreads()
const {
return in.
VolumeCB(); }
36 const int *fine_to_coarse,
const int *coarse_to_fine,
int parity)
37 : out(out), in(in), v(v), fine_to_coarse(fine_to_coarse), coarse_to_fine(coarse_to_fine),
38 parity(parity), location(
checkLocation(out,in,v)), block_size(in.VolumeCB()/(2*out.VolumeCB()))
42 create_jitify_program(
"kernels/restrictor.cuh");
54 virtual ~RestrictLaunch() { }
56 void apply(
const cudaStream_t &
stream) {
60 arg(out, in, v, fine_to_coarse, coarse_to_fine, parity);
61 Restrict<Float,fineSpin,fineColor,coarseSpin,coarseColor,coarse_colors_per_thread>(
arg);
70 Arg
arg(out, in, v, fine_to_coarse, coarse_to_fine, parity);
74 using namespace jitify::reflection;
75 jitify_error = program->kernel(
"quda::RestrictKernel")
76 .instantiate((
int)tp.
block.x,Type<Float>(),fineSpin,fineColor,coarseSpin,coarseColor,coarse_colors_per_thread,Type<Arg>())
80 coarseSpin,coarseColor,coarse_colors_per_thread,Arg);
97 while(param.
block.z <= coarseColor/coarse_colors_per_thread) {
99 if ( (coarseColor/coarse_colors_per_thread) % param.
block.z == 0) {
100 param.
grid.z = (coarseColor/coarse_colors_per_thread) / param.
block.z;
106 if (param.
block.z <= (coarseColor/coarse_colors_per_thread) ) {
110 param.
grid.z = coarseColor/coarse_colors_per_thread;
115 int tuningIter()
const {
return 3; }
133 bool advanceTuneParam(
TuneParam ¶m)
const {
return advanceSharedBytes(param) || advanceAux(param); }
135 TuneKey tuneKey()
const {
return TuneKey(vol,
typeid(*this).name(), aux); }
137 void initTuneParam(
TuneParam ¶m)
const { defaultTuneParam(param); }
140 void defaultTuneParam(
TuneParam ¶m)
const {
142 param.
grid = dim3( (minThreads()+param.
block.x-1) / param.
block.x, 1, 1);
146 param.
grid.z = coarseColor / coarse_colors_per_thread;
150 long long flops()
const {
return 8 * fineSpin * fineColor * coarseColor * in.
SiteSubset()*(
long long)in.
VolumeCB(); }
152 long long bytes()
const {
159 template <
typename Float,
int fineSpin,
int fineColor,
int coarseSpin,
int coarseColor>
161 const int *fine_to_coarse,
const int *coarse_to_fine,
int parity) {
164 constexpr
int coarse_colors_per_thread = fineColor != 3 ? 2 : coarseColor >= 4 && coarseColor % 4 == 0 ? 4 : 2;
168 #if QUDA_PRECISION & 2 169 RestrictLaunch<Float, short, fineSpin, fineColor, coarseSpin, coarseColor, coarse_colors_per_thread>
170 restrictor(out, in, v, fine_to_coarse, coarse_to_fine, parity);
173 errorQuda(
"QUDA_PRECISION=%d does not enable half precision", QUDA_PRECISION);
176 RestrictLaunch<Float, Float, fineSpin, fineColor, coarseSpin, coarseColor, coarse_colors_per_thread>
177 restrictor(out, in, v, fine_to_coarse, coarse_to_fine, parity);
186 template <
typename Float,
int fineSpin>
188 int nVec,
const int *fine_to_coarse,
const int *coarse_to_fine,
const int *
const * spin_map,
int parity) {
191 const int coarseSpin = 2;
195 for (
int s=0;
s<fineSpin;
s++)
196 for (
int p=0; p<2; p++)
197 if (mapper(
s,p) != spin_map[
s][p])
errorQuda(
"Spin map does not match spin_mapper");
202 const int fineColor = 3;
204 Restrict<Float,fineSpin,fineColor,coarseSpin,4>(
out,
in, v, fine_to_coarse, coarse_to_fine,
parity);
205 }
else if (nVec == 6) {
206 Restrict<Float,fineSpin,fineColor,coarseSpin,6>(
out,
in, v, fine_to_coarse, coarse_to_fine,
parity);
207 }
else if (nVec == 24) {
208 Restrict<Float,fineSpin,fineColor,coarseSpin,24>(
out,
in, v, fine_to_coarse, coarse_to_fine,
parity);
209 }
else if (nVec == 32) {
210 Restrict<Float,fineSpin,fineColor,coarseSpin,32>(
out,
in, v, fine_to_coarse, coarse_to_fine,
parity);
214 }
else if (in.
Ncolor() == 6) {
215 const int fineColor = 6;
217 Restrict<Float,fineSpin,fineColor,coarseSpin,6>(
out,
in, v, fine_to_coarse, coarse_to_fine,
parity);
221 }
else if (in.
Ncolor() == 24) {
222 const int fineColor = 24;
224 Restrict<Float,fineSpin,fineColor,coarseSpin,24>(
out,
in, v, fine_to_coarse, coarse_to_fine,
parity);
225 }
else if (nVec == 32) {
226 Restrict<Float,fineSpin,fineColor,coarseSpin,32>(
out,
in, v, fine_to_coarse, coarse_to_fine,
parity);
230 }
else if (in.
Ncolor() == 32) {
231 const int fineColor = 32;
233 Restrict<Float,fineSpin,fineColor,coarseSpin,32>(
out,
in, v, fine_to_coarse, coarse_to_fine,
parity);
242 template <
typename Float>
244 int Nvec,
const int *fine_to_coarse,
const int *coarse_to_fine,
const int *
const * spin_map,
int parity) {
246 if (in.
Nspin() == 2) {
247 Restrict<Float,2>(
out,
in, v, Nvec, fine_to_coarse, coarse_to_fine, spin_map,
parity);
248 #ifdef GPU_WILSON_DIRAC 249 }
else if (in.
Nspin() == 4) {
250 Restrict<Float,4>(
out,
in, v, Nvec, fine_to_coarse, coarse_to_fine, spin_map,
parity);
252 #if GPU_STAGGERED_DIRAC 253 }
else if (in.
Nspin() == 1) {
254 Restrict<Float,1>(
out,
in, v, Nvec, fine_to_coarse, coarse_to_fine, spin_map,
parity);
261 #endif // GPU_MULTIGRID 264 int Nvec,
const int *fine_to_coarse,
const int *coarse_to_fine,
const int *
const * spin_map,
int parity) {
268 errorQuda(
"Field orders do not match (out=%d, in=%d, v=%d)",
274 #ifdef GPU_MULTIGRID_DOUBLE 275 Restrict<double>(
out,
in, v, Nvec, fine_to_coarse, coarse_to_fine, spin_map,
parity);
277 errorQuda(
"Double precision multigrid has not been enabled");
280 Restrict<float>(
out,
in, v, Nvec, fine_to_coarse, coarse_to_fine, spin_map,
parity);
285 errorQuda(
"Multigrid has not been built");
enum QudaPrecision_s QudaPrecision
const char * AuxString() const
cudaDeviceProp deviceProp
QudaVerbosity getVerbosity()
#define checkPrecision(...)
__global__ void RestrictKernel(Arg arg)
Helper file when using jitify run-time compilation. This file should be included in source code...
const char * VolString() const
const char * compile_type_str(const LatticeField &meta, QudaFieldLocation location_=QUDA_INVALID_FIELD_LOCATION)
Helper function for setting auxilary string.
QudaSiteSubset SiteSubset() const
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
#define checkLocation(...)
QudaFieldLocation Location() const
enum QudaFieldLocation_s QudaFieldLocation
cpuColorSpinorField * out
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
static const int volume_n
QudaTune getTuning()
Query whether autotuning is enabled or not. Default is enabled but can be overridden by setting QUDA_...
QudaPrecision Precision() const
#define LAUNCH_KERNEL_MG_BLOCK_SIZE(kernel, tp, stream, arg,...)
QudaFieldOrder FieldOrder() const