QUDA  1.0.0
restrictor.cu
Go to the documentation of this file.
1 #include <color_spinor_field.h>
2 #include <tune_quda.h>
3 #include <typeinfo>
4 #include <launch_kernel.cuh>
5 
6 #include <jitify_helper.cuh>
7 #include <kernels/restrictor.cuh>
8 
9 namespace quda {
10 
11 #ifdef GPU_MULTIGRID
12 
13  template <typename Float, typename vFloat, int fineSpin, int fineColor, int coarseSpin, int coarseColor,
14  int coarse_colors_per_thread>
15  class RestrictLaunch : public Tunable {
16 
17  protected:
19  const ColorSpinorField &in;
20  const ColorSpinorField &v;
21  const int *fine_to_coarse;
22  const int *coarse_to_fine;
23  const int parity;
24  const QudaFieldLocation location;
25  const int block_size;
26  char vol[TuneKey::volume_n];
27 
28  unsigned int sharedBytesPerThread() const { return 0; }
29  unsigned int sharedBytesPerBlock(const TuneParam &param) const { return 0; }
30  bool tuneGridDim() const { return false; } // Don't tune the grid dimensions.
31  bool tuneAuxDim() const { return true; } // Do tune the aux dimensions.
32  unsigned int minThreads() const { return in.VolumeCB(); } // fine parity is the block y dimension
33 
34  public:
35  RestrictLaunch(ColorSpinorField &out, const ColorSpinorField &in, const ColorSpinorField &v,
36  const int *fine_to_coarse, const int *coarse_to_fine, int parity)
37  : out(out), in(in), v(v), fine_to_coarse(fine_to_coarse), coarse_to_fine(coarse_to_fine),
38  parity(parity), location(checkLocation(out,in,v)), block_size(in.VolumeCB()/(2*out.VolumeCB()))
39  {
41 #ifdef JITIFY
42  create_jitify_program("kernels/restrictor.cuh");
43 #endif
44  }
45  strcpy(aux, compile_type_str(in));
46  strcat(aux, out.AuxString());
47  strcat(aux, ",");
48  strcat(aux, in.AuxString());
49 
50  strcpy(vol, out.VolString());
51  strcat(vol, ",");
52  strcat(vol, in.VolString());
53  } // block size is checkerboard fine length / full coarse length
54  virtual ~RestrictLaunch() { }
55 
56  void apply(const cudaStream_t &stream) {
57  if (location == QUDA_CPU_FIELD_LOCATION) {
60  arg(out, in, v, fine_to_coarse, coarse_to_fine, parity);
61  Restrict<Float,fineSpin,fineColor,coarseSpin,coarseColor,coarse_colors_per_thread>(arg);
62  } else {
63  errorQuda("Unsupported field order %d", out.FieldOrder());
64  }
65  } else {
66  TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
67 
68  if (out.FieldOrder() == QUDA_FLOAT2_FIELD_ORDER) {
70  Arg arg(out, in, v, fine_to_coarse, coarse_to_fine, parity);
71  arg.swizzle = tp.aux.x;
72 
73 #ifdef JITIFY
74  using namespace jitify::reflection;
75  jitify_error = program->kernel("quda::RestrictKernel")
76  .instantiate((int)tp.block.x,Type<Float>(),fineSpin,fineColor,coarseSpin,coarseColor,coarse_colors_per_thread,Type<Arg>())
77  .configure(tp.grid,tp.block,tp.shared_bytes,stream).launch(arg);
78 #else
79  LAUNCH_KERNEL_MG_BLOCK_SIZE(RestrictKernel,tp,stream,arg,Float,fineSpin,fineColor,
80  coarseSpin,coarseColor,coarse_colors_per_thread,Arg);
81 #endif
82  } else {
83  errorQuda("Unsupported field order %d", out.FieldOrder());
84  }
85  }
86  }
87 
88  // This block tuning tunes for the optimal amount of color
89  // splitting between blockDim.z and gridDim.z. However, enabling
90  // blockDim.z > 1 gives incorrect results due to cub reductions
91  // being unable to do independent sliced reductions along
92  // blockDim.z. So for now we only split between colors per thread
93  // and grid.z.
94  bool advanceBlockDim(TuneParam &param) const
95  {
96  // let's try to advance spin/block-color
97  while(param.block.z <= coarseColor/coarse_colors_per_thread) {
98  param.block.z++;
99  if ( (coarseColor/coarse_colors_per_thread) % param.block.z == 0) {
100  param.grid.z = (coarseColor/coarse_colors_per_thread) / param.block.z;
101  break;
102  }
103  }
104 
105  // we can advance spin/block-color since this is valid
106  if (param.block.z <= (coarseColor/coarse_colors_per_thread) ) { //
107  return true;
108  } else { // we have run off the end so let's reset
109  param.block.z = 1;
110  param.grid.z = coarseColor/coarse_colors_per_thread;
111  return false;
112  }
113  }
114 
115  int tuningIter() const { return 3; }
116 
117  bool advanceAux(TuneParam &param) const
118  {
119 #ifdef SWIZZLE
120  if (param.aux.x < 2*deviceProp.multiProcessorCount) {
121  param.aux.x++;
122  return true;
123  } else {
124  param.aux.x = 1;
125  return false;
126  }
127 #else
128  return false;
129 #endif
130  }
131 
132  // only tune shared memory per thread (disable tuning for block.z for now)
133  bool advanceTuneParam(TuneParam &param) const { return advanceSharedBytes(param) || advanceAux(param); }
134 
135  TuneKey tuneKey() const { return TuneKey(vol, typeid(*this).name(), aux); }
136 
137  void initTuneParam(TuneParam &param) const { defaultTuneParam(param); }
138 
140  void defaultTuneParam(TuneParam &param) const {
141  param.block = dim3(block_size, in.SiteSubset(), 1);
142  param.grid = dim3( (minThreads()+param.block.x-1) / param.block.x, 1, 1);
143  param.shared_bytes = 0;
144 
145  param.block.z = 1;
146  param.grid.z = coarseColor / coarse_colors_per_thread;
147  param.aux.x = 1; // swizzle factor
148  }
149 
150  long long flops() const { return 8 * fineSpin * fineColor * coarseColor * in.SiteSubset()*(long long)in.VolumeCB(); }
151 
152  long long bytes() const {
153  size_t v_bytes = v.Bytes() / (v.SiteSubset() == in.SiteSubset() ? 1 : 2);
154  return in.Bytes() + out.Bytes() + v_bytes + in.SiteSubset()*in.VolumeCB()*sizeof(int);
155  }
156 
157  };
158 
159  template <typename Float, int fineSpin, int fineColor, int coarseSpin, int coarseColor>
160  void Restrict(ColorSpinorField &out, const ColorSpinorField &in, const ColorSpinorField &v,
161  const int *fine_to_coarse, const int *coarse_to_fine, int parity) {
162 
163  // for fine grids (Nc=3) have more parallelism so can use more coarse strategy
164  constexpr int coarse_colors_per_thread = fineColor != 3 ? 2 : coarseColor >= 4 && coarseColor % 4 == 0 ? 4 : 2;
165  //coarseColor >= 8 && coarseColor % 8 == 0 ? 8 : coarseColor >= 4 && coarseColor % 4 == 0 ? 4 : 2;
166 
167  if (v.Precision() == QUDA_HALF_PRECISION) {
168 #if QUDA_PRECISION & 2
169  RestrictLaunch<Float, short, fineSpin, fineColor, coarseSpin, coarseColor, coarse_colors_per_thread>
170  restrictor(out, in, v, fine_to_coarse, coarse_to_fine, parity);
171  restrictor.apply(0);
172 #else
173  errorQuda("QUDA_PRECISION=%d does not enable half precision", QUDA_PRECISION);
174 #endif
175  } else if (v.Precision() == in.Precision()) {
176  RestrictLaunch<Float, Float, fineSpin, fineColor, coarseSpin, coarseColor, coarse_colors_per_thread>
177  restrictor(out, in, v, fine_to_coarse, coarse_to_fine, parity);
178  restrictor.apply(0);
179  } else {
180  errorQuda("Unsupported V precision %d", v.Precision());
181  }
182 
184  }
185 
186  template <typename Float, int fineSpin>
187  void Restrict(ColorSpinorField &out, const ColorSpinorField &in, const ColorSpinorField &v,
188  int nVec, const int *fine_to_coarse, const int *coarse_to_fine, const int * const * spin_map, int parity) {
189 
190  if (out.Nspin() != 2) errorQuda("Unsupported nSpin %d", out.Nspin());
191  const int coarseSpin = 2;
192 
193  // first check that the spin_map matches the spin_mapper
195  for (int s=0; s<fineSpin; s++)
196  for (int p=0; p<2; p++)
197  if (mapper(s,p) != spin_map[s][p]) errorQuda("Spin map does not match spin_mapper");
198 
199 
200  // Template over fine color
201  if (in.Ncolor() == 3) { // standard QCD
202  const int fineColor = 3;
203  if (nVec == 4) {
204  Restrict<Float,fineSpin,fineColor,coarseSpin,4>(out, in, v, fine_to_coarse, coarse_to_fine, parity);
205  } else if (nVec == 6) { // free field Wilson
206  Restrict<Float,fineSpin,fineColor,coarseSpin,6>(out, in, v, fine_to_coarse, coarse_to_fine, parity);
207  } else if (nVec == 24) {
208  Restrict<Float,fineSpin,fineColor,coarseSpin,24>(out, in, v, fine_to_coarse, coarse_to_fine, parity);
209  } else if (nVec == 32) {
210  Restrict<Float,fineSpin,fineColor,coarseSpin,32>(out, in, v, fine_to_coarse, coarse_to_fine, parity);
211  } else {
212  errorQuda("Unsupported nVec %d", nVec);
213  }
214  } else if (in.Ncolor() == 6) { // Coarsen coarsened Wilson free field
215  const int fineColor = 6;
216  if (nVec == 6) {
217  Restrict<Float,fineSpin,fineColor,coarseSpin,6>(out, in, v, fine_to_coarse, coarse_to_fine, parity);
218  } else {
219  errorQuda("Unsupported nVec %d", nVec);
220  }
221  } else if (in.Ncolor() == 24) { // to keep compilation under control coarse grids have same or more colors
222  const int fineColor = 24;
223  if (nVec == 24) {
224  Restrict<Float,fineSpin,fineColor,coarseSpin,24>(out, in, v, fine_to_coarse, coarse_to_fine, parity);
225  } else if (nVec == 32) {
226  Restrict<Float,fineSpin,fineColor,coarseSpin,32>(out, in, v, fine_to_coarse, coarse_to_fine, parity);
227  } else {
228  errorQuda("Unsupported nVec %d", nVec);
229  }
230  } else if (in.Ncolor() == 32) {
231  const int fineColor = 32;
232  if (nVec == 32) {
233  Restrict<Float,fineSpin,fineColor,coarseSpin,32>(out, in, v, fine_to_coarse, coarse_to_fine, parity);
234  } else {
235  errorQuda("Unsupported nVec %d", nVec);
236  }
237  } else {
238  errorQuda("Unsupported nColor %d", in.Ncolor());
239  }
240  }
241 
242  template <typename Float>
243  void Restrict(ColorSpinorField &out, const ColorSpinorField &in, const ColorSpinorField &v,
244  int Nvec, const int *fine_to_coarse, const int *coarse_to_fine, const int * const * spin_map, int parity) {
245 
246  if (in.Nspin() == 2) {
247  Restrict<Float,2>(out, in, v, Nvec, fine_to_coarse, coarse_to_fine, spin_map, parity);
248 #ifdef GPU_WILSON_DIRAC
249  } else if (in.Nspin() == 4) {
250  Restrict<Float,4>(out, in, v, Nvec, fine_to_coarse, coarse_to_fine, spin_map, parity);
251 #endif
252 #if GPU_STAGGERED_DIRAC
253  } else if (in.Nspin() == 1) {
254  Restrict<Float,1>(out, in, v, Nvec, fine_to_coarse, coarse_to_fine, spin_map, parity);
255 #endif
256  } else {
257  errorQuda("Unsupported nSpin %d", in.Nspin());
258  }
259  }
260 
261 #endif // GPU_MULTIGRID
262 
264  int Nvec, const int *fine_to_coarse, const int *coarse_to_fine, const int * const * spin_map, int parity) {
265 
266 #ifdef GPU_MULTIGRID
267  if (out.FieldOrder() != in.FieldOrder() || out.FieldOrder() != v.FieldOrder())
268  errorQuda("Field orders do not match (out=%d, in=%d, v=%d)",
269  out.FieldOrder(), in.FieldOrder(), v.FieldOrder());
270 
271  QudaPrecision precision = checkPrecision(out, in);
272 
273  if (precision == QUDA_DOUBLE_PRECISION) {
274 #ifdef GPU_MULTIGRID_DOUBLE
275  Restrict<double>(out, in, v, Nvec, fine_to_coarse, coarse_to_fine, spin_map, parity);
276 #else
277  errorQuda("Double precision multigrid has not been enabled");
278 #endif
279  } else if (precision == QUDA_SINGLE_PRECISION) {
280  Restrict<float>(out, in, v, Nvec, fine_to_coarse, coarse_to_fine, spin_map, parity);
281  } else {
282  errorQuda("Unsupported precision %d", out.Precision());
283  }
284 #else
285  errorQuda("Multigrid has not been built");
286 #endif
287  }
288 
289 } // namespace quda
enum QudaPrecision_s QudaPrecision
const char * AuxString() const
cudaDeviceProp deviceProp
QudaVerbosity getVerbosity()
Definition: util_quda.cpp:21
#define checkPrecision(...)
__global__ void RestrictKernel(Arg arg)
Definition: restrictor.cuh:136
#define errorQuda(...)
Definition: util_quda.h:121
Helper file when using jitify run-time compilation. This file should be included in source code...
cudaStream_t * stream
const char * VolString() const
const char * compile_type_str(const LatticeField &meta, QudaFieldLocation location_=QUDA_INVALID_FIELD_LOCATION)
Helper function for setting auxilary string.
QudaGaugeParam param
Definition: pack_test.cpp:17
cpuColorSpinorField * in
QudaSiteSubset SiteSubset() const
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
Definition: tune.cpp:643
#define checkLocation(...)
void Restrict(Arg arg)
Definition: restrictor.cuh:90
QudaFieldLocation Location() const
enum QudaFieldLocation_s QudaFieldLocation
cpuColorSpinorField * out
__shared__ float s[]
unsigned long long flops
Definition: blas_quda.cu:22
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
#define checkCudaError()
Definition: util_quda.h:161
static const int volume_n
Definition: tune_key.h:10
QudaTune getTuning()
Query whether autotuning is enabled or not. Default is enabled but can be overridden by setting QUDA_...
Definition: util_quda.cpp:52
QudaPrecision Precision() const
QudaParity parity
Definition: covdev_test.cpp:54
#define LAUNCH_KERNEL_MG_BLOCK_SIZE(kernel, tp, stream, arg,...)
QudaFieldOrder FieldOrder() const
unsigned long long bytes
Definition: blas_quda.cu:23