QUDA  0.9.0
tune_quda.h
Go to the documentation of this file.
1 #ifndef _TUNE_QUDA_H
2 #define _TUNE_QUDA_H
3 
4 #include <quda_internal.h>
5 #include <dirac_quda.h>
6 
7 #include <string>
8 #include <iostream>
9 #include <iomanip>
10 #include <cstring>
11 #include <cfloat>
12 #include <stdarg.h>
13 #include <tune_key.h>
14 
15 namespace quda {
16 
17  class TuneParam {
18 
19  public:
20  dim3 block;
21  dim3 grid;
23  int4 aux; // free parameter that can be used as an arbitrary autotuning dimension outside of launch parameters
24 
25  std::string comment;
26  float time;
27  long long n_calls;
28 
29  inline TuneParam() : block(32, 1, 1), grid(1, 1, 1), shared_bytes(0), aux(), time(FLT_MAX), n_calls(0) {
30  aux = make_int4(1,1,1,1);
31  }
32 
33  inline TuneParam(const TuneParam &param)
35 
36  inline TuneParam& operator=(const TuneParam &param) {
37  if (&param != this) {
38  block = param.block;
39  grid = param.grid;
40  shared_bytes = param.shared_bytes;
41  aux = param.aux;
42  comment = param.comment;
43  time = param.time;
44  n_calls = param.n_calls;
45  }
46  return *this;
47  }
48 
49  friend std::ostream& operator<<(std::ostream& output, const TuneParam& param) {
50  output << "block = (" << param.block.x << ", " << param.block.y << ", " << param.block.z << ")" << std::endl;
51  output << "grid = (" << param.grid.x << ", " << param.grid.y << ", " << param.grid.z << ")" << std::endl;
52  output << "shared_bytes = " << param.shared_bytes << std::endl;
53  output << "aux = (" << param.aux.x << ", " << param.aux.y << ", " << param.aux.z << ", " << param.aux.w << ")" << std::endl;
54  output << param.comment << std::endl;
55  return output;
56  }
57  };
58 
59 
60  class Tunable {
61 
62  protected:
63  virtual long long flops() const = 0;
64  virtual long long bytes() const { return 0; } // FIXME
65 
66  // the minimum number of shared bytes per thread
67  virtual unsigned int sharedBytesPerThread() const = 0;
68 
69  // the minimum number of shared bytes per thread block
70  virtual unsigned int sharedBytesPerBlock(const TuneParam &param) const = 0;
71 
72  // override this if a specific thread count is required (e.g., if not grid size tuning)
73  virtual unsigned int minThreads() const { return 1; }
74  virtual bool tuneGridDim() const { return true; }
75  virtual bool tuneAuxDim() const { return false; }
76  virtual bool tuneSharedBytes() const { return true; }
77 
78  virtual bool advanceGridDim(TuneParam &param) const
79  {
80  if (tuneGridDim()) {
81  const unsigned int max_blocks = maxGridSize();
82  const int step = 1;
83  param.grid.x += step;
84  if (param.grid.x > max_blocks) {
85  param.grid.x = minGridSize();
86  return false;
87  } else {
88  return true;
89  }
90  } else {
91  return false;
92  }
93  }
94 
95  virtual unsigned int maxBlockSize() const { return deviceProp.maxThreadsDim[0]; }
96  virtual unsigned int maxGridSize() const { return 2*deviceProp.multiProcessorCount; }
97  virtual unsigned int minGridSize() const { return 1; }
98 
99  virtual int blockStep() const { return deviceProp.warpSize; }
100  virtual int blockMin() const { return deviceProp.warpSize; }
101 
102  virtual bool advanceBlockDim(TuneParam &param) const
103  {
104  const unsigned int max_threads = maxBlockSize();
105  const unsigned int max_blocks = deviceProp.maxGridSize[0];
106  const unsigned int max_shared = deviceProp.sharedMemPerBlock;
107  const int step = blockStep();
108  bool ret;
109 
110  param.block.x += step;
111  int nthreads = param.block.x*param.block.y*param.block.z;
112  if (param.block.x > max_threads || sharedBytesPerThread()*nthreads > max_shared) {
113 
114  if (tuneGridDim()) {
115  param.block.x = step;
116  } else { // not tuning the grid dimension so have to set a valid grid size
117  // ensure the blockDim is large enough given the limit on gridDim
118  param.block.x = (minThreads()+max_blocks-1)/max_blocks;
119  param.block.x = ((param.block.x+step-1)/step)*step; // round up to nearest step size
120  if(param.block.x > max_threads) errorQuda("Local lattice volume is too large for device");
121  }
122 
123  ret = false;
124  } else {
125  ret = true;
126  }
127 
128  if (!tuneGridDim())
129  param.grid = dim3((minThreads()+param.block.x-1)/param.block.x, 1, 1);
130 
131  return ret;
132  }
133 
140  unsigned int maxBlocksPerSM() const {
141  switch (deviceProp.major) {
142  case 2:
143  return 8;
144  case 3:
145  return 16;
146  case 5:
147  case 6:
148  case 7:
149  return 32;
150  default:
151  errorQuda("Unknown SM architecture %d.%d\n", deviceProp.major, deviceProp.minor);
152  return 0;
153  }
154  }
155 
163  virtual bool advanceSharedBytes(TuneParam &param) const
164  {
165  if (tuneSharedBytes()) {
166  const int max_shared = deviceProp.sharedMemPerBlock;
167  const int max_blocks_per_sm = std::min(deviceProp.maxThreadsPerMultiProcessor / (param.block.x*param.block.y*param.block.z), maxBlocksPerSM());
168  int blocks_per_sm = max_shared / (param.shared_bytes ? param.shared_bytes : 1);
169  if (blocks_per_sm > max_blocks_per_sm) blocks_per_sm = max_blocks_per_sm;
170  param.shared_bytes = (blocks_per_sm > 0 ? max_shared / blocks_per_sm + 1 : max_shared + 1);
171 
172  if (param.shared_bytes > max_shared) {
173  TuneParam next(param);
174  advanceBlockDim(next); // to get next blockDim
175  int nthreads = next.block.x * next.block.y * next.block.z;
176  param.shared_bytes = sharedBytesPerThread()*nthreads > sharedBytesPerBlock(param) ?
178  return false;
179  } else {
180  return true;
181  }
182  } else {
183  return false;
184  }
185  }
186 
187  virtual bool advanceAux(TuneParam &param) const { return false; }
188 
190 
191  int writeAuxString(const char *format, ...) {
192  va_list arguments;
193  va_start(arguments, format);
194  int n = vsnprintf(aux, TuneKey::aux_n, format, arguments);
195  if (n < 0 || n >=TuneKey::aux_n) errorQuda("Error writing auxiliary string");
196  return n;
197  }
198 
199  public:
200  Tunable() { }
201  virtual ~Tunable() { }
202  virtual TuneKey tuneKey() const = 0;
203  virtual void apply(const cudaStream_t &stream) = 0;
204  virtual void preTune() { }
205  virtual void postTune() { }
206  virtual int tuningIter() const { return 1; }
207 
208  virtual std::string paramString(const TuneParam &param) const
209  {
210  std::stringstream ps;
211  ps << "block=(" << param.block.x << "," << param.block.y << "," << param.block.z << "), ";
212  if (tuneGridDim()) ps << "grid=(" << param.grid.x << "," << param.grid.y << "," << param.grid.z << "), ";
213  ps << "shared=" << param.shared_bytes << ", ";
214 
215  // determine if we are tuning the auxiliary dimension
216  if (tuneAuxDim()) ps << "aux=(" << param.aux.x << "," << param.aux.y << "," << param.aux.z << "," << param.aux.w << ")";
217  return ps.str();
218  }
219 
220  virtual std::string perfString(float time) const
221  {
222  float gflops = flops() / (1e9 * time);
223  float gbytes = bytes() / (1e9 * time);
224  std::stringstream ss;
225  ss << std::setiosflags(std::ios::fixed) << std::setprecision(2) << gflops << " Gflop/s, ";
226  ss << gbytes << " GB/s";
227  return ss.str();
228  }
229 
230  virtual void initTuneParam(TuneParam &param) const
231  {
232  const unsigned int max_threads = deviceProp.maxThreadsDim[0];
233  const unsigned int max_blocks = deviceProp.maxGridSize[0];
234  const int min_grid_size = minGridSize();
235  const int min_block_size = blockMin();
236 
237  if (tuneGridDim()) {
238  param.block = dim3(min_block_size,1,1);
239 
240  param.grid = dim3(min_grid_size,1,1);
241  } else {
242  // find the minimum valid blockDim
243  param.block = dim3((minThreads()+max_blocks-1)/max_blocks, 1, 1);
244  param.block.x = ((param.block.x+min_block_size-1) / min_block_size) * min_block_size; // round up to the nearest multiple of desired minimum block size
245  if (param.block.x > max_threads) errorQuda("Local lattice volume is too large for device");
246 
247  param.grid = dim3((minThreads()+param.block.x-1)/param.block.x, 1, 1);
248  }
249  param.shared_bytes = sharedBytesPerThread()*param.block.x > sharedBytesPerBlock(param) ?
251  }
252 
254  virtual void defaultTuneParam(TuneParam &param) const
255  {
257  if (tuneGridDim()) param.grid = dim3(128,1,1);
258  }
259 
260  virtual bool advanceTuneParam(TuneParam &param) const
261  {
263  }
264 
270 
271  if (param.block.x > (unsigned int)deviceProp.maxThreadsDim[0])
272  errorQuda("Requested X-dimension block size %d greater than hardware limit %d",
273  param.block.x, deviceProp.maxThreadsDim[0]);
274 
275  if (param.block.y > (unsigned int)deviceProp.maxThreadsDim[1])
276  errorQuda("Requested Y-dimension block size %d greater than hardware limit %d",
277  param.block.y, deviceProp.maxThreadsDim[1]);
278 
279  if (param.block.z > (unsigned int)deviceProp.maxThreadsDim[2])
280  errorQuda("Requested Z-dimension block size %d greater than hardware limit %d",
281  param.block.z, deviceProp.maxThreadsDim[2]);
282 
283  if (param.grid.x > (unsigned int)deviceProp.maxGridSize[0]){
284  errorQuda("Requested X-dimension grid size %d greater than hardware limit %d",
285  param.grid.x, deviceProp.maxGridSize[0]);
286 
287  }
288  if (param.grid.y > (unsigned int)deviceProp.maxGridSize[1])
289  errorQuda("Requested Y-dimension grid size %d greater than hardware limit %d",
290  param.grid.y, deviceProp.maxGridSize[1]);
291 
292  if (param.grid.z > (unsigned int)deviceProp.maxGridSize[2])
293  errorQuda("Requested Z-dimension grid size %d greater than hardware limit %d",
294  param.grid.z, deviceProp.maxGridSize[2]);
295  }
296 
297  };
298 
299 
306  class TunableLocalParity : public Tunable {
307 
308  protected:
309  unsigned int sharedBytesPerThread() const { return 0; }
310  unsigned int sharedBytesPerBlock(const TuneParam &param) const { return 0; }
311 
312  // don't tune the grid dimension
313  bool tuneGridDim() const { return false; }
314 
319  unsigned int maxBlockSize() const { return deviceProp.maxThreadsPerBlock / 2; }
320 
321  public:
323  bool rtn = Tunable::advanceBlockDim(param);
324  param.block.y = 2;
325  return rtn;
326  }
327 
330  param.block.y = 2;
331  }
332 
335  param.block.y = 2;
336  }
337 
338  };
339 
346  class TunableVectorY : public Tunable {
347 
348  protected:
349  virtual unsigned int sharedBytesPerThread() const { return 0; }
350  virtual unsigned int sharedBytesPerBlock(const TuneParam &param) const { return 0; }
351 
352  unsigned int vector_length_y;
353 
354  public:
356 
358  {
359  dim3 block = param.block;
360  dim3 grid = param.grid;
362  param.block.y = block.y;
363  param.grid.y = grid.y;
364 
365  if (ret) { // we advanced the block.x so we're done
366  return true;
367  } else { // block.x (spacetime) was reset
368 
369  // we can advance spin/block-color since this is valid
370  if (param.block.y < vector_length_y && param.block.y < (unsigned int)deviceProp.maxThreadsDim[1]) {
371  param.block.y++;
372  param.grid.y = (vector_length_y + param.block.y - 1) / param.block.y;
373  return true;
374  } else { // we have run off the end so let's reset
375  param.block.y = 1;
376  param.grid.y = vector_length_y;
377  return false;
378  }
379  }
380  }
381 
383  {
385  param.block.y = 1;
386  param.grid.y = vector_length_y;
387  }
388 
391  {
393  param.block.y = 1;
394  param.grid.y = vector_length_y;
395  }
396 
397  void resizeVector(int y) { vector_length_y = y; }
398  };
399 
401 
402  mutable unsigned vector_length_z;
403 
404  public:
405  TunableVectorYZ(unsigned int vector_length_y, unsigned int vector_length_z)
407 
409  {
410  dim3 block = param.block;
411  dim3 grid = param.grid;
413  param.block.z = block.z;
414  param.grid.z = grid.z;
415 
416  if (ret) { // we advanced the block.y / block.x so we're done
417  return true;
418  } else { // block.x/block.y (spacetime) was reset
419 
420  // we can advance spin/block-color since this is valid
421  if (param.block.z < vector_length_z && param.block.z < (unsigned int)deviceProp.maxThreadsDim[1]) {
422  param.block.z++;
423  param.grid.z = (vector_length_z + param.block.z - 1) / param.block.z;
424  return true;
425  } else { // we have run off the end so let's reset
426  param.block.z = 1;
427  param.grid.z = vector_length_z;
428  return false;
429  }
430  }
431  }
432 
434  {
436  param.block.z = 1;
437  param.grid.z = vector_length_z;
438  }
439 
442  {
444  param.block.z = 1;
445  param.grid.z = vector_length_z;
446  }
447 
449  };
450 
455  bool activeTuning();
456 
457  void loadTuneCache();
458  void saveTuneCache();
459 
463  void saveProfile(const std::string label = "");
464 
468  void flushProfile();
469 
470  TuneParam& tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity);
471 
472 } // namespace quda
473 
474 #endif // _TUNE_QUDA_H
virtual unsigned int maxBlockSize() const
Definition: tune_quda.h:95
void resizeVector(int y, int z)
Definition: tune_quda.h:448
QudaVerbosity verbosity
cudaDeviceProp deviceProp
virtual int tuningIter() const
Definition: tune_quda.h:206
virtual bool advanceSharedBytes(TuneParam &param) const
Definition: tune_quda.h:163
#define errorQuda(...)
Definition: util_quda.h:90
cudaStream_t * stream
friend std::ostream & operator<<(std::ostream &output, const TuneParam &param)
Definition: tune_quda.h:49
void initTuneParam(TuneParam &param) const
Definition: tune_quda.h:382
virtual std::string paramString(const TuneParam &param) const
Definition: tune_quda.h:208
TunableVectorY(unsigned int vector_length_y)
Definition: tune_quda.h:355
bool advanceBlockDim(TuneParam &param) const
Definition: tune_quda.h:357
virtual TuneKey tuneKey() const =0
virtual bool advanceGridDim(TuneParam &param) const
Definition: tune_quda.h:78
virtual bool advanceAux(TuneParam &param) const
Definition: tune_quda.h:187
virtual unsigned int sharedBytesPerThread() const
Definition: tune_quda.h:349
TuneParam(const TuneParam &param)
Definition: tune_quda.h:33
bool tuneGridDim() const
Definition: tune_quda.h:313
QudaGaugeParam param
Definition: pack_test.cpp:17
time_t time(time_t *)
virtual unsigned int maxGridSize() const
Definition: tune_quda.h:96
virtual int blockMin() const
Definition: tune_quda.h:100
unsigned int vector_length_y
Definition: tune_quda.h:352
virtual unsigned int sharedBytesPerThread() const =0
virtual unsigned int sharedBytesPerBlock(const TuneParam &param) const =0
void flushProfile()
Flush profile contents, setting all counts to zero.
Definition: tune.cpp:462
unsigned int sharedBytesPerThread() const
Definition: tune_quda.h:309
virtual long long bytes() const
Definition: tune_quda.h:64
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
Definition: tune.cpp:603
void defaultTuneParam(TuneParam &param) const
Definition: tune_quda.h:390
void saveProfile(const std::string label="")
Save profile to disk.
Definition: tune.cpp:472
TuneParam & operator=(const TuneParam &param)
Definition: tune_quda.h:36
TunableVectorYZ(unsigned int vector_length_y, unsigned int vector_length_z)
Definition: tune_quda.h:405
__darwin_va_list va_list
int int int int vsnprintf(char *__str, size_t __size, const char *__format, va_list) __attribute__((__format__(__printf__
void initTuneParam(TuneParam &param) const
Definition: tune_quda.h:433
unsigned vector_length_z
Definition: tune_quda.h:402
bool activeTuning()
query if tuning is in progress
Definition: tune.cpp:103
virtual bool tuneAuxDim() const
Definition: tune_quda.h:75
unsigned int maxBlocksPerSM() const
For reason this can&#39;t be queried from the device properties, so here we set set this. Based on Table 14 of the CUDA Programming Guide 9.0 (Technical Specifications per Compute Capability)
Definition: tune_quda.h:140
virtual bool tuneSharedBytes() const
Definition: tune_quda.h:76
virtual void preTune()
Definition: tune_quda.h:204
virtual unsigned int minGridSize() const
Definition: tune_quda.h:97
virtual void postTune()
Definition: tune_quda.h:205
bool advanceBlockDim(TuneParam &param) const
Definition: tune_quda.h:322
virtual ~Tunable()
Definition: tune_quda.h:201
void loadTuneCache()
Definition: tune.cpp:302
static const int aux_n
Definition: tune_key.h:12
void defaultTuneParam(TuneParam &param) const
Definition: tune_quda.h:333
enum QudaTune_s QudaTune
int writeAuxString(const char *format,...)
Definition: tune_quda.h:191
long long n_calls
Definition: tune_quda.h:27
void initTuneParam(TuneParam &param) const
Definition: tune_quda.h:328
unsigned int sharedBytesPerBlock(const TuneParam &param) const
Definition: tune_quda.h:310
virtual int blockStep() const
Definition: tune_quda.h:99
virtual unsigned int minThreads() const
Definition: tune_quda.h:73
void checkLaunchParam(TuneParam &param)
Definition: tune_quda.h:269
virtual void initTuneParam(TuneParam &param) const
Definition: tune_quda.h:230
enum QudaVerbosity_s QudaVerbosity
unsigned int maxBlockSize() const
Definition: tune_quda.h:319
virtual bool advanceBlockDim(TuneParam &param) const
Definition: tune_quda.h:102
void resizeVector(int y)
Definition: tune_quda.h:397
virtual unsigned int sharedBytesPerBlock(const TuneParam &param) const
Definition: tune_quda.h:350
virtual bool tuneGridDim() const
Definition: tune_quda.h:74
void saveTuneCache()
Definition: tune.cpp:388
char aux[TuneKey::aux_n]
Definition: tune_quda.h:189
virtual std::string perfString(float time) const
Definition: tune_quda.h:220
void defaultTuneParam(TuneParam &param) const
Definition: tune_quda.h:441
virtual long long flops() const =0
virtual void apply(const cudaStream_t &stream)=0
bool advanceBlockDim(TuneParam &param) const
Definition: tune_quda.h:408
virtual void defaultTuneParam(TuneParam &param) const
Definition: tune_quda.h:254
virtual bool advanceTuneParam(TuneParam &param) const
Definition: tune_quda.h:260
std::string comment
Definition: tune_quda.h:25