QUDA  1.0.0
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros
tune_quda.h
Go to the documentation of this file.
1 #ifndef _TUNE_QUDA_H
2 #define _TUNE_QUDA_H
3 
4 #include <string>
5 #include <iostream>
6 #include <iomanip>
7 #include <cstring>
8 #include <cfloat>
9 #include <stdarg.h>
10 #include <map>
11 
12 #include <tune_key.h>
13 #include <quda_internal.h>
14 
15 namespace quda {
16 
17  class TuneParam {
18 
19  public:
20  dim3 block;
21  dim3 grid;
23  int4 aux; // free parameter that can be used as an arbitrary autotuning dimension outside of launch parameters
24 
25  std::string comment;
26  float time;
27  long long n_calls;
28 
29  inline TuneParam() : block(32, 1, 1), grid(1, 1, 1), shared_bytes(0), aux(), time(FLT_MAX), n_calls(0) {
30  aux = make_int4(1,1,1,1);
31  }
32 
33  inline TuneParam(const TuneParam &param)
34  : block(param.block), grid(param.grid), shared_bytes(param.shared_bytes), aux(param.aux), comment(param.comment), time(param.time), n_calls(param.n_calls) { }
35 
36  inline TuneParam& operator=(const TuneParam &param) {
37  if (&param != this) {
38  block = param.block;
39  grid = param.grid;
40  shared_bytes = param.shared_bytes;
41  aux = param.aux;
42  comment = param.comment;
43  time = param.time;
44  n_calls = param.n_calls;
45  }
46  return *this;
47  }
48 
49  friend std::ostream& operator<<(std::ostream& output, const TuneParam& param) {
50  output << "block=(" << param.block.x << "," << param.block.y << "," << param.block.z << "), ";
51  output << "grid=(" << param.grid.x << "," << param.grid.y << "," << param.grid.z << "), ";
52  output << "shared_bytes=" << param.shared_bytes;
53  output << ", aux=(" << param.aux.x << "," << param.aux.y << "," << param.aux.z << "," << param.aux.w << ")";
54  return output;
55  }
56  };
57 
58 
59  class Tunable {
60 
61  protected:
62  virtual long long flops() const = 0;
63  virtual long long bytes() const { return 0; } // FIXME
64 
65  // the minimum number of shared bytes per thread
66  virtual unsigned int sharedBytesPerThread() const = 0;
67 
68  // the minimum number of shared bytes per thread block
69  virtual unsigned int sharedBytesPerBlock(const TuneParam &param) const = 0;
70 
71  // override this if a specific thread count is required (e.g., if not grid size tuning)
72  virtual unsigned int minThreads() const { return 1; }
73  virtual bool tuneGridDim() const { return true; }
74  virtual bool tuneAuxDim() const { return false; }
75  virtual bool tuneSharedBytes() const { return true; }
76 
77  virtual bool advanceGridDim(TuneParam &param) const
78  {
79  if (tuneGridDim()) {
80  const unsigned int max_blocks = maxGridSize();
81  const int step = gridStep();
82  param.grid.x += step;
83  if (param.grid.x > max_blocks) {
84  param.grid.x = minGridSize();
85  return false;
86  } else {
87  return true;
88  }
89  } else {
90  return false;
91  }
92  }
93 
94  virtual unsigned int maxBlockSize(const TuneParam &param) const { return deviceProp.maxThreadsPerBlock / (param.block.y*param.block.z); }
95  virtual unsigned int maxGridSize() const { return 2*deviceProp.multiProcessorCount; }
96  virtual unsigned int minGridSize() const { return 1; }
97 
103  virtual int gridStep() const { return 1; }
104 
105  virtual int blockStep() const { return deviceProp.warpSize; }
106  virtual int blockMin() const { return deviceProp.warpSize; }
107 
108  virtual void resetBlockDim(TuneParam &param) const {
109  const unsigned int max_threads = maxBlockSize(param);
110  const unsigned int max_blocks = deviceProp.maxGridSize[0];
111  const int step = blockStep();
112 
113  if (tuneGridDim()) {
114  param.block.x = step;
115  } else { // not tuning the grid dimension so have to set a valid grid size
116  // ensure the blockDim is large enough given the limit on gridDim
117  param.block.x = (minThreads()+max_blocks-1)/max_blocks;
118  param.block.x = ((param.block.x+step-1)/step)*step; // round up to nearest step size
119  if (param.block.x > max_threads && param.block.y == 1 && param.block.z == 1)
120  errorQuda("Local lattice volume is too large for device");
121  }
122  }
123 
124  virtual bool advanceBlockDim(TuneParam &param) const
125  {
126  const unsigned int max_threads = maxBlockSize(param);
127  const unsigned int max_shared = maxSharedBytesPerBlock();
128  bool ret;
129 
130  param.block.x += blockStep();
131  int nthreads = param.block.x*param.block.y*param.block.z;
132  if (param.block.x > max_threads || sharedBytesPerThread() * nthreads > max_shared
133  || sharedBytesPerBlock(param) > max_shared) {
134  resetBlockDim(param);
135  ret = false;
136  } else {
137  ret = true;
138  }
139 
140  if (!tuneGridDim())
141  param.grid = dim3((minThreads()+param.block.x-1)/param.block.x, 1, 1);
142 
143  return ret;
144  }
145 
153  unsigned int maxBlocksPerSM() const
154  {
155  switch (deviceProp.major) {
156  case 2:
157  return 8;
158  case 3:
159  return 16;
160  case 5:
161  case 6: return 32;
162  case 7:
163  switch (deviceProp.minor) {
164  case 0: return 32;
165  case 2: return 32;
166  case 5: return 16;
167  }
168  default:
169  warningQuda("Unknown SM architecture %d.%d - assuming limit of 32 blocks per SM\n",
170  deviceProp.major, deviceProp.minor);
171  return 32;
172  }
173  }
174 
181  template <typename F> inline void setMaxDynamicSharedBytesPerBlock(F *func) const
182  {
183 #if CUDA_VERSION >= 9000
184  qudaFuncSetAttribute(
185  (const void *)func, cudaFuncAttributePreferredSharedMemoryCarveout, (int)cudaSharedmemCarveoutMaxShared);
186  qudaFuncSetAttribute(
187  (const void *)func, cudaFuncAttributeMaxDynamicSharedMemorySize, maxDynamicSharedBytesPerBlock());
188 #endif
189  }
190 
198  unsigned int maxDynamicSharedBytesPerBlock() const
199  {
200  switch (deviceProp.major) {
201  case 2:
202  case 3:
203  case 5:
204  case 6: return 48 * 1024;
205  case 7:
206  switch (deviceProp.minor) {
207  case 0: return 96 * 1024;
208  case 2: return 96 * 1024;
209  case 5: return 64 * 1024;
210  }
211  default:
212  warningQuda("Unknown SM architecture %d.%d - assuming limit of 48 KiB per SM\n",
213  deviceProp.major, deviceProp.minor);
214  return 48 * 1024;
215  }
216  }
217 
229  virtual unsigned int maxSharedBytesPerBlock() const { return deviceProp.sharedMemPerBlock; }
230 
238  virtual bool advanceSharedBytes(TuneParam &param) const
239  {
240  if (tuneSharedBytes()) {
241  const int max_shared = maxSharedBytesPerBlock();
242  const int max_blocks_per_sm = std::min(deviceProp.maxThreadsPerMultiProcessor / (param.block.x*param.block.y*param.block.z), maxBlocksPerSM());
243  int blocks_per_sm = max_shared / (param.shared_bytes ? param.shared_bytes : 1);
244  if (blocks_per_sm > max_blocks_per_sm) blocks_per_sm = max_blocks_per_sm;
245  param.shared_bytes = (blocks_per_sm > 0 ? max_shared / blocks_per_sm + 1 : max_shared + 1);
246 
247  if (param.shared_bytes > max_shared) {
248  TuneParam next(param);
249  advanceBlockDim(next); // to get next blockDim
250  int nthreads = next.block.x * next.block.y * next.block.z;
251  param.shared_bytes = sharedBytesPerThread() * nthreads > sharedBytesPerBlock(next) ?
252  sharedBytesPerThread() * nthreads :
253  sharedBytesPerBlock(next);
254  return false;
255  } else {
256  return true;
257  }
258  } else {
259  return false;
260  }
261  }
262 
263  virtual bool advanceAux(TuneParam &param) const { return false; }
264 
266 
267  int writeAuxString(const char *format, ...) {
268  va_list arguments;
269  va_start(arguments, format);
270  int n = vsnprintf(aux, TuneKey::aux_n, format, arguments);
271  if (n < 0 || n >=TuneKey::aux_n) errorQuda("Error writing auxiliary string");
272  return n;
273  }
274 
276  CUresult jitify_error;
277 
278  public:
279  Tunable() : jitify_error(CUDA_SUCCESS) { aux[0] = '\0'; }
280  virtual ~Tunable() { }
281  virtual TuneKey tuneKey() const = 0;
282  virtual void apply(const cudaStream_t &stream) = 0;
283  virtual void preTune() { }
284  virtual void postTune() { }
285  virtual int tuningIter() const { return 1; }
286 
287  virtual std::string paramString(const TuneParam &param) const
288  {
289  std::stringstream ps;
290  ps << param;
291  return ps.str();
292  }
293 
294  virtual std::string perfString(float time) const
295  {
296  float gflops = flops() / (1e9 * time);
297  float gbytes = bytes() / (1e9 * time);
298  std::stringstream ss;
299  ss << std::setiosflags(std::ios::fixed) << std::setprecision(2) << gflops << " Gflop/s, ";
300  ss << gbytes << " GB/s";
301  return ss.str();
302  }
303 
304  virtual void initTuneParam(TuneParam &param) const
305  {
306  const unsigned int max_threads = deviceProp.maxThreadsDim[0];
307  const unsigned int max_blocks = deviceProp.maxGridSize[0];
308  const int min_grid_size = minGridSize();
309  const int min_block_size = blockMin();
310 
311  if (tuneGridDim()) {
312  param.block = dim3(min_block_size,1,1);
313 
314  param.grid = dim3(min_grid_size,1,1);
315  } else {
316  // find the minimum valid blockDim
317  param.block = dim3((minThreads()+max_blocks-1)/max_blocks, 1, 1);
318  param.block.x = ((param.block.x+min_block_size-1) / min_block_size) * min_block_size; // round up to the nearest multiple of desired minimum block size
319  if (param.block.x > max_threads) errorQuda("Local lattice volume is too large for device");
320 
321  param.grid = dim3((minThreads()+param.block.x-1)/param.block.x, 1, 1);
322  }
323  int nthreads = param.block.x*param.block.y*param.block.z;
324  param.shared_bytes = sharedBytesPerThread()*nthreads > sharedBytesPerBlock(param) ?
325  sharedBytesPerThread()*nthreads : sharedBytesPerBlock(param);
326  }
327 
329  virtual void defaultTuneParam(TuneParam &param) const
330  {
331  initTuneParam(param);
332  if (tuneGridDim()) param.grid = dim3(128,1,1);
333  }
334 
335  virtual bool advanceTuneParam(TuneParam &param) const
336  {
337  return advanceSharedBytes(param) || advanceBlockDim(param) || advanceGridDim(param) || advanceAux(param);
338  }
339 
345 
346  if (param.block.x*param.block.y*param.block.z > (unsigned)deviceProp.maxThreadsPerBlock)
347  errorQuda("Requested block size %dx%dx%d=%d greater than hardware limit %d",
348  param.block.x, param.block.y, param.block.z, param.block.x*param.block.y*param.block.z, deviceProp.maxThreadsPerBlock);
349 
350  if (param.block.x > (unsigned int)deviceProp.maxThreadsDim[0])
351  errorQuda("Requested X-dimension block size %d greater than hardware limit %d",
352  param.block.x, deviceProp.maxThreadsDim[0]);
353 
354  if (param.block.y > (unsigned int)deviceProp.maxThreadsDim[1])
355  errorQuda("Requested Y-dimension block size %d greater than hardware limit %d",
356  param.block.y, deviceProp.maxThreadsDim[1]);
357 
358  if (param.block.z > (unsigned int)deviceProp.maxThreadsDim[2])
359  errorQuda("Requested Z-dimension block size %d greater than hardware limit %d",
360  param.block.z, deviceProp.maxThreadsDim[2]);
361 
362  if (param.grid.x > (unsigned int)deviceProp.maxGridSize[0])
363  errorQuda("Requested X-dimension grid size %d greater than hardware limit %d",
364  param.grid.x, deviceProp.maxGridSize[0]);
365 
366  if (param.grid.y > (unsigned int)deviceProp.maxGridSize[1])
367  errorQuda("Requested Y-dimension grid size %d greater than hardware limit %d",
368  param.grid.y, deviceProp.maxGridSize[1]);
369 
370  if (param.grid.z > (unsigned int)deviceProp.maxGridSize[2])
371  errorQuda("Requested Z-dimension grid size %d greater than hardware limit %d",
372  param.grid.z, deviceProp.maxGridSize[2]);
373  }
374 
375  CUresult jitifyError() const { return jitify_error; }
376  CUresult& jitifyError() { return jitify_error; }
377  };
378 
379 
386  class TunableLocalParity : public Tunable {
387 
388  protected:
389  unsigned int sharedBytesPerThread() const { return 0; }
390  unsigned int sharedBytesPerBlock(const TuneParam &param) const { return 0; }
391 
392  // don't tune the grid dimension
393  virtual bool tuneGridDim() const { return false; }
394 
399  unsigned int maxBlockSize(const TuneParam &param) const { return deviceProp.maxThreadsPerBlock / 2; }
400 
401  public:
403  bool rtn = Tunable::advanceBlockDim(param);
404  param.block.y = 2;
405  return rtn;
406  }
407 
409  Tunable::initTuneParam(param);
410  param.block.y = 2;
411  }
412 
415  param.block.y = 2;
416  }
417 
418  };
419 
426  class TunableVectorY : public Tunable {
427 
428  protected:
429  virtual unsigned int sharedBytesPerThread() const { return 0; }
430  virtual unsigned int sharedBytesPerBlock(const TuneParam &param) const { return 0; }
431 
432  mutable unsigned int vector_length_y;
433  mutable unsigned int step_y;
435 
436  public:
437  TunableVectorY(unsigned int vector_length_y) : vector_length_y(vector_length_y),
438  step_y(1), tune_block_x(true) { }
439 
441  {
442  dim3 block = param.block;
443  dim3 grid = param.grid;
444  bool ret = tune_block_x ? Tunable::advanceBlockDim(param) : false;
445  param.block.y = block.y;
446  param.grid.y = grid.y;
447 
448  if (ret) {
449  return true;
450  } else { // block.x (spacetime) was reset
451 
452  // we can advance spin/block-color since this is valid
453  if (param.block.y < vector_length_y && param.block.y < (unsigned int)deviceProp.maxThreadsDim[1] &&
454  param.block.x*(param.block.y+step_y)*param.block.z <= (unsigned int)deviceProp.maxThreadsPerBlock) {
455  param.block.y += step_y;
456  param.grid.y = (vector_length_y + param.block.y - 1) / param.block.y;
457  return true;
458  } else { // we have run off the end so let's reset
459  param.block.y = step_y;
460  param.grid.y = (vector_length_y + param.block.y - 1) / param.block.y;
461  return false;
462  }
463  }
464  }
465 
467  {
468  Tunable::initTuneParam(param);
469  param.block.y = step_y;
470  param.grid.y = (vector_length_y + step_y - 1) / step_y;
471  }
472 
475  {
477  param.block.y = step_y;
478  param.grid.y = (vector_length_y + step_y - 1) / step_y;
479  }
480 
481  void resizeVector(int y) const { vector_length_y = y; }
482  void resizeStep(int y) const { step_y = y; }
483  };
484 
486 
487  mutable unsigned vector_length_z;
488  mutable unsigned step_z;
490 
491  public:
492  TunableVectorYZ(unsigned int vector_length_y, unsigned int vector_length_z)
493  : TunableVectorY(vector_length_y), vector_length_z(vector_length_z),
494  step_z(1), tune_block_y(true) { }
495 
497  {
498  dim3 block = param.block;
499  dim3 grid = param.grid;
500  bool ret = tune_block_y ? TunableVectorY::advanceBlockDim(param) : tune_block_x ? Tunable::advanceBlockDim(param) : false;
501  param.block.z = block.z;
502  param.grid.z = grid.z;
503 
504  if (ret) {
505  // we advanced the block.x / block.y so we're done
506  return true;
507  } else { // block.x/block.y (spacetime) was reset
508 
509  // we can advance spin/block-color since this is valid
510  if (param.block.z < vector_length_z && param.block.z < (unsigned int)deviceProp.maxThreadsDim[2] &&
511  param.block.x*param.block.y*(param.block.z+step_z) <= (unsigned int)deviceProp.maxThreadsPerBlock) {
512  param.block.z += step_z;
513  param.grid.z = (vector_length_z + param.block.z - 1) / param.block.z;
514  return true;
515  } else { // we have run off the end so let's reset
516  param.block.z = step_z;
517  param.grid.z = (vector_length_z + param.block.z - 1) / param.block.z;
518  return false;
519  }
520  }
521  }
522 
524  {
526  param.block.z = step_z;
527  param.grid.z = (vector_length_z + step_z - 1) / step_z;
528  }
529 
532  {
534  param.block.z = step_z;
535  param.grid.z = (vector_length_z + step_z - 1) / step_z;
536  }
537 
538  void resizeVector(int y, int z) const { vector_length_z = z; TunableVectorY::resizeVector(y); }
539  void resizeStep(int y, int z) const { step_z = z; TunableVectorY::resizeStep(y); }
540  };
541 
546  bool activeTuning();
547 
548  void loadTuneCache();
549  void saveTuneCache(bool error = false);
550 
554  void saveProfile(const std::string label = "");
555 
559  void flushProfile();
560 
562 
566  void postTrace_(const char *func, const char *file, int line);
567 
572  const std::map<TuneKey, TuneParam> &getTuneCache();
573 
577  void enableProfileCount();
578 
582  void disableProfileCount();
583 
587  void setPolicyTuning(bool);
588 
589 } // namespace quda
590 
591 #define postTrace() quda::postTrace_(__func__, quda::file_name(__FILE__), __LINE__)
592 
593 #endif // _TUNE_QUDA_H
CUresult jitifyError() const
Definition: tune_quda.h:375
virtual void resetBlockDim(TuneParam &param) const
Definition: tune_quda.h:108
void resizeStep(int y, int z) const
Definition: tune_quda.h:539
cudaDeviceProp deviceProp
virtual int tuningIter() const
Definition: tune_quda.h:285
void disableProfileCount()
Disable the profile kernel counting.
Definition: tune.cpp:125
virtual bool advanceSharedBytes(TuneParam &param) const
Definition: tune_quda.h:238
#define errorQuda(...)
Definition: util_quda.h:121
CUresult & jitifyError()
Definition: tune_quda.h:376
void postTrace_(const char *func, const char *file, int line)
Post an event in the trace, recording where it was posted.
Definition: tune.cpp:92
cudaStream_t * stream
void saveTuneCache(bool error=false)
Definition: tune.cpp:426
friend std::ostream & operator<<(std::ostream &output, const TuneParam &param)
Definition: tune_quda.h:49
void initTuneParam(TuneParam &param) const
Definition: tune_quda.h:466
virtual std::string paramString(const TuneParam &param) const
Definition: tune_quda.h:287
TunableVectorY(unsigned int vector_length_y)
Definition: tune_quda.h:437
bool advanceBlockDim(TuneParam &param) const
Definition: tune_quda.h:440
virtual bool tuneGridDim() const
Definition: tune_quda.h:393
virtual bool advanceGridDim(TuneParam &param) const
Definition: tune_quda.h:77
virtual bool advanceAux(TuneParam &param) const
Definition: tune_quda.h:263
virtual unsigned int sharedBytesPerThread() const
Definition: tune_quda.h:429
TuneParam(const TuneParam &param)
Definition: tune_quda.h:33
void enableProfileCount()
Enable the profile kernel counting.
Definition: tune.cpp:126
QudaGaugeParam param
Definition: pack_test.cpp:17
unsigned int maxBlockSize(const TuneParam &param) const
Definition: tune_quda.h:399
void setMaxDynamicSharedBytesPerBlock(F *func) const
Enable the maximum dynamic shared bytes for the kernel "func" (values given by maxDynamicSharedBytesP...
Definition: tune_quda.h:181
virtual unsigned int maxGridSize() const
Definition: tune_quda.h:95
virtual int blockMin() const
Definition: tune_quda.h:106
unsigned int vector_length_y
Definition: tune_quda.h:432
void flushProfile()
Flush profile contents, setting all counts to zero.
Definition: tune.cpp:504
unsigned int sharedBytesPerThread() const
Definition: tune_quda.h:389
virtual long long bytes() const
Definition: tune_quda.h:63
virtual int gridStep() const
gridStep sets the step size when iterating the grid size in advanceGridDim.
Definition: tune_quda.h:103
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
Definition: tune.cpp:643
CUresult jitify_error
Definition: tune_quda.h:276
#define warningQuda(...)
Definition: util_quda.h:133
void defaultTuneParam(TuneParam &param) const
Definition: tune_quda.h:474
void resizeStep(int y) const
Definition: tune_quda.h:482
void saveProfile(const std::string label="")
Save profile to disk.
Definition: tune.cpp:514
TuneParam & operator=(const TuneParam &param)
Definition: tune_quda.h:36
TunableVectorYZ(unsigned int vector_length_y, unsigned int vector_length_z)
Definition: tune_quda.h:492
void setPolicyTuning(bool)
Enable / disable whether are tuning a policy.
Definition: tune.cpp:499
void initTuneParam(TuneParam &param) const
Definition: tune_quda.h:523
unsigned vector_length_z
Definition: tune_quda.h:487
bool activeTuning()
query if tuning is in progress
Definition: tune.cpp:121
virtual bool tuneAuxDim() const
Definition: tune_quda.h:74
unsigned int maxBlocksPerSM() const
For some reason this can&#39;t be queried from the device properties, so here we set set this...
Definition: tune_quda.h:153
virtual bool tuneSharedBytes() const
Definition: tune_quda.h:75
virtual unsigned int maxBlockSize(const TuneParam &param) const
Definition: tune_quda.h:94
virtual void preTune()
Definition: tune_quda.h:283
virtual unsigned int minGridSize() const
Definition: tune_quda.h:96
virtual void postTune()
Definition: tune_quda.h:284
bool advanceBlockDim(TuneParam &param) const
Definition: tune_quda.h:402
void resizeVector(int y) const
Definition: tune_quda.h:481
virtual ~Tunable()
Definition: tune_quda.h:280
void resizeVector(int y, int z) const
Definition: tune_quda.h:538
void loadTuneCache()
Definition: tune.cpp:322
static const int aux_n
Definition: tune_key.h:12
void defaultTuneParam(TuneParam &param) const
Definition: tune_quda.h:413
enum QudaTune_s QudaTune
unsigned long long flops
Definition: blas_quda.cu:22
int writeAuxString(const char *format,...)
Definition: tune_quda.h:267
long long n_calls
Definition: tune_quda.h:27
void initTuneParam(TuneParam &param) const
Definition: tune_quda.h:408
unsigned int sharedBytesPerBlock(const TuneParam &param) const
Definition: tune_quda.h:390
unsigned int step_y
Definition: tune_quda.h:433
virtual int blockStep() const
Definition: tune_quda.h:105
virtual unsigned int minThreads() const
Definition: tune_quda.h:72
void checkLaunchParam(TuneParam &param)
Definition: tune_quda.h:344
virtual void initTuneParam(TuneParam &param) const
Definition: tune_quda.h:304
enum QudaVerbosity_s QudaVerbosity
unsigned int maxDynamicSharedBytesPerBlock() const
This can&#39;t be correctly queried in CUDA for all architectures so here we set set this. Based on Table 14 of the CUDA Programming Guide 10.0 (Technical Specifications per Compute Capability).
Definition: tune_quda.h:198
const std::map< TuneKey, TuneParam > & getTuneCache()
Returns a reference to the tunecache map.
Definition: tune.cpp:128
virtual bool advanceBlockDim(TuneParam &param) const
Definition: tune_quda.h:124
virtual unsigned int sharedBytesPerBlock(const TuneParam &param) const
Definition: tune_quda.h:430
virtual bool tuneGridDim() const
Definition: tune_quda.h:73
virtual std::string perfString(float time) const
Definition: tune_quda.h:294
virtual unsigned int maxSharedBytesPerBlock() const
The maximum shared memory that a CUDA thread block can use in the autotuner. This isn&#39;t necessarily t...
Definition: tune_quda.h:229
unsigned long long bytes
Definition: blas_quda.cu:23
void defaultTuneParam(TuneParam &param) const
Definition: tune_quda.h:531
bool advanceBlockDim(TuneParam &param) const
Definition: tune_quda.h:496
virtual void defaultTuneParam(TuneParam &param) const
Definition: tune_quda.h:329
virtual bool advanceTuneParam(TuneParam &param) const
Definition: tune_quda.h:335
QudaVerbosity verbosity
Definition: test_util.cpp:1614
std::string comment
Definition: tune_quda.h:25