QUDA  v1.1.0
A library for QCD on GPUs
tune_quda.h
Go to the documentation of this file.
1 #pragma once
2 
3 #include <string>
4 #include <iostream>
5 #include <iomanip>
6 #include <cstring>
7 #include <cfloat>
8 #include <stdarg.h>
9 #include <map>
10 #include <algorithm>
11 #include <typeinfo>
12 
13 #include <tune_key.h>
14 #include <quda_internal.h>
15 #include <device.h>
16 
17 // this file has some workarounds to allow compilation using nvrtc of kernels that include this file
18 #ifdef __CUDACC_RTC__
19 #define CUresult bool
20 #define CUDA_SUCCESS true
21 #endif
22 
23 namespace quda {
24 
25  class TuneParam {
26 
27  public:
28  dim3 block;
29  dim3 grid;
31  bool set_max_shared_bytes; // whether to opt in to max shared bytes per thread block
32  int4 aux; // free parameter that can be used as an arbitrary autotuning dimension outside of launch parameters
33 
35  float time;
36  long long n_calls;
37 
38  inline TuneParam() :
39  block(32, 1, 1),
40  grid(1, 1, 1),
41  shared_bytes(0),
42  set_max_shared_bytes(false),
43  aux(),
44  time(FLT_MAX),
45  n_calls(0)
46  {
47  aux = make_int4(1,1,1,1);
48  }
49 
50  inline TuneParam(const TuneParam &param) :
51  block(param.block),
52  grid(param.grid),
55  aux(param.aux),
57  time(param.time),
59  {
60  }
61 
62  inline TuneParam& operator=(const TuneParam &param) {
63  if (&param != this) {
64  block = param.block;
65  grid = param.grid;
66  shared_bytes = param.shared_bytes;
67  set_max_shared_bytes = param.set_max_shared_bytes;
68  aux = param.aux;
69  comment = param.comment;
70  time = param.time;
71  n_calls = param.n_calls;
72  }
73  return *this;
74  }
75 
76 #ifndef __CUDACC_RTC__
77  friend std::ostream& operator<<(std::ostream& output, const TuneParam& param) {
78  output << "block=(" << param.block.x << "," << param.block.y << "," << param.block.z << "), ";
79  output << "grid=(" << param.grid.x << "," << param.grid.y << "," << param.grid.z << "), ";
80  output << "shared_bytes=" << param.shared_bytes;
81  output << ", aux=(" << param.aux.x << "," << param.aux.y << "," << param.aux.z << "," << param.aux.w << ")";
82  return output;
83  }
84 #endif
85  };
86 
87 #ifndef __CUDACC_RTC__
92  const std::map<TuneKey, TuneParam> &getTuneCache();
93 #endif
94 
95  class Tunable {
96 
97  protected:
98  virtual long long flops() const = 0;
99  virtual long long bytes() const { return 0; } // FIXME
100 
101  // the minimum number of shared bytes per thread
102  virtual unsigned int sharedBytesPerThread() const = 0;
103 
104  // the minimum number of shared bytes per thread block
105  virtual unsigned int sharedBytesPerBlock(const TuneParam &param) const = 0;
106 
107  // override this if a specific thread count is required (e.g., if not grid size tuning)
108  virtual unsigned int minThreads() const { return 1; }
109  virtual bool tuneGridDim() const { return true; }
110  virtual bool tuneAuxDim() const { return false; }
111  virtual bool tuneSharedBytes() const { return true; }
112 
113  virtual bool advanceGridDim(TuneParam &param) const
114  {
115  if (tuneGridDim()) {
116  const int step = gridStep();
117  param.grid.x += step;
118  if (param.grid.x > maxGridSize()) {
119  param.grid.x = minGridSize();
120  return false;
121  } else {
122  return true;
123  }
124  } else {
125  return false;
126  }
127  }
128 
129  virtual unsigned int maxBlockSize(const TuneParam &param) const { return deviceProp.maxThreadsPerBlock / (param.block.y*param.block.z); }
130  virtual unsigned int maxGridSize() const { return 2*deviceProp.multiProcessorCount; }
131  virtual unsigned int minGridSize() const { return 1; }
132 
138  virtual int gridStep() const { return 1; }
139 
140  virtual int blockStep() const { return deviceProp.warpSize; }
141  virtual int blockMin() const { return deviceProp.warpSize; }
142 
143  virtual void resetBlockDim(TuneParam &param) const {
144  if (tuneGridDim()) {
145  param.block.x = blockMin();
146  } else { // not tuning the grid dimension so have to set a valid grid size
147  const auto step = blockStep();
148  const auto max_threads = maxBlockSize(param);
149  const auto max_blocks = deviceProp.maxGridSize[0];
150 
151  // ensure the blockDim is large enough given the limit on gridDim
152  param.block.x = (minThreads() + max_blocks - 1) / max_blocks;
153  param.block.x = ((param.block.x+step-1)/step)*step; // round up to nearest step size
154  if (param.block.x > max_threads && param.block.y == 1 && param.block.z == 1)
155  errorQuda("Local lattice volume is too large for device");
156  }
157  }
158 
159  virtual bool advanceBlockDim(TuneParam &param) const
160  {
161  const unsigned int max_threads = maxBlockSize(param);
162  const unsigned int max_shared = maxSharedBytesPerBlock();
163  bool ret;
164 
165  param.block.x += blockStep();
166  int nthreads = param.block.x*param.block.y*param.block.z;
167  if (param.block.x > max_threads || sharedBytesPerThread() * nthreads > max_shared
168  || sharedBytesPerBlock(param) > max_shared) {
170  ret = false;
171  } else {
172  ret = true;
173  }
174 
175  if (!tuneGridDim()) param.grid.x = (minThreads() + param.block.x - 1) / param.block.x;
176 
177  return ret;
178  }
179 
186  unsigned int maxBlocksPerSM() const
187  {
188 #if CUDA_VERSION >= 11000
189  static int max_blocks_per_sm = 0;
190  if (!max_blocks_per_sm)
191  cudaDeviceGetAttribute(&max_blocks_per_sm, cudaDevAttrMaxBlocksPerMultiprocessor, comm_gpuid());
192  return max_blocks_per_sm;
193 #else
194  // these variables are taken from Table 14 of the CUDA 10.2 prgramming guide
195  switch (deviceProp.major) {
196  case 2:
197  return 8;
198  case 3:
199  return 16;
200  case 5:
201  case 6: return 32;
202  case 7:
203  switch (deviceProp.minor) {
204  case 0: return 32;
205  case 2: return 32;
206  case 5: return 16;
207  }
208  default:
209  warningQuda("Unknown SM architecture %d.%d - assuming limit of 32 blocks per SM\n",
210  deviceProp.major, deviceProp.minor);
211  return 32;
212  }
213 #endif
214  }
215 
221 
233  virtual unsigned int maxSharedBytesPerBlock() const { return deviceProp.sharedMemPerBlock; }
234 
242  virtual bool advanceSharedBytes(TuneParam &param) const
243  {
244  if (tuneSharedBytes()) {
245  const int max_shared = maxSharedBytesPerBlock();
246  const int max_blocks_per_sm = std::min(deviceProp.maxThreadsPerMultiProcessor / (param.block.x*param.block.y*param.block.z), maxBlocksPerSM());
247  int blocks_per_sm = max_shared / (param.shared_bytes ? param.shared_bytes : 1);
248  if (blocks_per_sm > max_blocks_per_sm) blocks_per_sm = max_blocks_per_sm;
249  param.shared_bytes = (blocks_per_sm > 0 ? max_shared / blocks_per_sm + 1 : max_shared + 1);
250 
251  if (param.shared_bytes > max_shared) {
252  TuneParam next(param);
253  advanceBlockDim(next); // to get next blockDim
254  int nthreads = next.block.x * next.block.y * next.block.z;
255  param.shared_bytes = sharedBytesPerThread() * nthreads > sharedBytesPerBlock(next) ?
256  sharedBytesPerThread() * nthreads :
257  sharedBytesPerBlock(next);
258  return false;
259  } else {
260  return true;
261  }
262  } else {
263  return false;
264  }
265  }
266 
267  virtual bool advanceAux(TuneParam &param) const { return false; }
268 
270 
271  int writeAuxString(const char *format, ...) {
272  int n = 0;
273 #ifndef __CUDACC_RTC__
274  va_list arguments;
275  va_start(arguments, format);
276  n = vsnprintf(aux, TuneKey::aux_n, format, arguments);
277  if (n < 0 || n >= TuneKey::aux_n) errorQuda("Error writing auxiliary string");
278 #endif
279  return n;
280  }
281 
283  CUresult jitify_error;
284 
289  bool tuned()
290  {
291 #ifndef __CUDACC_RTC__
292  // not tuning is equivalent to already tuned
293  if (!getTuning()) return true;
294 
295  TuneKey key = tuneKey();
296  if (use_managed_memory()) strcat(key.aux, ",managed");
297  // if key is present in cache then already tuned
298  return getTuneCache().find(key) != getTuneCache().end();
299 #else
300  return true;
301 #endif
302  }
303 
304  public:
305  Tunable() : jitify_error(CUDA_SUCCESS) { aux[0] = '\0'; }
306  virtual ~Tunable() { }
307  virtual TuneKey tuneKey() const = 0;
308  virtual void apply(const qudaStream_t &stream) = 0;
309  virtual void preTune() { }
310  virtual void postTune() { }
311  virtual int tuningIter() const { return 1; }
312 
313 #ifndef __CUDACC_RTC__
314  virtual std::string paramString(const TuneParam &param) const
315  {
316  std::stringstream ps;
317  ps << param;
318  return ps.str();
319  }
320 
321  virtual std::string perfString(float time) const
322  {
323  float gflops = flops() / (1e9 * time);
324  float gbytes = bytes() / (1e9 * time);
325  std::stringstream ss;
326  ss << std::setiosflags(std::ios::fixed) << std::setprecision(2) << gflops << " Gflop/s, ";
327  ss << gbytes << " GB/s";
328  return ss.str();
329  }
330 #endif
331 
332  virtual void initTuneParam(TuneParam &param) const
333  {
334  const unsigned int max_threads = deviceProp.maxThreadsDim[0];
335  const unsigned int max_blocks = deviceProp.maxGridSize[0];
336  const int min_grid_size = minGridSize();
337  const int min_block_size = blockMin();
338 
339  if (tuneGridDim()) {
340  param.block = dim3(min_block_size,1,1);
341 
342  param.grid = dim3(min_grid_size,1,1);
343  } else {
344  // find the minimum valid blockDim
345  param.block = dim3((minThreads()+max_blocks-1)/max_blocks, 1, 1);
346  param.block.x = ((param.block.x+min_block_size-1) / min_block_size) * min_block_size; // round up to the nearest multiple of desired minimum block size
347  if (param.block.x > max_threads) errorQuda("Local lattice volume is too large for device");
348 
349  param.grid = dim3((minThreads()+param.block.x-1)/param.block.x, 1, 1);
350  }
351  int nthreads = param.block.x*param.block.y*param.block.z;
352  param.shared_bytes = sharedBytesPerThread()*nthreads > sharedBytesPerBlock(param) ?
354  }
355 
357  virtual void defaultTuneParam(TuneParam &param) const
358  {
360  if (tuneGridDim()) param.grid.x = maxGridSize(); // don't set y and z in case derived initTuneParam has
361  }
362 
363  virtual bool advanceTuneParam(TuneParam &param) const
364  {
366  }
367 
373 
374  if (param.block.x*param.block.y*param.block.z > (unsigned)deviceProp.maxThreadsPerBlock)
375  errorQuda("Requested block size %dx%dx%d=%d greater than hardware limit %d",
376  param.block.x, param.block.y, param.block.z, param.block.x*param.block.y*param.block.z, deviceProp.maxThreadsPerBlock);
377 
378  if (param.block.x > (unsigned int)deviceProp.maxThreadsDim[0])
379  errorQuda("Requested X-dimension block size %d greater than hardware limit %d", param.block.x,
380  deviceProp.maxThreadsDim[0]);
381 
382  if (param.block.y > (unsigned int)deviceProp.maxThreadsDim[1])
383  errorQuda("Requested Y-dimension block size %d greater than hardware limit %d", param.block.y,
384  deviceProp.maxThreadsDim[1]);
385 
386  if (param.block.z > (unsigned int)deviceProp.maxThreadsDim[2])
387  errorQuda("Requested Z-dimension block size %d greater than hardware limit %d", param.block.z,
388  deviceProp.maxThreadsDim[2]);
389 
390  if (param.grid.x > (unsigned int)deviceProp.maxGridSize[0])
391  errorQuda("Requested X-dimension grid size %d greater than hardware limit %d", param.grid.x,
392  deviceProp.maxGridSize[0]);
393 
394  if (param.grid.y > (unsigned int)deviceProp.maxGridSize[1])
395  errorQuda("Requested Y-dimension grid size %d greater than hardware limit %d", param.grid.y,
396  deviceProp.maxGridSize[1]);
397 
398  if (param.grid.z > (unsigned int)deviceProp.maxGridSize[2])
399  errorQuda("Requested Z-dimension grid size %d greater than hardware limit %d", param.grid.z,
400  deviceProp.maxGridSize[2]);
401  }
402 
403  CUresult jitifyError() const { return jitify_error; }
404  CUresult& jitifyError() { return jitify_error; }
405  };
406 
414  {
415 
416  protected:
417  unsigned int sharedBytesPerThread() const { return 0; }
418  unsigned int sharedBytesPerBlock(const TuneParam &param) const { return 0; }
419 
425  bool tuneGridDim() const final { return true; }
426 
427  unsigned int minGridSize() const { return maxGridSize() / 8; }
428  int gridStep() const { return minGridSize(); }
429 
436  unsigned int maxBlockSize(const TuneParam &param) const { return deviceProp.maxThreadsPerBlock / 2; }
437 
438  public:
440  bool rtn = Tunable::advanceBlockDim(param);
441  param.block.y = 2;
442  return rtn;
443  }
444 
447  param.block.y = 2;
448  }
449 
452  param.block.y = 2;
453  }
454  };
455 
462  class TunableVectorY : public Tunable {
463 
464  protected:
465  virtual unsigned int sharedBytesPerThread() const { return 0; }
466  virtual unsigned int sharedBytesPerBlock(const TuneParam &param) const { return 0; }
467 
468  mutable unsigned int vector_length_y;
469  mutable unsigned int step_y;
471 
472  public:
474  step_y(1), tune_block_x(true) { }
475 
477  {
478  dim3 block = param.block;
479  dim3 grid = param.grid;
480  bool ret = tune_block_x ? Tunable::advanceBlockDim(param) : false;
481  param.block.y = block.y;
482  param.grid.y = grid.y;
483 
484  if (ret) {
485  return true;
486  } else { // block.x (spacetime) was reset
487 
488  // we can advance spin/block-color since this is valid
489  if (param.block.y < vector_length_y && param.block.y < (unsigned int)deviceProp.maxThreadsDim[1] &&
490  param.block.x*(param.block.y+step_y)*param.block.z <= (unsigned int)deviceProp.maxThreadsPerBlock) {
491  param.block.y += step_y;
492  param.grid.y = (vector_length_y + param.block.y - 1) / param.block.y;
493  return true;
494  } else { // we have run off the end so let's reset
495  param.block.y = step_y;
496  param.grid.y = (vector_length_y + param.block.y - 1) / param.block.y;
497  return false;
498  }
499  }
500  }
501 
503  {
505  param.block.y = step_y;
506  param.grid.y = (vector_length_y + step_y - 1) / step_y;
507  }
508 
511  {
513  param.block.y = step_y;
514  param.grid.y = (vector_length_y + step_y - 1) / step_y;
515  }
516 
517  void resizeVector(int y) const { vector_length_y = y; }
518  void resizeStep(int y) const { step_y = y; }
519  };
520 
522 
523  protected:
524  mutable unsigned vector_length_z;
525  mutable unsigned step_z;
527 
528  public:
529  TunableVectorYZ(unsigned int vector_length_y, unsigned int vector_length_z)
531  step_z(1), tune_block_y(true) { }
532 
534  {
535  dim3 block = param.block;
536  dim3 grid = param.grid;
538  param.block.z = block.z;
539  param.grid.z = grid.z;
540 
541  if (ret) {
542  // we advanced the block.x / block.y so we're done
543  return true;
544  } else { // block.x/block.y (spacetime) was reset
545 
546  // we can advance spin/block-color since this is valid
547  if (param.block.z < vector_length_z && param.block.z < (unsigned int)deviceProp.maxThreadsDim[2] &&
548  param.block.x*param.block.y*(param.block.z+step_z) <= (unsigned int)deviceProp.maxThreadsPerBlock) {
549  param.block.z += step_z;
550  param.grid.z = (vector_length_z + param.block.z - 1) / param.block.z;
551  return true;
552  } else { // we have run off the end so let's reset
553  param.block.z = step_z;
554  param.grid.z = (vector_length_z + param.block.z - 1) / param.block.z;
555  return false;
556  }
557  }
558  }
559 
561  {
563  param.block.z = step_z;
564  param.grid.z = (vector_length_z + step_z - 1) / step_z;
565  }
566 
569  {
571  param.block.z = step_z;
572  param.grid.z = (vector_length_z + step_z - 1) / step_z;
573  }
574 
575  void resizeVector(int y, int z) const { vector_length_z = z; TunableVectorY::resizeVector(y); }
576  void resizeStep(int y, int z) const { step_z = z; TunableVectorY::resizeStep(y); }
577  };
578 
583  bool activeTuning();
584 
585  void loadTuneCache();
586  void saveTuneCache(bool error = false);
587 
591  void saveProfile(const std::string label = "");
592 
596  void flushProfile();
597 
598  TuneParam tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity);
599 
603  void postTrace_(const char *func, const char *file, int line);
604 
608  void enableProfileCount();
609 
613  void disableProfileCount();
614 
618  void setPolicyTuning(bool);
619 
623  bool policyTuning();
624 
628  void setUberTuning(bool);
629 
633  bool uberTuning();
634 
635 } // namespace quda
636 
637 // undo jit-safe modifications
638 #ifdef __CUDACC_RTC__
639 #undef CUresult
640 #undef CUDA_SUCCESS
641 #endif
642 
643 #define postTrace() quda::postTrace_(__func__, quda::file_name(__FILE__), __LINE__)
unsigned int maxDynamicSharedBytesPerBlock() const
Returns the maximum dynamic shared memory per block.
Definition: tune_quda.h:220
virtual long long flops() const =0
virtual bool tuneSharedBytes() const
Definition: tune_quda.h:111
virtual int gridStep() const
gridStep sets the step size when iterating the grid size in advanceGridDim.
Definition: tune_quda.h:138
virtual long long bytes() const
Definition: tune_quda.h:99
virtual std::string perfString(float time) const
Definition: tune_quda.h:321
CUresult & jitifyError()
Definition: tune_quda.h:404
virtual std::string paramString(const TuneParam &param) const
Definition: tune_quda.h:314
CUresult jitifyError() const
Definition: tune_quda.h:403
bool tuned()
Whether the present instance has already been tuned or not.
Definition: tune_quda.h:289
virtual bool tuneGridDim() const
Definition: tune_quda.h:109
virtual unsigned int sharedBytesPerThread() const =0
char aux[TuneKey::aux_n]
Definition: tune_quda.h:269
virtual bool advanceTuneParam(TuneParam &param) const
Definition: tune_quda.h:363
virtual bool advanceGridDim(TuneParam &param) const
Definition: tune_quda.h:113
virtual unsigned int maxSharedBytesPerBlock() const
The maximum shared memory that a CUDA thread block can use in the autotuner. This isn't necessarily t...
Definition: tune_quda.h:233
virtual void initTuneParam(TuneParam &param) const
Definition: tune_quda.h:332
virtual void postTune()
Definition: tune_quda.h:310
virtual void preTune()
Definition: tune_quda.h:309
virtual bool tuneAuxDim() const
Definition: tune_quda.h:110
virtual int blockMin() const
Definition: tune_quda.h:141
virtual unsigned int sharedBytesPerBlock(const TuneParam &param) const =0
virtual unsigned int maxBlockSize(const TuneParam &param) const
Definition: tune_quda.h:129
CUresult jitify_error
Definition: tune_quda.h:283
virtual unsigned int minGridSize() const
Definition: tune_quda.h:131
virtual TuneKey tuneKey() const =0
virtual bool advanceBlockDim(TuneParam &param) const
Definition: tune_quda.h:159
virtual int blockStep() const
Definition: tune_quda.h:140
virtual void apply(const qudaStream_t &stream)=0
virtual bool advanceSharedBytes(TuneParam &param) const
Definition: tune_quda.h:242
void checkLaunchParam(TuneParam &param)
Definition: tune_quda.h:372
unsigned int maxBlocksPerSM() const
Returns the maximum number of simultaneously resident blocks per SM. We can directly query this of CU...
Definition: tune_quda.h:186
virtual void resetBlockDim(TuneParam &param) const
Definition: tune_quda.h:143
virtual unsigned int maxGridSize() const
Definition: tune_quda.h:130
virtual void defaultTuneParam(TuneParam &param) const
Definition: tune_quda.h:357
int writeAuxString(const char *format,...)
Definition: tune_quda.h:271
virtual int tuningIter() const
Definition: tune_quda.h:311
virtual unsigned int minThreads() const
Definition: tune_quda.h:108
virtual ~Tunable()
Definition: tune_quda.h:306
virtual bool advanceAux(TuneParam &param) const
Definition: tune_quda.h:267
bool advanceBlockDim(TuneParam &param) const
Definition: tune_quda.h:439
unsigned int minGridSize() const
Definition: tune_quda.h:427
unsigned int maxBlockSize(const TuneParam &param) const
Definition: tune_quda.h:436
void defaultTuneParam(TuneParam &param) const
Definition: tune_quda.h:450
bool tuneGridDim() const final
Definition: tune_quda.h:425
unsigned int sharedBytesPerBlock(const TuneParam &param) const
Definition: tune_quda.h:418
int gridStep() const
gridStep sets the step size when iterating the grid size in advanceGridDim.
Definition: tune_quda.h:428
unsigned int sharedBytesPerThread() const
Definition: tune_quda.h:417
void initTuneParam(TuneParam &param) const
Definition: tune_quda.h:445
virtual unsigned int sharedBytesPerBlock(const TuneParam &param) const
Definition: tune_quda.h:466
virtual unsigned int sharedBytesPerThread() const
Definition: tune_quda.h:465
void defaultTuneParam(TuneParam &param) const
Definition: tune_quda.h:510
bool advanceBlockDim(TuneParam &param) const
Definition: tune_quda.h:476
void initTuneParam(TuneParam &param) const
Definition: tune_quda.h:502
void resizeStep(int y) const
Definition: tune_quda.h:518
unsigned int step_y
Definition: tune_quda.h:469
unsigned int vector_length_y
Definition: tune_quda.h:468
TunableVectorY(unsigned int vector_length_y)
Definition: tune_quda.h:473
void resizeVector(int y) const
Definition: tune_quda.h:517
void initTuneParam(TuneParam &param) const
Definition: tune_quda.h:560
void resizeVector(int y, int z) const
Definition: tune_quda.h:575
unsigned vector_length_z
Definition: tune_quda.h:524
void resizeStep(int y, int z) const
Definition: tune_quda.h:576
bool advanceBlockDim(TuneParam &param) const
Definition: tune_quda.h:533
void defaultTuneParam(TuneParam &param) const
Definition: tune_quda.h:568
TunableVectorYZ(unsigned int vector_length_y, unsigned int vector_length_z)
Definition: tune_quda.h:529
std::string comment
Definition: tune_quda.h:34
TuneParam(const TuneParam &param)
Definition: tune_quda.h:50
bool set_max_shared_bytes
Definition: tune_quda.h:31
TuneParam & operator=(const TuneParam &param)
Definition: tune_quda.h:62
long long n_calls
Definition: tune_quda.h:36
friend std::ostream & operator<<(std::ostream &output, const TuneParam &param)
Definition: tune_quda.h:77
int comm_gpuid(void)
QudaVerbosity verbosity
enum QudaTune_s QudaTune
enum QudaVerbosity_s QudaVerbosity
size_t max_dynamic_shared_memory()
Returns the maximum dynamic shared memory per block.
Definition: device.cpp:215
void disableProfileCount()
Disable the profile kernel counting.
Definition: tune.cpp:141
TuneParam tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
Definition: tune.cpp:677
const std::map< TuneKey, TuneParam > & getTuneCache()
Returns a reference to the tunecache map.
Definition: tune.cpp:144
void saveTuneCache(bool error=false)
Definition: tune.cpp:439
void loadTuneCache()
Definition: tune.cpp:337
bool policyTuning()
Query whether we are currently tuning a policy.
Definition: tune.cpp:512
void setUberTuning(bool)
Enable / disable whether we are tuning an uber kernel.
Definition: tune.cpp:519
void setPolicyTuning(bool)
Enable / disable whether are tuning a policy.
Definition: tune.cpp:514
qudaStream_t * stream
bool activeTuning()
query if tuning is in progress
Definition: tune.cpp:137
void postTrace_(const char *func, const char *file, int line)
Post an event in the trace, recording where it was posted.
Definition: tune.cpp:106
void flushProfile()
Flush profile contents, setting all counts to zero.
Definition: tune.cpp:522
bool use_managed_memory()
Definition: malloc.cpp:178
void enableProfileCount()
Enable the profile kernel counting.
Definition: tune.cpp:142
void saveProfile(const std::string label="")
Save profile to disk.
Definition: tune.cpp:532
bool uberTuning()
Query whether we are tuning an uber kernel.
Definition: tune.cpp:517
::std::string string
Definition: gtest-port.h:891
QudaGaugeParam param
Definition: pack_test.cpp:18
cudaDeviceProp deviceProp
Definition: device.cpp:14
cudaStream_t qudaStream_t
Definition: quda_api.h:9
static const int aux_n
Definition: tune_key.h:12
char aux[aux_n]
Definition: tune_key.h:15
QudaTune getTuning()
Query whether autotuning is enabled or not. Default is enabled but can be overridden by setting QUDA_...
Definition: util_quda.cpp:52
#define warningQuda(...)
Definition: util_quda.h:132
#define errorQuda(...)
Definition: util_quda.h:120