QUDA  v0.7.0
A library for QCD on GPUs
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
tune_quda.h
Go to the documentation of this file.
1 #ifndef _TUNE_QUDA_H
2 #define _TUNE_QUDA_H
3 
4 #include <quda_internal.h>
5 #include <dirac_quda.h>
6 
7 #include <string>
8 #include <iostream>
9 #include <iomanip>
10 #include <cstring>
11 #include <stdarg.h>
12 #include <tune_key.h>
13 
14 namespace quda {
15 
16  class TuneParam {
17 
18  public:
19  dim3 block;
20  dim3 grid;
23 
24  TuneParam() : block(32, 1, 1), grid(1, 1, 1), shared_bytes(0) { }
26  : block(param.block), grid(param.grid), shared_bytes(param.shared_bytes), comment(param.comment) { }
28  if (&param != this) {
29  block = param.block;
30  grid = param.grid;
31  shared_bytes = param.shared_bytes;
32  comment = param.comment;
33  }
34  return *this;
35  }
36 
37  };
38 
39 
40  class Tunable {
41 
42  protected:
43  virtual long long flops() const = 0;
44  virtual long long bytes() const { return 0; } // FIXME
45 
46  // the minimum number of shared bytes per thread
47  virtual unsigned int sharedBytesPerThread() const = 0;
48 
49  // the minimum number of shared bytes per thread block
50  virtual unsigned int sharedBytesPerBlock(const TuneParam &param) const = 0;
51 
52  // override this if a specific thread count is required (e.g., if not grid size tuning)
53  virtual unsigned int minThreads() const { return 1; }
54  virtual bool tuneGridDim() const { return true; }
55  virtual bool tuneSharedBytes() const { return true; }
56 
57  virtual bool advanceGridDim(TuneParam &param) const
58  {
59  if (tuneGridDim()) {
60  const unsigned int max_blocks = 256; // FIXME: set a reasonable value for blas currently
61  const int step = 1;
62  param.grid.x += step;
63  if (param.grid.x > max_blocks) {
64  param.grid.x = step;
65  return false;
66  } else {
67  return true;
68  }
69  } else {
70  return false;
71  }
72  }
73 
74  virtual bool advanceBlockDim(TuneParam &param) const
75  {
76  const unsigned int max_threads = deviceProp.maxThreadsDim[0];
77  const unsigned int max_blocks = deviceProp.maxGridSize[0];
78  const unsigned int max_shared = deviceProp.sharedMemPerBlock;
79  const int step = deviceProp.warpSize;
80  bool ret;
81 
82  param.block.x += step;
83  if (param.block.x > max_threads || sharedBytesPerThread()*param.block.x > max_shared) {
84 
85  if (tuneGridDim()) {
86  param.block.x = step;
87  } else { // not tuning the grid dimension so have to set a valid grid size
88  // ensure the blockDim is large enough given the limit on gridDim
89  param.block = dim3((minThreads()+max_blocks-1)/max_blocks, 1, 1);
90  param.block.x = ((param.block.x+step-1)/step)*step; // round up to nearest step size
91  if(param.block.x > max_threads) errorQuda("Local lattice volume is too large for device");
92  }
93 
94  ret = false;
95  } else {
96  ret = true;
97  }
98 
99  if (!tuneGridDim())
100  param.grid = dim3((minThreads()+param.block.x-1)/param.block.x, 1, 1);
101 
102  return ret;
103  }
104 
113  virtual bool advanceSharedBytes(TuneParam &param) const
114  {
115  if (tuneSharedBytes()) {
116  const int max_shared = deviceProp.sharedMemPerBlock;
117  const int max_blocks_per_sm = 8; // FIXME: derive from deviceProp
118  int blocks_per_sm = max_shared / (param.shared_bytes ? param.shared_bytes : 1);
119  if (blocks_per_sm > max_blocks_per_sm) blocks_per_sm = max_blocks_per_sm;
120  param.shared_bytes = max_shared / blocks_per_sm + 1;
121  if (param.shared_bytes > max_shared) {
122  TuneParam next(param);
123  advanceBlockDim(next); // to get next blockDim
124  int nthreads = next.block.x * next.block.y * next.block.z;
125  param.shared_bytes = sharedBytesPerThread()*nthreads > sharedBytesPerBlock(param) ?
126  sharedBytesPerThread()*nthreads : sharedBytesPerBlock(param);
127  return false;
128  } else {
129  return true;
130  }
131  } else {
132  return false;
133  }
134  }
135 
137 
138  void writeAuxString(const char *format, ...) {
139  va_list arguments;
140  va_start(arguments, format);
141  int n = vsnprintf(aux, TuneKey::aux_n, format, arguments);
142  //int n = snprintf(aux, QUDA_TUNE_AUX_STR_LENGTH, "threads=%d,prec=%lu,stride=%d,geometery=%d",
143  // arg.volumeCB,sizeof(Complex)/2,arg.forceOffset);
144  if (n < 0 || n >= 512) errorQuda("Error writing auxiliary string");
145  }
146 
147  public:
148  Tunable() { }
149  virtual ~Tunable() { }
150  virtual TuneKey tuneKey() const = 0;
151  virtual void apply(const cudaStream_t &stream) = 0;
152  virtual void preTune() { }
153  virtual void postTune() { }
154  virtual int tuningIter() const { return 1; }
155 
156  virtual std::string paramString(const TuneParam &param) const
157  {
158  std::stringstream ps;
159  ps << "block=(" << param.block.x << "," << param.block.y << "," << param.block.z << "), ";
160  ps << "grid=(" << param.grid.x << "," << param.grid.y << "," << param.grid.z << "), ";
161  ps << "shared=" << param.shared_bytes;
162  return ps.str();
163  }
164 
165  virtual std::string perfString(float time) const
166  {
167  float gflops = flops() / (1e9 * time);
168  float gbytes = bytes() / (1e9 * time);
169  std::stringstream ss;
170  ss << std::setiosflags(std::ios::fixed) << std::setprecision(2) << gflops << " Gflop/s, ";
171  ss << gbytes << " GB/s";
172  return ss.str();
173  }
174 
175  virtual void initTuneParam(TuneParam &param) const
176  {
177  const unsigned int max_threads = deviceProp.maxThreadsDim[0];
178  const unsigned int max_blocks = deviceProp.maxGridSize[0];
179  const int min_block_size = deviceProp.warpSize;
180 
181  if (tuneGridDim()) {
182  param.block = dim3(min_block_size,1,1);
183 
184  param.grid = dim3(1,1,1);
185  } else {
186  // find the minimum valid blockDim
187  const int warp = deviceProp.warpSize;
188  param.block = dim3((minThreads()+max_blocks-1)/max_blocks, 1, 1);
189  param.block.x = ((param.block.x+warp-1) / warp) * warp; // round up to the nearest warp
190  if (param.block.x > max_threads) errorQuda("Local lattice volume is too large for device");
191 
192  param.grid = dim3((minThreads()+param.block.x-1)/param.block.x, 1, 1);
193  }
194  param.shared_bytes = sharedBytesPerThread()*param.block.x > sharedBytesPerBlock(param) ?
196  }
197 
199  virtual void defaultTuneParam(TuneParam &param) const
200  {
201  initTuneParam(param);
202  if (tuneGridDim()) param.grid = dim3(128,1,1);
203  }
204 
205  virtual bool advanceTuneParam(TuneParam &param) const
206  {
207  return advanceSharedBytes(param) || advanceBlockDim(param) || advanceGridDim(param);
208  }
209 
215 
216  if (param.block.x > (unsigned int)deviceProp.maxThreadsDim[0])
217  errorQuda("Requested X-dimension block size %d greater than hardware limit %d",
218  param.block.x, deviceProp.maxThreadsDim[0]);
219 
220  if (param.block.y > (unsigned int)deviceProp.maxThreadsDim[1])
221  errorQuda("Requested Y-dimension block size %d greater than hardware limit %d",
222  param.block.y, deviceProp.maxThreadsDim[1]);
223 
224  if (param.block.z > (unsigned int)deviceProp.maxThreadsDim[2])
225  errorQuda("Requested Z-dimension block size %d greater than hardware limit %d",
226  param.block.z, deviceProp.maxThreadsDim[2]);
227 
228  if (param.grid.x > (unsigned int)deviceProp.maxGridSize[0]){
229  errorQuda("Requested X-dimension grid size %d greater than hardware limit %d",
230  param.grid.x, deviceProp.maxGridSize[0]);
231 
232  }
233  if (param.grid.y > (unsigned int)deviceProp.maxGridSize[1])
234  errorQuda("Requested Y-dimension grid size %d greater than hardware limit %d",
235  param.grid.y, deviceProp.maxGridSize[1]);
236 
237  if (param.grid.z > (unsigned int)deviceProp.maxGridSize[2])
238  errorQuda("Requested Z-dimension grid size %d greater than hardware limit %d",
239  param.grid.z, deviceProp.maxGridSize[2]);
240  }
241 
242  };
243 
244  void loadTuneCache(QudaVerbosity verbosity);
245  void saveTuneCache(QudaVerbosity verbosity);
246  TuneParam& tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity);
247 
248 } // namespace quda
249 
250 #endif // _TUNE_QUDA_H
cudaDeviceProp deviceProp
#define errorQuda(...)
Definition: util_quda.h:73
cudaStream_t * stream
::std::string string
Definition: gtest.h:1979
virtual void initTuneParam(TuneParam &param) const
Definition: tune_quda.h:175
virtual TuneKey tuneKey() const =0
virtual bool tuneGridDim() const
Definition: tune_quda.h:54
virtual unsigned int minThreads() const
Definition: tune_quda.h:53
TuneParam(const TuneParam &param)
Definition: tune_quda.h:25
virtual bool advanceSharedBytes(TuneParam &param) const
Definition: tune_quda.h:113
virtual std::string paramString(const TuneParam &param) const
Definition: tune_quda.h:156
QudaGaugeParam param
Definition: pack_test.cpp:17
void writeAuxString(const char *format,...)
Definition: tune_quda.h:138
virtual unsigned int sharedBytesPerThread() const =0
virtual unsigned int sharedBytesPerBlock(const TuneParam &param) const =0
virtual bool advanceBlockDim(TuneParam &param) const
Definition: tune_quda.h:74
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
Definition: tune.cpp:271
virtual std::string perfString(float time) const
Definition: tune_quda.h:165
TuneParam & operator=(const TuneParam &param)
Definition: tune_quda.h:27
virtual void defaultTuneParam(TuneParam &param) const
Definition: tune_quda.h:199
virtual int tuningIter() const
Definition: tune_quda.h:154
virtual void preTune()
Definition: tune_quda.h:152
virtual void postTune()
Definition: tune_quda.h:153
void loadTuneCache(QudaVerbosity verbosity)
Definition: tune.cpp:131
virtual ~Tunable()
Definition: tune_quda.h:149
static const int aux_n
Definition: tune_key.h:12
virtual long long bytes() const
Definition: tune_quda.h:44
enum QudaTune_s QudaTune
void checkLaunchParam(TuneParam &param)
Definition: tune_quda.h:214
virtual bool advanceTuneParam(TuneParam &param) const
Definition: tune_quda.h:205
enum QudaVerbosity_s QudaVerbosity
virtual bool tuneSharedBytes() const
Definition: tune_quda.h:55
char aux[TuneKey::aux_n]
Definition: tune_quda.h:136
void saveTuneCache(QudaVerbosity verbosity)
Definition: tune.cpp:205
virtual long long flops() const =0
virtual void apply(const cudaStream_t &stream)=0
std::string comment
Definition: tune_quda.h:22
virtual bool advanceGridDim(TuneParam &param) const
Definition: tune_quda.h:57