QUDA  v0.5.0
A library for QCD on GPUs
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
tune_quda.h
Go to the documentation of this file.
1 #ifndef _TUNE_QUDA_H
2 #define _TUNE_QUDA_H
3 
4 #include <quda_internal.h>
5 #include <dirac_quda.h>
6 
7 #include <string>
8 #include <iostream>
9 #include <iomanip>
10 
11 namespace quda {
12 
13  class TuneKey {
14 
15  public:
16  std::string volume;
17  std::string name;
18  std::string aux;
19 
20  TuneKey() { }
21  TuneKey(std::string v, std::string n, std::string a=std::string("type=default"))
22  : volume(v), name(n), aux(a) { }
23  TuneKey(const TuneKey &key)
24  : volume(key.volume), name(key.name), aux(key.aux) { }
25 
26  TuneKey& operator=(const TuneKey &key) {
27  if (&key != this) {
28  volume = key.volume;
29  name = key.name;
30  aux = key.aux;
31  }
32  return *this;
33  }
34 
35  bool operator<(const TuneKey &other) const {
36  return (volume < other.volume) ||
37  ((volume == other.volume) && (name < other.name)) ||
38  ((volume == other.volume) && (name == other.name) && (aux < other.aux));
39  }
40 
41  };
42 
43 
44  class TuneParam {
45 
46  public:
47  dim3 block;
48  dim3 grid;
50  std::string comment;
51 
52  TuneParam() : block(32, 1, 1), grid(1, 1, 1), shared_bytes(0) { }
54  : block(param.block), grid(param.grid), shared_bytes(param.shared_bytes), comment(param.comment) { }
56  if (&param != this) {
57  block = param.block;
58  grid = param.grid;
59  shared_bytes = param.shared_bytes;
60  comment = param.comment;
61  }
62  return *this;
63  }
64 
65  };
66 
67 
68  class Tunable {
69 
70  protected:
71  virtual long long flops() const = 0;
72  virtual long long bytes() const { return 0; } // FIXME
73 
74  // the minimum number of shared bytes per thread
75  virtual int sharedBytesPerThread() const = 0;
76 
77  // the minimum number of shared bytes per thread block
78  virtual int sharedBytesPerBlock(const TuneParam &param) const = 0;
79 
80  virtual bool advanceGridDim(TuneParam &param) const
81  {
82  const unsigned int max_blocks = 256; // FIXME: set a reasonable value for blas currently
83  const int step = 1;
84  param.grid.x += step;
85  if (param.grid.x > max_blocks) {
86  param.grid.x = step;
87  return false;
88  } else {
89  return true;
90  }
91  }
92 
93  virtual bool advanceBlockDim(TuneParam &param) const
94  {
95  const unsigned int max_threads = deviceProp.maxThreadsDim[0];
96  const unsigned int max_shared = 16384; // FIXME: use deviceProp.sharedMemPerBlock;
97  const int step = deviceProp.warpSize;
98  param.block.x += step;
99  if (param.block.x > max_threads || sharedBytesPerThread()*param.block.x > max_shared) {
100  param.block.x = step;
101  return false;
102  } else {
103  return true;
104  }
105  }
106 
115  virtual bool advanceSharedBytes(TuneParam &param) const
116  {
117  const int max_shared = 16384; // FIXME: use deviceProp.sharedMemPerBlock;
118  const int max_blocks_per_sm = 8; // FIXME: derive from deviceProp
119  int blocks_per_sm = max_shared / (param.shared_bytes ? param.shared_bytes : 1);
120  if (blocks_per_sm > max_blocks_per_sm) blocks_per_sm = max_blocks_per_sm;
121  param.shared_bytes = max_shared / blocks_per_sm + 1;
122  if (param.shared_bytes > max_shared) {
123  TuneParam next(param);
124  advanceBlockDim(next); // to get next blockDim
125  int nthreads = next.block.x * next.block.y * next.block.z;
126  param.shared_bytes = sharedBytesPerThread()*nthreads > sharedBytesPerBlock(param) ?
127  sharedBytesPerThread()*nthreads : sharedBytesPerBlock(param);
128  return false;
129  } else {
130  return true;
131  }
132  }
133 
134  public:
135  Tunable() { }
136  virtual ~Tunable() { }
137  virtual TuneKey tuneKey() const = 0;
138  virtual void apply(const cudaStream_t &stream) = 0;
139  virtual void preTune() { }
140  virtual void postTune() { }
141  virtual int tuningIter() const { return 1; }
142 
143  virtual std::string paramString(const TuneParam &param) const
144  {
145  std::stringstream ps;
146  ps << "block=(" << param.block.x << "," << param.block.y << "," << param.block.z << "), ";
147  ps << "grid=(" << param.grid.x << "," << param.grid.y << "," << param.grid.z << "), ";
148  ps << "shared=" << param.shared_bytes;
149  return ps.str();
150  }
151 
152  virtual std::string perfString(float time) const
153  {
154  float gflops = flops() / (1e9 * time);
155  float gbytes = bytes() / (1e9 * time);
156  std::stringstream ss;
157  ss << std::setiosflags(std::ios::fixed) << std::setprecision(2) << gflops << " Gflop/s, ";
158  ss << gbytes << " GB/s";
159  return ss.str();
160  }
161 
162  virtual void initTuneParam(TuneParam &param) const
163  {
164  const int min_block_size = deviceProp.warpSize;
165  param.block = dim3(min_block_size,1,1);
166  param.grid = dim3(1,1,1);
167  param.shared_bytes = sharedBytesPerThread()*min_block_size > sharedBytesPerBlock(param) ?
168  sharedBytesPerThread()*min_block_size : sharedBytesPerBlock(param);
169  }
170 
172  virtual void defaultTuneParam(TuneParam &param) const
173  {
174  initTuneParam(param);
175  param.grid = dim3(128,1,1);
176  }
177 
178  virtual bool advanceTuneParam(TuneParam &param) const
179  {
180  return advanceSharedBytes(param) || advanceBlockDim(param) || advanceGridDim(param);
181  }
182 
188 
189  if (param.block.x > (unsigned int)deviceProp.maxThreadsDim[0])
190  errorQuda("Requested X-dimension block size %d greater than hardware limit %d",
191  param.block.x, deviceProp.maxThreadsDim[0]);
192 
193  if (param.block.y > (unsigned int)deviceProp.maxThreadsDim[1])
194  errorQuda("Requested Y-dimension block size %d greater than hardware limit %d",
195  param.block.y, deviceProp.maxThreadsDim[1]);
196 
197  if (param.block.z > (unsigned int)deviceProp.maxThreadsDim[2])
198  errorQuda("Requested Z-dimension block size %d greater than hardware limit %d",
199  param.block.z, deviceProp.maxThreadsDim[2]);
200 
201  if (param.grid.x > (unsigned int)deviceProp.maxGridSize[0]){
202  errorQuda("Requested X-dimension grid size %d greater than hardware limit %d",
203  param.grid.x, deviceProp.maxGridSize[0]);
204 
205  }
206  if (param.grid.y > (unsigned int)deviceProp.maxGridSize[1])
207  errorQuda("Requested Y-dimension grid size %d greater than hardware limit %d",
208  param.grid.y, deviceProp.maxGridSize[1]);
209 
210  if (param.grid.z > (unsigned int)deviceProp.maxGridSize[2])
211  errorQuda("Requested Z-dimension grid size %d greater than hardware limit %d",
212  param.grid.z, deviceProp.maxGridSize[2]);
213  }
214 
215  };
216 
217  void loadTuneCache(QudaVerbosity verbosity);
218  void saveTuneCache(QudaVerbosity verbosity);
219  TuneParam tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity);
220 
221 } // namespace quda
222 
223 #endif // _TUNE_QUDA_H