QUDA  v0.7.0
A library for QCD on GPUs
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
tune.cpp
Go to the documentation of this file.
1 #include <tune_quda.h>
2 #include <comm_quda.h>
3 #include <quda.h> // for QUDA_VERSION_STRING
4 #include <sys/stat.h> // for stat()
5 #include <fcntl.h>
6 #include <cfloat> // for FLT_MAX
7 #include <ctime>
8 #include <fstream>
9 #include <typeinfo>
10 #include <map>
11 #include <unistd.h>
12 #ifdef PTHREADS
13 #include <pthread.h>
14 #endif
15 
16 //#define LAUNCH_TIMER
17 
18 namespace quda { static TuneKey last_key; }
19 
20 // intentionally leave this outside of the namespace for now
21 quda::TuneKey getLastTuneKey() { return quda::last_key; }
22 
23 namespace quda {
24  typedef std::map<TuneKey, TuneParam> map;
25 
26  static const std::string quda_hash = QUDA_HASH; // defined in lib/Makefile
27  static std::string resource_path;
28  static map tunecache;
29  static map::iterator it;
30  static size_t initial_cache_size = 0;
31 
32 
33 #define STR_(x) #x
34 #define STR(x) STR_(x)
35  static const std::string quda_version = STR(QUDA_VERSION_MAJOR) "." STR(QUDA_VERSION_MINOR) "." STR(QUDA_VERSION_SUBMINOR);
36 #undef STR
37 #undef STR_
38 
42  static void deserializeTuneCache(std::istream &in)
43  {
44  std::string line;
45  std::stringstream ls;
46 
47  TuneKey key;
48  TuneParam param;
49 
50  std::string v;
51  std::string n;
52  std::string a;
53 
54  int check;
55 
56  while (in.good()) {
57  getline(in, line);
58  if (!line.length()) continue; // skip blank lines (e.g., at end of file)
59  ls.clear();
60  ls.str(line);
61  ls >> v >> n >> a >> param.block.x >> param.block.y >> param.block.z;
62  check = snprintf(key.volume, key.volume_n, "%s", v.c_str());
63  if (check < 0 || check >= key.volume_n) errorQuda("Error writing volume string");
64  check = snprintf(key.name, key.name_n, "%s", n.c_str());
65  if (check < 0 || check >= key.name_n) errorQuda("Error writing name string");
66  check = snprintf(key.aux, key.aux_n, "%s", a.c_str());
67  if (check < 0 || check >= key.aux_n) errorQuda("Error writing aux string");
68  ls >> param.grid.x >> param.grid.y >> param.grid.z >> param.shared_bytes;
69  ls.ignore(1); // throw away tab before comment
70  getline(ls, param.comment); // assume anything remaining on the line is a comment
71  param.comment += "\n"; // our convention is to include the newline, since ctime() likes to do this
72  tunecache[key] = param;
73  }
74  }
75 
76 
80  static void serializeTuneCache(std::ostream &out)
81  {
82  map::iterator entry;
83 
84  for (entry = tunecache.begin(); entry != tunecache.end(); entry++) {
85  TuneKey key = entry->first;
86  TuneParam param = entry->second;
87 
88  out << key.volume << "\t" << key.name << "\t" << key.aux << "\t";
89  out << param.block.x << "\t" << param.block.y << "\t" << param.block.z << "\t";
90  out << param.grid.x << "\t" << param.grid.y << "\t" << param.grid.z << "\t";
91  out << param.shared_bytes << "\t" << param.comment; // param.comment ends with a newline
92  }
93  }
94 
95 
99  static void broadcastTuneCache()
100  {
101 #ifdef MULTI_GPU
102 
103  std::stringstream serialized;
104  size_t size;
105 
106  if (comm_rank() == 0) {
107  serializeTuneCache(serialized);
108  size = serialized.str().length();
109  }
110  comm_broadcast(&size, sizeof(size_t));
111 
112  if (size > 0) {
113  if (comm_rank() == 0) {
114  comm_broadcast(const_cast<char *>(serialized.str().c_str()), size);
115  } else {
116  char *serstr = new char[size+1];
117  comm_broadcast(serstr, size);
118  serstr[size] ='\0'; // null-terminate
119  serialized.str(serstr);
120  deserializeTuneCache(serialized);
121  delete[] serstr;
122  }
123  }
124 #endif
125  }
126 
127 
128  /*
129  * Read tunecache from disk.
130  */
131  void loadTuneCache(QudaVerbosity verbosity)
132  {
133  char *path;
134  struct stat pstat;
135  std::string cache_path, line, token;
136  std::ifstream cache_file;
137  std::stringstream ls;
138 
139  path = getenv("QUDA_RESOURCE_PATH");
140  if (!path) {
141  warningQuda("Environment variable QUDA_RESOURCE_PATH is not set.");
142  warningQuda("Caching of tuned parameters will be disabled.");
143  return;
144  } else if (stat(path, &pstat) || !S_ISDIR(pstat.st_mode)) {
145  warningQuda("The path \"%s\" specified by QUDA_RESOURCE_PATH does not exist or is not a directory.", path);
146  warningQuda("Caching of tuned parameters will be disabled.");
147  return;
148  } else {
149  resource_path = path;
150  }
151 
152 #ifdef MULTI_GPU
153  if (comm_rank() == 0) {
154 #endif
155 
156  cache_path = resource_path;
157  cache_path += "/tunecache.tsv";
158  cache_file.open(cache_path.c_str());
159 
160  if (cache_file) {
161 
162  if (!cache_file.good()) errorQuda("Bad format in %s", cache_path.c_str());
163  getline(cache_file, line);
164  ls.str(line);
165  ls >> token;
166  if (token.compare("tunecache")) errorQuda("Bad format in %s", cache_path.c_str());
167  ls >> token;
168  if (token.compare(quda_version)) errorQuda("Cache file %s does not match current QUDA version. \nPlease delete this file or set the QUDA_RESOURCE_PATH environment variable to point to a new path.", cache_path.c_str());
169  ls >> token;
170  if (token.compare(quda_hash)) errorQuda("Cache file %s does not match current QUDA build. \nPlease delete this file or set the QUDA_RESOURCE_PATH environment variable to point to a new path.", cache_path.c_str());
171 
172 
173  if (!cache_file.good()) errorQuda("Bad format in %s", cache_path.c_str());
174  getline(cache_file, line); // eat the blank line
175 
176  if (!cache_file.good()) errorQuda("Bad format in %s", cache_path.c_str());
177  getline(cache_file, line); // eat the description line
178 
179  deserializeTuneCache(cache_file);
180 
181  cache_file.close();
182  initial_cache_size = tunecache.size();
183 
184  if (verbosity >= QUDA_SUMMARIZE) {
185  printfQuda("Loaded %d sets of cached parameters from %s\n", static_cast<int>(initial_cache_size), cache_path.c_str());
186  }
187 
188 
189  } else {
190  warningQuda("Cache file not found. All kernels will be re-tuned (if tuning is enabled).");
191  }
192 
193 #ifdef MULTI_GPU
194  }
195 #endif
196 
197 
198  broadcastTuneCache();
199  }
200 
201 
205  void saveTuneCache(QudaVerbosity verbosity)
206  {
207  time_t now;
208  int lock_handle;
209  std::string lock_path, cache_path;
210  std::ofstream cache_file;
211 
212  if (resource_path.empty()) return;
213 
214  //FIXME: We should really check to see if any nodes have tuned a kernel that was not also tuned on node 0, since as things
215  // stand, the corresponding launch parameters would never get cached to disk in this situation. This will come up if we
216  // ever support different subvolumes per GPU (as might be convenient for lattice volumes that don't divide evenly).
217 
218 #ifdef MULTI_GPU
219  if (comm_rank() == 0) {
220 #endif
221 
222  if (tunecache.size() == initial_cache_size) return;
223 
224  // Acquire lock. Note that this is only robust if the filesystem supports flock() semantics, which is true for
225  // NFS on recent versions of linux but not Lustre by default (unless the filesystem was mounted with "-o flock").
226  lock_path = resource_path + "/tunecache.lock";
227  lock_handle = open(lock_path.c_str(), O_WRONLY | O_CREAT | O_EXCL, 0666);
228  if (lock_handle == -1) {
229  warningQuda("Unable to lock cache file. Tuned launch parameters will not be cached to disk. "
230  "If you are certain that no other instances of QUDA are accessing this filesystem, "
231  "please manually remove %s", lock_path.c_str());
232  return;
233  }
234  char msg[] = "If no instances of applications using QUDA are running,\n"
235  "this lock file shouldn't be here and is safe to delete.";
236  int stat = write(lock_handle, msg, sizeof(msg)); // check status to avoid compiler warning
237  if (stat == -1) warningQuda("Unable to write to lock file for some bizarre reason");
238 
239  cache_path = resource_path + "/tunecache.tsv";
240  cache_file.open(cache_path.c_str());
241 
242  if (verbosity >= QUDA_SUMMARIZE) {
243  printfQuda("Saving %d sets of cached parameters to %s\n", static_cast<int>(tunecache.size()), cache_path.c_str());
244  }
245 
246  time(&now);
247  cache_file << "tunecache\t" << quda_version << "\t" << quda_hash << "\t# Last updated " << ctime(&now) << std::endl;
248  cache_file << "volume\tname\taux\tblock.x\tblock.y\tblock.z\tgrid.x\tgrid.y\tgrid.z\tshared_bytes\tcomment" << std::endl;
249  serializeTuneCache(cache_file);
250  cache_file.close();
251 
252  // Release lock.
253  close(lock_handle);
254  remove(lock_path.c_str());
255 
256  initial_cache_size = tunecache.size();
257 
258 #ifdef MULTI_GPU
259  }
260 #endif
261  }
262 
263  static TimeProfile launchTimer("tuneLaunch");
264 
265 // static int tally = 0;
266 
271  TuneParam& tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
272  {
273 #ifdef PTHREADS // tuning should be performed serially
274 // pthread_mutex_lock(&pthread_mutex);
275 // tally++;
276 #endif
277 
278 #ifdef LAUNCH_TIMER
279  launchTimer.Start(QUDA_PROFILE_TOTAL);
280  launchTimer.Start(QUDA_PROFILE_INIT);
281 #endif
282 
283  const TuneKey key = tunable.tuneKey();
284  last_key = key;
285  static TuneParam param;
286 
287 #ifdef LAUNCH_TIMER
288  launchTimer.Stop(QUDA_PROFILE_INIT);
289  launchTimer.Start(QUDA_PROFILE_PREAMBLE);
290 #endif
291 
292  // first check if we have the tuned value and return if we have it
293  //if (enabled == QUDA_TUNE_YES && tunecache.count(key)) {
294 
295  it = tunecache.find(key);
296  if (enabled == QUDA_TUNE_YES && it != tunecache.end()) {
297 
298 #ifdef LAUNCH_TIMER
299  launchTimer.Stop(QUDA_PROFILE_PREAMBLE);
300  launchTimer.Start(QUDA_PROFILE_COMPUTE);
301 #endif
302 
303  //param = tunecache[key];
304  TuneParam param = it->second;
305 
306 #ifdef LAUNCH_TIMER
307  launchTimer.Stop(QUDA_PROFILE_COMPUTE);
308  launchTimer.Start(QUDA_PROFILE_EPILOGUE);
309 #endif
310 
311  tunable.checkLaunchParam(it->second);
312 
313 #ifdef LAUNCH_TIMER
314  launchTimer.Stop(QUDA_PROFILE_EPILOGUE);
315  launchTimer.Stop(QUDA_PROFILE_TOTAL);
316 #endif
317 
318 #ifdef PTHREADS
319  //pthread_mutex_unlock(&pthread_mutex);
320  //tally--;
321  //printfQuda("pthread_mutex_unlock a complete %d\n",tally);
322 #endif
323  return it->second;
324  }
325 
326 #ifdef LAUNCH_TIMER
327  launchTimer.Stop(QUDA_PROFILE_PREAMBLE);
328  launchTimer.Stop(QUDA_PROFILE_TOTAL);
329 #endif
330 
331 
332  // We must switch off the global sum when tuning in case of process divergence
333  bool reduceState = globalReduce;
334  globalReduce = false;
335 
336  static bool tuning = false; // tuning in progress?
337  static const Tunable *active_tunable; // for error checking
338 
339  if (enabled == QUDA_TUNE_NO) {
340  tunable.defaultTuneParam(param);
341  tunable.checkLaunchParam(param);
342  } else if (!tuning) {
343 
344  TuneParam best_param;
345  cudaError_t error;
346  cudaEvent_t start, end;
347  float elapsed_time, best_time;
348  time_t now;
349 
350  tuning = true;
351  active_tunable = &tunable;
352  best_time = FLT_MAX;
353 
354  if (verbosity >= QUDA_DEBUG_VERBOSE) printfQuda("PreTune %s\n", key.name);
355  tunable.preTune();
356 
357  cudaEventCreate(&start);
358  cudaEventCreate(&end);
359 
360  if (verbosity >= QUDA_DEBUG_VERBOSE) {
361  printfQuda("Tuning %s with %s at vol=%s\n", key.name, key.aux, key.volume);
362  }
363 
364  tunable.initTuneParam(param);
365  while (tuning) {
366  cudaDeviceSynchronize();
367  cudaGetLastError(); // clear error counter
368  tunable.checkLaunchParam(param);
369  cudaEventRecord(start, 0);
370  for (int i=0; i<tunable.tuningIter(); i++) {
371  if (verbosity >= QUDA_DEBUG_VERBOSE) {
372  printfQuda("About to call tunable.apply\n");
373  }
374  tunable.apply(0); // calls tuneLaunch() again, which simply returns the currently active param
375  }
376  cudaEventRecord(end, 0);
377  cudaEventSynchronize(end);
378  cudaEventElapsedTime(&elapsed_time, start, end);
379  cudaDeviceSynchronize();
380  error = cudaGetLastError();
381 
382  { // check that error state is cleared
383  cudaDeviceSynchronize();
384  cudaError_t error = cudaGetLastError();
385  if (error != cudaSuccess) errorQuda("Failed to clear error state %s\n", cudaGetErrorString(error));
386  }
387 
388  elapsed_time /= (1e3 * tunable.tuningIter());
389  if ((elapsed_time < best_time) && (error == cudaSuccess)) {
390  best_time = elapsed_time;
391  best_param = param;
392  }
393  if ((verbosity >= QUDA_DEBUG_VERBOSE)) {
394  if (error == cudaSuccess)
395  printfQuda(" %s gives %s\n", tunable.paramString(param).c_str(),
396  tunable.perfString(elapsed_time).c_str());
397  else
398  printfQuda(" %s gives %s\n", tunable.paramString(param).c_str(), cudaGetErrorString(error));
399  }
400  tuning = tunable.advanceTuneParam(param);
401  }
402 
403  if (best_time == FLT_MAX) {
404  errorQuda("Auto-tuning failed for %s with %s at vol=%s", key.name, key.aux, key.volume);
405  }
406  if (verbosity >= QUDA_VERBOSE) {
407  printfQuda("Tuned %s giving %s for %s with %s\n", tunable.paramString(best_param).c_str(),
408  tunable.perfString(best_time).c_str(), key.name, key.aux);
409  }
410  time(&now);
411  best_param.comment = "# " + tunable.perfString(best_time) + ", tuned ";
412  best_param.comment += ctime(&now); // includes a newline
413 
414  cudaEventDestroy(start);
415  cudaEventDestroy(end);
416 
417  if (verbosity >= QUDA_DEBUG_VERBOSE) printfQuda("PostTune %s\n", key.name);
418  tunable.postTune();
419  param = best_param;
420  tunecache[key] = best_param;
421 
422  } else if (&tunable != active_tunable) {
423  errorQuda("Unexpected call to tuneLaunch() in %s::apply()", typeid(tunable).name());
424  }
425 
426  // restore the original reduction state
427  globalReduce = reduceState;
428 
429 #ifdef PTHREADS
430 // pthread_mutex_unlock(&pthread_mutex);
431 // tally--;
432 // printfQuda("pthread_mutex_unlock b complete %d\n",tally);
433 #endif
434  return param;
435  }
436 
438 #ifdef LAUNCH_TIMER
439  launchTimer.Print();
440 #endif
441  }
442 } // namespace quda
int comm_rank(void)
Definition: comm_mpi.cpp:80
void Print()
Definition: timer.cpp:6
#define errorQuda(...)
Definition: util_quda.h:73
::std::string string
Definition: gtest.h:1979
#define QUDA_VERSION_MINOR
Definition: quda_constants.h:2
virtual void initTuneParam(TuneParam &param) const
Definition: tune_quda.h:175
quda::TuneKey getLastTuneKey()
Definition: tune.cpp:21
virtual TuneKey tuneKey() const =0
std::map< TuneKey, TuneParam > map
Definition: tune.cpp:24
virtual std::string paramString(const TuneParam &param) const
Definition: tune_quda.h:156
QudaGaugeParam param
Definition: pack_test.cpp:17
cpuColorSpinorField * in
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
Definition: tune.cpp:271
char aux[aux_n]
Definition: tune_key.h:15
#define warningQuda(...)
Definition: util_quda.h:84
#define QUDA_VERSION_SUBMINOR
Definition: quda_constants.h:3
virtual std::string perfString(float time) const
Definition: tune_quda.h:165
virtual void defaultTuneParam(TuneParam &param) const
Definition: tune_quda.h:199
virtual int tuningIter() const
Definition: tune_quda.h:154
virtual void preTune()
Definition: tune_quda.h:152
virtual void postTune()
Definition: tune_quda.h:153
void comm_broadcast(void *data, size_t nbytes)
Definition: comm_mpi.cpp:234
cpuColorSpinorField * out
void Stop(QudaProfileType idx)
void loadTuneCache(QudaVerbosity verbosity)
Definition: tune.cpp:131
Main header file for the QUDA library.
#define STR(x)
Definition: tune.cpp:34
#define printfQuda(...)
Definition: util_quda.h:67
enum QudaTune_s QudaTune
char volume[volume_n]
Definition: tune_key.h:13
char name[name_n]
Definition: tune_key.h:14
void Start(QudaProfileType idx)
void checkLaunchParam(TuneParam &param)
Definition: tune_quda.h:214
void printLaunchTimer()
Definition: tune.cpp:437
virtual bool advanceTuneParam(TuneParam &param) const
Definition: tune_quda.h:205
enum QudaVerbosity_s QudaVerbosity
bool globalReduce
Definition: face_buffer.cpp:11
#define QUDA_VERSION_MAJOR
Definition: quda_constants.h:1
void saveTuneCache(QudaVerbosity verbosity)
Definition: tune.cpp:205
void end()
virtual void apply(const cudaStream_t &stream)=0
std::string comment
Definition: tune_quda.h:22