QUDA  v1.1.0
A library for QCD on GPUs
tune.cpp
Go to the documentation of this file.
1 #include <tune_quda.h>
2 #include <comm_quda.h>
3 #include <quda.h> // for QUDA_VERSION_STRING
4 #include <sys/stat.h> // for stat()
5 #include <fcntl.h>
6 #include <cfloat> // for FLT_MAX
7 #include <ctime>
8 #include <fstream>
9 #include <typeinfo>
10 #include <map>
11 #include <list>
12 #include <unistd.h>
13 #include <uint_to_char.h>
14 
15 #include <deque>
16 #include <queue>
17 #include <functional>
18 
19 #include <communicator_quda.h>
20 
21 //#define LAUNCH_TIMER
22 extern char *gitversion;
23 
24 namespace quda
25 {
26  static TuneKey last_key;
27 }
28 
29 // intentionally leave this outside of the namespace for now
30 quda::TuneKey getLastTuneKey() { return quda::last_key; }
31 
32 namespace quda
33 {
34  typedef std::map<TuneKey, TuneParam> map;
35 
36  struct TraceKey {
37 
39  float time;
40 
44  long host_bytes;
45 
46  TraceKey() {}
47 
48  TraceKey(const TuneKey &key, float time) :
49  key(key),
50  time(time),
55  {
56  }
57 
58  TraceKey(const TraceKey &trace) :
59  key(trace.key),
60  time(trace.time),
64  host_bytes(trace.host_bytes)
65  {
66  }
67 
68  TraceKey &operator=(const TraceKey &trace)
69  {
70  if (&trace != this) {
71  key = trace.key;
72  time = trace.time;
73  device_bytes = trace.device_bytes;
74  pinned_bytes = trace.pinned_bytes;
75  mapped_bytes = trace.mapped_bytes;
76  host_bytes = trace.host_bytes;
77  }
78  return *this;
79  }
80  };
81 
82  // linked list that is augmented each time we call a kernel
83  static std::list<TraceKey> trace_list;
84  static int enable_trace = 0;
85 
87  {
88  static bool init = false;
89 
90  if (!init) {
91  char *enable_trace_env = getenv("QUDA_ENABLE_TRACE");
92  if (enable_trace_env) {
93  if (strcmp(enable_trace_env, "1") == 0) {
94  // only explicitly posted trace events are included
95  enable_trace = 1;
96  } else if (strcmp(enable_trace_env, "2") == 0) {
97  // enable full kernel trace and posted trace events
98  enable_trace = 2;
99  }
100  }
101  init = true;
102  }
103  return enable_trace;
104  }
105 
106  void postTrace_(const char *func, const char *file, int line)
107  {
108  if (traceEnabled() >= 1) {
109  char aux[TuneKey::aux_n];
110  strcpy(aux, file);
111  strcat(aux, ":");
112  char tmp[TuneKey::aux_n];
113  i32toa(tmp, line);
114  strcat(aux, tmp);
115  TuneKey key("", func, aux);
116  TraceKey trace_entry(key, 0.0);
117  trace_list.push_back(trace_entry);
118  }
119  }
120 
121  static const std::string quda_hash = QUDA_HASH; // defined in lib/Makefile
122  static std::string resource_path;
123  static map tunecache;
124  static map::iterator it;
125  static size_t initial_cache_size = 0;
126 
127 #define STR_(x) #x
128 #define STR(x) STR_(x)
129  static const std::string quda_version
131 #undef STR
132 #undef STR_
133 
135  static bool tuning = false;
136 
137  bool activeTuning() { return tuning; }
138 
139  static bool profile_count = true;
140 
141  void disableProfileCount() { profile_count = false; }
142  void enableProfileCount() { profile_count = true; }
143 
144  const map &getTuneCache() { return tunecache; }
145 
149  static void deserializeTuneCache(std::istream &in)
150  {
151  std::string line;
152  std::stringstream ls;
153 
154  TuneKey key;
155  TuneParam param;
156 
157  std::string v;
158  std::string n;
159  std::string a;
160 
161  int check;
162 
163  while (in.good()) {
164  getline(in, line);
165  if (!line.length()) continue; // skip blank lines (e.g., at end of file)
166  ls.clear();
167  ls.str(line);
168  ls >> v >> n >> a >> param.block.x >> param.block.y >> param.block.z;
169  check = snprintf(key.volume, key.volume_n, "%s", v.c_str());
170  if (check < 0 || check >= key.volume_n) errorQuda("Error writing volume string (check = %d)", check);
171  check = snprintf(key.name, key.name_n, "%s", n.c_str());
172  if (check < 0 || check >= key.name_n) errorQuda("Error writing name string (check=%d)", check);
173  check = snprintf(key.aux, key.aux_n, "%s", a.c_str());
174  if (check < 0 || check >= key.aux_n) errorQuda("Error writing aux string (check=%d)", check);
175  ls >> param.grid.x >> param.grid.y >> param.grid.z >> param.shared_bytes >> param.aux.x >> param.aux.y
176  >> param.aux.z >> param.aux.w >> param.time;
177  ls.ignore(1); // throw away tab before comment
178  getline(ls, param.comment); // assume anything remaining on the line is a comment
179  param.comment += "\n"; // our convention is to include the newline, since ctime() likes to do this
180  tunecache[key] = param;
181  }
182  }
183 
187  static void serializeTuneCache(std::ostream &out)
188  {
189  map::iterator entry;
190 
191  for (entry = tunecache.begin(); entry != tunecache.end(); entry++) {
192  TuneKey key = entry->first;
193  TuneParam param = entry->second;
194 
195  out << std::setw(16) << key.volume << "\t" << key.name << "\t" << key.aux << "\t";
196  out << param.block.x << "\t" << param.block.y << "\t" << param.block.z << "\t";
197  out << param.grid.x << "\t" << param.grid.y << "\t" << param.grid.z << "\t";
198  out << param.shared_bytes << "\t" << param.aux.x << "\t" << param.aux.y << "\t" << param.aux.z << "\t"
199  << param.aux.w << "\t";
200  out << param.time << "\t" << param.comment; // param.comment ends with a newline
201  }
202  }
203 
204  template <class T> struct less_significant : std::binary_function<T, T, bool> {
205  inline bool operator()(const T &lhs, const T &rhs)
206  {
207  return lhs.second.time * lhs.second.n_calls < rhs.second.time * rhs.second.n_calls;
208  }
209  };
210 
214  static void serializeProfile(std::ostream &out, std::ostream &async_out)
215  {
216  map::iterator entry;
217  double total_time = 0.0;
218  double async_total_time = 0.0;
219 
220  // first let's sort the entries in decreasing order of significance
221  typedef std::pair<TuneKey, TuneParam> profile_t;
222  typedef std::priority_queue<profile_t, std::deque<profile_t>, less_significant<profile_t>> queue_t;
223  queue_t q(tunecache.begin(), tunecache.end());
224 
225  // now compute total time spent in kernels so we can give each kernel a significance
226  for (entry = tunecache.begin(); entry != tunecache.end(); entry++) {
227  TuneKey key = entry->first;
228  TuneParam param = entry->second;
229 
230  char tmp[TuneKey::aux_n] = {};
231  strncpy(tmp, key.aux, TuneKey::aux_n);
232  bool is_policy_kernel = strncmp(tmp, "policy_kernel", 13) == 0 ? true : false;
233  bool is_policy = (strncmp(tmp, "policy", 6) == 0 && !is_policy_kernel) ? true : false;
234  if (param.n_calls > 0 && !is_policy) total_time += param.n_calls * param.time;
235  if (param.n_calls > 0 && is_policy) async_total_time += param.n_calls * param.time;
236  }
237 
238  while (!q.empty()) {
239  TuneKey key = q.top().first;
240  TuneParam param = q.top().second;
241 
242  char tmp[TuneKey::aux_n] = {};
243  strncpy(tmp, key.aux, TuneKey::aux_n);
244  bool is_policy_kernel = strncmp(tmp, "policy_kernel", 13) == 0 ? true : false;
245  bool is_policy = (strncmp(tmp, "policy", 6) == 0 && !is_policy_kernel) ? true : false;
246  bool is_nested_policy = (strncmp(tmp, "nested_policy", 6) == 0) ? true : false; // nested policies not included
247 
248  // synchronous profile
249  if (param.n_calls > 0 && !is_policy && !is_nested_policy) {
250  double time = param.n_calls * param.time;
251 
252  out << std::setw(12) << param.n_calls * param.time << "\t" << std::setw(12) << (time / total_time) * 100 << "\t";
253  out << std::setw(12) << param.n_calls << "\t" << std::setw(12) << param.time << "\t" << std::setw(16)
254  << key.volume << "\t";
255  out << key.name << "\t" << key.aux << "\t" << param.comment; // param.comment ends with a newline
256  }
257 
258  // async policy profile
259  if (param.n_calls > 0 && is_policy) {
260  double time = param.n_calls * param.time;
261 
262  async_out << std::setw(12) << param.n_calls * param.time << "\t" << std::setw(12)
263  << (time / async_total_time) * 100 << "\t";
264  async_out << std::setw(12) << param.n_calls << "\t" << std::setw(12) << param.time << "\t" << std::setw(16)
265  << key.volume << "\t";
266  async_out << key.name << "\t" << key.aux << "\t" << param.comment; // param.comment ends with a newline
267  }
268 
269  q.pop();
270  }
271 
272  out << std::endl << "# Total time spent in kernels = " << total_time << " seconds" << std::endl;
273  async_out << std::endl
274  << "# Total time spent in asynchronous execution = " << async_total_time << " seconds" << std::endl;
275  }
276 
280  static void serializeTrace(std::ostream &out)
281  {
282  for (auto it = trace_list.begin(); it != trace_list.end(); it++) {
283 
284  TuneKey &key = it->key;
285 
286  // special case kernel members of a policy
287  char tmp[TuneKey::aux_n] = {};
288  strncpy(tmp, key.aux, TuneKey::aux_n);
289  bool is_policy_kernel = strcmp(tmp, "policy_kernel") == 0 ? true : false;
290 
291  out << std::setw(12) << it->time << "\t";
292  out << std::setw(12) << it->device_bytes << "\t";
293  out << std::setw(12) << it->pinned_bytes << "\t";
294  out << std::setw(12) << it->mapped_bytes << "\t";
295  out << std::setw(12) << it->host_bytes << "\t";
296  out << std::setw(16) << key.volume << "\t";
297  if (is_policy_kernel) out << "\t";
298  out << key.name << "\t";
299  if (!is_policy_kernel) out << "\t";
300  out << key.aux << std::endl;
301  }
302  }
303 
307  static void broadcastTuneCache()
308  {
309 #ifdef MULTI_GPU
310  std::stringstream serialized;
311  size_t size;
312 
313  if (comm_rank_global() == 0) {
314  serializeTuneCache(serialized);
315  size = serialized.str().length();
316  }
317  comm_broadcast_global(&size, sizeof(size_t));
318 
319  if (size > 0) {
320  if (comm_rank_global() == 0) {
321  comm_broadcast_global(const_cast<char *>(serialized.str().c_str()), size);
322  } else {
323  char *serstr = new char[size + 1];
324  comm_broadcast_global(serstr, size);
325  serstr[size] = '\0'; // null-terminate
326  serialized.str(serstr);
327  deserializeTuneCache(serialized);
328  delete[] serstr;
329  }
330  }
331 #endif
332  }
333 
334  /*
335  * Read tunecache from disk.
336  */
338  {
339  if (getTuning() == QUDA_TUNE_NO) {
340  warningQuda("Autotuning disabled");
341  return;
342  }
343 
344  char *path;
345  struct stat pstat;
346  std::string cache_path, line, token;
347  std::ifstream cache_file;
348  std::stringstream ls;
349 
350  path = getenv("QUDA_RESOURCE_PATH");
351 
352  if (!path) {
353  warningQuda("Environment variable QUDA_RESOURCE_PATH is not set.");
354  warningQuda("Caching of tuned parameters will be disabled.");
355  return;
356  } else if (stat(path, &pstat) || !S_ISDIR(pstat.st_mode)) {
357  warningQuda("The path \"%s\" specified by QUDA_RESOURCE_PATH does not exist or is not a directory.", path);
358  warningQuda("Caching of tuned parameters will be disabled.");
359  return;
360  } else {
361  resource_path = path;
362  }
363 
364  bool version_check = true;
365  char *override_version_env = getenv("QUDA_TUNE_VERSION_CHECK");
366  if (override_version_env && strcmp(override_version_env, "0") == 0) {
367  version_check = false;
368  warningQuda("Disabling QUDA tunecache version check");
369  }
370 
371 #ifdef MULTI_GPU
372  if (comm_rank_global() == 0) {
373 #endif
374 
375  cache_path = resource_path;
376  cache_path += "/tunecache.tsv";
377  cache_file.open(cache_path.c_str());
378 
379  if (cache_file) {
380 
381  if (!cache_file.good()) errorQuda("Bad format in %s", cache_path.c_str());
382  getline(cache_file, line);
383  ls.str(line);
384  ls >> token;
385  if (token.compare("tunecache")) errorQuda("Bad format in %s", cache_path.c_str());
386  ls >> token;
387  if (version_check && token.compare(quda_version))
388  errorQuda("Cache file %s does not match current QUDA version. \nPlease delete this file or set the "
389  "QUDA_RESOURCE_PATH environment variable to point to a new path.",
390  cache_path.c_str());
391  ls >> token;
392 #ifdef GITVERSION
393  if (version_check && token.compare(gitversion))
394  errorQuda("Cache file %s does not match current QUDA version. \nPlease delete this file or set the "
395  "QUDA_RESOURCE_PATH environment variable to point to a new path.",
396  cache_path.c_str());
397 #else
398  if (version_check && token.compare(quda_version))
399  errorQuda("Cache file %s does not match current QUDA version. \nPlease delete this file or set the "
400  "QUDA_RESOURCE_PATH environment variable to point to a new path.",
401  cache_path.c_str());
402 #endif
403  ls >> token;
404  if (version_check && token.compare(quda_hash))
405  errorQuda("Cache file %s does not match current QUDA build. \nPlease delete this file or set the "
406  "QUDA_RESOURCE_PATH environment variable to point to a new path.",
407  cache_path.c_str());
408 
409  if (!cache_file.good()) errorQuda("Bad format in %s", cache_path.c_str());
410  getline(cache_file, line); // eat the blank line
411 
412  if (!cache_file.good()) errorQuda("Bad format in %s", cache_path.c_str());
413  getline(cache_file, line); // eat the description line
414 
415  deserializeTuneCache(cache_file);
416 
417  cache_file.close();
418  initial_cache_size = tunecache.size();
419 
420  if (getVerbosity() >= QUDA_SUMMARIZE) {
421  printfQuda("Loaded %d sets of cached parameters from %s\n", static_cast<int>(initial_cache_size),
422  cache_path.c_str());
423  }
424 
425  } else {
426  warningQuda("Cache file not found. All kernels will be re-tuned (if tuning is enabled).");
427  }
428 
429 #ifdef MULTI_GPU
430  }
431 #endif
432 
433  broadcastTuneCache();
434  }
435 
439  void saveTuneCache(bool error)
440  {
441  time_t now;
442  int lock_handle;
443  std::string lock_path, cache_path;
444  std::ofstream cache_file;
445 
446  if (resource_path.empty()) return;
447 
448  // FIXME: We should really check to see if any nodes have tuned a kernel that was not also tuned on node 0, since as things
449  // stand, the corresponding launch parameters would never get cached to disk in this situation. This will come up if we
450  // ever support different subvolumes per GPU (as might be convenient for lattice volumes that don't divide evenly).
451 
452 #ifdef MULTI_GPU
453  if (comm_rank_global() == 0) {
454 #endif
455 
456  if (tunecache.size() == initial_cache_size && !error) return;
457 
458  // Acquire lock. Note that this is only robust if the filesystem supports flock() semantics, which is true for
459  // NFS on recent versions of linux but not Lustre by default (unless the filesystem was mounted with "-o flock").
460  lock_path = resource_path + "/tunecache.lock";
461  lock_handle = open(lock_path.c_str(), O_WRONLY | O_CREAT | O_EXCL, 0666);
462  if (lock_handle == -1) {
463  warningQuda("Unable to lock cache file. Tuned launch parameters will not be cached to disk. "
464  "If you are certain that no other instances of QUDA are accessing this filesystem, "
465  "please manually remove %s",
466  lock_path.c_str());
467  return;
468  }
469  char msg[] = "If no instances of applications using QUDA are running,\n"
470  "this lock file shouldn't be here and is safe to delete.";
471  int stat = write(lock_handle, msg, sizeof(msg)); // check status to avoid compiler warning
472  if (stat == -1) warningQuda("Unable to write to lock file for some bizarre reason");
473 
474  cache_path = resource_path + (error ? "/tunecache_error.tsv" : "/tunecache.tsv");
475  cache_file.open(cache_path.c_str());
476 
477  if (getVerbosity() >= QUDA_SUMMARIZE) {
478  printfQuda("Saving %d sets of cached parameters to %s\n", static_cast<int>(tunecache.size()), cache_path.c_str());
479  }
480 
481  time(&now);
482  cache_file << "tunecache\t" << quda_version;
483 #ifdef GITVERSION
484  cache_file << "\t" << gitversion;
485 #else
486  cache_file << "\t" << quda_version;
487 #endif
488  cache_file << "\t" << quda_hash << "\t# Last updated " << ctime(&now) << std::endl;
489  cache_file << std::setw(16) << "volume"
490  << "\tname\taux\tblock.x\tblock.y\tblock.z\tgrid.x\tgrid.y\tgrid.z\tshared_bytes\taux.x\taux.y\taux."
491  "z\taux.w\ttime\tcomment"
492  << std::endl;
493  serializeTuneCache(cache_file);
494  cache_file.close();
495 
496  // Release lock.
497  close(lock_handle);
498  remove(lock_path.c_str());
499 
500  initial_cache_size = tunecache.size();
501 
502 #ifdef MULTI_GPU
503  } else {
504  // give process 0 time to write out its tunecache if needed, but
505  // doesn't cause a hang if error is not triggered on process 0
506  if (error) sleep(10);
507  }
508 #endif
509  }
510 
511  static bool policy_tuning = false;
512  bool policyTuning() { return policy_tuning; }
513 
514  void setPolicyTuning(bool policy_tuning_) { policy_tuning = policy_tuning_; }
515 
516  static bool uber_tuning = false;
517  bool uberTuning() { return uber_tuning; }
518 
519  void setUberTuning(bool uber_tuning_) { uber_tuning = uber_tuning_; }
520 
521  // flush profile, setting counts to zero
523  {
524  for (map::iterator entry = tunecache.begin(); entry != tunecache.end(); entry++) {
525  // set all n_calls = 0
526  TuneParam &param = entry->second;
527  param.n_calls = 0;
528  }
529  }
530 
531  // save profile
532  void saveProfile(const std::string label)
533  {
534  time_t now;
535  int lock_handle;
536  std::string lock_path, profile_path, async_profile_path, trace_path;
537  std::ofstream profile_file, async_profile_file, trace_file;
538 
539  if (resource_path.empty()) return;
540 
541 #ifdef MULTI_GPU
542  if (comm_rank_global() == 0) { // Make sure only one rank is writing to disk
543 #endif
544 
545  // Acquire lock. Note that this is only robust if the filesystem supports flock() semantics, which is true for
546  // NFS on recent versions of linux but not Lustre by default (unless the filesystem was mounted with "-o flock").
547  lock_path = resource_path + "/profile.lock";
548  lock_handle = open(lock_path.c_str(), O_WRONLY | O_CREAT | O_EXCL, 0666);
549  if (lock_handle == -1) {
550  warningQuda("Unable to lock profile file. Profile will not be saved to disk. "
551  "If you are certain that no other instances of QUDA are accessing this filesystem, "
552  "please manually remove %s",
553  lock_path.c_str());
554  return;
555  }
556  char msg[] = "If no instances of applications using QUDA are running,\n"
557  "this lock file shouldn't be here and is safe to delete.";
558  int stat = write(lock_handle, msg, sizeof(msg)); // check status to avoid compiler warning
559  if (stat == -1) warningQuda("Unable to write to lock file for some bizarre reason");
560 
561  // profile counter for writing out unique profiles
562  static int count = 0;
563 
564  char *profile_fname = getenv("QUDA_PROFILE_OUTPUT_BASE");
565 
566  if (!profile_fname) {
567  warningQuda(
568  "Environment variable QUDA_PROFILE_OUTPUT_BASE not set; writing to profile.tsv and profile_async.tsv");
569  profile_path = resource_path + "/profile_" + std::to_string(count) + ".tsv";
570  async_profile_path = resource_path + "/profile_async_" + std::to_string(count) + ".tsv";
571  if (traceEnabled()) trace_path = resource_path + "/trace_" + std::to_string(count) + ".tsv";
572  } else {
573  profile_path = resource_path + "/" + profile_fname + "_" + std::to_string(count) + ".tsv";
574  async_profile_path = resource_path + "/" + profile_fname + "_" + std::to_string(count) + "_async.tsv";
575  if (traceEnabled())
576  trace_path = resource_path + "/" + profile_fname + "_trace_" + std::to_string(count) + ".tsv";
577  }
578 
579  count++;
580 
581  profile_file.open(profile_path.c_str());
582  async_profile_file.open(async_profile_path.c_str());
583  if (traceEnabled()) trace_file.open(trace_path.c_str());
584 
585  if (getVerbosity() >= QUDA_SUMMARIZE) {
586  // compute number of non-zero entries that will be output in the profile
587  int n_entry = 0;
588  int n_policy = 0;
589  for (map::iterator entry = tunecache.begin(); entry != tunecache.end(); entry++) {
590  // if a policy entry, then we can ignore
591  char tmp[TuneKey::aux_n] = {};
592  strncpy(tmp, entry->first.aux, TuneKey::aux_n);
593  TuneParam param = entry->second;
594  bool is_policy = strcmp(tmp, "policy") == 0 ? true : false;
595  if (param.n_calls > 0 && !is_policy) n_entry++;
596  if (param.n_calls > 0 && is_policy) n_policy++;
597  }
598 
599  printfQuda("Saving %d sets of cached parameters to %s\n", n_entry, profile_path.c_str());
600  printfQuda("Saving %d sets of cached profiles to %s\n", n_policy, async_profile_path.c_str());
601  if (traceEnabled())
602  printfQuda("Saving trace list with %lu entries to %s\n", trace_list.size(), trace_path.c_str());
603  }
604 
605  time(&now);
606 
607  std::string Label = label.empty() ? "profile" : label;
608 
609  profile_file << Label << "\t" << quda_version;
610 #ifdef GITVERSION
611  profile_file << "\t" << gitversion;
612 #else
613  profile_file << "\t" << quda_version;
614 #endif
615  profile_file << "\t" << quda_hash << "\t# Last updated " << ctime(&now) << std::endl;
616  profile_file << std::setw(12) << "total time"
617  << "\t" << std::setw(12) << "percentage"
618  << "\t" << std::setw(12) << "calls"
619  << "\t" << std::setw(12) << "time / call"
620  << "\t" << std::setw(16) << "volume"
621  << "\tname\taux\tcomment" << std::endl;
622 
623  async_profile_file << Label << "\t" << quda_version;
624 #ifdef GITVERSION
625  async_profile_file << "\t" << gitversion;
626 #else
627  async_profile_file << "\t" << quda_version;
628 #endif
629  async_profile_file << "\t" << quda_hash << "\t# Last updated " << ctime(&now) << std::endl;
630  async_profile_file << std::setw(12) << "total time"
631  << "\t" << std::setw(12) << "percentage"
632  << "\t" << std::setw(12) << "calls"
633  << "\t" << std::setw(12) << "time / call"
634  << "\t" << std::setw(16) << "volume"
635  << "\tname\taux\tcomment" << std::endl;
636 
637  serializeProfile(profile_file, async_profile_file);
638 
639  profile_file.close();
640  async_profile_file.close();
641 
642  if (traceEnabled()) {
643  trace_file << "trace"
644  << "\t" << quda_version;
645 #ifdef GITVERSION
646  trace_file << "\t" << gitversion;
647 #else
648  trace_file << "\t" << quda_version;
649 #endif
650  trace_file << "\t" << quda_hash << "\t# Last updated " << ctime(&now) << std::endl;
651 
652  trace_file << std::setw(12) << "time\t" << std::setw(12) << "device-mem\t" << std::setw(12) << "pinned-mem\t";
653  trace_file << std::setw(12) << "mapped-mem\t" << std::setw(12) << "host-mem\t";
654  trace_file << std::setw(16) << "volume"
655  << "\tname\taux" << std::endl;
656 
657  serializeTrace(trace_file);
658 
659  trace_file.close();
660  }
661 
662  // Release lock.
663  close(lock_handle);
664  remove(lock_path.c_str());
665 
666 #ifdef MULTI_GPU
667  }
668 #endif
669  }
670 
671  static TimeProfile launchTimer("tuneLaunch");
672 
678  {
679 #ifdef LAUNCH_TIMER
680  launchTimer.TPSTART(QUDA_PROFILE_TOTAL);
681  launchTimer.TPSTART(QUDA_PROFILE_INIT);
682 #endif
683 
684  TuneKey key = tunable.tuneKey();
685  if (use_managed_memory()) strcat(key.aux, ",managed");
686  last_key = key;
687 
688 #ifdef LAUNCH_TIMER
689  launchTimer.TPSTOP(QUDA_PROFILE_INIT);
690  launchTimer.TPSTART(QUDA_PROFILE_PREAMBLE);
691 #endif
692 
693  static const Tunable *active_tunable; // for error checking
694  it = tunecache.find(key);
695 
696  // first check if we have the tuned value and return if we have it
697  if (enabled == QUDA_TUNE_YES && it != tunecache.end()) {
698 
699 #ifdef LAUNCH_TIMER
700  launchTimer.TPSTOP(QUDA_PROFILE_PREAMBLE);
701  launchTimer.TPSTART(QUDA_PROFILE_COMPUTE);
702 #endif
703 
704  TuneParam &param_tuned = it->second;
705 
706  if (verbosity >= QUDA_DEBUG_VERBOSE) {
707  printfQuda("Launching %s with %s at vol=%s with %s\n", key.name, key.aux, key.volume,
708  tunable.paramString(param_tuned).c_str());
709  }
710 
711 #ifdef LAUNCH_TIMER
712  launchTimer.TPSTOP(QUDA_PROFILE_COMPUTE);
713  launchTimer.TPSTART(QUDA_PROFILE_EPILOGUE);
714 #endif
715 
716  tunable.checkLaunchParam(param_tuned);
717 
718  // we could be tuning outside of the current scope
719  if (!tuning && profile_count) param_tuned.n_calls++;
720 
721 #ifdef LAUNCH_TIMER
722  launchTimer.TPSTOP(QUDA_PROFILE_EPILOGUE);
723  launchTimer.TPSTOP(QUDA_PROFILE_TOTAL);
724 #endif
725 
726  if (traceEnabled() >= 2) {
727  TraceKey trace_entry(key, param_tuned.time);
728  trace_list.push_back(trace_entry);
729  }
730 
731  return param_tuned;
732  }
733 
734 #ifdef LAUNCH_TIMER
735  launchTimer.TPSTOP(QUDA_PROFILE_PREAMBLE);
736  launchTimer.TPSTOP(QUDA_PROFILE_TOTAL);
737 #endif
738 
739  static TuneParam param;
740 
741  if (enabled == QUDA_TUNE_NO) {
742  TuneParam param_default;
743  tunable.defaultTuneParam(param_default);
744  tunable.checkLaunchParam(param_default);
745  if (verbosity >= QUDA_DEBUG_VERBOSE) {
746  printfQuda("Launching %s with %s at vol=%s with %s (untuned)\n", key.name, key.aux, key.volume,
747  tunable.paramString(param_default).c_str());
748  }
749 
750  return param_default;
751  } else if (!tuning) {
752 
753  /* As long as global reductions are not disabled, only do the
754  tuning on node 0, else do the tuning on all nodes since we
755  can't guarantee that all nodes are partaking */
756  if (comm_rank_global() == 0 || !commGlobalReduction() || policyTuning() || uberTuning()) {
757  TuneParam best_param;
758  cudaError_t error = cudaSuccess;
759  cudaEvent_t start, end;
760  float elapsed_time, best_time;
761  time_t now;
762 
763  tuning = true;
764  active_tunable = &tunable;
765  best_time = FLT_MAX;
766 
767  if (verbosity >= QUDA_DEBUG_VERBOSE) printfQuda("PreTune %s\n", key.name);
768  tunable.preTune();
769 
770  cudaEventCreate(&start);
771  cudaEventCreate(&end);
772 
773  if (verbosity >= QUDA_DEBUG_VERBOSE) {
774  printfQuda("Tuning %s with %s at vol=%s\n", key.name, key.aux, key.volume);
775  }
776 
777  Timer tune_timer;
778  tune_timer.Start(__func__, __FILE__, __LINE__);
779 
780  tunable.initTuneParam(param);
781  while (tuning) {
782  cudaDeviceSynchronize();
783  cudaGetLastError(); // clear error counter
784  tunable.checkLaunchParam(param);
785  if (verbosity >= QUDA_DEBUG_VERBOSE) {
786  printfQuda("About to call tunable.apply block=(%d,%d,%d) grid=(%d,%d,%d) shared_bytes=%d aux=(%d,%d,%d)\n",
787  param.block.x, param.block.y, param.block.z, param.grid.x, param.grid.y, param.grid.z,
788  param.shared_bytes, param.aux.x, param.aux.y, param.aux.z);
789  }
790  tunable.apply(0); // do initial call in case we need to jit compile for these parameters or if policy tuning
791 
792  cudaEventRecord(start, 0);
793  for (int i = 0; i < tunable.tuningIter(); i++) {
794  tunable.apply(0); // calls tuneLaunch() again, which simply returns the currently active param
795  }
796  cudaEventRecord(end, 0);
797  cudaEventSynchronize(end);
798  cudaEventElapsedTime(&elapsed_time, start, end);
799  cudaDeviceSynchronize();
800  error = cudaGetLastError();
801 
802  { // check that error state is cleared
803  cudaDeviceSynchronize();
804  cudaError_t error = cudaGetLastError();
805  if (error != cudaSuccess) errorQuda("Failed to clear error state %s\n", cudaGetErrorString(error));
806  }
807 
808  elapsed_time /= (1e3 * tunable.tuningIter());
809  if ((elapsed_time < best_time) && (error == cudaSuccess) && (tunable.jitifyError() == CUDA_SUCCESS)) {
810  best_time = elapsed_time;
811  best_param = param;
812  }
813  if ((verbosity >= QUDA_DEBUG_VERBOSE)) {
814  if (error == cudaSuccess && tunable.jitifyError() == CUDA_SUCCESS) {
815  printfQuda(" %s gives %s\n", tunable.paramString(param).c_str(),
816  tunable.perfString(elapsed_time).c_str());
817  } else {
818  if (tunable.jitifyError() == CUDA_SUCCESS) {
819  // if not jitify error must be regular error
820  printfQuda(" %s gives %s\n", tunable.paramString(param).c_str(), cudaGetErrorString(error));
821  } else {
822  // else is a jitify error
823  const char *str;
824  cuGetErrorString(tunable.jitifyError(), &str);
825  printfQuda(" %s gives %s\n", tunable.paramString(param).c_str(), str);
826  }
827  }
828  }
829  tuning = tunable.advanceTuneParam(param);
830  tunable.jitifyError() = CUDA_SUCCESS;
831  }
832 
833  tune_timer.Stop(__func__, __FILE__, __LINE__);
834 
835  if (best_time == FLT_MAX) {
836  errorQuda("Auto-tuning failed for %s with %s at vol=%s", key.name, key.aux, key.volume);
837  }
838  if (verbosity >= QUDA_VERBOSE) {
839  printfQuda("Tuned %s giving %s for %s with %s\n", tunable.paramString(best_param).c_str(),
840  tunable.perfString(best_time).c_str(), key.name, key.aux);
841  }
842  time(&now);
843  best_param.comment = "# " + tunable.perfString(best_time);
844  best_param.comment += ", tuning took " + std::to_string(tune_timer.Last()) + " seconds at ";
845  best_param.comment += ctime(&now); // includes a newline
846  best_param.time = best_time;
847 
848  cudaEventDestroy(start);
849  cudaEventDestroy(end);
850 
851  if (verbosity >= QUDA_DEBUG_VERBOSE) printfQuda("PostTune %s\n", key.name);
852  tuning = true;
853  tunable.postTune();
854  tuning = false;
855  param = best_param;
856  tunecache[key] = best_param;
857  }
858  if (commGlobalReduction() || policyTuning() || uberTuning()) { broadcastTuneCache(); }
859 
860  // check this process is getting the key that is expected
861  if (tunecache.find(key) == tunecache.end()) {
862  errorQuda("Failed to find key entry (%s:%s:%s)", key.name, key.volume, key.aux);
863  }
864  param = tunecache[key]; // read this now for all processes
865 
866  if (traceEnabled() >= 2) {
867  TraceKey trace_entry(key, param.time);
868  trace_list.push_back(trace_entry);
869  }
870 
871  } else if (&tunable != active_tunable) {
872  errorQuda("Unexpected call to tuneLaunch() in %s::apply()", typeid(tunable).name());
873  }
874 
875  param.n_calls = profile_count ? 1 : 0;
876 
877  return param;
878  }
879 
881  {
882 #ifdef LAUNCH_TIMER
883  launchTimer.Print();
884 #endif
885  }
886 } // namespace quda
void Print()
Definition: timer.cpp:7
virtual std::string perfString(float time) const
Definition: tune_quda.h:321
virtual std::string paramString(const TuneParam &param) const
Definition: tune_quda.h:314
CUresult jitifyError() const
Definition: tune_quda.h:403
virtual bool advanceTuneParam(TuneParam &param) const
Definition: tune_quda.h:363
virtual void initTuneParam(TuneParam &param) const
Definition: tune_quda.h:332
virtual void postTune()
Definition: tune_quda.h:310
virtual void preTune()
Definition: tune_quda.h:309
virtual TuneKey tuneKey() const =0
virtual void apply(const qudaStream_t &stream)=0
void checkLaunchParam(TuneParam &param)
Definition: tune_quda.h:372
virtual void defaultTuneParam(TuneParam &param) const
Definition: tune_quda.h:357
virtual int tuningIter() const
Definition: tune_quda.h:311
std::string comment
Definition: tune_quda.h:34
long long n_calls
Definition: tune_quda.h:36
bool commGlobalReduction()
int comm_rank_global(void)
QudaVerbosity verbosity
void comm_broadcast_global(void *data, size_t nbytes)
These routine broadcast the data according to the default communicator.
void end(void)
cudaColorSpinorField * tmp
Definition: covdev_test.cpp:34
@ QUDA_DEBUG_VERBOSE
Definition: enum_quda.h:268
@ QUDA_SUMMARIZE
Definition: enum_quda.h:266
@ QUDA_VERBOSE
Definition: enum_quda.h:267
enum QudaTune_s QudaTune
@ QUDA_TUNE_YES
Definition: enum_quda.h:272
@ QUDA_TUNE_NO
Definition: enum_quda.h:272
enum QudaVerbosity_s QudaVerbosity
void init()
Create the BLAS context.
void start()
Start profiling.
Definition: device.cpp:226
void disableProfileCount()
Disable the profile kernel counting.
Definition: tune.cpp:141
TuneParam tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
Definition: tune.cpp:677
const std::map< TuneKey, TuneParam > & getTuneCache()
Returns a reference to the tunecache map.
Definition: tune.cpp:144
void saveTuneCache(bool error=false)
Definition: tune.cpp:439
void loadTuneCache()
Definition: tune.cpp:337
bool policyTuning()
Query whether we are currently tuning a policy.
Definition: tune.cpp:512
void setUberTuning(bool)
Enable / disable whether we are tuning an uber kernel.
Definition: tune.cpp:519
std::map< TuneKey, TuneParam > map
Definition: tune.cpp:34
size_t host_allocated_peak()
Definition: malloc.cpp:87
void setPolicyTuning(bool)
Enable / disable whether are tuning a policy.
Definition: tune.cpp:514
@ QUDA_PROFILE_INIT
Definition: timer.h:106
@ QUDA_PROFILE_EPILOGUE
Definition: timer.h:110
@ QUDA_PROFILE_COMPUTE
Definition: timer.h:108
@ QUDA_PROFILE_TOTAL
Definition: timer.h:149
@ QUDA_PROFILE_PREAMBLE
Definition: timer.h:107
bool activeTuning()
query if tuning is in progress
Definition: tune.cpp:137
void postTrace_(const char *func, const char *file, int line)
Post an event in the trace, recording where it was posted.
Definition: tune.cpp:106
void flushProfile()
Flush profile contents, setting all counts to zero.
Definition: tune.cpp:522
void printLaunchTimer()
Definition: tune.cpp:880
size_t mapped_allocated_peak()
Definition: malloc.cpp:83
void i32toa(char *buffer, int32_t value)
Definition: uint_to_char.h:117
int traceEnabled()
Definition: tune.cpp:86
bool use_managed_memory()
Definition: malloc.cpp:178
size_t pinned_allocated_peak()
Definition: malloc.cpp:81
void enableProfileCount()
Enable the profile kernel counting.
Definition: tune.cpp:142
size_t device_allocated_peak()
Definition: malloc.cpp:79
void saveProfile(const std::string label="")
Save profile to disk.
Definition: tune.cpp:532
bool uberTuning()
Query whether we are tuning an uber kernel.
Definition: tune.cpp:517
::std::string string
Definition: gtest-port.h:891
QudaGaugeParam param
Definition: pack_test.cpp:18
Main header file for the QUDA library.
#define QUDA_VERSION_SUBMINOR
Definition: quda_constants.h:3
#define QUDA_VERSION_MAJOR
Definition: quda_constants.h:1
#define QUDA_VERSION_MINOR
Definition: quda_constants.h:2
void Stop(const char *func, const char *file, int line)
Definition: timer.h:72
double Last()
Definition: timer.h:88
void Start(const char *func, const char *file, int line)
Definition: timer.h:63
TraceKey(const TraceKey &trace)
Definition: tune.cpp:58
long device_bytes
Definition: tune.cpp:41
float time
Definition: tune.cpp:39
TraceKey & operator=(const TraceKey &trace)
Definition: tune.cpp:68
TuneKey key
Definition: tune.cpp:38
long host_bytes
Definition: tune.cpp:44
long mapped_bytes
Definition: tune.cpp:43
TraceKey(const TuneKey &key, float time)
Definition: tune.cpp:48
long pinned_bytes
Definition: tune.cpp:42
static const int aux_n
Definition: tune_key.h:12
char volume[volume_n]
Definition: tune_key.h:13
char name[name_n]
Definition: tune_key.h:14
char aux[aux_n]
Definition: tune_key.h:15
bool operator()(const T &lhs, const T &rhs)
Definition: tune.cpp:205
quda::TuneKey getLastTuneKey()
Definition: tune.cpp:30
#define STR(x)
Definition: tune.cpp:128
char * gitversion
Definition: version.cpp:4
QudaTune getTuning()
Query whether autotuning is enabled or not. Default is enabled but can be overridden by setting QUDA_...
Definition: util_quda.cpp:52
#define printfQuda(...)
Definition: util_quda.h:114
QudaVerbosity getVerbosity()
Definition: util_quda.cpp:21
#define warningQuda(...)
Definition: util_quda.h:132
#define errorQuda(...)
Definition: util_quda.h:120