QUDA  1.0.0
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros
tune.cpp
Go to the documentation of this file.
1 #include <tune_quda.h>
2 #include <comm_quda.h>
3 #include <quda.h> // for QUDA_VERSION_STRING
4 #include <sys/stat.h> // for stat()
5 #include <fcntl.h>
6 #include <cfloat> // for FLT_MAX
7 #include <ctime>
8 #include <fstream>
9 #include <typeinfo>
10 #include <map>
11 #include <list>
12 #include <unistd.h>
13 #include <uint_to_char.h>
14 
15 #include <deque>
16 #include <queue>
17 #include <functional>
18 
19 //#define LAUNCH_TIMER
20 extern char* gitversion;
21 
22 namespace quda { static TuneKey last_key; }
23 
24 // intentionally leave this outside of the namespace for now
26 
27 namespace quda {
28  typedef std::map<TuneKey, TuneParam> map;
29 
30  struct TraceKey {
31 
33  float time;
34 
38  long host_bytes;
39 
40  TraceKey() { }
41 
42  TraceKey(const TuneKey &key, float time)
43  : key(key), time(time),
44  device_bytes(device_allocated_peak()),
45  pinned_bytes(pinned_allocated_peak()),
46  mapped_bytes(mapped_allocated_peak()),
47  host_bytes(host_allocated_peak()) { }
48 
49  TraceKey(const TraceKey &trace)
50  : key(trace.key), time(trace.time),
51  device_bytes(trace.device_bytes),
52  pinned_bytes(trace.pinned_bytes),
53  mapped_bytes(trace.mapped_bytes),
54  host_bytes(trace.host_bytes) { }
55 
56  TraceKey& operator=(const TraceKey &trace) {
57  if (&trace != this) {
58  key = trace.key;
59  time = trace.time;
60  device_bytes = trace.device_bytes;
61  pinned_bytes = trace.pinned_bytes;
62  mapped_bytes = trace.mapped_bytes;
63  host_bytes = trace.host_bytes;
64  }
65  return *this;
66  }
67  };
68 
69  // linked list that is augmented each time we call a kernel
70  static std::list<TraceKey> trace_list;
71  static int enable_trace = 0;
72 
73  int traceEnabled() {
74  static bool init = false;
75 
76  if (!init) {
77  char *enable_trace_env = getenv("QUDA_ENABLE_TRACE");
78  if (enable_trace_env) {
79  if (strcmp(enable_trace_env, "1") == 0) {
80  // only explicitly posted trace events are included
81  enable_trace = 1;
82  } else if (strcmp(enable_trace_env, "2") == 0) {
83  // enable full kernel trace and posted trace events
84  enable_trace = 2;
85  }
86  }
87  init = true;
88  }
89  return enable_trace;
90  }
91 
92  void postTrace_(const char *func, const char *file, int line) {
93  if (traceEnabled() >= 1) {
94  char aux[TuneKey::aux_n];
95  strcpy(aux,file);
96  strcat(aux,":");
97  char tmp[TuneKey::aux_n];
98  i32toa(tmp,line);
99  strcat(aux,tmp);
100  TuneKey key("", func, aux);
101  TraceKey trace_entry(key, 0.0);
102  trace_list.push_back(trace_entry);
103  }
104  }
105 
106  static const std::string quda_hash = QUDA_HASH; // defined in lib/Makefile
107  static std::string resource_path;
108  static map tunecache;
109  static map::iterator it;
110  static size_t initial_cache_size = 0;
111 
112 #define STR_(x) #x
113 #define STR(x) STR_(x)
115 #undef STR
116 #undef STR_
117 
119  static bool tuning = false;
120 
121  bool activeTuning() { return tuning; }
122 
123  static bool profile_count = true;
124 
125  void disableProfileCount() { profile_count = false; }
126  void enableProfileCount() { profile_count = true; }
127 
128  const map& getTuneCache() { return tunecache; }
129 
130 
134  static void deserializeTuneCache(std::istream &in)
135  {
136  std::string line;
137  std::stringstream ls;
138 
139  TuneKey key;
141 
142  std::string v;
143  std::string n;
144  std::string a;
145 
146  int check;
147 
148  while (in.good()) {
149  getline(in, line);
150  if (!line.length()) continue; // skip blank lines (e.g., at end of file)
151  ls.clear();
152  ls.str(line);
153  ls >> v >> n >> a >> param.block.x >> param.block.y >> param.block.z;
154  check = snprintf(key.volume, key.volume_n, "%s", v.c_str());
155  if (check < 0 || check >= key.volume_n) errorQuda("Error writing volume string (check = %d)", check);
156  check = snprintf(key.name, key.name_n, "%s", n.c_str());
157  if (check < 0 || check >= key.name_n) errorQuda("Error writing name string (check=%d)", check);
158  check = snprintf(key.aux, key.aux_n, "%s", a.c_str());
159  if (check < 0 || check >= key.aux_n) errorQuda("Error writing aux string (check=%d)", check);
160  ls >> param.grid.x >> param.grid.y >> param.grid.z >> param.shared_bytes >> param.aux.x >> param.aux.y >> param.aux.z >> param.aux.w >> param.time;
161  ls.ignore(1); // throw away tab before comment
162  getline(ls, param.comment); // assume anything remaining on the line is a comment
163  param.comment += "\n"; // our convention is to include the newline, since ctime() likes to do this
164  tunecache[key] = param;
165  }
166  }
167 
168 
172  static void serializeTuneCache(std::ostream &out)
173  {
174  map::iterator entry;
175 
176  for (entry = tunecache.begin(); entry != tunecache.end(); entry++) {
177  TuneKey key = entry->first;
178  TuneParam param = entry->second;
179 
180  out << std::setw(16) << key.volume << "\t" << key.name << "\t" << key.aux << "\t";
181  out << param.block.x << "\t" << param.block.y << "\t" << param.block.z << "\t";
182  out << param.grid.x << "\t" << param.grid.y << "\t" << param.grid.z << "\t";
183  out << param.shared_bytes << "\t" << param.aux.x << "\t" << param.aux.y << "\t" << param.aux.z << "\t" << param.aux.w << "\t";
184  out << param.time << "\t" << param.comment; // param.comment ends with a newline
185  }
186  }
187 
188 
189  template <class T>
190  struct less_significant : std::binary_function<T,T,bool> {
191  inline bool operator()(const T &lhs, const T &rhs) {
192  return lhs.second.time * lhs.second.n_calls < rhs.second.time * rhs.second.n_calls;
193  }
194  };
195 
199  static void serializeProfile(std::ostream &out, std::ostream &async_out)
200  {
201  map::iterator entry;
202  double total_time = 0.0;
203  double async_total_time = 0.0;
204 
205  // first let's sort the entries in decreasing order of significance
206  typedef std::pair<TuneKey, TuneParam> profile_t;
207  typedef std::priority_queue<profile_t, std::deque<profile_t>, less_significant<profile_t> > queue_t;
208  queue_t q(tunecache.begin(), tunecache.end());
209 
210  // now compute total time spent in kernels so we can give each kernel a significance
211  for (entry = tunecache.begin(); entry != tunecache.end(); entry++) {
212  TuneKey key = entry->first;
213  TuneParam param = entry->second;
214 
215  char tmp[14] = { };
216  strncpy(tmp, key.aux, 13);
217  bool is_policy_kernel = strncmp(tmp, "policy_kernel", 13) == 0 ? true : false;
218  bool is_policy = (strncmp(tmp, "policy", 6) == 0 && !is_policy_kernel) ? true : false;
219  if (param.n_calls > 0 && !is_policy) total_time += param.n_calls * param.time;
220  if (param.n_calls > 0 && is_policy) async_total_time += param.n_calls * param.time;
221  }
222 
223 
224  while ( !q.empty() ) {
225  TuneKey key = q.top().first;
226  TuneParam param = q.top().second;
227 
228  char tmp[TuneKey::aux_n] = { };
229  strncpy(tmp, key.aux, TuneKey::aux_n);
230  bool is_policy_kernel = strncmp(tmp, "policy_kernel", 13) == 0 ? true : false;
231  bool is_policy = (strncmp(tmp, "policy", 6) == 0 && !is_policy_kernel) ? true : false;
232 
233  // synchronous profile
234  if (param.n_calls > 0 && !is_policy) {
235  double time = param.n_calls * param.time;
236 
237  out << std::setw(12) << param.n_calls * param.time << "\t" << std::setw(12) << (time / total_time) * 100 << "\t";
238  out << std::setw(12) << param.n_calls << "\t" << std::setw(12) << param.time << "\t" << std::setw(16) << key.volume << "\t";
239  out << key.name << "\t" << key.aux << "\t" << param.comment; // param.comment ends with a newline
240  }
241 
242  // async policy profile
243  if (param.n_calls > 0 && is_policy) {
244  double time = param.n_calls * param.time;
245 
246  async_out << std::setw(12) << param.n_calls * param.time << "\t" << std::setw(12) << (time / async_total_time) * 100 << "\t";
247  async_out << std::setw(12) << param.n_calls << "\t" << std::setw(12) << param.time << "\t" << std::setw(16) << key.volume << "\t";
248  async_out << key.name << "\t" << key.aux << "\t" << param.comment; // param.comment ends with a newline
249  }
250 
251  q.pop();
252  }
253 
254  out << std::endl << "# Total time spent in kernels = " << total_time << " seconds" << std::endl;
255  async_out << std::endl << "# Total time spent in asynchronous execution = " << async_total_time << " seconds" << std::endl;
256  }
257 
261  static void serializeTrace(std::ostream &out)
262  {
263  for (auto it = trace_list.begin(); it != trace_list.end(); it++) {
264 
265  TuneKey &key = it->key;
266 
267  // special case kernel members of a policy
268  char tmp[TuneKey::aux_n] = { };
269  strncpy(tmp, key.aux, TuneKey::aux_n);
270  bool is_policy_kernel = strcmp(tmp, "policy_kernel") == 0 ? true : false;
271 
272  out << std::setw(12) << it->time << "\t";
273  out << std::setw(12) << it->device_bytes << "\t";
274  out << std::setw(12) << it->pinned_bytes << "\t";
275  out << std::setw(12) << it->mapped_bytes << "\t";
276  out << std::setw(12) << it->host_bytes << "\t";
277  out << std::setw(16) << key.volume << "\t";
278  if (is_policy_kernel) out << "\t";
279  out << key.name << "\t";
280  if (!is_policy_kernel) out << "\t";
281  out << key.aux << std::endl;
282 
283  }
284  }
285 
286 
290  static void broadcastTuneCache()
291  {
292 #ifdef MULTI_GPU
293 
294  std::stringstream serialized;
295  size_t size;
296 
297  if (comm_rank() == 0) {
298  serializeTuneCache(serialized);
299  size = serialized.str().length();
300  }
301  comm_broadcast(&size, sizeof(size_t));
302 
303  if (size > 0) {
304  if (comm_rank() == 0) {
305  comm_broadcast(const_cast<char *>(serialized.str().c_str()), size);
306  } else {
307  char *serstr = new char[size+1];
308  comm_broadcast(serstr, size);
309  serstr[size] ='\0'; // null-terminate
310  serialized.str(serstr);
311  deserializeTuneCache(serialized);
312  delete[] serstr;
313  }
314  }
315 #endif
316  }
317 
318 
319  /*
320  * Read tunecache from disk.
321  */
323  {
324  if (getTuning() == QUDA_TUNE_NO) {
325  warningQuda("Autotuning disabled");
326  return;
327  }
328 
329  char *path;
330  struct stat pstat;
331  std::string cache_path, line, token;
332  std::ifstream cache_file;
333  std::stringstream ls;
334 
335  path = getenv("QUDA_RESOURCE_PATH");
336 
337  if (!path) {
338  warningQuda("Environment variable QUDA_RESOURCE_PATH is not set.");
339  warningQuda("Caching of tuned parameters will be disabled.");
340  return;
341  } else if (stat(path, &pstat) || !S_ISDIR(pstat.st_mode)) {
342  warningQuda("The path \"%s\" specified by QUDA_RESOURCE_PATH does not exist or is not a directory.", path);
343  warningQuda("Caching of tuned parameters will be disabled.");
344  return;
345  } else {
346  resource_path = path;
347  }
348 
349  bool version_check = true;
350  char *override_version_env = getenv("QUDA_TUNE_VERSION_CHECK");
351  if (override_version_env && strcmp(override_version_env, "0") == 0) {
352  version_check = false;
353  warningQuda("Disabling QUDA tunecache version check");
354  }
355 
356 #ifdef MULTI_GPU
357  if (comm_rank() == 0) {
358 #endif
359 
360  cache_path = resource_path;
361  cache_path += "/tunecache.tsv";
362  cache_file.open(cache_path.c_str());
363 
364  if (cache_file) {
365 
366  if (!cache_file.good()) errorQuda("Bad format in %s", cache_path.c_str());
367  getline(cache_file, line);
368  ls.str(line);
369  ls >> token;
370  if (token.compare("tunecache")) errorQuda("Bad format in %s", cache_path.c_str());
371  ls >> token;
372  if (version_check && token.compare(quda_version))
373  errorQuda("Cache file %s does not match current QUDA version. \nPlease delete this file or set the "
374  "QUDA_RESOURCE_PATH environment variable to point to a new path.",
375  cache_path.c_str());
376  ls >> token;
377 #ifdef GITVERSION
378  if (version_check && token.compare(gitversion))
379  errorQuda("Cache file %s does not match current QUDA version. \nPlease delete this file or set the "
380  "QUDA_RESOURCE_PATH environment variable to point to a new path.",
381  cache_path.c_str());
382 #else
383  if (version_check && token.compare(quda_version))
384  errorQuda("Cache file %s does not match current QUDA version. \nPlease delete this file or set the "
385  "QUDA_RESOURCE_PATH environment variable to point to a new path.",
386  cache_path.c_str());
387 #endif
388  ls >> token;
389  if (version_check && token.compare(quda_hash))
390  errorQuda("Cache file %s does not match current QUDA build. \nPlease delete this file or set the "
391  "QUDA_RESOURCE_PATH environment variable to point to a new path.",
392  cache_path.c_str());
393 
394  if (!cache_file.good()) errorQuda("Bad format in %s", cache_path.c_str());
395  getline(cache_file, line); // eat the blank line
396 
397  if (!cache_file.good()) errorQuda("Bad format in %s", cache_path.c_str());
398  getline(cache_file, line); // eat the description line
399 
400  deserializeTuneCache(cache_file);
401 
402  cache_file.close();
403  initial_cache_size = tunecache.size();
404 
405  if (getVerbosity() >= QUDA_SUMMARIZE) {
406  printfQuda("Loaded %d sets of cached parameters from %s\n", static_cast<int>(initial_cache_size), cache_path.c_str());
407  }
408 
409 
410  } else {
411  warningQuda("Cache file not found. All kernels will be re-tuned (if tuning is enabled).");
412  }
413 
414 #ifdef MULTI_GPU
415  }
416 #endif
417 
418 
420  }
421 
422 
426  void saveTuneCache(bool error)
427  {
428  time_t now;
429  int lock_handle;
430  std::string lock_path, cache_path;
431  std::ofstream cache_file;
432 
433  if (resource_path.empty()) return;
434 
435  //FIXME: We should really check to see if any nodes have tuned a kernel that was not also tuned on node 0, since as things
436  // stand, the corresponding launch parameters would never get cached to disk in this situation. This will come up if we
437  // ever support different subvolumes per GPU (as might be convenient for lattice volumes that don't divide evenly).
438 
439 #ifdef MULTI_GPU
440  if (comm_rank() == 0) {
441 #endif
442 
443  if (tunecache.size() == initial_cache_size && !error) return;
444 
445  // Acquire lock. Note that this is only robust if the filesystem supports flock() semantics, which is true for
446  // NFS on recent versions of linux but not Lustre by default (unless the filesystem was mounted with "-o flock").
447  lock_path = resource_path + "/tunecache.lock";
448  lock_handle = open(lock_path.c_str(), O_WRONLY | O_CREAT | O_EXCL, 0666);
449  if (lock_handle == -1) {
450  warningQuda("Unable to lock cache file. Tuned launch parameters will not be cached to disk. "
451  "If you are certain that no other instances of QUDA are accessing this filesystem, "
452  "please manually remove %s", lock_path.c_str());
453  return;
454  }
455  char msg[] = "If no instances of applications using QUDA are running,\n"
456  "this lock file shouldn't be here and is safe to delete.";
457  int stat = write(lock_handle, msg, sizeof(msg)); // check status to avoid compiler warning
458  if (stat == -1) warningQuda("Unable to write to lock file for some bizarre reason");
459 
460  cache_path = resource_path + (error ? "/tunecache_error.tsv" : "/tunecache.tsv");
461  cache_file.open(cache_path.c_str());
462 
463  if (getVerbosity() >= QUDA_SUMMARIZE) {
464  printfQuda("Saving %d sets of cached parameters to %s\n", static_cast<int>(tunecache.size()), cache_path.c_str());
465  }
466 
467  time(&now);
468  cache_file << "tunecache\t" << quda_version;
469 #ifdef GITVERSION
470  cache_file << "\t" << gitversion;
471 #else
472  cache_file << "\t" << quda_version;
473 #endif
474  cache_file << "\t" << quda_hash << "\t# Last updated " << ctime(&now) << std::endl;
475  cache_file << std::setw(16) << "volume" << "\tname\taux\tblock.x\tblock.y\tblock.z\tgrid.x\tgrid.y\tgrid.z\tshared_bytes\taux.x\taux.y\taux.z\taux.w\ttime\tcomment" << std::endl;
476  serializeTuneCache(cache_file);
477  cache_file.close();
478 
479  // Release lock.
480  close(lock_handle);
481  remove(lock_path.c_str());
482 
483  initial_cache_size = tunecache.size();
484 
485 #ifdef MULTI_GPU
486  } else {
487  // give process 0 time to write out its tunecache if needed, but
488  // doesn't cause a hang if error is not triggered on process 0
489  if (error) sleep(10);
490  }
491 #endif
492  }
493 
494  static bool policy_tuning = false;
495  bool policyTuning() {
496  return policy_tuning;
497  }
498 
499  void setPolicyTuning(bool policy_tuning_) {
500  policy_tuning = policy_tuning_;
501  }
502 
503  // flush profile, setting counts to zero
505  {
506  for (map::iterator entry = tunecache.begin(); entry != tunecache.end(); entry++) {
507  // set all n_calls = 0
508  TuneParam &param = entry->second;
509  param.n_calls = 0;
510  }
511  }
512 
513  // save profile
514  void saveProfile(const std::string label)
515  {
516  time_t now;
517  int lock_handle;
518  std::string lock_path, profile_path, async_profile_path, trace_path;
519  std::ofstream profile_file, async_profile_file, trace_file;
520 
521  if (resource_path.empty()) return;
522 
523 #ifdef MULTI_GPU
524  if (comm_rank() == 0) {
525 #endif
526 
527  // Acquire lock. Note that this is only robust if the filesystem supports flock() semantics, which is true for
528  // NFS on recent versions of linux but not Lustre by default (unless the filesystem was mounted with "-o flock").
529  lock_path = resource_path + "/profile.lock";
530  lock_handle = open(lock_path.c_str(), O_WRONLY | O_CREAT | O_EXCL, 0666);
531  if (lock_handle == -1) {
532  warningQuda("Unable to lock profile file. Profile will not be saved to disk. "
533  "If you are certain that no other instances of QUDA are accessing this filesystem, "
534  "please manually remove %s", lock_path.c_str());
535  return;
536  }
537  char msg[] = "If no instances of applications using QUDA are running,\n"
538  "this lock file shouldn't be here and is safe to delete.";
539  int stat = write(lock_handle, msg, sizeof(msg)); // check status to avoid compiler warning
540  if (stat == -1) warningQuda("Unable to write to lock file for some bizarre reason");
541 
542  // profile counter for writing out unique profiles
543  static int count = 0;
544 
545  char *profile_fname = getenv("QUDA_PROFILE_OUTPUT_BASE");
546 
547  if (!profile_fname) {
548  warningQuda("Environment variable QUDA_PROFILE_OUTPUT_BASE not set; writing to profile.tsv and profile_async.tsv");
549  profile_path = resource_path + "/profile_" + std::to_string(count) + ".tsv";
550  async_profile_path = resource_path + "/profile_async_" + std::to_string(count) + ".tsv";
551  if (traceEnabled()) trace_path = resource_path + "/trace_" + std::to_string(count) + ".tsv";
552  } else {
553  profile_path = resource_path + "/" + profile_fname + "_" + std::to_string(count) + ".tsv";
554  async_profile_path = resource_path + "/" + profile_fname + "_" + std::to_string(count) + "_async.tsv";
555  if (traceEnabled()) trace_path = resource_path + "/" + profile_fname + "_trace_" + std::to_string(count) + ".tsv";
556  }
557 
558  count++;
559 
560  profile_file.open(profile_path.c_str());
561  async_profile_file.open(async_profile_path.c_str());
562  if (traceEnabled()) trace_file.open(trace_path.c_str());
563 
564  if (getVerbosity() >= QUDA_SUMMARIZE) {
565  // compute number of non-zero entries that will be output in the profile
566  int n_entry = 0;
567  int n_policy = 0;
568  for (map::iterator entry = tunecache.begin(); entry != tunecache.end(); entry++) {
569  // if a policy entry, then we can ignore
570  char tmp[TuneKey::aux_n] = { };
571  strncpy(tmp, entry->first.aux, TuneKey::aux_n);
572  TuneParam param = entry->second;
573  bool is_policy = strcmp(tmp, "policy") == 0 ? true : false;
574  if (param.n_calls > 0 && !is_policy) n_entry++;
575  if (param.n_calls > 0 && is_policy) n_policy++;
576  }
577 
578  printfQuda("Saving %d sets of cached parameters to %s\n", n_entry, profile_path.c_str());
579  printfQuda("Saving %d sets of cached profiles to %s\n", n_policy, async_profile_path.c_str());
580  if (traceEnabled()) printfQuda("Saving trace list with %lu entries to %s\n", trace_list.size(), trace_path.c_str());
581  }
582 
583  time(&now);
584 
585  std::string Label = label.empty() ? "profile" : label;
586 
587  profile_file << Label << "\t" << quda_version;
588 #ifdef GITVERSION
589  profile_file << "\t" << gitversion;
590 #else
591  profile_file << "\t" << quda_version;
592 #endif
593  profile_file << "\t" << quda_hash << "\t# Last updated " << ctime(&now) << std::endl;
594  profile_file << std::setw(12) << "total time" << "\t" << std::setw(12) << "percentage" << "\t" << std::setw(12) << "calls" << "\t" << std::setw(12) << "time / call" << "\t" << std::setw(16) << "volume" << "\tname\taux\tcomment" << std::endl;
595 
596  async_profile_file << Label << "\t" << quda_version;
597 #ifdef GITVERSION
598  async_profile_file << "\t" << gitversion;
599 #else
600  async_profile_file << "\t" << quda_version;
601 #endif
602  async_profile_file << "\t" << quda_hash << "\t# Last updated " << ctime(&now) << std::endl;
603  async_profile_file << std::setw(12) << "total time" << "\t" << std::setw(12) << "percentage" << "\t" << std::setw(12) << "calls" << "\t" << std::setw(12) << "time / call" << "\t" << std::setw(16) << "volume" << "\tname\taux\tcomment" << std::endl;
604 
605  serializeProfile(profile_file, async_profile_file);
606 
607  profile_file.close();
608  async_profile_file.close();
609 
610  if (traceEnabled()) {
611  trace_file << "trace" << "\t" << quda_version;
612 #ifdef GITVERSION
613  trace_file << "\t" << gitversion;
614 #else
615  trace_file << "\t" << quda_version;
616 #endif
617  trace_file << "\t" << quda_hash << "\t# Last updated " << ctime(&now) << std::endl;
618 
619  trace_file << std::setw(12) << "time\t" << std::setw(12) << "device-mem\t" << std::setw(12) << "pinned-mem\t";
620  trace_file << std::setw(12) << "mapped-mem\t" << std::setw(12) << "host-mem\t";
621  trace_file << std::setw(16) << "volume" << "\tname\taux" << std::endl;
622 
623  serializeTrace(trace_file);
624 
625  trace_file.close();
626  }
627 
628  // Release lock.
629  close(lock_handle);
630  remove(lock_path.c_str());
631 
632 #ifdef MULTI_GPU
633  }
634 #endif
635  }
636 
637  static TimeProfile launchTimer("tuneLaunch");
638 
644  {
645 
646 #ifdef LAUNCH_TIMER
647  launchTimer.TPSTART(QUDA_PROFILE_TOTAL);
648  launchTimer.TPSTART(QUDA_PROFILE_INIT);
649 #endif
650 
651  const TuneKey key = tunable.tuneKey();
652  last_key = key;
653  static TuneParam param;
654 
655 #ifdef LAUNCH_TIMER
656  launchTimer.TPSTOP(QUDA_PROFILE_INIT);
657  launchTimer.TPSTART(QUDA_PROFILE_PREAMBLE);
658 #endif
659 
660  static const Tunable *active_tunable; // for error checking
661  it = tunecache.find(key);
662 
663  // first check if we have the tuned value and return if we have it
664  if (enabled == QUDA_TUNE_YES && it != tunecache.end()) {
665 
666 #ifdef LAUNCH_TIMER
667  launchTimer.TPSTOP(QUDA_PROFILE_PREAMBLE);
668  launchTimer.TPSTART(QUDA_PROFILE_COMPUTE);
669 #endif
670 
671  TuneParam &param = it->second;
672 
673  if (verbosity >= QUDA_DEBUG_VERBOSE) {
674  printfQuda("Launching %s with %s at vol=%s with %s\n",
675  key.name, key.aux, key.volume, tunable.paramString(param).c_str());
676  }
677 
678 #ifdef LAUNCH_TIMER
679  launchTimer.TPSTOP(QUDA_PROFILE_COMPUTE);
680  launchTimer.TPSTART(QUDA_PROFILE_EPILOGUE);
681 #endif
682 
683  tunable.checkLaunchParam(param);
684 
685  // we could be tuning outside of the current scope
686  if (!tuning && profile_count) param.n_calls++;
687 
688 #ifdef LAUNCH_TIMER
689  launchTimer.TPSTOP(QUDA_PROFILE_EPILOGUE);
690  launchTimer.TPSTOP(QUDA_PROFILE_TOTAL);
691 #endif
692 
693  if (traceEnabled() >= 2) {
694  TraceKey trace_entry(key, param.time);
695  trace_list.push_back(trace_entry);
696  }
697 
698  return param;
699  }
700 
701 #ifdef LAUNCH_TIMER
702  launchTimer.TPSTOP(QUDA_PROFILE_PREAMBLE);
703  launchTimer.TPSTOP(QUDA_PROFILE_TOTAL);
704 #endif
705 
706  if (enabled == QUDA_TUNE_NO) {
707  tunable.defaultTuneParam(param);
708  tunable.checkLaunchParam(param);
709  if (verbosity >= QUDA_DEBUG_VERBOSE) {
710  printfQuda("Launching %s with %s at vol=%s with %s (untuned)\n",
711  key.name, key.aux, key.volume, tunable.paramString(param).c_str());
712  }
713  } else if (!tuning) {
714 
715  /* As long as global reductions are not disabled, only do the
716  tuning on node 0, else do the tuning on all nodes since we
717  can't guarantee that all nodes are partaking */
718  if (comm_rank() == 0 || !commGlobalReduction() || policyTuning()) {
719  TuneParam best_param;
720  cudaError_t error = cudaSuccess;
721  cudaEvent_t start, end;
722  float elapsed_time, best_time;
723  time_t now;
724 
725  tuning = true;
726  active_tunable = &tunable;
727  best_time = FLT_MAX;
728 
729  if (verbosity >= QUDA_DEBUG_VERBOSE) printfQuda("PreTune %s\n", key.name);
730  tunable.preTune();
731 
732  cudaEventCreate(&start);
733  cudaEventCreate(&end);
734 
735  if (verbosity >= QUDA_DEBUG_VERBOSE) {
736  printfQuda("Tuning %s with %s at vol=%s\n", key.name, key.aux, key.volume);
737  }
738 
739  Timer tune_timer;
740  tune_timer.Start(__func__, __FILE__, __LINE__);
741 
742  tunable.initTuneParam(param);
743  while (tuning) {
744  cudaDeviceSynchronize();
745  cudaGetLastError(); // clear error counter
746  tunable.checkLaunchParam(param);
747  tunable.apply(0); // do initial call in case we need to jit compile for these parameters or if policy tuning
748  if (verbosity >= QUDA_DEBUG_VERBOSE) {
749  printfQuda("About to call tunable.apply block=(%d,%d,%d) grid=(%d,%d,%d) shared_bytes=%d aux=(%d,%d,%d)\n",
750  param.block.x, param.block.y, param.block.z,
751  param.grid.x, param.grid.y, param.grid.z,
752  param.shared_bytes,
753  param.aux.x, param.aux.y, param.aux.z);
754  }
755 
756  cudaEventRecord(start, 0);
757  for (int i=0; i<tunable.tuningIter(); i++) {
758  tunable.apply(0); // calls tuneLaunch() again, which simply returns the currently active param
759  }
760  cudaEventRecord(end, 0);
761  cudaEventSynchronize(end);
762  cudaEventElapsedTime(&elapsed_time, start, end);
763  cudaDeviceSynchronize();
764  error = cudaGetLastError();
765 
766  { // check that error state is cleared
767  cudaDeviceSynchronize();
768  cudaError_t error = cudaGetLastError();
769  if (error != cudaSuccess) errorQuda("Failed to clear error state %s\n", cudaGetErrorString(error));
770  }
771 
772  elapsed_time /= (1e3 * tunable.tuningIter());
773  if ( (elapsed_time < best_time) && (error == cudaSuccess) && (tunable.jitifyError() == CUDA_SUCCESS) ) {
774  best_time = elapsed_time;
775  best_param = param;
776  }
777  if ((verbosity >= QUDA_DEBUG_VERBOSE)) {
778  if (error == cudaSuccess && tunable.jitifyError() == CUDA_SUCCESS) {
779  printfQuda(" %s gives %s\n", tunable.paramString(param).c_str(),
780  tunable.perfString(elapsed_time).c_str());
781  } else {
782  if (tunable.jitifyError() == CUDA_SUCCESS) {
783  // if not jitify error must be regular error
784  printfQuda(" %s gives %s\n", tunable.paramString(param).c_str(), cudaGetErrorString(error));
785  } else {
786  // else is a jitify error
787  const char *str;
788  cuGetErrorString(tunable.jitifyError(), &str);
789  printfQuda(" %s gives %s\n", tunable.paramString(param).c_str(), str);
790  }
791  }
792  }
793  tuning = tunable.advanceTuneParam(param);
794  tunable.jitifyError() = CUDA_SUCCESS;
795  }
796 
797  tune_timer.Stop(__func__, __FILE__, __LINE__);
798 
799  if (best_time == FLT_MAX) {
800  errorQuda("Auto-tuning failed for %s with %s at vol=%s", key.name, key.aux, key.volume);
801  }
802  if (verbosity >= QUDA_VERBOSE) {
803  printfQuda("Tuned %s giving %s for %s with %s\n", tunable.paramString(best_param).c_str(),
804  tunable.perfString(best_time).c_str(), key.name, key.aux);
805  }
806  time(&now);
807  best_param.comment = "# " + tunable.perfString(best_time);
808  best_param.comment += ", tuning took " + std::to_string(tune_timer.Last()) + " seconds at ";
809  best_param.comment += ctime(&now); // includes a newline
810  best_param.time = best_time;
811 
812  cudaEventDestroy(start);
813  cudaEventDestroy(end);
814 
815  if (verbosity >= QUDA_DEBUG_VERBOSE) printfQuda("PostTune %s\n", key.name);
816  tunable.postTune();
817  param = best_param;
818  tunecache[key] = best_param;
819 
820  }
822 
823  // check this process is getting the key that is expected
824  if (tunecache.find(key) == tunecache.end()) {
825  errorQuda("Failed to find key entry (%s:%s:%s)", key.name, key.volume, key.aux);
826  }
827  param = tunecache[key]; // read this now for all processes
828 
829  if (traceEnabled() >= 2) {
830  TraceKey trace_entry(key, param.time);
831  trace_list.push_back(trace_entry);
832  }
833 
834  } else if (&tunable != active_tunable) {
835  errorQuda("Unexpected call to tuneLaunch() in %s::apply()", typeid(tunable).name());
836  }
837 
838  param.n_calls = profile_count ? 1 : 0;
839 
840  return param;
841  }
842 
844 #ifdef LAUNCH_TIMER
845  launchTimer.Print();
846 #endif
847  }
848 } // namespace quda
static int enable_trace
Definition: tune.cpp:71
int comm_rank(void)
Definition: comm_mpi.cpp:82
CUresult jitifyError() const
Definition: tune_quda.h:375
long device_allocated_peak()
Definition: malloc.cpp:59
float time
Definition: tune.cpp:33
long mapped_bytes
Definition: tune.cpp:37
virtual int tuningIter() const
Definition: tune_quda.h:285
void disableProfileCount()
Disable the profile kernel counting.
Definition: tune.cpp:125
QudaVerbosity getVerbosity()
Definition: util_quda.cpp:21
void end(void)
Definition: blas_quda.cu:489
#define errorQuda(...)
Definition: util_quda.h:121
void postTrace_(const char *func, const char *file, int line)
Post an event in the trace, recording where it was posted.
Definition: tune.cpp:92
cudaColorSpinorField * tmp
Definition: covdev_test.cpp:44
void saveTuneCache(bool error=false)
Definition: tune.cpp:426
#define QUDA_VERSION_MINOR
Definition: quda_constants.h:2
long pinned_bytes
Definition: tune.cpp:36
static map tunecache
Definition: tune.cpp:108
static void broadcastTuneCache()
Definition: tune.cpp:290
virtual std::string paramString(const TuneParam &param) const
Definition: tune_quda.h:287
quda::TuneKey getLastTuneKey()
Definition: tune.cpp:25
static std::list< TraceKey > trace_list
Definition: tune.cpp:70
virtual TuneKey tuneKey() const =0
char * gitversion
Definition: version.cpp:4
static size_t initial_cache_size
Definition: tune.cpp:110
std::map< TuneKey, TuneParam > map
Definition: tune.cpp:28
void enableProfileCount()
Enable the profile kernel counting.
Definition: tune.cpp:126
void i32toa(char *buffer, int32_t value)
Definition: uint_to_char.h:117
QudaGaugeParam param
Definition: pack_test.cpp:17
static const std::string quda_hash
Definition: tune.cpp:106
TraceKey(const TuneKey &key, float time)
Definition: tune.cpp:42
static bool profile_count
Definition: tune.cpp:123
long device_bytes
Definition: tune.cpp:35
int traceEnabled()
Definition: tune.cpp:73
static void deserializeTuneCache(std::istream &in)
Definition: tune.cpp:134
bool policyTuning()
Definition: tune.cpp:495
void flushProfile()
Flush profile contents, setting all counts to zero.
Definition: tune.cpp:504
static bool policy_tuning
Definition: tune.cpp:494
cpuColorSpinorField * in
static const std::string quda_version
Definition: tune.cpp:114
static map::iterator it
Definition: tune.cpp:109
bool operator()(const T &lhs, const T &rhs)
Definition: tune.cpp:191
void Print()
Definition: timer.cpp:7
constexpr int size
static bool tuning
Definition: tune.cpp:119
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
Definition: tune.cpp:643
char aux[aux_n]
Definition: tune_key.h:15
#define warningQuda(...)
Definition: util_quda.h:133
#define QUDA_VERSION_SUBMINOR
Definition: quda_constants.h:3
bool commGlobalReduction()
void Start(const char *func, const char *file, int line)
Definition: timer.h:63
void saveProfile(const std::string label="")
Save profile to disk.
Definition: tune.cpp:514
long host_allocated_peak()
Definition: malloc.cpp:65
static void serializeTuneCache(std::ostream &out)
Definition: tune.cpp:172
long mapped_allocated_peak()
Definition: malloc.cpp:63
void init()
Create the CUBLAS context.
Definition: blas_cublas.cu:31
static std::string resource_path
Definition: tune.cpp:107
void setPolicyTuning(bool)
Enable / disable whether are tuning a policy.
Definition: tune.cpp:499
bool activeTuning()
query if tuning is in progress
Definition: tune.cpp:121
TraceKey(const TraceKey &trace)
Definition: tune.cpp:49
double Last()
Definition: timer.h:88
static TuneKey last_key
Definition: tune.cpp:22
TraceKey & operator=(const TraceKey &trace)
Definition: tune.cpp:56
static void serializeTrace(std::ostream &out)
Definition: tune.cpp:261
virtual void preTune()
Definition: tune_quda.h:283
virtual void postTune()
Definition: tune_quda.h:284
void comm_broadcast(void *data, size_t nbytes)
Definition: comm_mpi.cpp:321
cpuColorSpinorField * out
long pinned_allocated_peak()
Definition: malloc.cpp:61
Main header file for the QUDA library.
#define STR(x)
Definition: tune.cpp:113
void loadTuneCache()
Definition: tune.cpp:322
static const int aux_n
Definition: tune_key.h:12
#define printfQuda(...)
Definition: util_quda.h:115
enum QudaTune_s QudaTune
char volume[volume_n]
Definition: tune_key.h:13
char name[name_n]
Definition: tune_key.h:14
long long n_calls
Definition: tune_quda.h:27
void checkLaunchParam(TuneParam &param)
Definition: tune_quda.h:344
long host_bytes
Definition: tune.cpp:38
void printLaunchTimer()
Definition: tune.cpp:843
TuneKey key
Definition: tune.cpp:32
virtual void initTuneParam(TuneParam &param) const
Definition: tune_quda.h:304
enum QudaVerbosity_s QudaVerbosity
static void serializeProfile(std::ostream &out, std::ostream &async_out)
Definition: tune.cpp:199
static TimeProfile launchTimer("tuneLaunch")
const std::map< TuneKey, TuneParam > & getTuneCache()
Returns a reference to the tunecache map.
Definition: tune.cpp:128
void Stop(const char *func, const char *file, int line)
Definition: timer.h:72
static const int volume_n
Definition: tune_key.h:10
QudaTune getTuning()
Query whether autotuning is enabled or not. Default is enabled but can be overridden by setting QUDA_...
Definition: util_quda.cpp:52
__device__ unsigned int count[QUDA_MAX_MULTI_REDUCE]
Definition: cub_helper.cuh:90
#define QUDA_VERSION_MAJOR
Definition: quda_constants.h:1
virtual std::string perfString(float time) const
Definition: tune_quda.h:294
static const int name_n
Definition: tune_key.h:11
virtual void apply(const cudaStream_t &stream)=0
virtual void defaultTuneParam(TuneParam &param) const
Definition: tune_quda.h:329
virtual bool advanceTuneParam(TuneParam &param) const
Definition: tune_quda.h:335
QudaVerbosity verbosity
Definition: test_util.cpp:1614
std::string comment
Definition: tune_quda.h:25