QUDA  0.9.0
tune.cpp
Go to the documentation of this file.
1 #include <tune_quda.h>
2 #include <comm_quda.h>
3 #include <quda.h> // for QUDA_VERSION_STRING
4 #include <sys/stat.h> // for stat()
5 #include <fcntl.h>
6 #include <cfloat> // for FLT_MAX
7 #include <ctime>
8 #include <fstream>
9 #include <typeinfo>
10 #include <map>
11 #include <list>
12 #include <unistd.h>
13 
14 #include <deque>
15 #include <queue>
16 #include <functional>
17 #ifdef PTHREADS
18 #include <pthread.h>
19 #endif
20 
21 //#define LAUNCH_TIMER
22 extern char* gitversion;
23 
24 namespace quda { static TuneKey last_key; }
25 
26 // intentionally leave this outside of the namespace for now
28 
29 namespace quda {
30  typedef std::map<TuneKey, TuneParam> map;
31 
32  struct TraceKey {
33 
35  float time;
36 
40  long host_bytes;
41 
42  TraceKey() { }
43 
44  TraceKey(const TuneKey &key, float time)
45  : key(key), time(time),
50 
51  TraceKey(const TraceKey &trace)
52  : key(trace.key), time(trace.time),
56  host_bytes(trace.host_bytes) { }
57 
58  TraceKey& operator=(const TraceKey &trace) {
59  if (&trace != this) {
60  key = trace.key;
61  time = trace.time;
62  device_bytes = trace.device_bytes;
63  pinned_bytes = trace.pinned_bytes;
64  mapped_bytes = trace.mapped_bytes;
65  host_bytes = trace.host_bytes;
66  }
67  return *this;
68  }
69  };
70 
71  // linked list that is augmented each time we call a kernel
72  static std::list<TraceKey> trace_list;
73  static bool enable_trace = false;
74 
75  bool traceEnabled() {
76  static bool init = false;
77 
78  if (!init) {
79  char *enable_trace_env = getenv("QUDA_ENABLE_TRACE");
80  if (enable_trace_env && strcmp(enable_trace_env, "1") == 0) {
81  enable_trace = true;
82  }
83  init = true;
84  }
85  return enable_trace;
86  }
87 
88  static const std::string quda_hash = QUDA_HASH; // defined in lib/Makefile
89  static std::string resource_path;
90  static map tunecache;
91  static map::iterator it;
92  static size_t initial_cache_size = 0;
93 
94 #define STR_(x) #x
95 #define STR(x) STR_(x)
97 #undef STR
98 #undef STR_
99 
101  static bool tuning = false;
102 
103  bool activeTuning() { return tuning; }
104 
105  static bool profile_count = true;
106 
107  void disableProfileCount() { profile_count = false; }
109 
110  const map& getTuneCache() { return tunecache; }
111 
112 
116  static void deserializeTuneCache(std::istream &in)
117  {
118  std::string line;
119  std::stringstream ls;
120 
121  TuneKey key;
123 
124  std::string v;
125  std::string n;
126  std::string a;
127 
128  int check;
129 
130  while (in.good()) {
131  getline(in, line);
132  if (!line.length()) continue; // skip blank lines (e.g., at end of file)
133  ls.clear();
134  ls.str(line);
135  ls >> v >> n >> a >> param.block.x >> param.block.y >> param.block.z;
136  check = snprintf(key.volume, key.volume_n, "%s", v.c_str());
137  if (check < 0 || check >= key.volume_n) errorQuda("Error writing volume string (check = %d)", check);
138  check = snprintf(key.name, key.name_n, "%s", n.c_str());
139  if (check < 0 || check >= key.name_n) errorQuda("Error writing name string (check=%d)", check);
140  check = snprintf(key.aux, key.aux_n, "%s", a.c_str());
141  if (check < 0 || check >= key.aux_n) errorQuda("Error writing aux string (check=%d)", check);
142  ls >> param.grid.x >> param.grid.y >> param.grid.z >> param.shared_bytes >> param.aux.x >> param.aux.y >> param.aux.z >> param.aux.w >> param.time;
143  ls.ignore(1); // throw away tab before comment
144  getline(ls, param.comment); // assume anything remaining on the line is a comment
145  param.comment += "\n"; // our convention is to include the newline, since ctime() likes to do this
146  tunecache[key] = param;
147  }
148  }
149 
150 
154  static void serializeTuneCache(std::ostream &out)
155  {
156  map::iterator entry;
157 
158  for (entry = tunecache.begin(); entry != tunecache.end(); entry++) {
159  TuneKey key = entry->first;
160  TuneParam param = entry->second;
161 
162  out << std::setw(16) << key.volume << "\t" << key.name << "\t" << key.aux << "\t";
163  out << param.block.x << "\t" << param.block.y << "\t" << param.block.z << "\t";
164  out << param.grid.x << "\t" << param.grid.y << "\t" << param.grid.z << "\t";
165  out << param.shared_bytes << "\t" << param.aux.x << "\t" << param.aux.y << "\t" << param.aux.z << "\t" << param.aux.w << "\t";
166  out << param.time << "\t" << param.comment; // param.comment ends with a newline
167  }
168  }
169 
170 
171  template <class T>
172  struct less_significant : std::binary_function<T,T,bool> {
173  inline bool operator()(const T &lhs, const T &rhs) {
174  return lhs.second.time * lhs.second.n_calls < rhs.second.time * rhs.second.n_calls;
175  }
176  };
177 
181  static void serializeProfile(std::ostream &out, std::ostream &async_out)
182  {
183  map::iterator entry;
184  double total_time = 0.0;
185  double async_total_time = 0.0;
186 
187  // first let's sort the entries in decreasing order of significance
188  typedef std::pair<TuneKey, TuneParam> profile_t;
189  typedef std::priority_queue<profile_t, std::deque<profile_t>, less_significant<profile_t> > queue_t;
190  queue_t q(tunecache.begin(), tunecache.end());
191 
192  // now compute total time spent in kernels so we can give each kernel a significance
193  for (entry = tunecache.begin(); entry != tunecache.end(); entry++) {
194  TuneKey key = entry->first;
195  TuneParam param = entry->second;
196 
197  char tmp[7] = { };
198  strncpy(tmp, key.aux, 6);
199  bool is_policy = strcmp(tmp, "policy") == 0 ? true : false;
200  if (param.n_calls > 0 && !is_policy) total_time += param.n_calls * param.time;
201  if (param.n_calls > 0 && is_policy) async_total_time += param.n_calls * param.time;
202  }
203 
204 
205  while ( !q.empty() ) {
206  TuneKey key = q.top().first;
207  TuneParam param = q.top().second;
208 
209  char tmp[7] = { };
210  strncpy(tmp, key.aux, 6);
211  bool is_policy = strcmp(tmp, "policy") == 0 ? true : false;
212 
213  // synchronous profile
214  if (param.n_calls > 0 && !is_policy) {
215  double time = param.n_calls * param.time;
216 
217  out << std::setw(12) << param.n_calls * param.time << "\t" << std::setw(12) << (time / total_time) * 100 << "\t";
218  out << std::setw(12) << param.n_calls << "\t" << std::setw(12) << param.time << "\t" << std::setw(16) << key.volume << "\t";
219  out << key.name << "\t" << key.aux << "\t" << param.comment; // param.comment ends with a newline
220  }
221 
222  // async policy profile
223  if (param.n_calls > 0 && is_policy) {
224  double time = param.n_calls * param.time;
225 
226  async_out << std::setw(12) << param.n_calls * param.time << "\t" << std::setw(12) << (time / async_total_time) * 100 << "\t";
227  async_out << std::setw(12) << param.n_calls << "\t" << std::setw(12) << param.time << "\t" << std::setw(16) << key.volume << "\t";
228  async_out << key.name << "\t" << key.aux << "\t" << param.comment; // param.comment ends with a newline
229  }
230 
231  q.pop();
232  }
233 
234  out << std::endl << "# Total time spent in kernels = " << total_time << " seconds" << std::endl;
235  async_out << std::endl << "# Total time spent in asynchronous execution = " << async_total_time << " seconds" << std::endl;
236  }
237 
241  static void serializeTrace(std::ostream &out)
242  {
243  for (auto it = trace_list.begin(); it != trace_list.end(); it++) {
244 
245  TuneKey &key = it->key;
246 
247  // special case kernel members of a policy
248  char tmp[14] = { };
249  strncpy(tmp, key.aux, 13);
250  bool is_policy_kernel = strcmp(tmp, "policy_kernel") == 0 ? true : false;
251 
252  out << std::setw(12) << it->time << "\t";
253  out << std::setw(12) << it->device_bytes << "\t";
254  out << std::setw(12) << it->pinned_bytes << "\t";
255  out << std::setw(12) << it->mapped_bytes << "\t";
256  out << std::setw(12) << it->host_bytes << "\t";
257  out << std::setw(16) << key.volume << "\t";
258  if (is_policy_kernel) out << "\t";
259  out << key.name << "\t";
260  if (!is_policy_kernel) out << "\t";
261  out << key.aux << std::endl;
262 
263  }
264  }
265 
266 
270  static void broadcastTuneCache()
271  {
272 #ifdef MULTI_GPU
273 
274  std::stringstream serialized;
275  size_t size;
276 
277  if (comm_rank() == 0) {
278  serializeTuneCache(serialized);
279  size = serialized.str().length();
280  }
281  comm_broadcast(&size, sizeof(size_t));
282 
283  if (size > 0) {
284  if (comm_rank() == 0) {
285  comm_broadcast(const_cast<char *>(serialized.str().c_str()), size);
286  } else {
287  char *serstr = new char[size+1];
288  comm_broadcast(serstr, size);
289  serstr[size] ='\0'; // null-terminate
290  serialized.str(serstr);
291  deserializeTuneCache(serialized);
292  delete[] serstr;
293  }
294  }
295 #endif
296  }
297 
298 
299  /*
300  * Read tunecache from disk.
301  */
303  {
304  if (getTuning() == QUDA_TUNE_NO) {
305  warningQuda("Autotuning disabled");
306  return;
307  }
308 
309  char *path;
310  struct stat pstat;
311  std::string cache_path, line, token;
312  std::ifstream cache_file;
313  std::stringstream ls;
314 
315  path = getenv("QUDA_RESOURCE_PATH");
316 
317  if (!path) {
318  warningQuda("Environment variable QUDA_RESOURCE_PATH is not set.");
319  warningQuda("Caching of tuned parameters will be disabled.");
320  return;
321  } else if (stat(path, &pstat) || !S_ISDIR(pstat.st_mode)) {
322  warningQuda("The path \"%s\" specified by QUDA_RESOURCE_PATH does not exist or is not a directory.", path);
323  warningQuda("Caching of tuned parameters will be disabled.");
324  return;
325  } else {
326  resource_path = path;
327  }
328 
329 #ifdef MULTI_GPU
330  if (comm_rank() == 0) {
331 #endif
332 
333  cache_path = resource_path;
334  cache_path += "/tunecache.tsv";
335  cache_file.open(cache_path.c_str());
336 
337  if (cache_file) {
338 
339  if (!cache_file.good()) errorQuda("Bad format in %s", cache_path.c_str());
340  getline(cache_file, line);
341  ls.str(line);
342  ls >> token;
343  if (token.compare("tunecache")) errorQuda("Bad format in %s", cache_path.c_str());
344  ls >> token;
345  if (token.compare(quda_version)) errorQuda("Cache file %s does not match current QUDA version. \nPlease delete this file or set the QUDA_RESOURCE_PATH environment variable to point to a new path.", cache_path.c_str());
346  ls >> token;
347 #ifdef GITVERSION
348  if (token.compare(gitversion)) errorQuda("Cache file %s does not match current QUDA version. \nPlease delete this file or set the QUDA_RESOURCE_PATH environment variable to point to a new path.", cache_path.c_str());
349 #else
350  if (token.compare(quda_version)) errorQuda("Cache file %s does not match current QUDA version. \nPlease delete this file or set the QUDA_RESOURCE_PATH environment variable to point to a new path.", cache_path.c_str());
351 #endif
352  ls >> token;
353  if (token.compare(quda_hash)) errorQuda("Cache file %s does not match current QUDA build. \nPlease delete this file or set the QUDA_RESOURCE_PATH environment variable to point to a new path.", cache_path.c_str());
354 
355 
356  if (!cache_file.good()) errorQuda("Bad format in %s", cache_path.c_str());
357  getline(cache_file, line); // eat the blank line
358 
359  if (!cache_file.good()) errorQuda("Bad format in %s", cache_path.c_str());
360  getline(cache_file, line); // eat the description line
361 
362  deserializeTuneCache(cache_file);
363 
364  cache_file.close();
365  initial_cache_size = tunecache.size();
366 
367  if (getVerbosity() >= QUDA_SUMMARIZE) {
368  printfQuda("Loaded %d sets of cached parameters from %s\n", static_cast<int>(initial_cache_size), cache_path.c_str());
369  }
370 
371 
372  } else {
373  warningQuda("Cache file not found. All kernels will be re-tuned (if tuning is enabled).");
374  }
375 
376 #ifdef MULTI_GPU
377  }
378 #endif
379 
380 
382  }
383 
384 
389  {
390  time_t now;
391  int lock_handle;
392  std::string lock_path, cache_path;
393  std::ofstream cache_file;
394 
395  if (resource_path.empty()) return;
396 
397  //FIXME: We should really check to see if any nodes have tuned a kernel that was not also tuned on node 0, since as things
398  // stand, the corresponding launch parameters would never get cached to disk in this situation. This will come up if we
399  // ever support different subvolumes per GPU (as might be convenient for lattice volumes that don't divide evenly).
400 
401 #ifdef MULTI_GPU
402  if (comm_rank() == 0) {
403 #endif
404 
405  if (tunecache.size() == initial_cache_size) return;
406 
407  // Acquire lock. Note that this is only robust if the filesystem supports flock() semantics, which is true for
408  // NFS on recent versions of linux but not Lustre by default (unless the filesystem was mounted with "-o flock").
409  lock_path = resource_path + "/tunecache.lock";
410  lock_handle = open(lock_path.c_str(), O_WRONLY | O_CREAT | O_EXCL, 0666);
411  if (lock_handle == -1) {
412  warningQuda("Unable to lock cache file. Tuned launch parameters will not be cached to disk. "
413  "If you are certain that no other instances of QUDA are accessing this filesystem, "
414  "please manually remove %s", lock_path.c_str());
415  return;
416  }
417  char msg[] = "If no instances of applications using QUDA are running,\n"
418  "this lock file shouldn't be here and is safe to delete.";
419  int stat = write(lock_handle, msg, sizeof(msg)); // check status to avoid compiler warning
420  if (stat == -1) warningQuda("Unable to write to lock file for some bizarre reason");
421 
422  cache_path = resource_path + "/tunecache.tsv";
423  cache_file.open(cache_path.c_str());
424 
425  if (getVerbosity() >= QUDA_SUMMARIZE) {
426  printfQuda("Saving %d sets of cached parameters to %s\n", static_cast<int>(tunecache.size()), cache_path.c_str());
427  }
428 
429  time(&now);
430  cache_file << "tunecache\t" << quda_version;
431 #ifdef GITVERSION
432  cache_file << "\t" << gitversion;
433 #else
434  cache_file << "\t" << quda_version;
435 #endif
436  cache_file << "\t" << quda_hash << "\t# Last updated " << ctime(&now) << std::endl;
437  cache_file << std::setw(16) << "volume" << "\tname\taux\tblock.x\tblock.y\tblock.z\tgrid.x\tgrid.y\tgrid.z\tshared_bytes\taux.x\taux.y\taux.z\taux.w\ttime\tcomment" << std::endl;
438  serializeTuneCache(cache_file);
439  cache_file.close();
440 
441  // Release lock.
442  close(lock_handle);
443  remove(lock_path.c_str());
444 
445  initial_cache_size = tunecache.size();
446 
447 #ifdef MULTI_GPU
448  }
449 #endif
450  }
451 
452  static bool policy_tuning = false;
453  bool policyTuning() {
454  return policy_tuning;
455  }
456 
457  void setPolicyTuning(bool policy_tuning_) {
458  policy_tuning = policy_tuning_;
459  }
460 
461  // flush profile, setting counts to zero
463  {
464  for (map::iterator entry = tunecache.begin(); entry != tunecache.end(); entry++) {
465  // set all n_calls = 0
466  TuneParam &param = entry->second;
467  param.n_calls = 0;
468  }
469  }
470 
471  // save profile
472  void saveProfile(const std::string label)
473  {
474  time_t now;
475  int lock_handle;
476  std::string lock_path, profile_path, async_profile_path, trace_path;
477  std::ofstream profile_file, async_profile_file, trace_file;
478 
479  if (resource_path.empty()) return;
480 
481 #ifdef MULTI_GPU
482  if (comm_rank() == 0) {
483 #endif
484 
485  // Acquire lock. Note that this is only robust if the filesystem supports flock() semantics, which is true for
486  // NFS on recent versions of linux but not Lustre by default (unless the filesystem was mounted with "-o flock").
487  lock_path = resource_path + "/profile.lock";
488  lock_handle = open(lock_path.c_str(), O_WRONLY | O_CREAT | O_EXCL, 0666);
489  if (lock_handle == -1) {
490  warningQuda("Unable to lock profile file. Profile will not be saved to disk. "
491  "If you are certain that no other instances of QUDA are accessing this filesystem, "
492  "please manually remove %s", lock_path.c_str());
493  return;
494  }
495  char msg[] = "If no instances of applications using QUDA are running,\n"
496  "this lock file shouldn't be here and is safe to delete.";
497  int stat = write(lock_handle, msg, sizeof(msg)); // check status to avoid compiler warning
498  if (stat == -1) warningQuda("Unable to write to lock file for some bizarre reason");
499 
500  // profile counter for writing out unique profiles
501  static int count = 0;
502 
503  char *profile_fname = getenv("QUDA_PROFILE_OUTPUT_BASE");
504 
505  if (!profile_fname) {
506  warningQuda("Environment variable QUDA_PROFILE_OUTPUT_BASE not set; writing to profile.tsv and profile_async.tsv");
507  profile_path = resource_path + "/profile_" + std::to_string(count) + ".tsv";
508  async_profile_path = resource_path + "/profile_async_" + std::to_string(count) + ".tsv";
509  if (traceEnabled()) trace_path = resource_path + "/trace_" + std::to_string(count) + ".tsv";
510  } else {
511  profile_path = resource_path + "/" + profile_fname + "_" + std::to_string(count) + ".tsv";
512  async_profile_path = resource_path + "/" + profile_fname + "_" + std::to_string(count) + "_async.tsv";
513  if (traceEnabled()) trace_path = resource_path + "/" + profile_fname + "_trace_" + std::to_string(count) + ".tsv";
514  }
515 
516  count++;
517 
518  profile_file.open(profile_path.c_str());
519  async_profile_file.open(async_profile_path.c_str());
520  if (traceEnabled()) trace_file.open(trace_path.c_str());
521 
522  if (getVerbosity() >= QUDA_SUMMARIZE) {
523  // compute number of non-zero entries that will be output in the profile
524  int n_entry = 0;
525  int n_policy = 0;
526  for (map::iterator entry = tunecache.begin(); entry != tunecache.end(); entry++) {
527  // if a policy entry, then we can ignore
528  char tmp[7] = { };
529  strncpy(tmp, entry->first.aux, 6);
530  TuneParam param = entry->second;
531  bool is_policy = strcmp(tmp, "policy") == 0 ? true : false;
532  if (param.n_calls > 0 && !is_policy) n_entry++;
533  if (param.n_calls > 0 && is_policy) n_policy++;
534  }
535 
536  printfQuda("Saving %d sets of cached parameters to %s\n", n_entry, profile_path.c_str());
537  printfQuda("Saving %d sets of cached profiles to %s\n", n_policy, async_profile_path.c_str());
538  if (traceEnabled()) printfQuda("Saving trace list with %lu entries to %s\n", trace_list.size(), trace_path.c_str());
539  }
540 
541  time(&now);
542 
543  std::string Label = label.empty() ? "profile" : label;
544 
545  profile_file << Label << "\t" << quda_version;
546 #ifdef GITVERSION
547  profile_file << "\t" << gitversion;
548 #else
549  profile_file << "\t" << quda_version;
550 #endif
551  profile_file << "\t" << quda_hash << "\t# Last updated " << ctime(&now) << std::endl;
552  profile_file << std::setw(12) << "total time" << "\t" << std::setw(12) << "percentage" << "\t" << std::setw(12) << "calls" << "\t" << std::setw(12) << "time / call" << "\t" << std::setw(16) << "volume" << "\tname\taux\tcomment" << std::endl;
553 
554  async_profile_file << Label << "\t" << quda_version;
555 #ifdef GITVERSION
556  async_profile_file << "\t" << gitversion;
557 #else
558  async_profile_file << "\t" << quda_version;
559 #endif
560  async_profile_file << "\t" << quda_hash << "\t# Last updated " << ctime(&now) << std::endl;
561  async_profile_file << std::setw(12) << "total time" << "\t" << std::setw(12) << "percentage" << "\t" << std::setw(12) << "calls" << "\t" << std::setw(12) << "time / call" << "\t" << std::setw(16) << "volume" << "\tname\taux\tcomment" << std::endl;
562 
563  serializeProfile(profile_file, async_profile_file);
564 
565  profile_file.close();
566  async_profile_file.close();
567 
568  if (traceEnabled()) {
569  trace_file << "trace" << "\t" << quda_version;
570 #ifdef GITVERSION
571  trace_file << "\t" << gitversion;
572 #else
573  trace_file << "\t" << quda_version;
574 #endif
575  trace_file << "\t" << quda_hash << "\t# Last updated " << ctime(&now) << std::endl;
576 
577  trace_file << std::setw(12) << "time\t" << std::setw(12) << "device-mem\t" << std::setw(12) << "pinned-mem\t";
578  trace_file << std::setw(12) << "mapped-mem\t" << std::setw(12) << "host-mem\t";
579  trace_file << std::setw(16) << "volume" << "\tname\taux" << std::endl;
580 
581  serializeTrace(trace_file);
582 
583  trace_file.close();
584  }
585 
586  // Release lock.
587  close(lock_handle);
588  remove(lock_path.c_str());
589 
590 #ifdef MULTI_GPU
591  }
592 #endif
593  }
594 
595  static TimeProfile launchTimer("tuneLaunch");
596 
597 // static int tally = 0;
598 
604  {
605 #ifdef PTHREADS // tuning should be performed serially
606 // pthread_mutex_lock(&pthread_mutex);
607 // tally++;
608 #endif
609 
610 #ifdef LAUNCH_TIMER
613 #endif
614 
615  const TuneKey key = tunable.tuneKey();
616  last_key = key;
617  static TuneParam param;
618 
619 #ifdef LAUNCH_TIMER
622 #endif
623 
624  static const Tunable *active_tunable; // for error checking
625 
626  // first check if we have the tuned value and return if we have it
627  //if (enabled == QUDA_TUNE_YES && tunecache.count(key)) {
628 
629  it = tunecache.find(key);
630  if (enabled == QUDA_TUNE_YES && it != tunecache.end()) {
631 
632 #ifdef LAUNCH_TIMER
635 #endif
636 
637  TuneParam &param = it->second;
638 
639 #ifdef LAUNCH_TIMER
642 #endif
643 
644  tunable.checkLaunchParam(param);
645 
646 #ifdef PTHREADS
647  //pthread_mutex_unlock(&pthread_mutex);
648  //tally--;
649  //printfQuda("pthread_mutex_unlock a complete %d\n",tally);
650 #endif
651  // we could be tuning outside of the current scope
652  if (!tuning && profile_count) param.n_calls++;
653 
654 #ifdef LAUNCH_TIMER
657 #endif
658 
659  if (traceEnabled()) {
660  TraceKey trace_entry(key, param.time);
661  trace_list.push_back(trace_entry);
662  }
663 
664  return param;
665  }
666 
667 #ifdef LAUNCH_TIMER
670 #endif
671 
672 
673  if (enabled == QUDA_TUNE_NO) {
674  tunable.defaultTuneParam(param);
675  tunable.checkLaunchParam(param);
676  } else if (!tuning) {
677 
678  /* As long as global reductions are not disabled, only do the
679  tuning on node 0, else do the tuning on all nodes since we
680  can't guarantee that all nodes are partaking */
681  if (comm_rank() == 0 || !commGlobalReduction() || policyTuning()) {
682  TuneParam best_param;
683  cudaError_t error = cudaSuccess;
684  cudaEvent_t start, end;
685  float elapsed_time, best_time;
686  time_t now;
687 
688  tuning = true;
689  active_tunable = &tunable;
690  best_time = FLT_MAX;
691 
692  if (verbosity >= QUDA_DEBUG_VERBOSE) printfQuda("PreTune %s\n", key.name);
693  tunable.preTune();
694 
695  cudaEventCreate(&start);
696  cudaEventCreate(&end);
697 
698  if (verbosity >= QUDA_DEBUG_VERBOSE) {
699  printfQuda("Tuning %s with %s at vol=%s\n", key.name, key.aux, key.volume);
700  }
701 
702  tunable.initTuneParam(param);
703  while (tuning) {
704  cudaDeviceSynchronize();
705  cudaGetLastError(); // clear error counter
706  tunable.checkLaunchParam(param);
707  if (policyTuning()) tunable.apply(0); // do a pre call if doing policy tuning
708 
709  if (verbosity >= QUDA_DEBUG_VERBOSE) {
710  printfQuda("About to call tunable.apply block=(%d,%d,%d) grid=(%d,%d,%d) shared_bytes=%d aux=(%d,%d,%d)\n",
711  param.block.x, param.block.y, param.block.z,
712  param.grid.x, param.grid.y, param.grid.z,
713  param.shared_bytes,
714  param.aux.x, param.aux.y, param.aux.z);
715  }
716 
717  cudaEventRecord(start, 0);
718  for (int i=0; i<tunable.tuningIter(); i++) {
719  tunable.apply(0); // calls tuneLaunch() again, which simply returns the currently active param
720  }
721  cudaEventRecord(end, 0);
722  cudaEventSynchronize(end);
723  cudaEventElapsedTime(&elapsed_time, start, end);
724  cudaDeviceSynchronize();
725  error = cudaGetLastError();
726 
727  { // check that error state is cleared
728  cudaDeviceSynchronize();
729  cudaError_t error = cudaGetLastError();
730  if (error != cudaSuccess) errorQuda("Failed to clear error state %s\n", cudaGetErrorString(error));
731  }
732 
733  elapsed_time /= (1e3 * tunable.tuningIter());
734  if ((elapsed_time < best_time) && (error == cudaSuccess)) {
735  best_time = elapsed_time;
736  best_param = param;
737  }
738  if ((verbosity >= QUDA_DEBUG_VERBOSE)) {
739  if (error == cudaSuccess)
740  printfQuda(" %s gives %s\n", tunable.paramString(param).c_str(),
741  tunable.perfString(elapsed_time).c_str());
742  else
743  printfQuda(" %s gives %s\n", tunable.paramString(param).c_str(), cudaGetErrorString(error));
744  }
745  tuning = tunable.advanceTuneParam(param);
746  }
747 
748  if (best_time == FLT_MAX) {
749  errorQuda("Auto-tuning failed for %s with %s at vol=%s", key.name, key.aux, key.volume);
750  }
751  if (verbosity >= QUDA_VERBOSE) {
752  printfQuda("Tuned %s giving %s for %s with %s\n", tunable.paramString(best_param).c_str(),
753  tunable.perfString(best_time).c_str(), key.name, key.aux);
754  }
755  time(&now);
756  best_param.comment = "# " + tunable.perfString(best_time) + ", tuned ";
757  best_param.comment += ctime(&now); // includes a newline
758  best_param.time = best_time;
759 
760  cudaEventDestroy(start);
761  cudaEventDestroy(end);
762 
763  if (verbosity >= QUDA_DEBUG_VERBOSE) printfQuda("PostTune %s\n", key.name);
764  tunable.postTune();
765  param = best_param;
766  tunecache[key] = best_param;
767 
768  }
770 
771  // check this process is getting the key that is expected
772  if (tunecache.find(key) == tunecache.end()) {
773  errorQuda("Failed to find key entry (%s:%s:%s)", key.name, key.volume, key.aux);
774  }
775  param = tunecache[key]; // read this now for all processes
776 
777  if (traceEnabled()) {
778  TraceKey trace_entry(key, param.time);
779  trace_list.push_back(trace_entry);
780  }
781 
782  } else if (&tunable != active_tunable) {
783  errorQuda("Unexpected call to tuneLaunch() in %s::apply()", typeid(tunable).name());
784  }
785 
786 #ifdef PTHREADS
787 // pthread_mutex_unlock(&pthread_mutex);
788 // tally--;
789 // printfQuda("pthread_mutex_unlock b complete %d\n",tally);
790 #endif
791 
792  param.n_calls = profile_count ? 1 : 0;
793 
794  return param;
795  }
796 
798 #ifdef LAUNCH_TIMER
799  launchTimer.Print();
800 #endif
801  }
802 } // namespace quda
int comm_rank(void)
Definition: comm_mpi.cpp:120
long device_allocated_peak()
Definition: malloc.cpp:57
bool traceEnabled()
Definition: tune.cpp:75
QudaVerbosity verbosity
float time
Definition: tune.cpp:35
int snprintf(char *__str, size_t __size, const char *__format,...) __attribute__((__format__(__printf__
long mapped_bytes
Definition: tune.cpp:39
std::map< TuneKey, TuneParam > map
virtual int tuningIter() const
Definition: tune_quda.h:206
void disableProfileCount()
Definition: tune.cpp:107
QudaVerbosity getVerbosity()
Definition: util_quda.cpp:20
#define errorQuda(...)
Definition: util_quda.h:90
void init()
Definition: blas_quda.cu:64
cudaEvent_t start
cudaColorSpinorField * tmp
Definition: covdev_test.cpp:44
#define QUDA_VERSION_MINOR
Definition: quda_constants.h:2
long pinned_bytes
Definition: tune.cpp:38
char * ctime(const time_t *)
static map tunecache
Definition: tune.cpp:90
static void broadcastTuneCache()
Definition: tune.cpp:270
virtual std::string paramString(const TuneParam &param) const
Definition: tune_quda.h:208
quda::TuneKey getLastTuneKey()
Definition: tune.cpp:27
static std::list< TraceKey > trace_list
Definition: tune.cpp:72
virtual TuneKey tuneKey() const =0
char * gitversion
Definition: version.cpp:4
static size_t initial_cache_size
Definition: tune.cpp:92
static bool enable_trace
Definition: tune.cpp:73
void enableProfileCount()
Definition: tune.cpp:108
QudaGaugeParam param
Definition: pack_test.cpp:17
static const std::string quda_hash
Definition: tune.cpp:88
TraceKey(const TuneKey &key, float time)
Definition: tune.cpp:44
time_t time(time_t *)
static bool profile_count
Definition: tune.cpp:105
int strcmp(const char *__s1, const char *__s2)
long device_bytes
Definition: tune.cpp:37
static __inline__ T * entry
static void deserializeTuneCache(std::istream &in)
Definition: tune.cpp:116
bool policyTuning()
Definition: tune.cpp:453
void flushProfile()
Flush profile contents, setting all counts to zero.
Definition: tune.cpp:462
static bool policy_tuning
Definition: tune.cpp:452
cpuColorSpinorField * in
static const std::string quda_version
Definition: tune.cpp:96
static map::iterator it
Definition: tune.cpp:91
bool operator()(const T &lhs, const T &rhs)
Definition: tune.cpp:173
void Print()
Definition: timer.cpp:6
static bool tuning
Definition: tune.cpp:101
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
Definition: tune.cpp:603
char aux[aux_n]
Definition: tune_key.h:15
#define warningQuda(...)
Definition: util_quda.h:101
#define QUDA_VERSION_SUBMINOR
Definition: quda_constants.h:3
bool commGlobalReduction()
void saveProfile(const std::string label="")
Save profile to disk.
Definition: tune.cpp:472
long host_allocated_peak()
Definition: malloc.cpp:63
static void serializeTuneCache(std::ostream &out)
Definition: tune.cpp:154
long mapped_allocated_peak()
Definition: malloc.cpp:61
static std::string resource_path
Definition: tune.cpp:89
void setPolicyTuning(bool)
Definition: tune.cpp:457
bool activeTuning()
query if tuning is in progress
Definition: tune.cpp:103
TraceKey(const TraceKey &trace)
Definition: tune.cpp:51
static TuneKey last_key
Definition: tune.cpp:24
TraceKey & operator=(const TraceKey &trace)
Definition: tune.cpp:58
std::map< TuneKey, TuneParam > map
char * strncpy(char *__dst, const char *__src, size_t __n)
static void serializeTrace(std::ostream &out)
Definition: tune.cpp:241
virtual void preTune()
Definition: tune_quda.h:204
virtual void postTune()
Definition: tune_quda.h:205
void comm_broadcast(void *data, size_t nbytes)
Definition: comm_mpi.cpp:322
cpuColorSpinorField * out
long pinned_allocated_peak()
Definition: malloc.cpp:59
Main header file for the QUDA library.
#define STR(x)
Definition: tune.cpp:95
void loadTuneCache()
Definition: tune.cpp:302
static const int aux_n
Definition: tune_key.h:12
#define printfQuda(...)
Definition: util_quda.h:84
enum QudaTune_s QudaTune
__darwin_time_t time_t
char volume[volume_n]
Definition: tune_key.h:13
char name[name_n]
Definition: tune_key.h:14
void checkLaunchParam(TuneParam &param)
Definition: tune_quda.h:269
long host_bytes
Definition: tune.cpp:40
void printLaunchTimer()
Definition: tune.cpp:797
TuneKey key
Definition: tune.cpp:34
virtual void initTuneParam(TuneParam &param) const
Definition: tune_quda.h:230
enum QudaVerbosity_s QudaVerbosity
static void serializeProfile(std::ostream &out, std::ostream &async_out)
Definition: tune.cpp:181
static TimeProfile launchTimer("tuneLaunch")
const map & getTuneCache()
Definition: tune.cpp:110
static const int volume_n
Definition: tune_key.h:10
QudaTune getTuning()
Query whether autotuning is enabled or not. Default is enabled but can be overridden by setting QUDA_...
Definition: util_quda.cpp:51
__device__ unsigned int count[QUDA_MAX_MULTI_REDUCE]
Definition: cub_helper.cuh:118
void saveTuneCache()
Definition: tune.cpp:388
char * getenv(const char *)
#define a
#define QUDA_VERSION_MAJOR
Definition: quda_constants.h:1
virtual std::string perfString(float time) const
Definition: tune_quda.h:220
static const int name_n
Definition: tune_key.h:11
cudaEvent_t cudaEvent_t end
virtual void apply(const cudaStream_t &stream)=0
virtual void defaultTuneParam(TuneParam &param) const
Definition: tune_quda.h:254
virtual bool advanceTuneParam(TuneParam &param) const
Definition: tune_quda.h:260
ssize_t getline(char **__linep, size_t *__linecapp, FILE *__stream) __attribute__((availability(macosx
std::string comment
Definition: tune_quda.h:25