QUDA  v0.5.0
A library for QCD on GPUs
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
tune.cpp
Go to the documentation of this file.
1 #include <tune_quda.h>
2 #include <comm_quda.h>
3 #include <quda.h> // for QUDA_VERSION_STRING
4 #include <sys/stat.h> // for stat()
5 #include <fcntl.h>
6 #include <cfloat> // for FLT_MAX
7 #include <ctime>
8 #include <fstream>
9 #include <typeinfo>
10 #include <map>
11 
12 namespace quda {
13 
14 static const std::string quda_hash = QUDA_HASH; // defined in lib/Makefile
15 static std::string resource_path;
16 static std::map<TuneKey, TuneParam> tunecache;
17 static size_t initial_cache_size = 0;
18 
19 #define STR_(x) #x
20 #define STR(x) STR_(x)
21 static const std::string quda_version = STR(QUDA_VERSION_MAJOR) "." STR(QUDA_VERSION_MINOR) "." STR(QUDA_VERSION_SUBMINOR);
22 #undef STR
23 #undef STR_
24 
28  static void deserializeTuneCache(std::istream &in)
29  {
30  std::string line;
31  std::stringstream ls;
32  TuneKey key;
33  TuneParam param;
34 
35  while (in.good()) {
36  getline(in, line);
37  if (!line.length()) continue; // skip blank lines (e.g., at end of file)
38  ls.clear();
39  ls.str(line);
40  ls >> key.volume >> key.name >> key.aux >> param.block.x >> param.block.y >> param.block.z;
41  ls >> param.grid.x >> param.grid.y >> param.grid.z >> param.shared_bytes;
42  ls.ignore(1); // throw away tab before comment
43  getline(ls, param.comment); // assume anything remaining on the line is a comment
44  param.comment += "\n"; // our convention is to include the newline, since ctime() likes to do this
45  tunecache[key] = param;
46  }
47  }
48 
49 
53  static void serializeTuneCache(std::ostream &out)
54  {
55  std::map<TuneKey, TuneParam>::iterator entry;
56 
57  for (entry = tunecache.begin(); entry != tunecache.end(); entry++) {
58  TuneKey key = entry->first;
59  TuneParam param = entry->second;
60 
61  out << key.volume << "\t" << key.name << "\t" << key.aux << "\t";
62  out << param.block.x << "\t" << param.block.y << "\t" << param.block.z << "\t";
63  out << param.grid.x << "\t" << param.grid.y << "\t" << param.grid.z << "\t";
64  out << param.shared_bytes << "\t" << param.comment; // param.comment ends with a newline
65  }
66  }
67 
68 
72  static void broadcastTuneCache()
73  {
74 #ifdef MULTI_GPU
75 
76  std::stringstream serialized;
77  size_t size;
78 
79  if (comm_rank() == 0) {
80  serializeTuneCache(serialized);
81  size = serialized.str().length();
82  }
83  comm_broadcast(&size, sizeof(size_t));
84 
85  if (size > 0) {
86  if (comm_rank() == 0) {
87  comm_broadcast(const_cast<char *>(serialized.str().c_str()), size);
88  } else {
89  char *serstr = new char[size+1];
90  comm_broadcast(serstr, size);
91  serstr[size] ='\0'; // null-terminate
92  serialized.str(serstr);
93  deserializeTuneCache(serialized);
94  delete[] serstr;
95  }
96  }
97 #endif
98  }
99 
100 
101  /*
102  * Read tunecache from disk.
103  */
104  void loadTuneCache(QudaVerbosity verbosity)
105  {
106  char *path;
107  struct stat pstat;
108  std::string cache_path, line, token;
109  std::ifstream cache_file;
110  std::stringstream ls;
111 
112  path = getenv("QUDA_RESOURCE_PATH");
113  if (!path) {
114  warningQuda("Environment variable QUDA_RESOURCE_PATH is not set.");
115  warningQuda("Caching of tuned parameters will be disabled.");
116  return;
117  } else if (stat(path, &pstat) || !S_ISDIR(pstat.st_mode)) {
118  warningQuda("The path \"%s\" specified by QUDA_RESOURCE_PATH does not exist or is not a directory.", path);
119  warningQuda("Caching of tuned parameters will be disabled.");
120  return;
121  } else {
122  resource_path = path;
123  }
124 
125 #ifdef MULTI_GPU
126  if (comm_rank() == 0) {
127 #endif
128 
129  cache_path = resource_path;
130  cache_path += "/tunecache.tsv";
131  cache_file.open(cache_path.c_str());
132 
133  if (cache_file) {
134 
135  if (!cache_file.good()) errorQuda("Bad format in %s", cache_path.c_str());
136  getline(cache_file, line);
137  ls.str(line);
138  ls >> token;
139  if (token.compare("tunecache")) errorQuda("Bad format in %s", cache_path.c_str());
140  ls >> token;
141  if (token.compare(quda_version)) errorQuda("Cache file %s does not match current QUDA version", cache_path.c_str());
142  ls >> token;
143  if (token.compare(quda_hash)) warningQuda("Cache file %s does not match current QUDA build", cache_path.c_str());
144 
145  if (!cache_file.good()) errorQuda("Bad format in %s", cache_path.c_str());
146  getline(cache_file, line); // eat the blank line
147 
148  if (!cache_file.good()) errorQuda("Bad format in %s", cache_path.c_str());
149  getline(cache_file, line); // eat the description line
150 
151  deserializeTuneCache(cache_file);
152  cache_file.close();
153  initial_cache_size = tunecache.size();
154 
155  if (verbosity >= QUDA_SUMMARIZE) {
156  printfQuda("Loaded %d sets of cached parameters from %s\n", static_cast<int>(initial_cache_size), cache_path.c_str());
157  }
158 
159  } else {
160  warningQuda("Cache file not found. All kernels will be re-tuned (if tuning is enabled).");
161  }
162 
163 #ifdef MULTI_GPU
164  }
165 #endif
166 
167  broadcastTuneCache();
168  }
169 
170 
174  void saveTuneCache(QudaVerbosity verbosity)
175  {
176  time_t now;
177  int lock_handle;
178  std::string lock_path, cache_path;
179  std::ofstream cache_file;
180 
181  if (resource_path.empty()) return;
182 
183  //FIXME: We should really check to see if any nodes have tuned a kernel that was not also tuned on node 0, since as things
184  // stand, the corresponding launch parameters would never get cached to disk in this situation. This will come up if we
185  // ever support different subvolumes per GPU (as might be convenient for lattice volumes that don't divide evenly).
186 
187 #ifdef MULTI_GPU
188  if (comm_rank() == 0) {
189 #endif
190 
191  if (tunecache.size() == initial_cache_size) return;
192 
193  // Acquire lock. Note that this is only robust if the filesystem supports flock() semantics, which is true for
194  // NFS on recent versions of linux but not Lustre by default (unless the filesystem was mounted with "-o flock").
195  lock_path = resource_path + "/tunecache.lock";
196  lock_handle = open(lock_path.c_str(), O_WRONLY | O_CREAT | O_EXCL, 0666);
197  if (lock_handle == -1) {
198  warningQuda("Unable to lock cache file. Tuned launch parameters will not be cached to disk. "
199  "If you are certain that no other instances of QUDA are accessing this filesystem, "
200  "please manually remove %s", lock_path.c_str());
201  return;
202  }
203  char msg[] = "If no instances of applications using QUDA are running,\n"
204  "this lock file shouldn't be here and is safe to delete.";
205  int stat = write(lock_handle, msg, sizeof(msg)); // check status to avoid compiler warning
206  if (stat == -1) warningQuda("Unable to write to lock file for some bizarre reason");
207 
208  cache_path = resource_path + "/tunecache.tsv";
209  cache_file.open(cache_path.c_str());
210 
211  if (verbosity >= QUDA_SUMMARIZE) {
212  printfQuda("Saving %d sets of cached parameters to %s\n", static_cast<int>(tunecache.size()), cache_path.c_str());
213  }
214 
215  time(&now);
216  cache_file << "tunecache\t" << quda_version << "\t" << quda_hash << "\t# Last updated " << ctime(&now) << std::endl;
217  cache_file << "volume\tname\taux\tblock.x\tblock.y\tblock.z\tgrid.x\tgrid.y\tgrid.z\tshared_bytes\tcomment" << std::endl;
218  serializeTuneCache(cache_file);
219  cache_file.close();
220 
221  // Release lock.
222  close(lock_handle);
223  remove(lock_path.c_str());
224 
225  initial_cache_size = tunecache.size();
226 
227 #ifdef MULTI_GPU
228  }
229 #endif
230  }
231 
236  TuneParam tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
237  {
238  // We must switch off the global sum when tuning in case of process divergence
239  bool reduceState = globalReduce;
240  globalReduce = false;
241 
242  static bool tuning = false; // tuning in progress?
243  static const Tunable *active_tunable; // for error checking
244  static TuneParam param;
245 
246  TuneParam best_param;
247  cudaError_t error;
248  cudaEvent_t start, end;
249  float elapsed_time, best_time;
250  time_t now;
251 
252  const TuneKey key = tunable.tuneKey();
253 
254  if (enabled == QUDA_TUNE_NO) {
255  tunable.defaultTuneParam(param);
256  tunable.checkLaunchParam(param);
257  } else if (tunecache.count(key)) {
258  param = tunecache[key];
259  tunable.checkLaunchParam(param);
260  } else if (!tuning) {
261 
262  tuning = true;
263  active_tunable = &tunable;
264  best_time = FLT_MAX;
265 
266  if (verbosity >= QUDA_DEBUG_VERBOSE) printfQuda("PreTune %s\n", key.name.c_str());
267  tunable.preTune();
268 
269  cudaEventCreate(&start);
270  cudaEventCreate(&end);
271 
272  if (verbosity >= QUDA_DEBUG_VERBOSE) {
273  printfQuda("Tuning %s with %s at vol=%s\n", key.name.c_str(), key.aux.c_str(), key.volume.c_str());
274  }
275 
276  tunable.initTuneParam(param);
277  while (tuning) {
278  cudaDeviceSynchronize();
279  cudaGetLastError(); // clear error counter
280  tunable.checkLaunchParam(param);
281  cudaEventRecord(start, 0);
282  for (int i=0; i<tunable.tuningIter(); i++) {
283  tunable.apply(0); // calls tuneLaunch() again, which simply returns the currently active param
284  }
285  cudaEventRecord(end, 0);
286  cudaEventSynchronize(end);
287  cudaEventElapsedTime(&elapsed_time, start, end);
288  cudaDeviceSynchronize();
289  error = cudaGetLastError();
290  elapsed_time /= (1e3 * tunable.tuningIter());
291  if ((elapsed_time < best_time) && (error == cudaSuccess)) {
292  best_time = elapsed_time;
293  best_param = param;
294  }
295  if ((verbosity >= QUDA_DEBUG_VERBOSE)) {
296  if (error == cudaSuccess)
297  printfQuda(" %s gives %s\n", tunable.paramString(param).c_str(),
298  tunable.perfString(elapsed_time).c_str());
299  else
300  printfQuda(" %s gives %s\n", tunable.paramString(param).c_str(), cudaGetErrorString(error));
301  }
302  tuning = tunable.advanceTuneParam(param);
303  }
304 
305  if (best_time == FLT_MAX) {
306  errorQuda("Auto-tuning failed for %s with %s at vol=%s", key.name.c_str(), key.aux.c_str(), key.volume.c_str());
307  }
308  if (verbosity >= QUDA_VERBOSE) {
309  printfQuda("Tuned %s giving %s for %s with %s\n", tunable.paramString(best_param).c_str(),
310  tunable.perfString(best_time).c_str(), key.name.c_str(), key.aux.c_str());
311  }
312  time(&now);
313  best_param.comment = "# " + tunable.perfString(best_time) + ", tuned ";
314  best_param.comment += ctime(&now); // includes a newline
315 
316  cudaEventDestroy(start);
317  cudaEventDestroy(end);
318 
319  if (verbosity >= QUDA_DEBUG_VERBOSE) printfQuda("PostTune %s\n", key.name.c_str());
320  tunable.postTune();
321  param = best_param;
322  tunecache[key] = best_param;
323 
324  } else if (&tunable != active_tunable) {
325  errorQuda("Unexpected call to tuneLaunch() in %s::apply()", typeid(tunable).name());
326  }
327 
328  // restore the original reduction state
329  globalReduce = reduceState;
330 
331  return param;
332  }
333 
334 } // namespace quda