QUDA: quda/lib/tune.cpp Source File

QUDA v0.4.0
A library for QCD on GPUs
00001 #include <tune_quda.h>
00002 #include <comm_quda.h>
00003 #include <quda.h> // for QUDA_VERSION_STRING
00004 #include <sys/stat.h> // for stat()
00005 #include <fcntl.h>
00006 #include <cfloat> // for FLT_MAX
00007 #include <ctime>
00008 #include <fstream>
00009 #include <typeinfo>
00010 #include <map>
00011 
00012 static const std::string quda_hash = QUDA_HASH; // defined in lib/Makefile
00013 static std::string resource_path;
00014 static std::map<TuneKey, TuneParam> tunecache;
00015 static size_t initial_cache_size = 0;
00016 
00017 #define STR_(x) #x
00018 #define STR(x) STR_(x)
00019 static const std::string quda_version = STR(QUDA_VERSION_MAJOR) "." STR(QUDA_VERSION_MINOR) "." STR(QUDA_VERSION_SUBMINOR);
00020 #undef STR
00021 #undef STR_
00022 
00023 
00027 static void deserializeTuneCache(std::istream &in)
00028 {
00029   std::string line;
00030   std::stringstream ls;
00031   TuneKey key;
00032   TuneParam param;
00033 
00034   while (in.good()) {
00035     getline(in, line);
00036     if (!line.length()) continue; // skip blank lines (e.g., at end of file)
00037     ls.clear();
00038     ls.str(line);
00039     ls >> key.volume >> key.name >> key.aux >> param.block.x >> param.block.y >> param.block.z;
00040     ls >> param.grid.x >> param.grid.y >> param.grid.z >> param.shared_bytes;
00041     ls.ignore(1); // throw away tab before comment
00042     getline(ls, param.comment); // assume anything remaining on the line is a comment
00043     param.comment += "\n"; // our convention is to include the newline, since ctime() likes to do this
00044     tunecache[key] = param;
00045   }
00046 }
00047 
00048 
00052 static void serializeTuneCache(std::ostream &out)
00053 {
00054   std::map<TuneKey, TuneParam>::iterator entry;
00055 
00056   for (entry = tunecache.begin(); entry != tunecache.end(); entry++) {
00057     TuneKey key = entry->first;
00058     TuneParam param = entry->second;
00059 
00060     out << key.volume << "\t" << key.name << "\t" << key.aux << "\t";
00061     out << param.block.x << "\t" << param.block.y << "\t" << param.block.z << "\t";
00062     out << param.grid.x << "\t" << param.grid.y << "\t" << param.grid.z << "\t";
00063     out << param.shared_bytes << "\t" << param.comment; // param.comment ends with a newline
00064   }
00065 }
00066 
00067 
00071 static void broadcastTuneCache()
00072 {
00073 #ifdef MULTI_GPU
00074 
00075   std::stringstream serialized;
00076   size_t size;
00077 
00078   if (comm_rank() == 0) {
00079     serializeTuneCache(serialized);
00080     size = serialized.str().length();
00081   }
00082   comm_broadcast(&size, sizeof(size_t));
00083 
00084   if (size > 0) {
00085     if (comm_rank() == 0) {
00086       comm_broadcast(const_cast<char *>(serialized.str().c_str()), size);
00087     } else {
00088       char *serstr = new char[size+1];
00089       comm_broadcast(serstr, size);
00090       serstr[size] ='\0'; // null-terminate
00091       serialized.str(serstr);
00092       deserializeTuneCache(serialized);
00093       delete[] serstr;
00094     }
00095   }
00096 #endif
00097 }
00098 
00099 
00100 /*
00101  * Read tunecache from disk.
00102  */
00103 void loadTuneCache(QudaVerbosity verbosity)
00104 {
00105   char *path;
00106   struct stat pstat;
00107   std::string cache_path, line, token;
00108   std::ifstream cache_file;
00109   std::stringstream ls;
00110 
00111   path = getenv("QUDA_RESOURCE_PATH");
00112   if (!path) {
00113     warningQuda("Environment variable QUDA_RESOURCE_PATH is not set.");
00114     warningQuda("Caching of tuned parameters will be disabled.");
00115     return;
00116   } else if (stat(path, &pstat) || !S_ISDIR(pstat.st_mode)) {
00117     warningQuda("The path \"%s\" specified by QUDA_RESOURCE_PATH does not exist or is not a directory.", path); 
00118     warningQuda("Caching of tuned parameters will be disabled.");
00119     return;
00120   } else {
00121     resource_path = path;
00122   }
00123 
00124 #ifdef MULTI_GPU
00125   if (comm_rank() == 0) {
00126 #endif
00127 
00128     cache_path = resource_path;
00129     cache_path += "/tunecache.tsv";
00130     cache_file.open(cache_path.c_str());
00131 
00132     if (cache_file) {
00133 
00134       if (!cache_file.good()) errorQuda("Bad format in %s", cache_path.c_str());
00135       getline(cache_file, line);
00136       ls.str(line);
00137       ls >> token;
00138       if (token.compare("tunecache")) errorQuda("Bad format in %s", cache_path.c_str());
00139       ls >> token;
00140       if (token.compare(quda_version)) errorQuda("Cache file %s does not match current QUDA version", cache_path.c_str());
00141       ls >> token;
00142       if (token.compare(quda_hash)) warningQuda("Cache file %s does not match current QUDA build", cache_path.c_str());
00143       
00144       if (!cache_file.good()) errorQuda("Bad format in %s", cache_path.c_str());
00145       getline(cache_file, line); // eat the blank line
00146       
00147       if (!cache_file.good()) errorQuda("Bad format in %s", cache_path.c_str());
00148       getline(cache_file, line); // eat the description line
00149       
00150       deserializeTuneCache(cache_file);
00151       cache_file.close();      
00152       initial_cache_size = tunecache.size();
00153 
00154       if (verbosity >= QUDA_SUMMARIZE) {
00155         printfQuda("Loaded %d sets of cached parameters from %s\n", static_cast<int>(initial_cache_size), cache_path.c_str());
00156       }
00157       
00158     } else {
00159       warningQuda("Cache file not found.  All kernels will be re-tuned (if tuning is enabled).");
00160     }
00161 
00162 #ifdef MULTI_GPU
00163   }
00164 #endif
00165 
00166   broadcastTuneCache();
00167 }
00168 
00169 
00173 void saveTuneCache(QudaVerbosity verbosity)
00174 {
00175   time_t now;
00176   int lock_handle;
00177   std::string lock_path, cache_path;
00178   std::ofstream cache_file;
00179 
00180   if (resource_path.empty()) return;
00181 
00182   //FIXME: We should really check to see if any nodes have tuned a kernel that was not also tuned on node 0, since as things
00183   //       stand, the corresponding launch parameters would never get cached to disk in this situation.  This will come up if we
00184   //       ever support different subvolumes per GPU (as might be convenient for lattice volumes that don't divide evenly).
00185 
00186 #ifdef MULTI_GPU
00187   if (comm_rank() == 0) {
00188 #endif
00189 
00190     if (tunecache.size() == initial_cache_size) return;
00191 
00192     // Acquire lock.  Note that this is only robust if the filesystem supports flock() semantics, which is true for
00193     // NFS on recent versions of linux but not Lustre by default (unless the filesystem was mounted with "-o flock").
00194     lock_path = resource_path + "/tunecache.lock";
00195     lock_handle = open(lock_path.c_str(), O_WRONLY | O_CREAT | O_EXCL, 0666);
00196     if (lock_handle == -1) {
00197       warningQuda("Unable to lock cache file.  Tuned launch parameters will not be cached to disk.  "
00198                   "If you are certain that no other instances of QUDA are accessing this filesystem, "
00199                   "please manually remove %s", lock_path.c_str());
00200       return;
00201     }
00202     char msg[] = "If no instances of applications using QUDA are running,\n"
00203                  "this lock file shouldn't be here and is safe to delete.";
00204     int stat = write(lock_handle, msg, sizeof(msg)); // check status to avoid compiler warning
00205     if (stat == -1) warningQuda("Unable to write to lock file for some bizarre reason");
00206 
00207     cache_path = resource_path + "/tunecache.tsv";
00208     cache_file.open(cache_path.c_str());
00209     
00210     if (verbosity >= QUDA_SUMMARIZE) {
00211       printfQuda("Saving %d sets of cached parameters to %s\n", static_cast<int>(tunecache.size()), cache_path.c_str());
00212     }
00213     
00214     time(&now);
00215     cache_file << "tunecache\t" << quda_version << "\t" << quda_hash << "\t# Last updated " << ctime(&now) << std::endl;
00216     cache_file << "volume\tname\taux\tblock.x\tblock.y\tblock.z\tgrid.x\tgrid.y\tgrid.z\tshared_bytes\tcomment" << std::endl;
00217     serializeTuneCache(cache_file);
00218     cache_file.close();
00219 
00220     // Release lock.
00221     close(lock_handle);
00222     remove(lock_path.c_str());
00223 
00224 #ifdef MULTI_GPU
00225   }
00226 #endif
00227 }
00228 
00229 
00234 TuneParam tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
00235 {
00236   static bool tuning = false; // tuning in progress?
00237   static const Tunable *active_tunable; // for error checking
00238   static TuneParam param;
00239 
00240   TuneParam best_param;
00241   cudaError_t error;
00242   cudaEvent_t start, end;
00243   float elapsed_time, best_time;
00244   time_t now;
00245 
00246   const TuneKey key = tunable.tuneKey();
00247 
00248   if (enabled == QUDA_TUNE_NO) {
00249     tunable.defaultTuneParam(param);
00250   } else if (tunecache.count(key)) {
00251     param = tunecache[key];
00252   } else if (!tuning) {
00253 
00254     tuning = true;
00255     active_tunable = &tunable;
00256     best_time = FLT_MAX;
00257     tunable.preTune();
00258 
00259     cudaEventCreate(&start);
00260     cudaEventCreate(&end);
00261 
00262     if (verbosity >= QUDA_DEBUG_VERBOSE) {
00263       printfQuda("Tuning %s with %s at vol=%s\n", key.name.c_str(), key.aux.c_str(), key.volume.c_str());
00264     }
00265 
00266     tunable.initTuneParam(param);
00267     while (tuning) {
00268       cudaThreadSynchronize();
00269       cudaGetLastError(); // clear error counter
00270       cudaEventRecord(start, 0);
00271       for (int i=0; i<tunable.tuningIter(); i++) {
00272         tunable.apply(0);  // calls tuneLaunch() again, which simply returns the currently active param
00273       }
00274       cudaEventRecord(end, 0);
00275       cudaEventSynchronize(end);
00276       cudaEventElapsedTime(&elapsed_time, start, end);
00277       cudaThreadSynchronize();
00278       error = cudaGetLastError();
00279       elapsed_time /= (1e3 * tunable.tuningIter());
00280       if ((elapsed_time < best_time) && (error == cudaSuccess)) {
00281         best_time = elapsed_time;
00282         best_param = param;
00283       }
00284       if ((verbosity >= QUDA_DEBUG_VERBOSE)) {
00285         if (error == cudaSuccess)
00286           printfQuda("    %s gives %s\n", tunable.paramString(param).c_str(), 
00287                      tunable.perfString(elapsed_time).c_str());
00288         else 
00289           printfQuda("    %s gives %s\n", tunable.paramString(param).c_str(), cudaGetErrorString(error));
00290       }
00291       tuning = tunable.advanceTuneParam(param);
00292     }
00293 
00294     if (best_time == FLT_MAX) {
00295       errorQuda("Auto-tuning failed for %s with %s at vol=%s", key.name.c_str(), key.aux.c_str(), key.volume.c_str());
00296     }
00297     if (verbosity >= QUDA_VERBOSE) {
00298       printfQuda("Tuned %s giving %s", tunable.paramString(best_param).c_str(), tunable.perfString(best_time).c_str());
00299       printfQuda(" for %s with %s\n", key.name.c_str(), key.aux.c_str());
00300     }
00301     time(&now);
00302     best_param.comment = "# " + tunable.perfString(best_time) + ", tuned ";
00303     best_param.comment += ctime(&now); // includes a newline
00304 
00305     cudaEventDestroy(start);
00306     cudaEventDestroy(end);
00307 
00308     tunable.postTune();
00309     param = best_param;
00310     tunecache[key] = best_param;
00311 
00312   } else if (&tunable != active_tunable) {
00313     errorQuda("Unexpected call to tuneLaunch() in %s::apply()", typeid(tunable).name());
00314   }
00315 
00316   return param;
00317 }