QUDA v0.4.0
A library for QCD on GPUs
|
00001 #include <tune_quda.h> 00002 #include <comm_quda.h> 00003 #include <quda.h> // for QUDA_VERSION_STRING 00004 #include <sys/stat.h> // for stat() 00005 #include <fcntl.h> 00006 #include <cfloat> // for FLT_MAX 00007 #include <ctime> 00008 #include <fstream> 00009 #include <typeinfo> 00010 #include <map> 00011 00012 static const std::string quda_hash = QUDA_HASH; // defined in lib/Makefile 00013 static std::string resource_path; 00014 static std::map<TuneKey, TuneParam> tunecache; 00015 static size_t initial_cache_size = 0; 00016 00017 #define STR_(x) #x 00018 #define STR(x) STR_(x) 00019 static const std::string quda_version = STR(QUDA_VERSION_MAJOR) "." STR(QUDA_VERSION_MINOR) "." STR(QUDA_VERSION_SUBMINOR); 00020 #undef STR 00021 #undef STR_ 00022 00023 00027 static void deserializeTuneCache(std::istream &in) 00028 { 00029 std::string line; 00030 std::stringstream ls; 00031 TuneKey key; 00032 TuneParam param; 00033 00034 while (in.good()) { 00035 getline(in, line); 00036 if (!line.length()) continue; // skip blank lines (e.g., at end of file) 00037 ls.clear(); 00038 ls.str(line); 00039 ls >> key.volume >> key.name >> key.aux >> param.block.x >> param.block.y >> param.block.z; 00040 ls >> param.grid.x >> param.grid.y >> param.grid.z >> param.shared_bytes; 00041 ls.ignore(1); // throw away tab before comment 00042 getline(ls, param.comment); // assume anything remaining on the line is a comment 00043 param.comment += "\n"; // our convention is to include the newline, since ctime() likes to do this 00044 tunecache[key] = param; 00045 } 00046 } 00047 00048 00052 static void serializeTuneCache(std::ostream &out) 00053 { 00054 std::map<TuneKey, TuneParam>::iterator entry; 00055 00056 for (entry = tunecache.begin(); entry != tunecache.end(); entry++) { 00057 TuneKey key = entry->first; 00058 TuneParam param = entry->second; 00059 00060 out << key.volume << "\t" << key.name << "\t" << key.aux << "\t"; 00061 out << param.block.x << "\t" << param.block.y << "\t" << param.block.z << "\t"; 00062 out << param.grid.x << "\t" << param.grid.y << "\t" << param.grid.z << "\t"; 00063 out << param.shared_bytes << "\t" << param.comment; // param.comment ends with a newline 00064 } 00065 } 00066 00067 00071 static void broadcastTuneCache() 00072 { 00073 #ifdef MULTI_GPU 00074 00075 std::stringstream serialized; 00076 size_t size; 00077 00078 if (comm_rank() == 0) { 00079 serializeTuneCache(serialized); 00080 size = serialized.str().length(); 00081 } 00082 comm_broadcast(&size, sizeof(size_t)); 00083 00084 if (size > 0) { 00085 if (comm_rank() == 0) { 00086 comm_broadcast(const_cast<char *>(serialized.str().c_str()), size); 00087 } else { 00088 char *serstr = new char[size+1]; 00089 comm_broadcast(serstr, size); 00090 serstr[size] ='\0'; // null-terminate 00091 serialized.str(serstr); 00092 deserializeTuneCache(serialized); 00093 delete[] serstr; 00094 } 00095 } 00096 #endif 00097 } 00098 00099 00100 /* 00101 * Read tunecache from disk. 00102 */ 00103 void loadTuneCache(QudaVerbosity verbosity) 00104 { 00105 char *path; 00106 struct stat pstat; 00107 std::string cache_path, line, token; 00108 std::ifstream cache_file; 00109 std::stringstream ls; 00110 00111 path = getenv("QUDA_RESOURCE_PATH"); 00112 if (!path) { 00113 warningQuda("Environment variable QUDA_RESOURCE_PATH is not set."); 00114 warningQuda("Caching of tuned parameters will be disabled."); 00115 return; 00116 } else if (stat(path, &pstat) || !S_ISDIR(pstat.st_mode)) { 00117 warningQuda("The path \"%s\" specified by QUDA_RESOURCE_PATH does not exist or is not a directory.", path); 00118 warningQuda("Caching of tuned parameters will be disabled."); 00119 return; 00120 } else { 00121 resource_path = path; 00122 } 00123 00124 #ifdef MULTI_GPU 00125 if (comm_rank() == 0) { 00126 #endif 00127 00128 cache_path = resource_path; 00129 cache_path += "/tunecache.tsv"; 00130 cache_file.open(cache_path.c_str()); 00131 00132 if (cache_file) { 00133 00134 if (!cache_file.good()) errorQuda("Bad format in %s", cache_path.c_str()); 00135 getline(cache_file, line); 00136 ls.str(line); 00137 ls >> token; 00138 if (token.compare("tunecache")) errorQuda("Bad format in %s", cache_path.c_str()); 00139 ls >> token; 00140 if (token.compare(quda_version)) errorQuda("Cache file %s does not match current QUDA version", cache_path.c_str()); 00141 ls >> token; 00142 if (token.compare(quda_hash)) warningQuda("Cache file %s does not match current QUDA build", cache_path.c_str()); 00143 00144 if (!cache_file.good()) errorQuda("Bad format in %s", cache_path.c_str()); 00145 getline(cache_file, line); // eat the blank line 00146 00147 if (!cache_file.good()) errorQuda("Bad format in %s", cache_path.c_str()); 00148 getline(cache_file, line); // eat the description line 00149 00150 deserializeTuneCache(cache_file); 00151 cache_file.close(); 00152 initial_cache_size = tunecache.size(); 00153 00154 if (verbosity >= QUDA_SUMMARIZE) { 00155 printfQuda("Loaded %d sets of cached parameters from %s\n", static_cast<int>(initial_cache_size), cache_path.c_str()); 00156 } 00157 00158 } else { 00159 warningQuda("Cache file not found. All kernels will be re-tuned (if tuning is enabled)."); 00160 } 00161 00162 #ifdef MULTI_GPU 00163 } 00164 #endif 00165 00166 broadcastTuneCache(); 00167 } 00168 00169 00173 void saveTuneCache(QudaVerbosity verbosity) 00174 { 00175 time_t now; 00176 int lock_handle; 00177 std::string lock_path, cache_path; 00178 std::ofstream cache_file; 00179 00180 if (resource_path.empty()) return; 00181 00182 //FIXME: We should really check to see if any nodes have tuned a kernel that was not also tuned on node 0, since as things 00183 // stand, the corresponding launch parameters would never get cached to disk in this situation. This will come up if we 00184 // ever support different subvolumes per GPU (as might be convenient for lattice volumes that don't divide evenly). 00185 00186 #ifdef MULTI_GPU 00187 if (comm_rank() == 0) { 00188 #endif 00189 00190 if (tunecache.size() == initial_cache_size) return; 00191 00192 // Acquire lock. Note that this is only robust if the filesystem supports flock() semantics, which is true for 00193 // NFS on recent versions of linux but not Lustre by default (unless the filesystem was mounted with "-o flock"). 00194 lock_path = resource_path + "/tunecache.lock"; 00195 lock_handle = open(lock_path.c_str(), O_WRONLY | O_CREAT | O_EXCL, 0666); 00196 if (lock_handle == -1) { 00197 warningQuda("Unable to lock cache file. Tuned launch parameters will not be cached to disk. " 00198 "If you are certain that no other instances of QUDA are accessing this filesystem, " 00199 "please manually remove %s", lock_path.c_str()); 00200 return; 00201 } 00202 char msg[] = "If no instances of applications using QUDA are running,\n" 00203 "this lock file shouldn't be here and is safe to delete."; 00204 int stat = write(lock_handle, msg, sizeof(msg)); // check status to avoid compiler warning 00205 if (stat == -1) warningQuda("Unable to write to lock file for some bizarre reason"); 00206 00207 cache_path = resource_path + "/tunecache.tsv"; 00208 cache_file.open(cache_path.c_str()); 00209 00210 if (verbosity >= QUDA_SUMMARIZE) { 00211 printfQuda("Saving %d sets of cached parameters to %s\n", static_cast<int>(tunecache.size()), cache_path.c_str()); 00212 } 00213 00214 time(&now); 00215 cache_file << "tunecache\t" << quda_version << "\t" << quda_hash << "\t# Last updated " << ctime(&now) << std::endl; 00216 cache_file << "volume\tname\taux\tblock.x\tblock.y\tblock.z\tgrid.x\tgrid.y\tgrid.z\tshared_bytes\tcomment" << std::endl; 00217 serializeTuneCache(cache_file); 00218 cache_file.close(); 00219 00220 // Release lock. 00221 close(lock_handle); 00222 remove(lock_path.c_str()); 00223 00224 #ifdef MULTI_GPU 00225 } 00226 #endif 00227 } 00228 00229 00234 TuneParam tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity) 00235 { 00236 static bool tuning = false; // tuning in progress? 00237 static const Tunable *active_tunable; // for error checking 00238 static TuneParam param; 00239 00240 TuneParam best_param; 00241 cudaError_t error; 00242 cudaEvent_t start, end; 00243 float elapsed_time, best_time; 00244 time_t now; 00245 00246 const TuneKey key = tunable.tuneKey(); 00247 00248 if (enabled == QUDA_TUNE_NO) { 00249 tunable.defaultTuneParam(param); 00250 } else if (tunecache.count(key)) { 00251 param = tunecache[key]; 00252 } else if (!tuning) { 00253 00254 tuning = true; 00255 active_tunable = &tunable; 00256 best_time = FLT_MAX; 00257 tunable.preTune(); 00258 00259 cudaEventCreate(&start); 00260 cudaEventCreate(&end); 00261 00262 if (verbosity >= QUDA_DEBUG_VERBOSE) { 00263 printfQuda("Tuning %s with %s at vol=%s\n", key.name.c_str(), key.aux.c_str(), key.volume.c_str()); 00264 } 00265 00266 tunable.initTuneParam(param); 00267 while (tuning) { 00268 cudaThreadSynchronize(); 00269 cudaGetLastError(); // clear error counter 00270 cudaEventRecord(start, 0); 00271 for (int i=0; i<tunable.tuningIter(); i++) { 00272 tunable.apply(0); // calls tuneLaunch() again, which simply returns the currently active param 00273 } 00274 cudaEventRecord(end, 0); 00275 cudaEventSynchronize(end); 00276 cudaEventElapsedTime(&elapsed_time, start, end); 00277 cudaThreadSynchronize(); 00278 error = cudaGetLastError(); 00279 elapsed_time /= (1e3 * tunable.tuningIter()); 00280 if ((elapsed_time < best_time) && (error == cudaSuccess)) { 00281 best_time = elapsed_time; 00282 best_param = param; 00283 } 00284 if ((verbosity >= QUDA_DEBUG_VERBOSE)) { 00285 if (error == cudaSuccess) 00286 printfQuda(" %s gives %s\n", tunable.paramString(param).c_str(), 00287 tunable.perfString(elapsed_time).c_str()); 00288 else 00289 printfQuda(" %s gives %s\n", tunable.paramString(param).c_str(), cudaGetErrorString(error)); 00290 } 00291 tuning = tunable.advanceTuneParam(param); 00292 } 00293 00294 if (best_time == FLT_MAX) { 00295 errorQuda("Auto-tuning failed for %s with %s at vol=%s", key.name.c_str(), key.aux.c_str(), key.volume.c_str()); 00296 } 00297 if (verbosity >= QUDA_VERBOSE) { 00298 printfQuda("Tuned %s giving %s", tunable.paramString(best_param).c_str(), tunable.perfString(best_time).c_str()); 00299 printfQuda(" for %s with %s\n", key.name.c_str(), key.aux.c_str()); 00300 } 00301 time(&now); 00302 best_param.comment = "# " + tunable.perfString(best_time) + ", tuned "; 00303 best_param.comment += ctime(&now); // includes a newline 00304 00305 cudaEventDestroy(start); 00306 cudaEventDestroy(end); 00307 00308 tunable.postTune(); 00309 param = best_param; 00310 tunecache[key] = best_param; 00311 00312 } else if (&tunable != active_tunable) { 00313 errorQuda("Unexpected call to tuneLaunch() in %s::apply()", typeid(tunable).name()); 00314 } 00315 00316 return param; 00317 }