28 typedef std::map<TuneKey, TuneParam>
map;
43 : key(key), time(time),
50 : key(trace.key), time(trace.time),
51 device_bytes(trace.device_bytes),
52 pinned_bytes(trace.pinned_bytes),
53 mapped_bytes(trace.mapped_bytes),
54 host_bytes(trace.host_bytes) { }
74 static bool init =
false;
77 char *enable_trace_env = getenv(
"QUDA_ENABLE_TRACE");
78 if (enable_trace_env) {
79 if (strcmp(enable_trace_env,
"1") == 0) {
82 }
else if (strcmp(enable_trace_env,
"2") == 0) {
92 void postTrace_(
const char *func,
const char *file,
int line) {
102 trace_list.push_back(trace_entry);
109 static map::iterator
it;
113 #define STR(x) STR_(x) 137 std::stringstream ls;
150 if (!line.length())
continue;
155 if (check < 0 || check >= key.
volume_n)
errorQuda(
"Error writing volume string (check = %d)", check);
156 check = snprintf(key.
name, key.
name_n,
"%s", n.c_str());
157 if (check < 0 || check >= key.
name_n)
errorQuda(
"Error writing name string (check=%d)", check);
158 check = snprintf(key.
aux, key.
aux_n,
"%s", a.c_str());
159 if (check < 0 || check >= key.
aux_n)
errorQuda(
"Error writing aux string (check=%d)", check);
176 for (entry = tunecache.begin(); entry != tunecache.end(); entry++) {
180 out << std::setw(16) << key.
volume <<
"\t" << key.
name <<
"\t" << key.
aux <<
"\t";
181 out << param.
block.x <<
"\t" << param.
block.y <<
"\t" << param.
block.z <<
"\t";
182 out << param.
grid.x <<
"\t" << param.
grid.y <<
"\t" << param.
grid.z <<
"\t";
183 out << param.
shared_bytes <<
"\t" << param.
aux.x <<
"\t" << param.
aux.y <<
"\t" << param.
aux.z <<
"\t" << param.
aux.w <<
"\t";
192 return lhs.second.time * lhs.second.n_calls < rhs.second.time * rhs.second.n_calls;
202 double total_time = 0.0;
203 double async_total_time = 0.0;
206 typedef std::pair<TuneKey, TuneParam> profile_t;
208 queue_t q(tunecache.begin(), tunecache.end());
211 for (entry = tunecache.begin(); entry != tunecache.end(); entry++) {
216 strncpy(tmp, key.
aux, 13);
217 bool is_policy_kernel = strncmp(tmp,
"policy_kernel", 13) == 0 ? true :
false;
218 bool is_policy = (strncmp(tmp,
"policy", 6) == 0 && !is_policy_kernel) ?
true :
false;
224 while ( !q.empty() ) {
230 bool is_policy_kernel = strncmp(tmp,
"policy_kernel", 13) == 0 ? true :
false;
231 bool is_policy = (strncmp(tmp,
"policy", 6) == 0 && !is_policy_kernel) ?
true :
false;
234 if (param.
n_calls > 0 && !is_policy) {
237 out << std::setw(12) << param.
n_calls * param.
time <<
"\t" << std::setw(12) << (time / total_time) * 100 <<
"\t";
238 out << std::setw(12) << param.
n_calls <<
"\t" << std::setw(12) << param.
time <<
"\t" << std::setw(16) << key.
volume <<
"\t";
243 if (param.
n_calls > 0 && is_policy) {
246 async_out << std::setw(12) << param.
n_calls * param.
time <<
"\t" << std::setw(12) << (time / async_total_time) * 100 <<
"\t";
247 async_out << std::setw(12) << param.
n_calls <<
"\t" << std::setw(12) << param.
time <<
"\t" << std::setw(16) << key.
volume <<
"\t";
248 async_out << key.
name <<
"\t" << key.
aux <<
"\t" << param.
comment;
254 out << std::endl <<
"# Total time spent in kernels = " << total_time <<
" seconds" << std::endl;
255 async_out << std::endl <<
"# Total time spent in asynchronous execution = " << async_total_time <<
" seconds" << std::endl;
263 for (
auto it = trace_list.begin(); it != trace_list.end(); it++) {
270 bool is_policy_kernel = strcmp(tmp,
"policy_kernel") == 0 ? true :
false;
272 out << std::setw(12) << it->time <<
"\t";
273 out << std::setw(12) << it->device_bytes <<
"\t";
274 out << std::setw(12) << it->pinned_bytes <<
"\t";
275 out << std::setw(12) << it->mapped_bytes <<
"\t";
276 out << std::setw(12) << it->host_bytes <<
"\t";
277 out << std::setw(16) << key.
volume <<
"\t";
278 if (is_policy_kernel) out <<
"\t";
279 out << key.
name <<
"\t";
280 if (!is_policy_kernel) out <<
"\t";
281 out << key.
aux << std::endl;
294 std::stringstream serialized;
299 size = serialized.str().length();
305 comm_broadcast(const_cast<char *>(serialized.str().c_str()), size);
307 char *serstr =
new char[size+1];
310 serialized.str(serstr);
331 std::string cache_path, line, token;
332 std::ifstream cache_file;
333 std::stringstream ls;
335 path = getenv(
"QUDA_RESOURCE_PATH");
338 warningQuda(
"Environment variable QUDA_RESOURCE_PATH is not set.");
339 warningQuda(
"Caching of tuned parameters will be disabled.");
341 }
else if (stat(path, &pstat) || !S_ISDIR(pstat.st_mode)) {
342 warningQuda(
"The path \"%s\" specified by QUDA_RESOURCE_PATH does not exist or is not a directory.", path);
343 warningQuda(
"Caching of tuned parameters will be disabled.");
346 resource_path = path;
349 bool version_check =
true;
350 char *override_version_env = getenv(
"QUDA_TUNE_VERSION_CHECK");
351 if (override_version_env && strcmp(override_version_env,
"0") == 0) {
352 version_check =
false;
353 warningQuda(
"Disabling QUDA tunecache version check");
361 cache_path +=
"/tunecache.tsv";
362 cache_file.open(cache_path.c_str());
366 if (!cache_file.good())
errorQuda(
"Bad format in %s", cache_path.c_str());
367 getline(cache_file, line);
370 if (token.compare(
"tunecache"))
errorQuda(
"Bad format in %s", cache_path.c_str());
372 if (version_check && token.compare(quda_version))
373 errorQuda(
"Cache file %s does not match current QUDA version. \nPlease delete this file or set the " 374 "QUDA_RESOURCE_PATH environment variable to point to a new path.",
378 if (version_check && token.compare(
gitversion))
379 errorQuda(
"Cache file %s does not match current QUDA version. \nPlease delete this file or set the " 380 "QUDA_RESOURCE_PATH environment variable to point to a new path.",
383 if (version_check && token.compare(quda_version))
384 errorQuda(
"Cache file %s does not match current QUDA version. \nPlease delete this file or set the " 385 "QUDA_RESOURCE_PATH environment variable to point to a new path.",
389 if (version_check && token.compare(quda_hash))
390 errorQuda(
"Cache file %s does not match current QUDA build. \nPlease delete this file or set the " 391 "QUDA_RESOURCE_PATH environment variable to point to a new path.",
394 if (!cache_file.good())
errorQuda(
"Bad format in %s", cache_path.c_str());
395 getline(cache_file, line);
397 if (!cache_file.good())
errorQuda(
"Bad format in %s", cache_path.c_str());
398 getline(cache_file, line);
403 initial_cache_size = tunecache.size();
406 printfQuda(
"Loaded %d sets of cached parameters from %s\n", static_cast<int>(initial_cache_size), cache_path.c_str());
411 warningQuda(
"Cache file not found. All kernels will be re-tuned (if tuning is enabled).");
430 std::string lock_path, cache_path;
431 std::ofstream cache_file;
433 if (resource_path.empty())
return;
443 if (tunecache.size() == initial_cache_size && !error)
return;
447 lock_path = resource_path +
"/tunecache.lock";
448 lock_handle = open(lock_path.c_str(), O_WRONLY | O_CREAT | O_EXCL, 0666);
449 if (lock_handle == -1) {
450 warningQuda(
"Unable to lock cache file. Tuned launch parameters will not be cached to disk. " 451 "If you are certain that no other instances of QUDA are accessing this filesystem, " 452 "please manually remove %s", lock_path.c_str());
455 char msg[] =
"If no instances of applications using QUDA are running,\n" 456 "this lock file shouldn't be here and is safe to delete.";
457 int stat =
write(lock_handle, msg,
sizeof(msg));
458 if (stat == -1)
warningQuda(
"Unable to write to lock file for some bizarre reason");
460 cache_path = resource_path + (error ?
"/tunecache_error.tsv" :
"/tunecache.tsv");
461 cache_file.open(cache_path.c_str());
464 printfQuda(
"Saving %d sets of cached parameters to %s\n", static_cast<int>(tunecache.size()), cache_path.c_str());
474 cache_file <<
"\t" << quda_hash <<
"\t# Last updated " << ctime(&now) << std::endl;
475 cache_file << std::setw(16) <<
"volume" <<
"\tname\taux\tblock.x\tblock.y\tblock.z\tgrid.x\tgrid.y\tgrid.z\tshared_bytes\taux.x\taux.y\taux.z\taux.w\ttime\tcomment" << std::endl;
481 remove(lock_path.c_str());
483 initial_cache_size = tunecache.size();
489 if (error) sleep(10);
500 policy_tuning = policy_tuning_;
506 for (map::iterator entry = tunecache.begin(); entry != tunecache.end(); entry++) {
518 std::string lock_path, profile_path, async_profile_path, trace_path;
519 std::ofstream profile_file, async_profile_file, trace_file;
521 if (resource_path.empty())
return;
529 lock_path = resource_path +
"/profile.lock";
530 lock_handle = open(lock_path.c_str(), O_WRONLY | O_CREAT | O_EXCL, 0666);
531 if (lock_handle == -1) {
532 warningQuda(
"Unable to lock profile file. Profile will not be saved to disk. " 533 "If you are certain that no other instances of QUDA are accessing this filesystem, " 534 "please manually remove %s", lock_path.c_str());
537 char msg[] =
"If no instances of applications using QUDA are running,\n" 538 "this lock file shouldn't be here and is safe to delete.";
539 int stat =
write(lock_handle, msg,
sizeof(msg));
540 if (stat == -1)
warningQuda(
"Unable to write to lock file for some bizarre reason");
543 static int count = 0;
545 char *profile_fname = getenv(
"QUDA_PROFILE_OUTPUT_BASE");
547 if (!profile_fname) {
548 warningQuda(
"Environment variable QUDA_PROFILE_OUTPUT_BASE not set; writing to profile.tsv and profile_async.tsv");
549 profile_path = resource_path +
"/profile_" + std::to_string(count) +
".tsv";
550 async_profile_path = resource_path +
"/profile_async_" + std::to_string(count) +
".tsv";
551 if (
traceEnabled()) trace_path = resource_path +
"/trace_" + std::to_string(count) +
".tsv";
553 profile_path = resource_path +
"/" + profile_fname +
"_" + std::to_string(count) +
".tsv";
554 async_profile_path = resource_path +
"/" + profile_fname +
"_" + std::to_string(count) +
"_async.tsv";
555 if (
traceEnabled()) trace_path = resource_path +
"/" + profile_fname +
"_trace_" + std::to_string(count) +
".tsv";
560 profile_file.open(profile_path.c_str());
561 async_profile_file.open(async_profile_path.c_str());
562 if (
traceEnabled()) trace_file.open(trace_path.c_str());
568 for (map::iterator entry = tunecache.begin(); entry != tunecache.end(); entry++) {
573 bool is_policy = strcmp(tmp,
"policy") == 0 ? true :
false;
574 if (param.
n_calls > 0 && !is_policy) n_entry++;
575 if (param.
n_calls > 0 && is_policy) n_policy++;
578 printfQuda(
"Saving %d sets of cached parameters to %s\n", n_entry, profile_path.c_str());
579 printfQuda(
"Saving %d sets of cached profiles to %s\n", n_policy, async_profile_path.c_str());
580 if (
traceEnabled())
printfQuda(
"Saving trace list with %lu entries to %s\n", trace_list.size(), trace_path.c_str());
585 std::string Label = label.empty() ?
"profile" : label;
593 profile_file <<
"\t" << quda_hash <<
"\t# Last updated " << ctime(&now) << std::endl;
594 profile_file << std::setw(12) <<
"total time" <<
"\t" << std::setw(12) <<
"percentage" <<
"\t" << std::setw(12) <<
"calls" <<
"\t" << std::setw(12) <<
"time / call" <<
"\t" << std::setw(16) <<
"volume" <<
"\tname\taux\tcomment" << std::endl;
602 async_profile_file <<
"\t" << quda_hash <<
"\t# Last updated " << ctime(&now) << std::endl;
603 async_profile_file << std::setw(12) <<
"total time" <<
"\t" << std::setw(12) <<
"percentage" <<
"\t" << std::setw(12) <<
"calls" <<
"\t" << std::setw(12) <<
"time / call" <<
"\t" << std::setw(16) <<
"volume" <<
"\tname\taux\tcomment" << std::endl;
607 profile_file.close();
608 async_profile_file.close();
617 trace_file <<
"\t" << quda_hash <<
"\t# Last updated " << ctime(&now) << std::endl;
619 trace_file << std::setw(12) <<
"time\t" << std::setw(12) <<
"device-mem\t" << std::setw(12) <<
"pinned-mem\t";
620 trace_file << std::setw(12) <<
"mapped-mem\t" << std::setw(12) <<
"host-mem\t";
621 trace_file << std::setw(16) <<
"volume" <<
"\tname\taux" << std::endl;
630 remove(lock_path.c_str());
660 static const Tunable *active_tunable;
661 it = tunecache.find(key);
674 printfQuda(
"Launching %s with %s at vol=%s with %s\n",
686 if (!tuning && profile_count) param.
n_calls++;
695 trace_list.push_back(trace_entry);
710 printfQuda(
"Launching %s with %s at vol=%s with %s (untuned)\n",
713 }
else if (!tuning) {
720 cudaError_t error = cudaSuccess;
721 cudaEvent_t start,
end;
722 float elapsed_time, best_time;
726 active_tunable = &tunable;
732 cudaEventCreate(&start);
733 cudaEventCreate(&end);
740 tune_timer.
Start(__func__, __FILE__, __LINE__);
744 cudaDeviceSynchronize();
749 printfQuda(
"About to call tunable.apply block=(%d,%d,%d) grid=(%d,%d,%d) shared_bytes=%d aux=(%d,%d,%d)\n",
756 cudaEventRecord(start, 0);
760 cudaEventRecord(end, 0);
761 cudaEventSynchronize(end);
762 cudaEventElapsedTime(&elapsed_time, start, end);
763 cudaDeviceSynchronize();
764 error = cudaGetLastError();
767 cudaDeviceSynchronize();
768 cudaError_t error = cudaGetLastError();
769 if (error != cudaSuccess)
errorQuda(
"Failed to clear error state %s\n", cudaGetErrorString(error));
773 if ( (elapsed_time < best_time) && (error == cudaSuccess) && (tunable.
jitifyError() == CUDA_SUCCESS) ) {
774 best_time = elapsed_time;
778 if (error == cudaSuccess && tunable.
jitifyError() == CUDA_SUCCESS) {
797 tune_timer.
Stop(__func__, __FILE__, __LINE__);
799 if (best_time == FLT_MAX) {
808 best_param.
comment +=
", tuning took " + std::to_string(tune_timer.
Last()) +
" seconds at ";
809 best_param.
comment += ctime(&now);
810 best_param.
time = best_time;
812 cudaEventDestroy(start);
813 cudaEventDestroy(end);
818 tunecache[
key] = best_param;
824 if (tunecache.find(key) == tunecache.end()) {
827 param = tunecache[
key];
831 trace_list.push_back(trace_entry);
834 }
else if (&tunable != active_tunable) {
835 errorQuda(
"Unexpected call to tuneLaunch() in %s::apply()",
typeid(tunable).name());
838 param.
n_calls = profile_count ? 1 : 0;
CUresult jitifyError() const
long device_allocated_peak()
virtual int tuningIter() const
void disableProfileCount()
Disable the profile kernel counting.
QudaVerbosity getVerbosity()
void postTrace_(const char *func, const char *file, int line)
Post an event in the trace, recording where it was posted.
cudaColorSpinorField * tmp
void saveTuneCache(bool error=false)
#define QUDA_VERSION_MINOR
static void broadcastTuneCache()
virtual std::string paramString(const TuneParam ¶m) const
quda::TuneKey getLastTuneKey()
static std::list< TraceKey > trace_list
virtual TuneKey tuneKey() const =0
static size_t initial_cache_size
std::map< TuneKey, TuneParam > map
void enableProfileCount()
Enable the profile kernel counting.
void i32toa(char *buffer, int32_t value)
static const std::string quda_hash
TraceKey(const TuneKey &key, float time)
static bool profile_count
static void deserializeTuneCache(std::istream &in)
void flushProfile()
Flush profile contents, setting all counts to zero.
static bool policy_tuning
static const std::string quda_version
bool operator()(const T &lhs, const T &rhs)
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
#define QUDA_VERSION_SUBMINOR
bool commGlobalReduction()
void Start(const char *func, const char *file, int line)
void saveProfile(const std::string label="")
Save profile to disk.
long host_allocated_peak()
static void serializeTuneCache(std::ostream &out)
long mapped_allocated_peak()
void init()
Create the CUBLAS context.
static std::string resource_path
void setPolicyTuning(bool)
Enable / disable whether are tuning a policy.
bool activeTuning()
query if tuning is in progress
TraceKey(const TraceKey &trace)
TraceKey & operator=(const TraceKey &trace)
static void serializeTrace(std::ostream &out)
void comm_broadcast(void *data, size_t nbytes)
cpuColorSpinorField * out
long pinned_allocated_peak()
Main header file for the QUDA library.
void checkLaunchParam(TuneParam ¶m)
virtual void initTuneParam(TuneParam ¶m) const
enum QudaVerbosity_s QudaVerbosity
static void serializeProfile(std::ostream &out, std::ostream &async_out)
static TimeProfile launchTimer("tuneLaunch")
const std::map< TuneKey, TuneParam > & getTuneCache()
Returns a reference to the tunecache map.
void Stop(const char *func, const char *file, int line)
static const int volume_n
QudaTune getTuning()
Query whether autotuning is enabled or not. Default is enabled but can be overridden by setting QUDA_...
__device__ unsigned int count[QUDA_MAX_MULTI_REDUCE]
#define QUDA_VERSION_MAJOR
virtual std::string perfString(float time) const
virtual void apply(const cudaStream_t &stream)=0
virtual void defaultTuneParam(TuneParam ¶m) const
virtual bool advanceTuneParam(TuneParam ¶m) const