26 static TuneKey last_key;
34 typedef std::map<TuneKey, TuneParam>
map;
83 static std::list<TraceKey> trace_list;
84 static int enable_trace = 0;
88 static bool init =
false;
91 char *enable_trace_env = getenv(
"QUDA_ENABLE_TRACE");
92 if (enable_trace_env) {
93 if (strcmp(enable_trace_env,
"1") == 0) {
96 }
else if (strcmp(enable_trace_env,
"2") == 0) {
106 void postTrace_(
const char *func,
const char *file,
int line)
117 trace_list.push_back(trace_entry);
123 static map tunecache;
124 static map::iterator it;
125 static size_t initial_cache_size = 0;
128 #define STR(x) STR_(x)
135 static bool tuning =
false;
139 static bool profile_count =
true;
149 static void deserializeTuneCache(std::istream &in)
152 std::stringstream ls;
165 if (!line.length())
continue;
169 check = snprintf(key.volume, key.volume_n,
"%s", v.c_str());
170 if (check < 0 || check >= key.volume_n)
errorQuda(
"Error writing volume string (check = %d)", check);
171 check = snprintf(key.name, key.name_n,
"%s", n.c_str());
172 if (check < 0 || check >= key.name_n)
errorQuda(
"Error writing name string (check=%d)", check);
173 check = snprintf(key.aux, key.aux_n,
"%s", a.c_str());
174 if (check < 0 || check >= key.aux_n)
errorQuda(
"Error writing aux string (check=%d)", check);
178 getline(ls,
param.comment);
179 param.comment +=
"\n";
180 tunecache[key] =
param;
187 static void serializeTuneCache(std::ostream &out)
191 for (entry = tunecache.begin(); entry != tunecache.end(); entry++) {
192 TuneKey key = entry->first;
193 TuneParam
param = entry->second;
195 out << std::setw(16) << key.volume <<
"\t" << key.name <<
"\t" << key.aux <<
"\t";
196 out <<
param.block.x <<
"\t" <<
param.block.y <<
"\t" <<
param.block.z <<
"\t";
197 out <<
param.grid.x <<
"\t" <<
param.grid.y <<
"\t" <<
param.grid.z <<
"\t";
198 out <<
param.shared_bytes <<
"\t" <<
param.aux.x <<
"\t" <<
param.aux.y <<
"\t" <<
param.aux.z <<
"\t"
199 <<
param.aux.w <<
"\t";
207 return lhs.second.time * lhs.second.n_calls < rhs.second.time * rhs.second.n_calls;
214 static void serializeProfile(std::ostream &out, std::ostream &async_out)
217 double total_time = 0.0;
218 double async_total_time = 0.0;
221 typedef std::pair<TuneKey, TuneParam> profile_t;
222 typedef std::priority_queue<profile_t, std::deque<profile_t>, less_significant<profile_t>> queue_t;
223 queue_t q(tunecache.begin(), tunecache.end());
226 for (entry = tunecache.begin(); entry != tunecache.end(); entry++) {
227 TuneKey key = entry->first;
228 TuneParam
param = entry->second;
232 bool is_policy_kernel = strncmp(
tmp,
"policy_kernel", 13) == 0 ? true :
false;
233 bool is_policy = (strncmp(
tmp,
"policy", 6) == 0 && !is_policy_kernel) ?
true :
false;
234 if (
param.n_calls > 0 && !is_policy) total_time +=
param.n_calls *
param.time;
235 if (
param.n_calls > 0 && is_policy) async_total_time +=
param.n_calls *
param.time;
239 TuneKey key = q.top().first;
240 TuneParam
param = q.top().second;
244 bool is_policy_kernel = strncmp(
tmp,
"policy_kernel", 13) == 0 ? true :
false;
245 bool is_policy = (strncmp(
tmp,
"policy", 6) == 0 && !is_policy_kernel) ?
true :
false;
246 bool is_nested_policy = (strncmp(
tmp,
"nested_policy", 6) == 0) ?
true :
false;
249 if (
param.n_calls > 0 && !is_policy && !is_nested_policy) {
252 out << std::setw(12) <<
param.n_calls *
param.time <<
"\t" << std::setw(12) << (time / total_time) * 100 <<
"\t";
253 out << std::setw(12) <<
param.n_calls <<
"\t" << std::setw(12) <<
param.time <<
"\t" << std::setw(16)
254 << key.volume <<
"\t";
255 out << key.name <<
"\t" << key.aux <<
"\t" <<
param.comment;
259 if (
param.n_calls > 0 && is_policy) {
262 async_out << std::setw(12) <<
param.n_calls *
param.time <<
"\t" << std::setw(12)
263 << (time / async_total_time) * 100 <<
"\t";
264 async_out << std::setw(12) <<
param.n_calls <<
"\t" << std::setw(12) <<
param.time <<
"\t" << std::setw(16)
265 << key.volume <<
"\t";
266 async_out << key.name <<
"\t" << key.aux <<
"\t" <<
param.comment;
272 out << std::endl <<
"# Total time spent in kernels = " << total_time <<
" seconds" << std::endl;
273 async_out << std::endl
274 <<
"# Total time spent in asynchronous execution = " << async_total_time <<
" seconds" << std::endl;
280 static void serializeTrace(std::ostream &out)
282 for (
auto it = trace_list.begin(); it != trace_list.end(); it++) {
284 TuneKey &key = it->key;
289 bool is_policy_kernel = strcmp(
tmp,
"policy_kernel") == 0 ? true :
false;
291 out << std::setw(12) << it->time <<
"\t";
292 out << std::setw(12) << it->device_bytes <<
"\t";
293 out << std::setw(12) << it->pinned_bytes <<
"\t";
294 out << std::setw(12) << it->mapped_bytes <<
"\t";
295 out << std::setw(12) << it->host_bytes <<
"\t";
296 out << std::setw(16) << key.volume <<
"\t";
297 if (is_policy_kernel) out <<
"\t";
298 out << key.name <<
"\t";
299 if (!is_policy_kernel) out <<
"\t";
300 out << key.aux << std::endl;
307 static void broadcastTuneCache()
310 std::stringstream serialized;
314 serializeTuneCache(serialized);
315 size = serialized.str().length();
323 char *serstr =
new char[size + 1];
326 serialized.str(serstr);
327 deserializeTuneCache(serialized);
347 std::ifstream cache_file;
348 std::stringstream ls;
350 path = getenv(
"QUDA_RESOURCE_PATH");
353 warningQuda(
"Environment variable QUDA_RESOURCE_PATH is not set.");
354 warningQuda(
"Caching of tuned parameters will be disabled.");
356 }
else if (stat(path, &pstat) || !S_ISDIR(pstat.st_mode)) {
357 warningQuda(
"The path \"%s\" specified by QUDA_RESOURCE_PATH does not exist or is not a directory.", path);
358 warningQuda(
"Caching of tuned parameters will be disabled.");
361 resource_path = path;
364 bool version_check =
true;
365 char *override_version_env = getenv(
"QUDA_TUNE_VERSION_CHECK");
366 if (override_version_env && strcmp(override_version_env,
"0") == 0) {
367 version_check =
false;
368 warningQuda(
"Disabling QUDA tunecache version check");
375 cache_path = resource_path;
376 cache_path +=
"/tunecache.tsv";
377 cache_file.open(cache_path.c_str());
381 if (!cache_file.good())
errorQuda(
"Bad format in %s", cache_path.c_str());
382 getline(cache_file, line);
385 if (token.compare(
"tunecache"))
errorQuda(
"Bad format in %s", cache_path.c_str());
387 if (version_check && token.compare(quda_version))
388 errorQuda(
"Cache file %s does not match current QUDA version. \nPlease delete this file or set the "
389 "QUDA_RESOURCE_PATH environment variable to point to a new path.",
393 if (version_check && token.compare(
gitversion))
394 errorQuda(
"Cache file %s does not match current QUDA version. \nPlease delete this file or set the "
395 "QUDA_RESOURCE_PATH environment variable to point to a new path.",
398 if (version_check && token.compare(quda_version))
399 errorQuda(
"Cache file %s does not match current QUDA version. \nPlease delete this file or set the "
400 "QUDA_RESOURCE_PATH environment variable to point to a new path.",
404 if (version_check && token.compare(quda_hash))
405 errorQuda(
"Cache file %s does not match current QUDA build. \nPlease delete this file or set the "
406 "QUDA_RESOURCE_PATH environment variable to point to a new path.",
409 if (!cache_file.good())
errorQuda(
"Bad format in %s", cache_path.c_str());
410 getline(cache_file, line);
412 if (!cache_file.good())
errorQuda(
"Bad format in %s", cache_path.c_str());
413 getline(cache_file, line);
415 deserializeTuneCache(cache_file);
418 initial_cache_size = tunecache.size();
421 printfQuda(
"Loaded %d sets of cached parameters from %s\n",
static_cast<int>(initial_cache_size),
426 warningQuda(
"Cache file not found. All kernels will be re-tuned (if tuning is enabled).");
433 broadcastTuneCache();
444 std::ofstream cache_file;
446 if (resource_path.empty())
return;
456 if (tunecache.size() == initial_cache_size && !error)
return;
460 lock_path = resource_path +
"/tunecache.lock";
461 lock_handle = open(lock_path.c_str(), O_WRONLY | O_CREAT | O_EXCL, 0666);
462 if (lock_handle == -1) {
463 warningQuda(
"Unable to lock cache file. Tuned launch parameters will not be cached to disk. "
464 "If you are certain that no other instances of QUDA are accessing this filesystem, "
465 "please manually remove %s",
469 char msg[] =
"If no instances of applications using QUDA are running,\n"
470 "this lock file shouldn't be here and is safe to delete.";
471 int stat = write(lock_handle, msg,
sizeof(msg));
472 if (stat == -1)
warningQuda(
"Unable to write to lock file for some bizarre reason");
474 cache_path = resource_path + (error ?
"/tunecache_error.tsv" :
"/tunecache.tsv");
475 cache_file.open(cache_path.c_str());
478 printfQuda(
"Saving %d sets of cached parameters to %s\n",
static_cast<int>(tunecache.size()), cache_path.c_str());
482 cache_file <<
"tunecache\t" << quda_version;
486 cache_file <<
"\t" << quda_version;
488 cache_file <<
"\t" << quda_hash <<
"\t# Last updated " << ctime(&now) << std::endl;
489 cache_file << std::setw(16) <<
"volume"
490 <<
"\tname\taux\tblock.x\tblock.y\tblock.z\tgrid.x\tgrid.y\tgrid.z\tshared_bytes\taux.x\taux.y\taux."
491 "z\taux.w\ttime\tcomment"
493 serializeTuneCache(cache_file);
498 remove(lock_path.c_str());
500 initial_cache_size = tunecache.size();
506 if (error) sleep(10);
511 static bool policy_tuning =
false;
516 static bool uber_tuning =
false;
524 for (map::iterator entry = tunecache.begin(); entry != tunecache.end(); entry++) {
536 std::string lock_path, profile_path, async_profile_path, trace_path;
537 std::ofstream profile_file, async_profile_file, trace_file;
539 if (resource_path.empty())
return;
547 lock_path = resource_path +
"/profile.lock";
548 lock_handle = open(lock_path.c_str(), O_WRONLY | O_CREAT | O_EXCL, 0666);
549 if (lock_handle == -1) {
550 warningQuda(
"Unable to lock profile file. Profile will not be saved to disk. "
551 "If you are certain that no other instances of QUDA are accessing this filesystem, "
552 "please manually remove %s",
556 char msg[] =
"If no instances of applications using QUDA are running,\n"
557 "this lock file shouldn't be here and is safe to delete.";
558 int stat = write(lock_handle, msg,
sizeof(msg));
559 if (stat == -1)
warningQuda(
"Unable to write to lock file for some bizarre reason");
562 static int count = 0;
564 char *profile_fname = getenv(
"QUDA_PROFILE_OUTPUT_BASE");
566 if (!profile_fname) {
568 "Environment variable QUDA_PROFILE_OUTPUT_BASE not set; writing to profile.tsv and profile_async.tsv");
569 profile_path = resource_path +
"/profile_" + std::to_string(count) +
".tsv";
570 async_profile_path = resource_path +
"/profile_async_" + std::to_string(count) +
".tsv";
571 if (
traceEnabled()) trace_path = resource_path +
"/trace_" + std::to_string(count) +
".tsv";
573 profile_path = resource_path +
"/" + profile_fname +
"_" + std::to_string(count) +
".tsv";
574 async_profile_path = resource_path +
"/" + profile_fname +
"_" + std::to_string(count) +
"_async.tsv";
576 trace_path = resource_path +
"/" + profile_fname +
"_trace_" + std::to_string(count) +
".tsv";
581 profile_file.open(profile_path.c_str());
582 async_profile_file.open(async_profile_path.c_str());
583 if (
traceEnabled()) trace_file.open(trace_path.c_str());
589 for (map::iterator entry = tunecache.begin(); entry != tunecache.end(); entry++) {
594 bool is_policy = strcmp(
tmp,
"policy") == 0 ? true :
false;
595 if (
param.n_calls > 0 && !is_policy) n_entry++;
596 if (
param.n_calls > 0 && is_policy) n_policy++;
599 printfQuda(
"Saving %d sets of cached parameters to %s\n", n_entry, profile_path.c_str());
600 printfQuda(
"Saving %d sets of cached profiles to %s\n", n_policy, async_profile_path.c_str());
602 printfQuda(
"Saving trace list with %lu entries to %s\n", trace_list.size(), trace_path.c_str());
607 std::string Label = label.empty() ?
"profile" : label;
609 profile_file << Label <<
"\t" << quda_version;
613 profile_file <<
"\t" << quda_version;
615 profile_file <<
"\t" << quda_hash <<
"\t# Last updated " << ctime(&now) << std::endl;
616 profile_file << std::setw(12) <<
"total time"
617 <<
"\t" << std::setw(12) <<
"percentage"
618 <<
"\t" << std::setw(12) <<
"calls"
619 <<
"\t" << std::setw(12) <<
"time / call"
620 <<
"\t" << std::setw(16) <<
"volume"
621 <<
"\tname\taux\tcomment" << std::endl;
623 async_profile_file << Label <<
"\t" << quda_version;
627 async_profile_file <<
"\t" << quda_version;
629 async_profile_file <<
"\t" << quda_hash <<
"\t# Last updated " << ctime(&now) << std::endl;
630 async_profile_file << std::setw(12) <<
"total time"
631 <<
"\t" << std::setw(12) <<
"percentage"
632 <<
"\t" << std::setw(12) <<
"calls"
633 <<
"\t" << std::setw(12) <<
"time / call"
634 <<
"\t" << std::setw(16) <<
"volume"
635 <<
"\tname\taux\tcomment" << std::endl;
637 serializeProfile(profile_file, async_profile_file);
639 profile_file.close();
640 async_profile_file.close();
643 trace_file <<
"trace"
644 <<
"\t" << quda_version;
648 trace_file <<
"\t" << quda_version;
650 trace_file <<
"\t" << quda_hash <<
"\t# Last updated " << ctime(&now) << std::endl;
652 trace_file << std::setw(12) <<
"time\t" << std::setw(12) <<
"device-mem\t" << std::setw(12) <<
"pinned-mem\t";
653 trace_file << std::setw(12) <<
"mapped-mem\t" << std::setw(12) <<
"host-mem\t";
654 trace_file << std::setw(16) <<
"volume"
655 <<
"\tname\taux" << std::endl;
657 serializeTrace(trace_file);
664 remove(lock_path.c_str());
671 static TimeProfile launchTimer(
"tuneLaunch");
693 static const Tunable *active_tunable;
694 it = tunecache.find(key);
719 if (!tuning && profile_count) param_tuned.
n_calls++;
728 trace_list.push_back(trace_entry);
750 return param_default;
751 }
else if (!tuning) {
758 cudaError_t error = cudaSuccess;
760 float elapsed_time, best_time;
764 active_tunable = &tunable;
770 cudaEventCreate(&
start);
771 cudaEventCreate(&
end);
778 tune_timer.
Start(__func__, __FILE__, __LINE__);
782 cudaDeviceSynchronize();
786 printfQuda(
"About to call tunable.apply block=(%d,%d,%d) grid=(%d,%d,%d) shared_bytes=%d aux=(%d,%d,%d)\n",
792 cudaEventRecord(
start, 0);
793 for (
int i = 0; i < tunable.
tuningIter(); i++) {
796 cudaEventRecord(
end, 0);
797 cudaEventSynchronize(
end);
798 cudaEventElapsedTime(&elapsed_time,
start,
end);
799 cudaDeviceSynchronize();
800 error = cudaGetLastError();
803 cudaDeviceSynchronize();
804 cudaError_t error = cudaGetLastError();
805 if (error != cudaSuccess)
errorQuda(
"Failed to clear error state %s\n", cudaGetErrorString(error));
809 if ((elapsed_time < best_time) && (error == cudaSuccess) && (tunable.
jitifyError() == CUDA_SUCCESS)) {
810 best_time = elapsed_time;
814 if (error == cudaSuccess && tunable.
jitifyError() == CUDA_SUCCESS) {
833 tune_timer.
Stop(__func__, __FILE__, __LINE__);
835 if (best_time == FLT_MAX) {
844 best_param.
comment +=
", tuning took " + std::to_string(tune_timer.
Last()) +
" seconds at ";
845 best_param.
comment += ctime(&now);
846 best_param.
time = best_time;
848 cudaEventDestroy(
start);
849 cudaEventDestroy(
end);
856 tunecache[key] = best_param;
861 if (tunecache.find(key) == tunecache.end()) {
864 param = tunecache[key];
868 trace_list.push_back(trace_entry);
871 }
else if (&tunable != active_tunable) {
872 errorQuda(
"Unexpected call to tuneLaunch() in %s::apply()",
typeid(tunable).name());
875 param.n_calls = profile_count ? 1 : 0;
virtual std::string perfString(float time) const
virtual std::string paramString(const TuneParam ¶m) const
CUresult jitifyError() const
virtual bool advanceTuneParam(TuneParam ¶m) const
virtual void initTuneParam(TuneParam ¶m) const
virtual TuneKey tuneKey() const =0
virtual void apply(const qudaStream_t &stream)=0
void checkLaunchParam(TuneParam ¶m)
virtual void defaultTuneParam(TuneParam ¶m) const
virtual int tuningIter() const
bool commGlobalReduction()
int comm_rank_global(void)
void comm_broadcast_global(void *data, size_t nbytes)
These routine broadcast the data according to the default communicator.
cudaColorSpinorField * tmp
enum QudaVerbosity_s QudaVerbosity
void init()
Create the BLAS context.
void start()
Start profiling.
void disableProfileCount()
Disable the profile kernel counting.
TuneParam tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
const std::map< TuneKey, TuneParam > & getTuneCache()
Returns a reference to the tunecache map.
void saveTuneCache(bool error=false)
bool policyTuning()
Query whether we are currently tuning a policy.
void setUberTuning(bool)
Enable / disable whether we are tuning an uber kernel.
std::map< TuneKey, TuneParam > map
size_t host_allocated_peak()
void setPolicyTuning(bool)
Enable / disable whether are tuning a policy.
bool activeTuning()
query if tuning is in progress
void postTrace_(const char *func, const char *file, int line)
Post an event in the trace, recording where it was posted.
void flushProfile()
Flush profile contents, setting all counts to zero.
size_t mapped_allocated_peak()
void i32toa(char *buffer, int32_t value)
bool use_managed_memory()
size_t pinned_allocated_peak()
void enableProfileCount()
Enable the profile kernel counting.
size_t device_allocated_peak()
void saveProfile(const std::string label="")
Save profile to disk.
bool uberTuning()
Query whether we are tuning an uber kernel.
Main header file for the QUDA library.
#define QUDA_VERSION_SUBMINOR
#define QUDA_VERSION_MAJOR
#define QUDA_VERSION_MINOR
void Stop(const char *func, const char *file, int line)
void Start(const char *func, const char *file, int line)
TraceKey(const TraceKey &trace)
TraceKey & operator=(const TraceKey &trace)
TraceKey(const TuneKey &key, float time)
bool operator()(const T &lhs, const T &rhs)
quda::TuneKey getLastTuneKey()
QudaTune getTuning()
Query whether autotuning is enabled or not. Default is enabled but can be overridden by setting QUDA_...
QudaVerbosity getVerbosity()