30 typedef std::map<TuneKey, TuneParam>
map;
76 static bool init =
false;
79 char *enable_trace_env =
getenv(
"QUDA_ENABLE_TRACE");
80 if (enable_trace_env &&
strcmp(enable_trace_env,
"1") == 0) {
91 static map::iterator
it;
95 #define STR(x) STR_(x) 119 std::stringstream ls;
132 if (!line.length())
continue;
137 if (check < 0 || check >= key.
volume_n)
errorQuda(
"Error writing volume string (check = %d)", check);
139 if (check < 0 || check >= key.
name_n)
errorQuda(
"Error writing name string (check=%d)", check);
141 if (check < 0 || check >= key.
aux_n)
errorQuda(
"Error writing aux string (check=%d)", check);
145 param.comment +=
"\n";
162 out << std::setw(16) << key.
volume <<
"\t" << key.
name <<
"\t" << key.
aux <<
"\t";
174 return lhs.second.time * lhs.second.n_calls < rhs.second.time * rhs.second.n_calls;
184 double total_time = 0.0;
185 double async_total_time = 0.0;
188 typedef std::pair<TuneKey, TuneParam> profile_t;
199 bool is_policy =
strcmp(
tmp,
"policy") == 0 ? true :
false;
200 if (
param.n_calls > 0 && !is_policy) total_time +=
param.n_calls *
param.time;
201 if (
param.n_calls > 0 && is_policy) async_total_time +=
param.n_calls *
param.time;
205 while ( !q.empty() ) {
211 bool is_policy =
strcmp(
tmp,
"policy") == 0 ? true :
false;
214 if (
param.n_calls > 0 && !is_policy) {
217 out << std::setw(12) <<
param.n_calls *
param.time <<
"\t" << std::setw(12) << (
time / total_time) * 100 <<
"\t";
218 out << std::setw(12) <<
param.n_calls <<
"\t" << std::setw(12) <<
param.time <<
"\t" << std::setw(16) << key.
volume <<
"\t";
223 if (
param.n_calls > 0 && is_policy) {
226 async_out << std::setw(12) <<
param.n_calls *
param.time <<
"\t" << std::setw(12) << (
time / async_total_time) * 100 <<
"\t";
227 async_out << std::setw(12) <<
param.n_calls <<
"\t" << std::setw(12) <<
param.time <<
"\t" << std::setw(16) << key.
volume <<
"\t";
228 async_out << key.
name <<
"\t" << key.
aux <<
"\t" <<
param.comment;
234 out << std::endl <<
"# Total time spent in kernels = " << total_time <<
" seconds" << std::endl;
235 async_out << std::endl <<
"# Total time spent in asynchronous execution = " << async_total_time <<
" seconds" << std::endl;
250 bool is_policy_kernel =
strcmp(
tmp,
"policy_kernel") == 0 ? true :
false;
252 out << std::setw(12) <<
it->time <<
"\t";
253 out << std::setw(12) <<
it->device_bytes <<
"\t";
254 out << std::setw(12) <<
it->pinned_bytes <<
"\t";
255 out << std::setw(12) <<
it->mapped_bytes <<
"\t";
256 out << std::setw(12) <<
it->host_bytes <<
"\t";
257 out << std::setw(16) << key.
volume <<
"\t";
258 if (is_policy_kernel)
out <<
"\t";
260 if (!is_policy_kernel)
out <<
"\t";
261 out << key.
aux << std::endl;
274 std::stringstream serialized;
279 size = serialized.str().length();
287 char *serstr =
new char[
size+1];
290 serialized.str(serstr);
311 std::string cache_path, line, token;
312 std::ifstream cache_file;
313 std::stringstream ls;
315 path =
getenv(
"QUDA_RESOURCE_PATH");
318 warningQuda(
"Environment variable QUDA_RESOURCE_PATH is not set.");
319 warningQuda(
"Caching of tuned parameters will be disabled.");
321 }
else if (stat(path, &pstat) || !S_ISDIR(pstat.st_mode)) {
322 warningQuda(
"The path \"%s\" specified by QUDA_RESOURCE_PATH does not exist or is not a directory.", path);
323 warningQuda(
"Caching of tuned parameters will be disabled.");
334 cache_path +=
"/tunecache.tsv";
335 cache_file.open(cache_path.c_str());
339 if (!cache_file.good())
errorQuda(
"Bad format in %s", cache_path.c_str());
343 if (token.compare(
"tunecache"))
errorQuda(
"Bad format in %s", cache_path.c_str());
345 if (token.compare(
quda_version))
errorQuda(
"Cache file %s does not match current QUDA version. \nPlease delete this file or set the QUDA_RESOURCE_PATH environment variable to point to a new path.", cache_path.c_str());
348 if (token.compare(
gitversion))
errorQuda(
"Cache file %s does not match current QUDA version. \nPlease delete this file or set the QUDA_RESOURCE_PATH environment variable to point to a new path.", cache_path.c_str());
350 if (token.compare(
quda_version))
errorQuda(
"Cache file %s does not match current QUDA version. \nPlease delete this file or set the QUDA_RESOURCE_PATH environment variable to point to a new path.", cache_path.c_str());
353 if (token.compare(
quda_hash))
errorQuda(
"Cache file %s does not match current QUDA build. \nPlease delete this file or set the QUDA_RESOURCE_PATH environment variable to point to a new path.", cache_path.c_str());
356 if (!cache_file.good())
errorQuda(
"Bad format in %s", cache_path.c_str());
359 if (!cache_file.good())
errorQuda(
"Bad format in %s", cache_path.c_str());
373 warningQuda(
"Cache file not found. All kernels will be re-tuned (if tuning is enabled).");
392 std::string lock_path, cache_path;
393 std::ofstream cache_file;
410 lock_handle = open(lock_path.c_str(), O_WRONLY | O_CREAT | O_EXCL, 0666);
411 if (lock_handle == -1) {
412 warningQuda(
"Unable to lock cache file. Tuned launch parameters will not be cached to disk. " 413 "If you are certain that no other instances of QUDA are accessing this filesystem, " 414 "please manually remove %s", lock_path.c_str());
417 char msg[] =
"If no instances of applications using QUDA are running,\n" 418 "this lock file shouldn't be here and is safe to delete.";
419 int stat =
write(lock_handle, msg,
sizeof(msg));
420 if (stat == -1)
warningQuda(
"Unable to write to lock file for some bizarre reason");
423 cache_file.open(cache_path.c_str());
426 printfQuda(
"Saving %d sets of cached parameters to %s\n", static_cast<int>(
tunecache.size()), cache_path.c_str());
436 cache_file <<
"\t" <<
quda_hash <<
"\t# Last updated " <<
ctime(&now) << std::endl;
437 cache_file << std::setw(16) <<
"volume" <<
"\tname\taux\tblock.x\tblock.y\tblock.z\tgrid.x\tgrid.y\tgrid.z\tshared_bytes\taux.x\taux.y\taux.z\taux.w\ttime\tcomment" << std::endl;
443 remove(lock_path.c_str());
476 std::string lock_path, profile_path, async_profile_path, trace_path;
477 std::ofstream profile_file, async_profile_file, trace_file;
488 lock_handle = open(lock_path.c_str(), O_WRONLY | O_CREAT | O_EXCL, 0666);
489 if (lock_handle == -1) {
490 warningQuda(
"Unable to lock profile file. Profile will not be saved to disk. " 491 "If you are certain that no other instances of QUDA are accessing this filesystem, " 492 "please manually remove %s", lock_path.c_str());
495 char msg[] =
"If no instances of applications using QUDA are running,\n" 496 "this lock file shouldn't be here and is safe to delete.";
497 int stat =
write(lock_handle, msg,
sizeof(msg));
498 if (stat == -1)
warningQuda(
"Unable to write to lock file for some bizarre reason");
501 static int count = 0;
503 char *profile_fname =
getenv(
"QUDA_PROFILE_OUTPUT_BASE");
505 if (!profile_fname) {
506 warningQuda(
"Environment variable QUDA_PROFILE_OUTPUT_BASE not set; writing to profile.tsv and profile_async.tsv");
508 async_profile_path =
resource_path +
"/profile_async_" + std::to_string(
count) +
".tsv";
511 profile_path =
resource_path +
"/" + profile_fname +
"_" + std::to_string(
count) +
".tsv";
512 async_profile_path =
resource_path +
"/" + profile_fname +
"_" + std::to_string(
count) +
"_async.tsv";
518 profile_file.open(profile_path.c_str());
519 async_profile_file.open(async_profile_path.c_str());
520 if (
traceEnabled()) trace_file.open(trace_path.c_str());
531 bool is_policy =
strcmp(
tmp,
"policy") == 0 ? true :
false;
532 if (
param.n_calls > 0 && !is_policy) n_entry++;
533 if (
param.n_calls > 0 && is_policy) n_policy++;
536 printfQuda(
"Saving %d sets of cached parameters to %s\n", n_entry, profile_path.c_str());
537 printfQuda(
"Saving %d sets of cached profiles to %s\n", n_policy, async_profile_path.c_str());
543 std::string Label = label.empty() ?
"profile" : label;
551 profile_file <<
"\t" <<
quda_hash <<
"\t# Last updated " <<
ctime(&now) << std::endl;
552 profile_file << std::setw(12) <<
"total time" <<
"\t" << std::setw(12) <<
"percentage" <<
"\t" << std::setw(12) <<
"calls" <<
"\t" << std::setw(12) <<
"time / call" <<
"\t" << std::setw(16) <<
"volume" <<
"\tname\taux\tcomment" << std::endl;
560 async_profile_file <<
"\t" <<
quda_hash <<
"\t# Last updated " <<
ctime(&now) << std::endl;
561 async_profile_file << std::setw(12) <<
"total time" <<
"\t" << std::setw(12) <<
"percentage" <<
"\t" << std::setw(12) <<
"calls" <<
"\t" << std::setw(12) <<
"time / call" <<
"\t" << std::setw(16) <<
"volume" <<
"\tname\taux\tcomment" << std::endl;
565 profile_file.close();
566 async_profile_file.close();
575 trace_file <<
"\t" <<
quda_hash <<
"\t# Last updated " <<
ctime(&now) << std::endl;
577 trace_file << std::setw(12) <<
"time\t" << std::setw(12) <<
"device-mem\t" << std::setw(12) <<
"pinned-mem\t";
578 trace_file << std::setw(12) <<
"mapped-mem\t" << std::setw(12) <<
"host-mem\t";
579 trace_file << std::setw(16) <<
"volume" <<
"\tname\taux" << std::endl;
588 remove(lock_path.c_str());
605 #ifdef PTHREADS // tuning should be performed serially 624 static const Tunable *active_tunable;
683 cudaError_t error = cudaSuccess;
685 float elapsed_time, best_time;
689 active_tunable = &tunable;
695 cudaEventCreate(&
start);
696 cudaEventCreate(&
end);
704 cudaDeviceSynchronize();
710 printfQuda(
"About to call tunable.apply block=(%d,%d,%d) grid=(%d,%d,%d) shared_bytes=%d aux=(%d,%d,%d)\n",
717 cudaEventRecord(
start, 0);
721 cudaEventRecord(
end, 0);
722 cudaEventSynchronize(
end);
723 cudaEventElapsedTime(&elapsed_time,
start,
end);
724 cudaDeviceSynchronize();
725 error = cudaGetLastError();
728 cudaDeviceSynchronize();
729 cudaError_t error = cudaGetLastError();
730 if (error != cudaSuccess)
errorQuda(
"Failed to clear error state %s\n", cudaGetErrorString(error));
734 if ((elapsed_time < best_time) && (error == cudaSuccess)) {
735 best_time = elapsed_time;
739 if (error == cudaSuccess)
748 if (best_time == FLT_MAX) {
758 best_param.
time = best_time;
760 cudaEventDestroy(
start);
761 cudaEventDestroy(
end);
782 }
else if (&tunable != active_tunable) {
783 errorQuda(
"Unexpected call to tuneLaunch() in %s::apply()",
typeid(tunable).name());
long device_allocated_peak()
int snprintf(char *__str, size_t __size, const char *__format,...) __attribute__((__format__(__printf__
std::map< TuneKey, TuneParam > map
virtual int tuningIter() const
void disableProfileCount()
QudaVerbosity getVerbosity()
cudaColorSpinorField * tmp
#define QUDA_VERSION_MINOR
char * ctime(const time_t *)
static void broadcastTuneCache()
virtual std::string paramString(const TuneParam ¶m) const
quda::TuneKey getLastTuneKey()
static std::list< TraceKey > trace_list
virtual TuneKey tuneKey() const =0
static size_t initial_cache_size
void enableProfileCount()
static const std::string quda_hash
TraceKey(const TuneKey &key, float time)
static bool profile_count
int strcmp(const char *__s1, const char *__s2)
static __inline__ T * entry
static void deserializeTuneCache(std::istream &in)
void flushProfile()
Flush profile contents, setting all counts to zero.
static bool policy_tuning
static const std::string quda_version
bool operator()(const T &lhs, const T &rhs)
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
#define QUDA_VERSION_SUBMINOR
bool commGlobalReduction()
void saveProfile(const std::string label="")
Save profile to disk.
long host_allocated_peak()
static void serializeTuneCache(std::ostream &out)
long mapped_allocated_peak()
static std::string resource_path
void setPolicyTuning(bool)
bool activeTuning()
query if tuning is in progress
TraceKey(const TraceKey &trace)
TraceKey & operator=(const TraceKey &trace)
std::map< TuneKey, TuneParam > map
char * strncpy(char *__dst, const char *__src, size_t __n)
static void serializeTrace(std::ostream &out)
void comm_broadcast(void *data, size_t nbytes)
cpuColorSpinorField * out
long pinned_allocated_peak()
Main header file for the QUDA library.
void checkLaunchParam(TuneParam ¶m)
virtual void initTuneParam(TuneParam ¶m) const
enum QudaVerbosity_s QudaVerbosity
static void serializeProfile(std::ostream &out, std::ostream &async_out)
static TimeProfile launchTimer("tuneLaunch")
const map & getTuneCache()
static const int volume_n
QudaTune getTuning()
Query whether autotuning is enabled or not. Default is enabled but can be overridden by setting QUDA_...
__device__ unsigned int count[QUDA_MAX_MULTI_REDUCE]
char * getenv(const char *)
#define QUDA_VERSION_MAJOR
virtual std::string perfString(float time) const
cudaEvent_t cudaEvent_t end
virtual void apply(const cudaStream_t &stream)=0
virtual void defaultTuneParam(TuneParam ¶m) const
virtual bool advanceTuneParam(TuneParam ¶m) const
ssize_t getline(char **__linep, size_t *__linecapp, FILE *__stream) __attribute__((availability(macosx