20 #define CUDA_SUCCESS true
47 aux = make_int4(1,1,1,1);
76 #ifndef __CUDACC_RTC__
78 output <<
"block=(" <<
param.block.x <<
"," <<
param.block.y <<
"," <<
param.block.z <<
"), ";
79 output <<
"grid=(" <<
param.grid.x <<
"," <<
param.grid.y <<
"," <<
param.grid.z <<
"), ";
80 output <<
"shared_bytes=" <<
param.shared_bytes;
81 output <<
", aux=(" <<
param.aux.x <<
"," <<
param.aux.y <<
"," <<
param.aux.z <<
"," <<
param.aux.w <<
")";
87 #ifndef __CUDACC_RTC__
98 virtual long long flops()
const = 0;
99 virtual long long bytes()
const {
return 0; }
117 param.grid.x += step;
149 const auto max_blocks =
deviceProp.maxGridSize[0];
153 param.block.x = ((
param.block.x+step-1)/step)*step;
154 if (
param.block.x > max_threads &&
param.block.y == 1 &&
param.block.z == 1)
155 errorQuda(
"Local lattice volume is too large for device");
188 #if CUDA_VERSION >= 11000
189 static int max_blocks_per_sm = 0;
190 if (!max_blocks_per_sm)
191 cudaDeviceGetAttribute(&max_blocks_per_sm, cudaDevAttrMaxBlocksPerMultiprocessor,
comm_gpuid());
192 return max_blocks_per_sm;
209 warningQuda(
"Unknown SM architecture %d.%d - assuming limit of 32 blocks per SM\n",
247 int blocks_per_sm = max_shared / (
param.shared_bytes ?
param.shared_bytes : 1);
248 if (blocks_per_sm > max_blocks_per_sm) blocks_per_sm = max_blocks_per_sm;
249 param.shared_bytes = (blocks_per_sm > 0 ? max_shared / blocks_per_sm + 1 : max_shared + 1);
251 if (
param.shared_bytes > max_shared) {
273 #ifndef __CUDACC_RTC__
275 va_start(arguments, format);
291 #ifndef __CUDACC_RTC__
313 #ifndef __CUDACC_RTC__
316 std::stringstream ps;
323 float gflops =
flops() / (1e9 * time);
324 float gbytes =
bytes() / (1e9 * time);
325 std::stringstream ss;
326 ss << std::setiosflags(std::ios::fixed) << std::setprecision(2) << gflops <<
" Gflop/s, ";
327 ss << gbytes <<
" GB/s";
334 const unsigned int max_threads =
deviceProp.maxThreadsDim[0];
335 const unsigned int max_blocks =
deviceProp.maxGridSize[0];
337 const int min_block_size =
blockMin();
340 param.block = dim3(min_block_size,1,1);
342 param.grid = dim3(min_grid_size,1,1);
346 param.block.x = ((
param.block.x+min_block_size-1) / min_block_size) * min_block_size;
347 if (
param.block.x > max_threads)
errorQuda(
"Local lattice volume is too large for device");
375 errorQuda(
"Requested block size %dx%dx%d=%d greater than hardware limit %d",
379 errorQuda(
"Requested X-dimension block size %d greater than hardware limit %d",
param.block.x,
383 errorQuda(
"Requested Y-dimension block size %d greater than hardware limit %d",
param.block.y,
387 errorQuda(
"Requested Z-dimension block size %d greater than hardware limit %d",
param.block.z,
391 errorQuda(
"Requested X-dimension grid size %d greater than hardware limit %d",
param.grid.x,
395 errorQuda(
"Requested Y-dimension grid size %d greater than hardware limit %d",
param.grid.y,
399 errorQuda(
"Requested Z-dimension grid size %d greater than hardware limit %d",
param.grid.z,
478 dim3 block =
param.block;
479 dim3 grid =
param.grid;
481 param.block.y = block.y;
482 param.grid.y = grid.y;
535 dim3 block =
param.block;
536 dim3 grid =
param.grid;
538 param.block.z = block.z;
539 param.grid.z = grid.z;
603 void postTrace_(
const char *func,
const char *file,
int line);
638 #ifdef __CUDACC_RTC__
643 #define postTrace() quda::postTrace_(__func__, quda::file_name(__FILE__), __LINE__)
unsigned int maxDynamicSharedBytesPerBlock() const
Returns the maximum dynamic shared memory per block.
virtual long long flops() const =0
virtual bool tuneSharedBytes() const
virtual int gridStep() const
gridStep sets the step size when iterating the grid size in advanceGridDim.
virtual long long bytes() const
virtual std::string perfString(float time) const
virtual std::string paramString(const TuneParam ¶m) const
CUresult jitifyError() const
bool tuned()
Whether the present instance has already been tuned or not.
virtual bool tuneGridDim() const
virtual unsigned int sharedBytesPerThread() const =0
virtual bool advanceTuneParam(TuneParam ¶m) const
virtual bool advanceGridDim(TuneParam ¶m) const
virtual unsigned int maxSharedBytesPerBlock() const
The maximum shared memory that a CUDA thread block can use in the autotuner. This isn't necessarily t...
virtual void initTuneParam(TuneParam ¶m) const
virtual bool tuneAuxDim() const
virtual int blockMin() const
virtual unsigned int sharedBytesPerBlock(const TuneParam ¶m) const =0
virtual unsigned int maxBlockSize(const TuneParam ¶m) const
virtual unsigned int minGridSize() const
virtual TuneKey tuneKey() const =0
virtual bool advanceBlockDim(TuneParam ¶m) const
virtual int blockStep() const
virtual void apply(const qudaStream_t &stream)=0
virtual bool advanceSharedBytes(TuneParam ¶m) const
void checkLaunchParam(TuneParam ¶m)
unsigned int maxBlocksPerSM() const
Returns the maximum number of simultaneously resident blocks per SM. We can directly query this of CU...
virtual void resetBlockDim(TuneParam ¶m) const
virtual unsigned int maxGridSize() const
virtual void defaultTuneParam(TuneParam ¶m) const
int writeAuxString(const char *format,...)
virtual int tuningIter() const
virtual unsigned int minThreads() const
virtual bool advanceAux(TuneParam ¶m) const
bool advanceBlockDim(TuneParam ¶m) const
unsigned int minGridSize() const
unsigned int maxBlockSize(const TuneParam ¶m) const
void defaultTuneParam(TuneParam ¶m) const
bool tuneGridDim() const final
unsigned int sharedBytesPerBlock(const TuneParam ¶m) const
int gridStep() const
gridStep sets the step size when iterating the grid size in advanceGridDim.
unsigned int sharedBytesPerThread() const
void initTuneParam(TuneParam ¶m) const
virtual unsigned int sharedBytesPerBlock(const TuneParam ¶m) const
virtual unsigned int sharedBytesPerThread() const
void defaultTuneParam(TuneParam ¶m) const
bool advanceBlockDim(TuneParam ¶m) const
void initTuneParam(TuneParam ¶m) const
void resizeStep(int y) const
unsigned int vector_length_y
TunableVectorY(unsigned int vector_length_y)
void resizeVector(int y) const
void initTuneParam(TuneParam ¶m) const
void resizeVector(int y, int z) const
void resizeStep(int y, int z) const
bool advanceBlockDim(TuneParam ¶m) const
void defaultTuneParam(TuneParam ¶m) const
TunableVectorYZ(unsigned int vector_length_y, unsigned int vector_length_z)
TuneParam(const TuneParam ¶m)
bool set_max_shared_bytes
TuneParam & operator=(const TuneParam ¶m)
friend std::ostream & operator<<(std::ostream &output, const TuneParam ¶m)
enum QudaVerbosity_s QudaVerbosity
size_t max_dynamic_shared_memory()
Returns the maximum dynamic shared memory per block.
void disableProfileCount()
Disable the profile kernel counting.
TuneParam tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
const std::map< TuneKey, TuneParam > & getTuneCache()
Returns a reference to the tunecache map.
void saveTuneCache(bool error=false)
bool policyTuning()
Query whether we are currently tuning a policy.
void setUberTuning(bool)
Enable / disable whether we are tuning an uber kernel.
void setPolicyTuning(bool)
Enable / disable whether are tuning a policy.
bool activeTuning()
query if tuning is in progress
void postTrace_(const char *func, const char *file, int line)
Post an event in the trace, recording where it was posted.
void flushProfile()
Flush profile contents, setting all counts to zero.
bool use_managed_memory()
void enableProfileCount()
Enable the profile kernel counting.
void saveProfile(const std::string label="")
Save profile to disk.
bool uberTuning()
Query whether we are tuning an uber kernel.
cudaDeviceProp deviceProp
cudaStream_t qudaStream_t
QudaTune getTuning()
Query whether autotuning is enabled or not. Default is enabled but can be overridden by setting QUDA_...