29 inline TuneParam() : block(32, 1, 1), grid(1, 1, 1), shared_bytes(0), aux(), time(FLT_MAX), n_calls(0) {
30 aux = make_int4(1,1,1,1);
34 : block(param.block), grid(param.grid), shared_bytes(param.shared_bytes), aux(param.aux), comment(param.comment), time(param.time), n_calls(param.n_calls) { }
50 output <<
"block=(" << param.
block.x <<
"," << param.
block.y <<
"," << param.
block.z <<
"), ";
51 output <<
"grid=(" << param.
grid.x <<
"," << param.
grid.y <<
"," << param.
grid.z <<
"), ";
53 output <<
", aux=(" << param.
aux.x <<
"," << param.
aux.y <<
"," << param.
aux.z <<
"," << param.
aux.w <<
")";
62 virtual long long flops()
const = 0;
63 virtual long long bytes()
const {
return 0; }
66 virtual unsigned int sharedBytesPerThread()
const = 0;
69 virtual unsigned int sharedBytesPerBlock(
const TuneParam &
param)
const = 0;
80 const unsigned int max_blocks = maxGridSize();
81 const int step = gridStep();
83 if (param.
grid.x > max_blocks) {
84 param.
grid.x = minGridSize();
109 const unsigned int max_threads = maxBlockSize(param);
110 const unsigned int max_blocks =
deviceProp.maxGridSize[0];
111 const int step = blockStep();
114 param.
block.x = step;
117 param.
block.x = (minThreads()+max_blocks-1)/max_blocks;
118 param.
block.x = ((param.
block.x+step-1)/step)*step;
119 if (param.
block.x > max_threads && param.
block.y == 1 && param.
block.z == 1)
120 errorQuda(
"Local lattice volume is too large for device");
126 const unsigned int max_threads = maxBlockSize(param);
127 const unsigned int max_shared = maxSharedBytesPerBlock();
130 param.
block.x += blockStep();
132 if (param.
block.x > max_threads || sharedBytesPerThread() * nthreads > max_shared
133 || sharedBytesPerBlock(param) > max_shared) {
134 resetBlockDim(param);
141 param.
grid = dim3((minThreads()+param.
block.x-1)/param.
block.x, 1, 1);
169 warningQuda(
"Unknown SM architecture %d.%d - assuming limit of 32 blocks per SM\n",
183 #if CUDA_VERSION >= 9000 184 qudaFuncSetAttribute(
185 (
const void *)func, cudaFuncAttributePreferredSharedMemoryCarveout, (
int)cudaSharedmemCarveoutMaxShared);
186 qudaFuncSetAttribute(
187 (
const void *)func, cudaFuncAttributeMaxDynamicSharedMemorySize, maxDynamicSharedBytesPerBlock());
204 case 6:
return 48 * 1024;
207 case 0:
return 96 * 1024;
208 case 2:
return 96 * 1024;
209 case 5:
return 64 * 1024;
212 warningQuda(
"Unknown SM architecture %d.%d - assuming limit of 48 KiB per SM\n",
240 if (tuneSharedBytes()) {
241 const int max_shared = maxSharedBytesPerBlock();
242 const int max_blocks_per_sm = std::min(
deviceProp.maxThreadsPerMultiProcessor / (param.
block.x*param.
block.y*param.
block.z), maxBlocksPerSM());
244 if (blocks_per_sm > max_blocks_per_sm) blocks_per_sm = max_blocks_per_sm;
245 param.
shared_bytes = (blocks_per_sm > 0 ? max_shared / blocks_per_sm + 1 : max_shared + 1);
249 advanceBlockDim(next);
251 param.
shared_bytes = sharedBytesPerThread() * nthreads > sharedBytesPerBlock(next) ?
252 sharedBytesPerThread() * nthreads :
253 sharedBytesPerBlock(next);
269 va_start(arguments, format);
279 Tunable() : jitify_error(CUDA_SUCCESS) { aux[0] =
'\0'; }
281 virtual TuneKey tuneKey()
const = 0;
282 virtual void apply(
const cudaStream_t &
stream) = 0;
289 std::stringstream ps;
298 std::stringstream ss;
299 ss << std::setiosflags(std::ios::fixed) << std::setprecision(2) << gflops <<
" Gflop/s, ";
300 ss << gbytes <<
" GB/s";
306 const unsigned int max_threads =
deviceProp.maxThreadsDim[0];
307 const unsigned int max_blocks =
deviceProp.maxGridSize[0];
308 const int min_grid_size = minGridSize();
309 const int min_block_size = blockMin();
312 param.
block = dim3(min_block_size,1,1);
314 param.
grid = dim3(min_grid_size,1,1);
317 param.
block = dim3((minThreads()+max_blocks-1)/max_blocks, 1, 1);
318 param.
block.x = ((param.
block.x+min_block_size-1) / min_block_size) * min_block_size;
319 if (param.
block.x > max_threads)
errorQuda(
"Local lattice volume is too large for device");
321 param.
grid = dim3((minThreads()+param.
block.x-1)/param.
block.x, 1, 1);
324 param.
shared_bytes = sharedBytesPerThread()*nthreads > sharedBytesPerBlock(param) ?
325 sharedBytesPerThread()*nthreads : sharedBytesPerBlock(param);
331 initTuneParam(param);
332 if (tuneGridDim()) param.
grid = dim3(128,1,1);
337 return advanceSharedBytes(param) || advanceBlockDim(param) || advanceGridDim(param) || advanceAux(param);
347 errorQuda(
"Requested block size %dx%dx%d=%d greater than hardware limit %d",
351 errorQuda(
"Requested X-dimension block size %d greater than hardware limit %d",
355 errorQuda(
"Requested Y-dimension block size %d greater than hardware limit %d",
359 errorQuda(
"Requested Z-dimension block size %d greater than hardware limit %d",
363 errorQuda(
"Requested X-dimension grid size %d greater than hardware limit %d",
367 errorQuda(
"Requested Y-dimension grid size %d greater than hardware limit %d",
371 errorQuda(
"Requested Z-dimension grid size %d greater than hardware limit %d",
438 step_y(1), tune_block_x(true) { }
445 param.
block.y = block.y;
446 param.
grid.y = grid.y;
453 if (param.
block.y < vector_length_y && param.
block.y < (
unsigned int)
deviceProp.maxThreadsDim[1] &&
455 param.
block.y += step_y;
456 param.
grid.y = (vector_length_y + param.
block.y - 1) / param.
block.y;
459 param.
block.y = step_y;
460 param.
grid.y = (vector_length_y + param.
block.y - 1) / param.
block.y;
469 param.
block.y = step_y;
470 param.
grid.y = (vector_length_y + step_y - 1) / step_y;
477 param.
block.y = step_y;
478 param.
grid.y = (vector_length_y + step_y - 1) / step_y;
493 :
TunableVectorY(vector_length_y), vector_length_z(vector_length_z),
494 step_z(1), tune_block_y(true) { }
501 param.
block.z = block.z;
502 param.
grid.z = grid.z;
510 if (param.
block.z < vector_length_z && param.
block.z < (
unsigned int)
deviceProp.maxThreadsDim[2] &&
512 param.
block.z += step_z;
513 param.
grid.z = (vector_length_z + param.
block.z - 1) / param.
block.z;
516 param.
block.z = step_z;
517 param.
grid.z = (vector_length_z + param.
block.z - 1) / param.
block.z;
526 param.
block.z = step_z;
527 param.
grid.z = (vector_length_z + step_z - 1) / step_z;
534 param.
block.z = step_z;
535 param.
grid.z = (vector_length_z + step_z - 1) / step_z;
566 void postTrace_(
const char *func,
const char *file,
int line);
591 #define postTrace() quda::postTrace_(__func__, quda::file_name(__FILE__), __LINE__) 593 #endif // _TUNE_QUDA_H CUresult jitifyError() const
virtual void resetBlockDim(TuneParam ¶m) const
void resizeStep(int y, int z) const
cudaDeviceProp deviceProp
virtual int tuningIter() const
void disableProfileCount()
Disable the profile kernel counting.
virtual bool advanceSharedBytes(TuneParam ¶m) const
void postTrace_(const char *func, const char *file, int line)
Post an event in the trace, recording where it was posted.
void saveTuneCache(bool error=false)
friend std::ostream & operator<<(std::ostream &output, const TuneParam ¶m)
void initTuneParam(TuneParam ¶m) const
virtual std::string paramString(const TuneParam ¶m) const
TunableVectorY(unsigned int vector_length_y)
bool advanceBlockDim(TuneParam ¶m) const
virtual bool tuneGridDim() const
virtual bool advanceGridDim(TuneParam ¶m) const
virtual bool advanceAux(TuneParam ¶m) const
virtual unsigned int sharedBytesPerThread() const
TuneParam(const TuneParam ¶m)
void enableProfileCount()
Enable the profile kernel counting.
unsigned int maxBlockSize(const TuneParam ¶m) const
void setMaxDynamicSharedBytesPerBlock(F *func) const
Enable the maximum dynamic shared bytes for the kernel "func" (values given by maxDynamicSharedBytesP...
virtual unsigned int maxGridSize() const
virtual int blockMin() const
unsigned int vector_length_y
void flushProfile()
Flush profile contents, setting all counts to zero.
unsigned int sharedBytesPerThread() const
virtual long long bytes() const
virtual int gridStep() const
gridStep sets the step size when iterating the grid size in advanceGridDim.
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
void defaultTuneParam(TuneParam ¶m) const
void resizeStep(int y) const
void saveProfile(const std::string label="")
Save profile to disk.
TuneParam & operator=(const TuneParam ¶m)
TunableVectorYZ(unsigned int vector_length_y, unsigned int vector_length_z)
void setPolicyTuning(bool)
Enable / disable whether are tuning a policy.
void initTuneParam(TuneParam ¶m) const
bool activeTuning()
query if tuning is in progress
virtual bool tuneAuxDim() const
unsigned int maxBlocksPerSM() const
For some reason this can't be queried from the device properties, so here we set set this...
virtual bool tuneSharedBytes() const
virtual unsigned int maxBlockSize(const TuneParam ¶m) const
virtual unsigned int minGridSize() const
bool advanceBlockDim(TuneParam ¶m) const
void resizeVector(int y) const
void resizeVector(int y, int z) const
void defaultTuneParam(TuneParam ¶m) const
int writeAuxString(const char *format,...)
void initTuneParam(TuneParam ¶m) const
unsigned int sharedBytesPerBlock(const TuneParam ¶m) const
virtual int blockStep() const
virtual unsigned int minThreads() const
void checkLaunchParam(TuneParam ¶m)
virtual void initTuneParam(TuneParam ¶m) const
enum QudaVerbosity_s QudaVerbosity
unsigned int maxDynamicSharedBytesPerBlock() const
This can't be correctly queried in CUDA for all architectures so here we set set this. Based on Table 14 of the CUDA Programming Guide 10.0 (Technical Specifications per Compute Capability).
const std::map< TuneKey, TuneParam > & getTuneCache()
Returns a reference to the tunecache map.
virtual bool advanceBlockDim(TuneParam ¶m) const
virtual unsigned int sharedBytesPerBlock(const TuneParam ¶m) const
virtual bool tuneGridDim() const
virtual std::string perfString(float time) const
virtual unsigned int maxSharedBytesPerBlock() const
The maximum shared memory that a CUDA thread block can use in the autotuner. This isn't necessarily t...
void defaultTuneParam(TuneParam ¶m) const
bool advanceBlockDim(TuneParam ¶m) const
virtual void defaultTuneParam(TuneParam ¶m) const
virtual bool advanceTuneParam(TuneParam ¶m) const