30 aux = make_int4(1,1,1,1);
50 output <<
"block = (" <<
param.block.x <<
", " <<
param.block.y <<
", " <<
param.block.z <<
")" << std::endl;
51 output <<
"grid = (" <<
param.grid.x <<
", " <<
param.grid.y <<
", " <<
param.grid.z <<
")" << std::endl;
52 output <<
"shared_bytes = " <<
param.shared_bytes << std::endl;
53 output <<
"aux = (" <<
param.aux.x <<
", " <<
param.aux.y <<
", " <<
param.aux.z <<
", " <<
param.aux.w <<
")" << std::endl;
54 output <<
param.comment << std::endl;
63 virtual long long flops()
const = 0;
64 virtual long long bytes()
const {
return 0; }
84 if (
param.grid.x > max_blocks) {
105 const unsigned int max_blocks =
deviceProp.maxGridSize[0];
106 const unsigned int max_shared =
deviceProp.sharedMemPerBlock;
110 param.block.x += step;
115 param.block.x = step;
119 param.block.x = ((
param.block.x+step-1)/step)*step;
120 if(
param.block.x > max_threads)
errorQuda(
"Local lattice volume is too large for device");
166 const int max_shared =
deviceProp.sharedMemPerBlock;
168 int blocks_per_sm = max_shared / (
param.shared_bytes ?
param.shared_bytes : 1);
169 if (blocks_per_sm > max_blocks_per_sm) blocks_per_sm = max_blocks_per_sm;
170 param.shared_bytes = (blocks_per_sm > 0 ? max_shared / blocks_per_sm + 1 : max_shared + 1);
172 if (
param.shared_bytes > max_shared) {
193 va_start(arguments, format);
203 virtual void apply(
const cudaStream_t &
stream) = 0;
210 std::stringstream ps;
211 ps <<
"block=(" <<
param.block.x <<
"," <<
param.block.y <<
"," <<
param.block.z <<
"), ";
213 ps <<
"shared=" <<
param.shared_bytes <<
", ";
224 std::stringstream ss;
225 ss << std::setiosflags(std::ios::fixed) << std::setprecision(2) << gflops <<
" Gflop/s, ";
226 ss << gbytes <<
" GB/s";
232 const unsigned int max_threads =
deviceProp.maxThreadsDim[0];
233 const unsigned int max_blocks =
deviceProp.maxGridSize[0];
235 const int min_block_size =
blockMin();
238 param.block = dim3(min_block_size,1,1);
240 param.grid = dim3(min_grid_size,1,1);
244 param.block.x = ((
param.block.x+min_block_size-1) / min_block_size) * min_block_size;
245 if (
param.block.x > max_threads)
errorQuda(
"Local lattice volume is too large for device");
272 errorQuda(
"Requested X-dimension block size %d greater than hardware limit %d",
276 errorQuda(
"Requested Y-dimension block size %d greater than hardware limit %d",
280 errorQuda(
"Requested Z-dimension block size %d greater than hardware limit %d",
284 errorQuda(
"Requested X-dimension grid size %d greater than hardware limit %d",
289 errorQuda(
"Requested Y-dimension grid size %d greater than hardware limit %d",
293 errorQuda(
"Requested Z-dimension grid size %d greater than hardware limit %d",
360 dim3 grid =
param.grid;
363 param.grid.y = grid.y;
411 dim3 grid =
param.grid;
414 param.grid.z = grid.z;
474 #endif // _TUNE_QUDA_H virtual unsigned int maxBlockSize() const
void resizeVector(int y, int z)
cudaDeviceProp deviceProp
virtual int tuningIter() const
virtual bool advanceSharedBytes(TuneParam ¶m) const
friend std::ostream & operator<<(std::ostream &output, const TuneParam ¶m)
void initTuneParam(TuneParam ¶m) const
virtual std::string paramString(const TuneParam ¶m) const
TunableVectorY(unsigned int vector_length_y)
bool advanceBlockDim(TuneParam ¶m) const
virtual TuneKey tuneKey() const =0
virtual bool advanceGridDim(TuneParam ¶m) const
virtual bool advanceAux(TuneParam ¶m) const
virtual unsigned int sharedBytesPerThread() const
TuneParam(const TuneParam ¶m)
virtual unsigned int maxGridSize() const
virtual int blockMin() const
unsigned int vector_length_y
virtual unsigned int sharedBytesPerThread() const =0
virtual unsigned int sharedBytesPerBlock(const TuneParam ¶m) const =0
void flushProfile()
Flush profile contents, setting all counts to zero.
unsigned int sharedBytesPerThread() const
virtual long long bytes() const
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
void defaultTuneParam(TuneParam ¶m) const
void saveProfile(const std::string label="")
Save profile to disk.
TuneParam & operator=(const TuneParam ¶m)
TunableVectorYZ(unsigned int vector_length_y, unsigned int vector_length_z)
int int int int vsnprintf(char *__str, size_t __size, const char *__format, va_list) __attribute__((__format__(__printf__
void initTuneParam(TuneParam ¶m) const
bool activeTuning()
query if tuning is in progress
virtual bool tuneAuxDim() const
unsigned int maxBlocksPerSM() const
For reason this can't be queried from the device properties, so here we set set this. Based on Table 14 of the CUDA Programming Guide 9.0 (Technical Specifications per Compute Capability)
virtual bool tuneSharedBytes() const
virtual unsigned int minGridSize() const
bool advanceBlockDim(TuneParam ¶m) const
void defaultTuneParam(TuneParam ¶m) const
int writeAuxString(const char *format,...)
void initTuneParam(TuneParam ¶m) const
unsigned int sharedBytesPerBlock(const TuneParam ¶m) const
virtual int blockStep() const
virtual unsigned int minThreads() const
void checkLaunchParam(TuneParam ¶m)
virtual void initTuneParam(TuneParam ¶m) const
enum QudaVerbosity_s QudaVerbosity
unsigned int maxBlockSize() const
virtual bool advanceBlockDim(TuneParam ¶m) const
virtual unsigned int sharedBytesPerBlock(const TuneParam ¶m) const
virtual bool tuneGridDim() const
virtual std::string perfString(float time) const
void defaultTuneParam(TuneParam ¶m) const
virtual long long flops() const =0
virtual void apply(const cudaStream_t &stream)=0
bool advanceBlockDim(TuneParam ¶m) const
virtual void defaultTuneParam(TuneParam ¶m) const
virtual bool advanceTuneParam(TuneParam ¶m) const