43 virtual long long flops()
const = 0;
44 virtual long long bytes()
const {
return 0; }
60 const unsigned int max_blocks = 256;
63 if (param.
grid.x > max_blocks) {
76 const unsigned int max_threads =
deviceProp.maxThreadsDim[0];
77 const unsigned int max_blocks =
deviceProp.maxGridSize[0];
78 const unsigned int max_shared =
deviceProp.sharedMemPerBlock;
82 param.
block.x += step;
90 param.
block.x = ((param.
block.x+step-1)/step)*step;
91 if(param.
block.x > max_threads)
errorQuda(
"Local lattice volume is too large for device");
116 const int max_shared =
deviceProp.sharedMemPerBlock;
117 const int max_blocks_per_sm = 8;
119 if (blocks_per_sm > max_blocks_per_sm) blocks_per_sm = max_blocks_per_sm;
140 va_start(arguments, format);
144 if (n < 0 || n >= 512)
errorQuda(
"Error writing auxiliary string");
151 virtual void apply(
const cudaStream_t &
stream) = 0;
158 std::stringstream ps;
159 ps <<
"block=(" << param.
block.x <<
"," << param.
block.y <<
"," << param.
block.z <<
"), ";
160 ps <<
"grid=(" << param.
grid.x <<
"," << param.
grid.y <<
"," << param.
grid.z <<
"), ";
167 float gflops =
flops() / (1e9 * time);
168 float gbytes =
bytes() / (1e9 * time);
169 std::stringstream ss;
170 ss << std::setiosflags(std::ios::fixed) << std::setprecision(2) << gflops <<
" Gflop/s, ";
171 ss << gbytes <<
" GB/s";
177 const unsigned int max_threads =
deviceProp.maxThreadsDim[0];
178 const unsigned int max_blocks =
deviceProp.maxGridSize[0];
179 const int min_block_size =
deviceProp.warpSize;
182 param.
block = dim3(min_block_size,1,1);
184 param.
grid = dim3(1,1,1);
189 param.
block.x = ((param.
block.x+warp-1) / warp) * warp;
190 if (param.
block.x > max_threads)
errorQuda(
"Local lattice volume is too large for device");
217 errorQuda(
"Requested X-dimension block size %d greater than hardware limit %d",
221 errorQuda(
"Requested Y-dimension block size %d greater than hardware limit %d",
225 errorQuda(
"Requested Z-dimension block size %d greater than hardware limit %d",
229 errorQuda(
"Requested X-dimension grid size %d greater than hardware limit %d",
234 errorQuda(
"Requested Y-dimension grid size %d greater than hardware limit %d",
238 errorQuda(
"Requested Z-dimension grid size %d greater than hardware limit %d",
250 #endif // _TUNE_QUDA_H
cudaDeviceProp deviceProp
virtual void initTuneParam(TuneParam ¶m) const
virtual TuneKey tuneKey() const =0
virtual bool tuneGridDim() const
virtual unsigned int minThreads() const
TuneParam(const TuneParam ¶m)
virtual bool advanceSharedBytes(TuneParam ¶m) const
virtual std::string paramString(const TuneParam ¶m) const
void writeAuxString(const char *format,...)
virtual unsigned int sharedBytesPerThread() const =0
virtual unsigned int sharedBytesPerBlock(const TuneParam ¶m) const =0
virtual bool advanceBlockDim(TuneParam ¶m) const
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
virtual std::string perfString(float time) const
TuneParam & operator=(const TuneParam ¶m)
virtual void defaultTuneParam(TuneParam ¶m) const
virtual int tuningIter() const
void loadTuneCache(QudaVerbosity verbosity)
virtual long long bytes() const
void checkLaunchParam(TuneParam ¶m)
virtual bool advanceTuneParam(TuneParam ¶m) const
enum QudaVerbosity_s QudaVerbosity
virtual bool tuneSharedBytes() const
void saveTuneCache(QudaVerbosity verbosity)
virtual long long flops() const =0
virtual void apply(const cudaStream_t &stream)=0
virtual bool advanceGridDim(TuneParam ¶m) const