21 TuneKey(std::string v, std::string n, std::string a=std::string(
"type=default"))
71 virtual long long flops()
const = 0;
72 virtual long long bytes()
const {
return 0; }
82 const unsigned int max_blocks = 256;
85 if (param.
grid.x > max_blocks) {
95 const unsigned int max_threads =
deviceProp.maxThreadsDim[0];
96 const unsigned int max_shared = 16384;
98 param.
block.x += step;
100 param.
block.x = step;
117 const int max_shared = 16384;
118 const int max_blocks_per_sm = 8;
120 if (blocks_per_sm > max_blocks_per_sm) blocks_per_sm = max_blocks_per_sm;
138 virtual void apply(
const cudaStream_t &
stream) = 0;
145 std::stringstream ps;
146 ps <<
"block=(" << param.
block.x <<
"," << param.
block.y <<
"," << param.
block.z <<
"), ";
147 ps <<
"grid=(" << param.
grid.x <<
"," << param.
grid.y <<
"," << param.
grid.z <<
"), ";
154 float gflops =
flops() / (1e9 * time);
155 float gbytes =
bytes() / (1e9 * time);
156 std::stringstream ss;
157 ss << std::setiosflags(std::ios::fixed) << std::setprecision(2) << gflops <<
" Gflop/s, ";
158 ss << gbytes <<
" GB/s";
164 const int min_block_size =
deviceProp.warpSize;
165 param.
block = dim3(min_block_size,1,1);
166 param.
grid = dim3(1,1,1);
175 param.
grid = dim3(128,1,1);
190 errorQuda(
"Requested X-dimension block size %d greater than hardware limit %d",
194 errorQuda(
"Requested Y-dimension block size %d greater than hardware limit %d",
198 errorQuda(
"Requested Z-dimension block size %d greater than hardware limit %d",
202 errorQuda(
"Requested X-dimension grid size %d greater than hardware limit %d",
207 errorQuda(
"Requested Y-dimension grid size %d greater than hardware limit %d",
211 errorQuda(
"Requested Z-dimension grid size %d greater than hardware limit %d",
223 #endif // _TUNE_QUDA_H