QUDA v0.4.0
A library for QCD on GPUs
|
00001 #ifndef _TUNE_QUDA_H 00002 #define _TUNE_QUDA_H 00003 00004 #include <quda_internal.h> 00005 #include <dirac_quda.h> 00006 00007 #include <string> 00008 #include <iostream> 00009 #include <iomanip> 00010 00011 class TuneKey { 00012 00013 public: 00014 std::string volume; 00015 std::string name; 00016 std::string aux; 00017 00018 TuneKey() { } 00019 TuneKey(std::string v, std::string n, std::string a=std::string()) 00020 : volume(v), name(n), aux(a) { } 00021 TuneKey(const TuneKey &key) 00022 : volume(key.volume), name(key.name), aux(key.aux) { } 00023 00024 TuneKey& operator=(const TuneKey &key) { 00025 if (&key != this) { 00026 volume = key.volume; 00027 name = key.name; 00028 aux = key.aux; 00029 } 00030 return *this; 00031 } 00032 00033 bool operator<(const TuneKey &other) const { 00034 return (volume < other.volume) || 00035 ((volume == other.volume) && (name < other.name)) || 00036 ((volume == other.volume) && (name == other.name) && (aux < other.aux)); 00037 } 00038 00039 }; 00040 00041 00042 class TuneParam { 00043 00044 public: 00045 dim3 block; 00046 dim3 grid; 00047 int shared_bytes; 00048 std::string comment; 00049 00050 TuneParam() : block(32, 1, 1), grid(1, 1, 1), shared_bytes(0) { } 00051 TuneParam(const TuneParam ¶m) 00052 : block(param.block), grid(param.grid), shared_bytes(param.shared_bytes), comment(param.comment) { } 00053 TuneParam& operator=(const TuneParam ¶m) { 00054 if (¶m != this) { 00055 block = param.block; 00056 grid = param.grid; 00057 shared_bytes = param.shared_bytes; 00058 comment = param.comment; 00059 } 00060 return *this; 00061 } 00062 00063 }; 00064 00065 00066 class Tunable { 00067 00068 protected: 00069 virtual long long flops() const { return 0; } // FIXME: make pure virtual 00070 virtual long long bytes() const { return 0; } // FIXME 00071 00072 // the minimum number of shared bytes per thread 00073 virtual int sharedBytesPerThread() const = 0; 00074 00075 // the minimum number of shared bytes per thread block 00076 virtual int sharedBytesPerBlock() const = 0; 00077 00078 virtual bool advanceGridDim(TuneParam ¶m) const 00079 { 00080 const unsigned int max_blocks = 256; // FIXME: set a reasonable value for blas currently 00081 const int step = 1; 00082 param.grid.x += step; 00083 if (param.grid.x > max_blocks) { 00084 param.grid.x = step; 00085 return false; 00086 } else { 00087 return true; 00088 } 00089 } 00090 00091 virtual bool advanceBlockDim(TuneParam ¶m) const 00092 { 00093 const unsigned int max_threads = 512; // FIXME: use deviceProp.maxThreadsDim[0]; 00094 const unsigned int max_shared = 16384; // FIXME: use deviceProp.sharedMemPerBlock; 00095 const int step = 32; // FIXME: use deviceProp.warpSize; 00096 param.block.x += step; 00097 if (param.block.x > max_threads || sharedBytesPerThread()*param.block.x > max_shared) { 00098 param.block.x = step; 00099 return false; 00100 } else { 00101 return true; 00102 } 00103 } 00104 00113 virtual bool advanceSharedBytes(TuneParam ¶m) const 00114 { 00115 const int max_shared = 16384; // FIXME: use deviceProp.sharedMemPerBlock; 00116 const int max_blocks_per_sm = 16; // FIXME: derive from deviceProp 00117 int blocks_per_sm = max_shared / (param.shared_bytes ? param.shared_bytes : 1); 00118 if (blocks_per_sm > max_blocks_per_sm) blocks_per_sm = max_blocks_per_sm; 00119 param.shared_bytes = max_shared / blocks_per_sm + 1; 00120 if (param.shared_bytes > max_shared) { 00121 TuneParam next(param); 00122 advanceBlockDim(next); // to get next blockDim 00123 int nthreads = next.block.x * next.block.y * next.block.z; 00124 param.shared_bytes = sharedBytesPerThread()*nthreads > sharedBytesPerBlock() ? 00125 sharedBytesPerThread()*nthreads : sharedBytesPerBlock(); 00126 return false; 00127 } else { 00128 return true; 00129 } 00130 } 00131 00132 public: 00133 Tunable() { } 00134 virtual ~Tunable() { } 00135 virtual TuneKey tuneKey() const = 0; 00136 virtual void apply(const cudaStream_t &stream) = 0; 00137 virtual void preTune() { } 00138 virtual void postTune() { } 00139 virtual int tuningIter() const { return 1; } 00140 00141 virtual std::string paramString(const TuneParam ¶m) const 00142 { 00143 std::stringstream ps; 00144 ps << "block=(" << param.block.x << "," << param.block.y << "," << param.block.z << "), "; 00145 ps << "grid=(" << param.grid.x << "," << param.grid.y << "," << param.grid.z << "), "; 00146 ps << "shared=" << param.shared_bytes; 00147 return ps.str(); 00148 } 00149 00150 virtual std::string perfString(float time) const 00151 { 00152 float gflops = flops() / (1e9 * time); 00153 float gbytes = bytes() / (1e9 * time); 00154 std::stringstream ss; 00155 ss << std::setiosflags(std::ios::fixed) << std::setprecision(2) << gflops << " Gflop/s, "; 00156 ss << gbytes << " GB/s"; 00157 return ss.str(); 00158 } 00159 00160 virtual void initTuneParam(TuneParam ¶m) const 00161 { 00162 const int min_block_size = 32; 00163 param.block = dim3(min_block_size,1,1); 00164 param.grid = dim3(1,1,1); 00165 param.shared_bytes = sharedBytesPerThread()*min_block_size > sharedBytesPerBlock() ? 00166 sharedBytesPerThread()*min_block_size : sharedBytesPerBlock(); 00167 } 00168 00170 virtual void defaultTuneParam(TuneParam ¶m) const 00171 { 00172 initTuneParam(param); 00173 param.grid = dim3(128,1,1); 00174 } 00175 00176 virtual bool advanceTuneParam(TuneParam ¶m) const 00177 { 00178 return advanceSharedBytes(param) || advanceBlockDim(param) || advanceGridDim(param); 00179 } 00180 00181 }; 00182 00183 void loadTuneCache(QudaVerbosity verbosity); 00184 void saveTuneCache(QudaVerbosity verbosity); 00185 TuneParam tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity); 00186 00187 #endif // _TUNE_QUDA_H