QUDA: quda/include/tune_quda.h Source File

QUDA v0.4.0
A library for QCD on GPUs
00001 #ifndef _TUNE_QUDA_H
00002 #define _TUNE_QUDA_H
00003 
00004 #include <quda_internal.h>
00005 #include <dirac_quda.h>
00006 
00007 #include <string>
00008 #include <iostream>
00009 #include <iomanip>
00010 
00011 class TuneKey {
00012 
00013  public:
00014   std::string volume;
00015   std::string name;
00016   std::string aux;
00017 
00018   TuneKey() { }
00019   TuneKey(std::string v, std::string n, std::string a=std::string())
00020     : volume(v), name(n), aux(a) { }  
00021   TuneKey(const TuneKey &key)
00022     : volume(key.volume), name(key.name), aux(key.aux) { }
00023 
00024   TuneKey& operator=(const TuneKey &key) {
00025     if (&key != this) {
00026       volume = key.volume;
00027       name = key.name;
00028       aux = key.aux;
00029     }
00030     return *this;
00031   }
00032 
00033   bool operator<(const TuneKey &other) const {
00034     return (volume < other.volume) ||
00035       ((volume == other.volume) && (name < other.name)) ||
00036       ((volume == other.volume) && (name == other.name) && (aux < other.aux));
00037   }
00038 
00039 };
00040 
00041 
00042 class TuneParam {
00043 
00044  public:
00045   dim3 block;
00046   dim3 grid;
00047   int shared_bytes;
00048   std::string comment;
00049 
00050   TuneParam() : block(32, 1, 1), grid(1, 1, 1), shared_bytes(0) { }
00051   TuneParam(const TuneParam &param)
00052     : block(param.block), grid(param.grid), shared_bytes(param.shared_bytes), comment(param.comment) { }
00053   TuneParam& operator=(const TuneParam &param) {
00054     if (&param != this) {
00055       block = param.block;
00056       grid = param.grid;
00057       shared_bytes = param.shared_bytes;
00058       comment = param.comment;
00059     }
00060     return *this;
00061   }
00062 
00063 };
00064 
00065 
00066 class Tunable {
00067 
00068  protected:
00069   virtual long long flops() const { return 0; } // FIXME: make pure virtual
00070   virtual long long bytes() const { return 0; } // FIXME
00071 
00072   // the minimum number of shared bytes per thread
00073   virtual int sharedBytesPerThread() const = 0;
00074 
00075   // the minimum number of shared bytes per thread block
00076   virtual int sharedBytesPerBlock() const = 0;
00077 
00078   virtual bool advanceGridDim(TuneParam &param) const
00079   {
00080     const unsigned int max_blocks = 256; // FIXME: set a reasonable value for blas currently
00081     const int step = 1;
00082     param.grid.x += step;
00083     if (param.grid.x > max_blocks) {
00084       param.grid.x = step;
00085       return false;
00086     } else {
00087       return true;
00088     }
00089   }
00090 
00091   virtual bool advanceBlockDim(TuneParam &param) const
00092   {
00093     const unsigned int max_threads = 512; // FIXME: use deviceProp.maxThreadsDim[0];
00094     const unsigned int max_shared = 16384; // FIXME: use deviceProp.sharedMemPerBlock;
00095     const int step = 32; // FIXME: use deviceProp.warpSize;
00096     param.block.x += step;
00097     if (param.block.x > max_threads || sharedBytesPerThread()*param.block.x > max_shared) {
00098       param.block.x = step;
00099       return false;
00100     } else {
00101       return true;
00102     }
00103   }
00104 
00113   virtual bool advanceSharedBytes(TuneParam &param) const
00114   {
00115     const int max_shared = 16384; // FIXME: use deviceProp.sharedMemPerBlock;
00116     const int max_blocks_per_sm = 16; // FIXME: derive from deviceProp
00117     int blocks_per_sm = max_shared / (param.shared_bytes ? param.shared_bytes : 1);
00118     if (blocks_per_sm > max_blocks_per_sm) blocks_per_sm = max_blocks_per_sm;
00119     param.shared_bytes = max_shared / blocks_per_sm + 1;
00120     if (param.shared_bytes > max_shared) {
00121       TuneParam next(param);
00122       advanceBlockDim(next); // to get next blockDim
00123       int nthreads = next.block.x * next.block.y * next.block.z;
00124       param.shared_bytes = sharedBytesPerThread()*nthreads > sharedBytesPerBlock() ?
00125         sharedBytesPerThread()*nthreads : sharedBytesPerBlock();
00126       return false;
00127     } else {
00128       return true;
00129     }
00130   }
00131 
00132  public:
00133   Tunable() { }
00134   virtual ~Tunable() { }
00135   virtual TuneKey tuneKey() const = 0;
00136   virtual void apply(const cudaStream_t &stream) = 0;
00137   virtual void preTune() { }
00138   virtual void postTune() { }
00139   virtual int tuningIter() const { return 1; }
00140 
00141   virtual std::string paramString(const TuneParam &param) const
00142   {
00143     std::stringstream ps;
00144     ps << "block=(" << param.block.x << "," << param.block.y << "," << param.block.z << "), ";
00145     ps << "grid=(" << param.grid.x << "," << param.grid.y << "," << param.grid.z << "), ";
00146     ps << "shared=" << param.shared_bytes;
00147     return ps.str();
00148   }
00149 
00150   virtual std::string perfString(float time) const
00151   {
00152     float gflops = flops() / (1e9 * time);
00153     float gbytes = bytes() / (1e9 * time);
00154     std::stringstream ss;
00155     ss << std::setiosflags(std::ios::fixed) << std::setprecision(2) << gflops << " Gflop/s, ";
00156     ss << gbytes << " GB/s";
00157     return ss.str();
00158   }
00159 
00160   virtual void initTuneParam(TuneParam &param) const
00161   {
00162     const int min_block_size = 32;
00163     param.block = dim3(min_block_size,1,1);
00164     param.grid = dim3(1,1,1);
00165     param.shared_bytes = sharedBytesPerThread()*min_block_size > sharedBytesPerBlock() ?
00166       sharedBytesPerThread()*min_block_size : sharedBytesPerBlock();
00167   }
00168 
00170   virtual void defaultTuneParam(TuneParam &param) const
00171   {
00172     initTuneParam(param);
00173     param.grid = dim3(128,1,1);
00174   }
00175 
00176   virtual bool advanceTuneParam(TuneParam &param) const
00177   {
00178     return advanceSharedBytes(param) || advanceBlockDim(param) || advanceGridDim(param);
00179   }
00180 
00181 };
00182 
00183 void loadTuneCache(QudaVerbosity verbosity);
00184 void saveTuneCache(QudaVerbosity verbosity);
00185 TuneParam tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity);
00186 
00187 #endif // _TUNE_QUDA_H