QUDA: quda/lib/dslash_quda.cu Source File

QUDA v0.4.0
A library for QCD on GPUs
00001 #include <cstdlib>
00002 #include <cstdio>
00003 #include <string>
00004 #include <iostream>
00005 
00006 #include <color_spinor_field.h>
00007 #include <clover_field.h>
00008 
00009 #define BLOCK_DIM 64
00010 
00011 // these control the Wilson-type actions
00012 //#define DIRECT_ACCESS_LINK
00013 //#define DIRECT_ACCESS_WILSON_SPINOR
00014 //#define DIRECT_ACCESS_WILSON_ACCUM
00015 //#define DIRECT_ACCESS_WILSON_INTER
00016 //#define DIRECT_ACCESS_WILSON_PACK_SPINOR
00017 //#define DIRECT_ACCESS_CLOVER
00018 
00019 //these are access control for staggered action
00020 #if (__COMPUTE_CAPABILITY__ >= 200)
00021 //#define DIRECT_ACCESS_FAT_LINK
00022 //#define DIRECT_ACCESS_LONG_LINK
00023 #define DIRECT_ACCESS_SPINOR
00024 //#define DIRECT_ACCESS_ACCUM
00025 //#define DIRECT_ACCESS_INTER
00026 //#define DIRECT_ACCESS_PACK
00027 #else
00028 #define DIRECT_ACCESS_FAT_LINK
00029 //#define DIRECT_ACCESS_LONG_LINK
00030 //#define DIRECT_ACCESS_SPINOR
00031 //#define DIRECT_ACCESS_ACCUM
00032 //#define DIRECT_ACCESS_INTER
00033 //#define DIRECT_ACCESS_PACK
00034 #endif
00035 
00036 #include <quda_internal.h>
00037 #include <dslash_quda.h>
00038 #include <sys/time.h>
00039 #include <inline_ptx.h>
00040 
00041 enum KernelType {
00042   INTERIOR_KERNEL = 5,
00043   EXTERIOR_KERNEL_X = 0,
00044   EXTERIOR_KERNEL_Y = 1,
00045   EXTERIOR_KERNEL_Z = 2,
00046   EXTERIOR_KERNEL_T = 3
00047 };
00048 
00049 struct DslashParam {
00050   int threads; // the desired number of active threads
00051   int parity;  // Even-Odd or Odd-Even
00052   int commDim[QUDA_MAX_DIM]; // Whether to do comms or not
00053   int ghostDim[QUDA_MAX_DIM]; // Whether a ghost zone has been allocated for a given dimension
00054   int ghostOffset[QUDA_MAX_DIM];
00055   int ghostNormOffset[QUDA_MAX_DIM];
00056   KernelType kernel_type; //is it INTERIOR_KERNEL, EXTERIOR_KERNEL_X/Y/Z/T
00057 };
00058 
00059 // determines whether the temporal ghost zones are packed with a gather kernel,
00060 // as opposed to multiple calls to cudaMemcpy()
00061 bool kernelPackT = false;
00062 
00063 DslashParam dslashParam;
00064 
00065 // these are set in initDslashConst
00066 int Vspatial;
00067 
00068 static cudaEvent_t packEnd[Nstream];
00069 static cudaEvent_t gatherStart[Nstream];
00070 static cudaEvent_t gatherEnd[Nstream];
00071 static cudaEvent_t scatterStart[Nstream];
00072 static cudaEvent_t scatterEnd[Nstream];
00073 
00074 static struct timeval dslashStart_h;
00075 #ifdef MULTI_GPU
00076 static struct timeval commsStart[Nstream];
00077 static struct timeval commsEnd[Nstream];
00078 #endif
00079 
00080 // these events are only used for profiling
00081 #ifdef DSLASH_PROFILING
00082 #define DSLASH_TIME_PROFILE() dslashTimeProfile()
00083 
00084 static cudaEvent_t dslashStart;
00085 static cudaEvent_t dslashEnd;
00086 static cudaEvent_t packStart[Nstream];
00087 static cudaEvent_t kernelStart[Nstream];
00088 static cudaEvent_t kernelEnd[Nstream];
00089 
00090 // dimension 2 because we want absolute and relative
00091 float packTime[Nstream][2];
00092 float gatherTime[Nstream][2];
00093 float commsTime[Nstream][2];
00094 float scatterTime[Nstream][2];
00095 float kernelTime[Nstream][2];
00096 float dslashTime;
00097 #define CUDA_EVENT_RECORD(a,b) cudaEventRecord(a,b)
00098 #else
00099 #define CUDA_EVENT_RECORD(a,b)
00100 #define DSLASH_TIME_PROFILE()
00101 #endif
00102 
00103 FaceBuffer *face;
00104 cudaColorSpinorField *inSpinor;
00105 
00106 // For tuneLaunch() to uniquely identify a suitable set of launch parameters, we need copies of a few of
00107 // the constants set by initDslashConstants().
00108 static struct {
00109   int x[4];
00110   int Ls;
00111   // In the future, we may also want to add gauge_fixed, sp_stride, ga_stride, cl_stride, etc.
00112 } dslashConstants;
00113 
00114 // dslashTuning = QUDA_TUNE_YES enables autotuning when the dslash is
00115 // first launched
00116 static QudaTune dslashTuning = QUDA_TUNE_NO;
00117 static QudaVerbosity verbosity = QUDA_SILENT;
00118 
00119 void setDslashTuning(QudaTune tune, QudaVerbosity verbose)
00120 {
00121   dslashTuning = tune;
00122   verbosity = verbose;
00123 }
00124 
00125 #include <dslash_textures.h>
00126 #include <dslash_constants.h>
00127 
00128 static inline __device__ float short2float(short a) {
00129   return (float)a/MAX_SHORT;
00130 }
00131 
00132 static inline __device__ short float2short(float c, float a) {
00133   return (short)(a*c*MAX_SHORT);
00134 }
00135 
00136 static inline __device__ short2 float22short2(float c, float2 a) {
00137   return make_short2((short)(a.x*c*MAX_SHORT), (short)(a.y*c*MAX_SHORT));
00138 }
00139 
00140 #if defined(DIRECT_ACCESS_LINK) || defined(DIRECT_ACCESS_WILSON_SPINOR) || \
00141   defined(DIRECT_ACCESS_WILSON_ACCUM) || defined(DIRECT_ACCESS_WILSON_PACK_SPINOR) || \
00142   defined(DIRECT_ACCESS_WILSON_INTER) || defined(DIRECT_ACCESS_WILSON_PACK_SPINOR)
00143 
00144 static inline __device__ short4 float42short4(float c, float4 a) {
00145   return make_short4(float2short(c, a.x), float2short(c, a.y), float2short(c, a.z), float2short(c, a.w));
00146 }
00147 
00148 static inline __device__ float4 short42float4(short4 a) {
00149   return make_float4(short2float(a.x), short2float(a.y), short2float(a.z), short2float(a.w));
00150 }
00151 
00152 static inline __device__ float2 short22float2(short2 a) {
00153   return make_float2(short2float(a.x), short2float(a.y));
00154 }
00155 #endif // DIRECT_ACCESS inclusions
00156 
00157 // Enable shared memory dslash for Fermi architecture
00158 //#define SHARED_WILSON_DSLASH
00159 //#define SHARED_8_BYTE_WORD_SIZE // 8-byte shared memory access
00160 
00161 #include <pack_face_def.h>        // kernels for packing the ghost zones and general indexing
00162 #include <staggered_dslash_def.h> // staggered Dslash kernels
00163 #include <wilson_dslash_def.h>    // Wilson Dslash kernels (including clover)
00164 #include <dw_dslash_def.h>        // Domain Wall kernels
00165 #include <tm_dslash_def.h>        // Twisted Mass kernels
00166 #include <tm_core.h>              // solo twisted mass kernel
00167 #include <clover_def.h>           // kernels for applying the clover term alone
00168 
00169 #ifndef DSLASH_SHARED_FLOATS_PER_THREAD
00170 #define DSLASH_SHARED_FLOATS_PER_THREAD 0
00171 #endif
00172 
00173 #ifndef CLOVER_SHARED_FLOATS_PER_THREAD
00174 #define CLOVER_SHARED_FLOATS_PER_THREAD 0
00175 #endif
00176 
00177 #include <blas_quda.h>
00178 #include <face_quda.h>
00179 
00180 
00181 __global__ void dummyKernel() {
00182   // do nothing
00183 }
00184 
00185 void initCache() {
00186 
00187 #if (__COMPUTE_CAPABILITY__ >= 200)
00188 
00189   static int firsttime = 1;
00190   if (firsttime){       
00191     cudaFuncSetCacheConfig(dummyKernel, cudaFuncCachePreferL1);
00192     dummyKernel<<<1,1>>>();
00193     firsttime=0;
00194   }
00195 
00196 #endif
00197 
00198 }
00199 
00200 void setFace(const FaceBuffer &Face) {
00201   face = (FaceBuffer*)&Face; // nasty
00202 }
00203 
00204 #define MORE_GENERIC_DSLASH(FUNC, DAG, X, kernel_type, gridDim, blockDim, shared, stream, param,  ...)            \
00205   if (x==0) {                                                                                                     \
00206     if (reconstruct == QUDA_RECONSTRUCT_NO) {                                                                     \
00207       FUNC ## 18 ## DAG ## Kernel<kernel_type><<<gridDim, blockDim, shared, stream>>> ( __VA_ARGS__ , param);     \
00208     } else if (reconstruct == QUDA_RECONSTRUCT_12) {                                                              \
00209       FUNC ## 12 ## DAG ## Kernel<kernel_type><<<gridDim, blockDim, shared, stream>>> ( __VA_ARGS__ , param);     \
00210     } else {                                                                                                      \
00211       FUNC ## 8 ## DAG ## Kernel<kernel_type><<<gridDim, blockDim, shared, stream>>> ( __VA_ARGS__, param);       \
00212     }                                                                                                             \
00213   } else {                                                                                                        \
00214     if (reconstruct == QUDA_RECONSTRUCT_NO) {                                                                     \
00215       FUNC ## 18 ## DAG ## X ## Kernel<kernel_type><<<gridDim, blockDim, shared, stream>>> ( __VA_ARGS__, param); \
00216     } else if (reconstruct == QUDA_RECONSTRUCT_12) {                                                              \
00217       FUNC ## 12 ## DAG ## X ## Kernel<kernel_type><<<gridDim, blockDim, shared, stream>>> ( __VA_ARGS__, param); \
00218     } else if (reconstruct == QUDA_RECONSTRUCT_8) {                                                               \
00219       FUNC ## 8 ## DAG ## X ## Kernel<kernel_type> <<<gridDim, blockDim, shared, stream>>> ( __VA_ARGS__, param); \
00220     }                                                                                                             \
00221   }
00222 
00223 #ifndef MULTI_GPU
00224 
00225 #define GENERIC_DSLASH(FUNC, DAG, X, gridDim, blockDim, shared, stream, param,  ...)                          \
00226   switch(param.kernel_type) {                                                                                 \
00227   case INTERIOR_KERNEL:                                                                                       \
00228     MORE_GENERIC_DSLASH(FUNC, DAG, X, INTERIOR_KERNEL, gridDim, blockDim, shared, stream, param, __VA_ARGS__) \
00229     break;                                                                                                    \
00230   default:                                                                                                    \
00231     errorQuda("KernelType %d not defined for single GPU", param.kernel_type);                                 \
00232   }
00233 
00234 #else
00235 
00236 #define GENERIC_DSLASH(FUNC, DAG, X, gridDim, blockDim, shared, stream, param,  ...)                            \
00237   switch(param.kernel_type) {                                                                                   \
00238   case INTERIOR_KERNEL:                                                                                         \
00239     MORE_GENERIC_DSLASH(FUNC, DAG, X, INTERIOR_KERNEL,   gridDim, blockDim, shared, stream, param, __VA_ARGS__) \
00240     break;                                                                                                      \
00241   case EXTERIOR_KERNEL_X:                                                                                       \
00242     MORE_GENERIC_DSLASH(FUNC, DAG, X, EXTERIOR_KERNEL_X, gridDim, blockDim, shared, stream, param, __VA_ARGS__) \
00243     break;                                                                                                      \
00244   case EXTERIOR_KERNEL_Y:                                                                                       \
00245     MORE_GENERIC_DSLASH(FUNC, DAG, X, EXTERIOR_KERNEL_Y, gridDim, blockDim, shared, stream, param, __VA_ARGS__) \
00246     break;                                                                                                      \
00247   case EXTERIOR_KERNEL_Z:                                                                                       \
00248     MORE_GENERIC_DSLASH(FUNC, DAG, X, EXTERIOR_KERNEL_Z, gridDim, blockDim, shared, stream, param, __VA_ARGS__) \
00249     break;                                                                                                      \
00250   case EXTERIOR_KERNEL_T:                                                                                       \
00251     MORE_GENERIC_DSLASH(FUNC, DAG, X, EXTERIOR_KERNEL_T, gridDim, blockDim, shared, stream, param, __VA_ARGS__) \
00252     break;                                                                                                      \
00253   }
00254 
00255 #endif
00256 
00257 // macro used for dslash types with dagger kernel defined (Wilson, domain wall, etc.)
00258 #define DSLASH(FUNC, gridDim, blockDim, shared, stream, param, ...)     \
00259   if (!dagger) {                                                        \
00260     GENERIC_DSLASH(FUNC, , Xpay, gridDim, blockDim, shared, stream, param, __VA_ARGS__) \
00261   } else {                                                              \
00262     GENERIC_DSLASH(FUNC, Dagger, Xpay, gridDim, blockDim, shared, stream, param, __VA_ARGS__) \
00263  }
00264 
00265 // macro used for staggered dslash
00266 #define STAGGERED_DSLASH(gridDim, blockDim, shared, stream, param, ...) \
00267     GENERIC_DSLASH(staggeredDslash, , Axpy, gridDim, blockDim, shared, stream, param, __VA_ARGS__)
00268 
00269 
00270 // Use an abstract class interface to drive the different CUDA dslash
00271 // kernels. All parameters are curried into the derived classes to
00272 // allow a simple interface.
00273 class DslashCuda : public Tunable {
00274  protected:
00275   int sharedBytesPerBlock() const { return 0; }
00276   bool advanceGridDim(TuneParam &param) const { return false; } // Don't tune the grid dimensions.
00277   bool advanceBlockDim(TuneParam &param) const {
00278     bool advance = Tunable::advanceBlockDim(param);
00279     if (advance) param.grid = dim3( (dslashParam.threads+param.block.x-1) / param.block.x, 1, 1);
00280     return advance;
00281   }
00282 
00283  public:
00284   DslashCuda() { }
00285   virtual ~DslashCuda() { }
00286   virtual TuneKey tuneKey() const;
00287   std::string paramString(const TuneParam &param) const // Don't bother printing the grid dim.
00288   {
00289     std::stringstream ps;
00290     ps << "block=(" << param.block.x << "," << param.block.y << "," << param.block.z << "), ";
00291     ps << "shared=" << param.shared_bytes;
00292     return ps.str();
00293   }
00294   virtual int Nface() { return 2; }
00295 
00296   virtual void initTuneParam(TuneParam &param) const
00297   {
00298     Tunable::initTuneParam(param);
00299     param.grid = dim3( (dslashParam.threads+param.block.x-1) / param.block.x, 1, 1);
00300   }
00301 
00303   virtual void defaultTuneParam(TuneParam &param) const
00304   {
00305     Tunable::defaultTuneParam(param);
00306     param.grid = dim3( (dslashParam.threads+param.block.x-1) / param.block.x, 1, 1);
00307   }
00308 
00309 
00310 };
00311 
00312 TuneKey DslashCuda::tuneKey() const
00313 {
00314   std::stringstream vol, aux;
00315   
00316   vol << dslashConstants.x[0] << "x";
00317   vol << dslashConstants.x[1] << "x";
00318   vol << dslashConstants.x[2] << "x";
00319   vol << dslashConstants.x[3];
00320 
00321   aux << "type=";
00322 #ifdef MULTI_GPU
00323   char comm[5], ghost[5];
00324   switch (dslashParam.kernel_type) {
00325   case INTERIOR_KERNEL: aux << "interior"; break;
00326   case EXTERIOR_KERNEL_X: aux << "exterior_x"; break;
00327   case EXTERIOR_KERNEL_Y: aux << "exterior_y"; break;
00328   case EXTERIOR_KERNEL_Z: aux << "exterior_z"; break;
00329   case EXTERIOR_KERNEL_T: aux << "exterior_t"; break;
00330   }
00331   for (int i=0; i<4; i++) {
00332     comm[i] = (dslashParam.commDim[i] ? '1' : '0');
00333     ghost[i] = (dslashParam.ghostDim[i] ? '1' : '0');
00334   }
00335   comm[4] = '\0'; ghost[4] = '\0';
00336   aux << ",comm=" << comm;
00337   if (dslashParam.kernel_type == INTERIOR_KERNEL) {
00338     aux << ",ghost=" << ghost;
00339   }
00340 #else
00341   aux << "single-GPU";
00342 #endif // MULTI_GPU
00343   return TuneKey(vol.str(), typeid(*this).name(), aux.str());
00344 }
00345 
00349 #if (__COMPUTE_CAPABILITY__ >= 200 && defined(SHARED_WILSON_DSLASH)) 
00350 class SharedDslashCuda : public DslashCuda {
00351  protected:
00352   int sharedBytesPerBlock() const { return 0; } // FIXME: this isn't quite true, but works
00353   bool advanceSharedBytes(TuneParam &param) const { 
00354     if (dslashParam.kernel_type != INTERIOR_KERNEL) return DslashCuda::advanceSharedBytes(param);
00355     else return false;
00356   } // FIXME - shared memory tuning only supported on exterior kernels
00357 
00359   int sharedBytes(const dim3 &block) const { 
00360     int warpSize = 32; // FIXME - query from device properties
00361     int block_xy = block.x*block.y;
00362     if (block_xy % warpSize != 0) block_xy = ((block_xy / warpSize) + 1)*warpSize;
00363     return block_xy*block.z*sharedBytesPerThread();
00364   }
00365 
00367   dim3 createGrid(const dim3 &block) const {
00368     unsigned int gx = ((dslashConstants.x[0]/2)*dslashConstants.x[3] + block.x - 1) / block.x;
00369     unsigned int gy = (dslashConstants.x[1] + block.y - 1 ) / block.y;  
00370     unsigned int gz = (dslashConstants.x[2] + block.z - 1) / block.z;
00371     return dim3(gx, gy, gz);
00372   }
00373 
00375   bool advanceBlockDim(TuneParam &param) const {
00376     if (dslashParam.kernel_type != INTERIOR_KERNEL) return DslashCuda::advanceBlockDim(param);
00377     const unsigned int min_threads = 2;
00378     const unsigned int max_threads = 512; // FIXME: use deviceProp.maxThreadsDim[0];
00379     const unsigned int max_shared = 16384*3; // FIXME: use deviceProp.sharedMemPerBlock;
00380     
00381     // set the x-block dimension equal to the entire x dimension
00382     bool set = false;
00383     dim3 blockInit = param.block;
00384     blockInit.z++;
00385     for (unsigned bx=blockInit.x; bx<=dslashConstants.x[0]/2; bx++) {
00386       //unsigned int gx = (dslashConstants.x[0]*dslashConstants.x[3] + bx - 1) / bx;
00387       for (unsigned by=blockInit.y; by<=dslashConstants.x[1]; by++) {
00388         unsigned int gy = (dslashConstants.x[1] + by - 1 ) / by;        
00389         
00390         if (by > 1 && (by%2) != 0) continue; // can't handle odd blocks yet except by=1
00391         
00392         for (unsigned bz=blockInit.z; bz<=dslashConstants.x[2]; bz++) {
00393           unsigned int gz = (dslashConstants.x[2] + bz - 1) / bz;
00394           
00395           if (bz > 1 && (bz%2) != 0) continue; // can't handle odd blocks yet except bz=1
00396           if (bx*by*bz > max_threads) continue;
00397           if (bx*by*bz < min_threads) continue;
00398           // can't yet handle the last block properly in shared memory addressing
00399           if (by*gy != dslashConstants.x[1]) continue;
00400           if (bz*gz != dslashConstants.x[2]) continue;
00401           if (sharedBytes(dim3(bx, by, bz)) > max_shared) continue;
00402 
00403           param.block = dim3(bx, by, bz);         
00404           set = true; break;
00405         }
00406         if (set) break;
00407         blockInit.z = 1;
00408       }
00409       if (set) break;
00410       blockInit.y = 1;
00411     }
00412 
00413     if (param.block.x > dslashConstants.x[0]/2 && param.block.y > dslashConstants.x[1] &&
00414         param.block.z > dslashConstants.x[2] || !set) {
00415       //||sharedBytesPerThread()*param.block.x > max_shared) {
00416       param.block = dim3(dslashConstants.x[0]/2, 1, 1);
00417       return false;
00418     } else { 
00419       param.grid = createGrid(param.block);
00420       param.shared_bytes = sharedBytes(param.block);
00421       return true; 
00422     }
00423     
00424   }
00425 
00426  public:
00427   SharedDslashCuda() : DslashCuda() { ; }
00428   virtual ~SharedDslashCuda() { ; }
00429   std::string paramString(const TuneParam &param) const // override and print out grid as well
00430   {
00431     std::stringstream ps;
00432     ps << "block=(" << param.block.x << "," << param.block.y << "," << param.block.z << "), ";
00433     ps << "grid=(" << param.grid.x << "," << param.grid.y << "," << param.grid.z << "), ";
00434     ps << "shared=" << param.shared_bytes;
00435     return ps.str();
00436   }
00437 
00438   virtual void initTuneParam(TuneParam &param) const
00439   {
00440     if (dslashParam.kernel_type != INTERIOR_KERNEL) return DslashCuda::initTuneParam(param);
00441 
00442     param.block = dim3(dslashConstants.x[0]/2, 1, 1);
00443     param.grid = createGrid(param.block);
00444     param.shared_bytes = sharedBytes(param.block);
00445   }
00446 
00448   virtual void defaultTuneParam(TuneParam &param) const
00449   {
00450     if (dslashParam.kernel_type != INTERIOR_KERNEL) DslashCuda::defaultTuneParam(param);
00451     else initTuneParam(param);
00452   }
00453 };
00454 #else 
00455 class SharedDslashCuda : public DslashCuda {
00456  public:
00457   SharedDslashCuda() : DslashCuda() { }
00458   virtual ~SharedDslashCuda() { }
00459 };
00460 #endif
00461 
00462 
00463 template <typename sFloat, typename gFloat>
00464 class WilsonDslashCuda : public SharedDslashCuda {
00465 
00466  private:
00467   const size_t bytes, norm_bytes;
00468   sFloat *out;
00469   float *outNorm;
00470   char *saveOut, *saveOutNorm;
00471   const sFloat *in, *x;
00472   const float *inNorm, *xNorm;
00473   const gFloat *gauge0, *gauge1;
00474   const QudaReconstructType reconstruct;
00475   const int dagger;
00476   const double a;
00477 
00478  protected:
00479   int sharedBytesPerThread() const
00480   {
00481 #if (__COMPUTE_CAPABILITY__ >= 200) // Fermi uses shared memory for common input
00482     if (dslashParam.kernel_type == INTERIOR_KERNEL) { // Interior kernels use shared memory for common iunput
00483       int reg_size = (typeid(sFloat)==typeid(double2) ? sizeof(double) : sizeof(float));
00484       return DSLASH_SHARED_FLOATS_PER_THREAD * reg_size;
00485     } else { // Exterior kernels use no shared memory
00486       return 0;
00487     }
00488 #else // Pre-Fermi uses shared memory only for pseudo-registers
00489     int reg_size = (typeid(sFloat)==typeid(double2) ? sizeof(double) : sizeof(float));
00490     return DSLASH_SHARED_FLOATS_PER_THREAD * reg_size;
00491 #endif
00492   }
00493 
00494  public:
00495   WilsonDslashCuda(sFloat *out, float *outNorm, const gFloat *gauge0, const gFloat *gauge1, 
00496                    const QudaReconstructType reconstruct, const sFloat *in, const float *inNorm,
00497                    const sFloat *x, const float *xNorm, const double a,
00498                    const int dagger, const size_t bytes, const size_t norm_bytes)
00499     : SharedDslashCuda(), bytes(bytes), norm_bytes(norm_bytes), out(out), outNorm(outNorm), gauge0(gauge0), gauge1(gauge1), in(in), 
00500     inNorm(inNorm), reconstruct(reconstruct), dagger(dagger), x(x), xNorm(xNorm), a(a)
00501   { 
00502     bindSpinorTex(bytes, norm_bytes, in, inNorm, out, outNorm, x, xNorm); 
00503   }
00504 
00505   virtual ~WilsonDslashCuda() { unbindSpinorTex(in, inNorm, out, outNorm, x, xNorm); }
00506 
00507   TuneKey tuneKey() const
00508   {
00509     TuneKey key = DslashCuda::tuneKey();
00510     std::stringstream recon;
00511     recon << reconstruct;
00512     key.aux += ",reconstruct=" + recon.str();
00513     if (x) key.aux += ",Xpay";
00514     return key;
00515   }
00516 
00517   void apply(const cudaStream_t &stream)
00518   {
00519 #ifdef SHARED_WILSON_DSLASH
00520     if (dslashParam.kernel_type == EXTERIOR_KERNEL_X) 
00521       errorQuda("Shared dslash does not yet support X-dimension partitioning");
00522 #endif
00523     TuneParam tp = tuneLaunch(*this, dslashTuning, verbosity);
00524     DSLASH(dslash, tp.grid, tp.block, tp.shared_bytes, stream, dslashParam,
00525            out, outNorm, gauge0, gauge1, in, inNorm, x, xNorm, a);
00526   }
00527 
00528   void preTune()
00529   {
00530     if (dslashParam.kernel_type < 5) { // exterior kernel
00531       saveOut = new char[bytes];
00532       cudaMemcpy(saveOut, out, bytes, cudaMemcpyDeviceToHost);
00533       if (typeid(sFloat) == typeid(short4)) {
00534         saveOutNorm = new char[norm_bytes];
00535         cudaMemcpy(saveOutNorm, outNorm, norm_bytes, cudaMemcpyDeviceToHost);
00536       }
00537     }
00538   }
00539 
00540   void postTune()
00541   {
00542     if (dslashParam.kernel_type < 5) { // exterior kernel
00543       cudaMemcpy(out, saveOut, bytes, cudaMemcpyHostToDevice);
00544       delete[] saveOut;
00545       if (typeid(sFloat) == typeid(short4)) {
00546         cudaMemcpy(outNorm, saveOutNorm, norm_bytes, cudaMemcpyHostToDevice);
00547         delete[] saveOutNorm;
00548       }
00549     }
00550   }
00551 
00552 };
00553 
00554 template <typename sFloat, typename gFloat, typename cFloat>
00555 class CloverDslashCuda : public SharedDslashCuda {
00556 
00557  private:
00558   const size_t bytes, norm_bytes;
00559   sFloat *out;
00560   float *outNorm;
00561   char *saveOut, *saveOutNorm;
00562   const sFloat *in, *x;
00563   const float *inNorm, *xNorm;
00564   const gFloat *gauge0, *gauge1;
00565   const QudaReconstructType reconstruct;
00566   const cFloat *clover;
00567   const float *cloverNorm;
00568   const int dagger;
00569   const double a;
00570 
00571  protected:
00572   int sharedBytesPerThread() const
00573   {
00574 #if (__COMPUTE_CAPABILITY__ >= 200)
00575     if (dslashParam.kernel_type == INTERIOR_KERNEL) {
00576       int reg_size = (typeid(sFloat)==typeid(double2) ? sizeof(double) : sizeof(float));
00577       return DSLASH_SHARED_FLOATS_PER_THREAD * reg_size;
00578     } else {
00579       return 0;
00580     }
00581 #else
00582     int reg_size = (typeid(sFloat)==typeid(double2) ? sizeof(double) : sizeof(float));
00583     return DSLASH_SHARED_FLOATS_PER_THREAD * reg_size;
00584 #endif
00585   }
00586 
00587  public:
00588   CloverDslashCuda(sFloat *out, float *outNorm, const gFloat *gauge0, const gFloat *gauge1, 
00589                    const QudaReconstructType reconstruct, const cFloat *clover, 
00590                    const float *cloverNorm, const sFloat *in, const float *inNorm,
00591                    const sFloat *x, const float *xNorm, const double a,
00592                    const int dagger, const size_t bytes, const size_t norm_bytes)
00593     : SharedDslashCuda(), bytes(bytes), norm_bytes(norm_bytes), out(out), outNorm(outNorm), gauge0(gauge0), gauge1(gauge1), clover(clover),
00594     cloverNorm(cloverNorm), in(in), inNorm(inNorm), reconstruct(reconstruct), dagger(dagger), x(x), xNorm(xNorm), a(a)
00595   { 
00596     bindSpinorTex(bytes, norm_bytes, in, inNorm, out, outNorm, x, xNorm); 
00597   }
00598   virtual ~CloverDslashCuda() { unbindSpinorTex(in, inNorm, out, outNorm, x, xNorm); }
00599 
00600   TuneKey tuneKey() const
00601   {
00602     TuneKey key = DslashCuda::tuneKey();
00603     std::stringstream recon;
00604     recon << reconstruct;
00605     key.aux += ",reconstruct=" + recon.str();
00606     if (x) key.aux += ",Xpay";
00607     return key;
00608   }
00609 
00610   void apply(const cudaStream_t &stream)
00611   {
00612 #ifdef SHARED_WILSON_DSLASH
00613     if (dslashParam.kernel_type == EXTERIOR_KERNEL_X) 
00614       errorQuda("Shared dslash does not yet support X-dimension partitioning");
00615 #endif
00616     TuneParam tp = tuneLaunch(*this, dslashTuning, verbosity);
00617     DSLASH(cloverDslash, tp.grid, tp.block, tp.shared_bytes, stream, dslashParam,
00618            out, outNorm, gauge0, gauge1, clover, cloverNorm, in, inNorm, x, xNorm, a);
00619   }
00620 
00621   void preTune()
00622   {
00623     if (dslashParam.kernel_type < 5) { // exterior kernel
00624       saveOut = new char[bytes];
00625       cudaMemcpy(saveOut, out, bytes, cudaMemcpyDeviceToHost);
00626       if (typeid(sFloat) == typeid(short4)) {
00627         saveOutNorm = new char[norm_bytes];
00628         cudaMemcpy(saveOutNorm, outNorm, norm_bytes, cudaMemcpyDeviceToHost);
00629       }
00630     }
00631   }
00632 
00633   void postTune()
00634   {
00635     if (dslashParam.kernel_type < 5) { // exterior kernel
00636       cudaMemcpy(out, saveOut, bytes, cudaMemcpyHostToDevice);
00637       delete[] saveOut;
00638       if (typeid(sFloat) == typeid(short4)) {
00639         cudaMemcpy(outNorm, saveOutNorm, norm_bytes, cudaMemcpyHostToDevice);
00640         delete[] saveOutNorm;
00641       }
00642     }
00643   }
00644 
00645 };
00646 
00647 void setTwistParam(double &a, double &b, const double &kappa, const double &mu, 
00648                    const int dagger, const QudaTwistGamma5Type twist) {
00649   if (twist == QUDA_TWIST_GAMMA5_DIRECT) {
00650     a = 2.0 * kappa * mu;
00651     b = 1.0;
00652   } else if (twist == QUDA_TWIST_GAMMA5_INVERSE) {
00653     a = -2.0 * kappa * mu;
00654     b = 1.0 / (1.0 + a*a);
00655   } else {
00656     errorQuda("Twist type %d not defined\n", twist);
00657   }
00658   if (dagger) a *= -1.0;
00659 
00660 }
00661 
00662 template <typename sFloat, typename gFloat>
00663 class TwistedDslashCuda : public SharedDslashCuda {
00664 
00665  private:
00666   const size_t bytes, norm_bytes;
00667   sFloat *out;
00668   float *outNorm;
00669   char *saveOut, *saveOutNorm;
00670   const sFloat *in, *x;
00671   const float *inNorm, *xNorm;
00672   const gFloat *gauge0, *gauge1;
00673   const QudaReconstructType reconstruct;
00674   const int dagger;
00675   double a;
00676   double b;
00677 
00678  protected:
00679   int sharedBytesPerThread() const
00680   {
00681 #if (__COMPUTE_CAPABILITY__ >= 200)
00682     if (dslashParam.kernel_type == INTERIOR_KERNEL) {
00683       int reg_size = (typeid(sFloat)==typeid(double2) ? sizeof(double) : sizeof(float));
00684       return DSLASH_SHARED_FLOATS_PER_THREAD * reg_size;
00685     } else {
00686       return 0;
00687     }
00688 #else
00689     int reg_size = (typeid(sFloat)==typeid(double2) ? sizeof(double) : sizeof(float));
00690     return DSLASH_SHARED_FLOATS_PER_THREAD * reg_size;
00691 #endif
00692   }
00693 
00694  public:
00695   TwistedDslashCuda(sFloat *out, float *outNorm, const gFloat *gauge0, const gFloat *gauge1, 
00696                     const QudaReconstructType reconstruct, const sFloat *in, const float *inNorm,
00697                     const sFloat *x, const float *xNorm, const double kappa, const double mu,
00698                     const double k, const int dagger, const size_t bytes, const size_t norm_bytes)
00699     : SharedDslashCuda(), bytes(bytes), norm_bytes(norm_bytes), out(out), outNorm(outNorm), gauge0(gauge0), gauge1(gauge1), in(in),
00700     inNorm(inNorm), reconstruct(reconstruct), dagger(dagger), x(x), xNorm(xNorm)
00701   { 
00702     bindSpinorTex(bytes, norm_bytes, in, inNorm, out, outNorm, x, xNorm); 
00703     setTwistParam(a, b, kappa, mu, dagger, QUDA_TWIST_GAMMA5_INVERSE);
00704     if (x) b *= k;
00705   }
00706   virtual ~TwistedDslashCuda() { unbindSpinorTex(in, inNorm, out, outNorm, x, xNorm); }
00707 
00708   TuneKey tuneKey() const
00709   {
00710     TuneKey key = DslashCuda::tuneKey();
00711     std::stringstream recon;
00712     recon << reconstruct;
00713     key.aux += ",reconstruct=" + recon.str();
00714     if (x) key.aux += ",Xpay";
00715     return key;
00716   }
00717 
00718   void apply(const cudaStream_t &stream)
00719   {
00720 #ifdef SHARED_WILSON_DSLASH
00721     if (dslashParam.kernel_type == EXTERIOR_KERNEL_X) 
00722       errorQuda("Shared dslash does not yet support X-dimension partitioning");
00723 #endif
00724     TuneParam tp = tuneLaunch(*this, dslashTuning, verbosity);
00725     DSLASH(twistedMassDslash, tp.grid, tp.block, tp.shared_bytes, stream, dslashParam,
00726            out, outNorm, gauge0, gauge1, in, inNorm, a, b, x, xNorm);
00727   }
00728 
00729   void preTune()
00730   {
00731     if (dslashParam.kernel_type < 5) { // exterior kernel
00732       saveOut = new char[bytes];
00733       cudaMemcpy(saveOut, out, bytes, cudaMemcpyDeviceToHost);
00734       if (typeid(sFloat) == typeid(short4)) {
00735         saveOutNorm = new char[norm_bytes];
00736         cudaMemcpy(saveOutNorm, outNorm, norm_bytes, cudaMemcpyDeviceToHost);
00737       }
00738     }
00739   }
00740 
00741   void postTune()
00742   {
00743     if (dslashParam.kernel_type < 5) { // exterior kernel
00744       cudaMemcpy(out, saveOut, bytes, cudaMemcpyHostToDevice);
00745       delete[] saveOut;
00746       if (typeid(sFloat) == typeid(short4)) {
00747         cudaMemcpy(outNorm, saveOutNorm, norm_bytes, cudaMemcpyHostToDevice);
00748         delete[] saveOutNorm;
00749       }
00750     }
00751   }
00752 };
00753 
00754 template <typename sFloat, typename gFloat>
00755 class DomainWallDslashCuda : public DslashCuda {
00756 
00757  private:
00758   const size_t bytes, norm_bytes;
00759   sFloat *out;
00760   float *outNorm;
00761   char *saveOut, *saveOutNorm;
00762   const sFloat *in, *x;
00763   const float *inNorm, *xNorm;
00764   const gFloat *gauge0, *gauge1;
00765   const QudaReconstructType reconstruct;
00766   const int dagger;
00767   const double mferm;
00768   const double a;
00769 
00770  protected:
00771   int sharedBytesPerThread() const { return 0; }
00772   
00773  public:
00774   DomainWallDslashCuda(sFloat *out, float *outNorm, const gFloat *gauge0, const gFloat *gauge1, 
00775                        const QudaReconstructType reconstruct, const sFloat *in, 
00776                        const float *inNorm, const sFloat *x, const float *xNorm, const double mferm, 
00777                        const double a, const int dagger, const size_t bytes, const size_t norm_bytes)
00778     : DslashCuda(), bytes(bytes), norm_bytes(norm_bytes), out(out), outNorm(outNorm), gauge0(gauge0), gauge1(gauge1), 
00779     in(in), inNorm(inNorm), mferm(mferm), reconstruct(reconstruct), dagger(dagger), x(x), xNorm(xNorm), a(a)
00780   { 
00781     bindSpinorTex(bytes, norm_bytes, in, inNorm, out, outNorm, x, xNorm); 
00782   }
00783   virtual ~DomainWallDslashCuda() { unbindSpinorTex(in, inNorm, out, outNorm, x, xNorm); }
00784 
00785   TuneKey tuneKey() const
00786   {
00787     TuneKey key = DslashCuda::tuneKey();
00788     std::stringstream ls, recon;
00789     ls << dslashConstants.Ls;
00790     recon << reconstruct;
00791     key.volume += "x" + ls.str();
00792     key.aux += ",reconstruct=" + recon.str();
00793     if (x) key.aux += ",Xpay";
00794     return key;
00795   }
00796 
00797   void apply(const cudaStream_t &stream)
00798   {
00799     TuneParam tp = tuneLaunch(*this, dslashTuning, verbosity);
00800     dim3 gridDim( (dslashParam.threads+tp.block.x-1) / tp.block.x, 1, 1);
00801     DSLASH(domainWallDslash, gridDim, tp.block, tp.shared_bytes, stream, dslashParam,
00802            out, outNorm, gauge0, gauge1, in, inNorm, mferm, x, xNorm, a);
00803   }
00804 
00805   void preTune()
00806   {
00807     if (dslashParam.kernel_type < 5) { // exterior kernel
00808       saveOut = new char[bytes];
00809       cudaMemcpy(saveOut, out, bytes, cudaMemcpyDeviceToHost);
00810       if (typeid(sFloat) == typeid(short4)) {
00811         saveOutNorm = new char[norm_bytes];
00812         cudaMemcpy(saveOutNorm, outNorm, norm_bytes, cudaMemcpyDeviceToHost);
00813       }
00814     }
00815   }
00816 
00817   void postTune()
00818   {
00819     if (dslashParam.kernel_type < 5) { // exterior kernel
00820       cudaMemcpy(out, saveOut, bytes, cudaMemcpyHostToDevice);
00821       delete[] saveOut;
00822       if (typeid(sFloat) == typeid(short4)) {
00823         cudaMemcpy(outNorm, saveOutNorm, norm_bytes, cudaMemcpyHostToDevice);
00824         delete[] saveOutNorm;
00825       }
00826     }
00827   }
00828 };
00829 
00830 template <typename sFloat, typename fatGFloat, typename longGFloat>
00831 class StaggeredDslashCuda : public DslashCuda {
00832 
00833 private:
00834   const size_t bytes, norm_bytes;
00835   sFloat *out;
00836   float *outNorm;
00837   char *saveOut, *saveOutNorm;
00838   const sFloat *in, *x;
00839   const float *inNorm, *xNorm;
00840   const fatGFloat *fat0, *fat1;
00841   const longGFloat *long0, *long1;
00842   const QudaReconstructType reconstruct;
00843   const int dagger;
00844   const double a;
00845 
00846  protected:
00847   int sharedBytesPerThread() const
00848   {
00849     int reg_size = (typeid(sFloat)==typeid(double2) ? sizeof(double) : sizeof(float));
00850     return 6 * reg_size;
00851   }
00852 
00853  public:
00854   StaggeredDslashCuda(sFloat *out, float *outNorm, const fatGFloat *fat0, const fatGFloat *fat1,
00855                       const longGFloat *long0, const longGFloat *long1,
00856                       const QudaReconstructType reconstruct, const sFloat *in, 
00857                       const float *inNorm, const sFloat *x, const float *xNorm, const double a,
00858                       const int dagger, const size_t bytes, const size_t norm_bytes)
00859     : DslashCuda(), bytes(bytes), norm_bytes(norm_bytes), out(out), outNorm(outNorm), fat0(fat0), fat1(fat1), long0(long0), long1(long1),
00860     in(in), inNorm(inNorm), reconstruct(reconstruct), dagger(dagger), x(x), xNorm(xNorm), a(a)
00861   { 
00862     bindSpinorTex(bytes, norm_bytes, in, inNorm, out, outNorm, x, xNorm); 
00863   }
00864 
00865   virtual ~StaggeredDslashCuda() { unbindSpinorTex(in, inNorm, out, outNorm, x, xNorm); }
00866 
00867   TuneKey tuneKey() const
00868   {
00869     TuneKey key = DslashCuda::tuneKey();
00870     std::stringstream recon;
00871     recon << reconstruct;
00872     key.aux += ",reconstruct=" + recon.str();
00873     if (x) key.aux += ",Axpy";
00874     return key;
00875   }
00876 
00877   void apply(const cudaStream_t &stream)
00878   {
00879     TuneParam tp = tuneLaunch(*this, dslashTuning, verbosity);
00880     dim3 gridDim( (dslashParam.threads+tp.block.x-1) / tp.block.x, 1, 1);
00881     STAGGERED_DSLASH(gridDim, tp.block, tp.shared_bytes, stream, dslashParam,
00882                      out, outNorm, fat0, fat1, long0, long1, in, inNorm, x, xNorm, a);
00883   }
00884 
00885   void preTune()
00886   {
00887     if (dslashParam.kernel_type < 5) { // exterior kernel
00888       saveOut = new char[bytes];
00889       cudaMemcpy(saveOut, out, bytes, cudaMemcpyDeviceToHost);
00890       if (typeid(sFloat) == typeid(short2)) {
00891         saveOutNorm = new char[norm_bytes];
00892         cudaMemcpy(saveOutNorm, outNorm, norm_bytes, cudaMemcpyDeviceToHost);
00893       }
00894     }
00895   }
00896 
00897   void postTune()
00898   {
00899     if (dslashParam.kernel_type < 5) { // exterior kernel
00900       cudaMemcpy(out, saveOut, bytes, cudaMemcpyHostToDevice);
00901       delete[] saveOut;
00902       if (typeid(sFloat) == typeid(short2)) {
00903         cudaMemcpy(outNorm, saveOutNorm, norm_bytes, cudaMemcpyHostToDevice);
00904         delete[] saveOutNorm;
00905       }
00906     }
00907   }
00908 
00909   int Nface() { return 6; }
00910 };
00911 
00912 #ifdef DSLASH_PROFILING
00913 
00914 #define TDIFF(a,b) 1e3*(b.tv_sec - a.tv_sec + 1e-6*(b.tv_usec - a.tv_usec))
00915 
00916 void dslashTimeProfile() {
00917 
00918   cudaEventSynchronize(dslashEnd);
00919   float runTime;
00920   cudaEventElapsedTime(&runTime, dslashStart, dslashEnd);
00921   dslashTime += runTime;
00922 
00923   for (int i=4; i>=0; i--) {
00924     if (!dslashParam.commDim[i] && i<4) continue;
00925 
00926     // kernel timing
00927     cudaEventElapsedTime(&runTime, dslashStart, kernelStart[2*i]);
00928     kernelTime[2*i][0] += runTime; // start time
00929     cudaEventElapsedTime(&runTime, dslashStart, kernelEnd[2*i]);
00930     kernelTime[2*i][1] += runTime; // end time
00931   }
00932       
00933 #ifdef MULTI_GPU
00934   for (int i=3; i>=0; i--) {
00935     if (!dslashParam.commDim[i]) continue;
00936 
00937     for (int dir = 0; dir < 2; dir ++) {
00938       // pack timing
00939       cudaEventElapsedTime(&runTime, dslashStart, packStart[2*i+dir]);
00940       packTime[2*i+dir][0] += runTime; // start time
00941       cudaEventElapsedTime(&runTime, dslashStart, packEnd[2*i+dir]);
00942       packTime[2*i+dir][1] += runTime; // end time
00943   
00944       // gather timing
00945       cudaEventElapsedTime(&runTime, dslashStart, gatherStart[2*i+dir]);
00946       gatherTime[2*i+dir][0] += runTime; // start time
00947       cudaEventElapsedTime(&runTime, dslashStart, gatherEnd[2*i+dir]);
00948       gatherTime[2*i+dir][1] += runTime; // end time
00949       
00950       // comms timing
00951       runTime = TDIFF(dslashStart_h, commsStart[2*i+dir]);
00952       commsTime[2*i+dir][0] += runTime; // start time
00953       runTime = TDIFF(dslashStart_h, commsEnd[2*i+dir]);
00954       commsTime[2*i+dir][1] += runTime; // end time
00955 
00956       // scatter timing
00957       cudaEventElapsedTime(&runTime, dslashStart, scatterStart[2*i+dir]);
00958       scatterTime[2*i+dir][0] += runTime; // start time
00959       cudaEventElapsedTime(&runTime, dslashStart, scatterEnd[2*i+dir]);
00960       scatterTime[2*i+dir][1] += runTime; // end time
00961     }
00962   }
00963 #endif
00964 
00965 }
00966 
00967 void printDslashProfile() {
00968   
00969   printfQuda("Total Dslash time = %6.2f\n", dslashTime);
00970 
00971   char dimstr[8][8] = {"X-", "X+", "Y-", "Y+", "Z-", "Z+", "T-", "T+"};
00972 
00973   printfQuda("     %13s %13s %13s %13s %13s\n", "Pack", "Gather", "Comms", "Scatter", "Kernel");
00974   printfQuda("         %6s %6s %6s %6s %6s %6s %6s %6s %6s %6s\n", 
00975              "Start", "End", "Start", "End", "Start", "End", "Start", "End", "Start", "End");
00976 
00977   printfQuda("%8s %55s %6.2f %6.2f\n", "Interior", "", kernelTime[8][0], kernelTime[8][1]);
00978       
00979   for (int i=3; i>=0; i--) {
00980     if (!dslashParam.commDim[i]) continue;
00981 
00982     for (int dir = 0; dir < 2; dir ++) {
00983       printfQuda("%8s ", dimstr[2*i+dir]);
00984 #ifdef MULTI_GPU
00985       printfQuda("%6.2f %6.2f ", packTime[2*i+dir][0], packTime[2*i+dir][1]);
00986       printfQuda("%6.2f %6.2f ", gatherTime[2*i+dir][0], gatherTime[2*i+dir][1]);
00987       printfQuda("%6.2f %6.2f ", commsTime[2*i+dir][0], commsTime[2*i+dir][1]);
00988       printfQuda("%6.2f %6.2f ", scatterTime[2*i+dir][0], scatterTime[2*i+dir][1]);
00989 #endif
00990 
00991       if (dir==0) printfQuda("%6.2f %6.2f\n", kernelTime[2*i][0], kernelTime[2*i][1]);
00992       else printfQuda("\n");
00993     }
00994   }
00995 
00996 }
00997 #endif
00998 
00999 int gatherCompleted[Nstream];
01000 int previousDir[Nstream];
01001 int commsCompleted[Nstream];
01002 int commDimTotal;
01003 
01007 void initDslashCommsPattern() {
01008   for (int i=0; i<Nstream-1; i++) {
01009     gatherCompleted[i] = 0;
01010     commsCompleted[i] = 0;
01011   }
01012   gatherCompleted[Nstream-1] = 1;
01013   commsCompleted[Nstream-1] = 1;
01014 
01015   //   We need to know which was the previous direction in which
01016   //   communication was issued, since we only query a given event /
01017   //   comms call after the previous the one has successfully
01018   //   completed.
01019   for (int i=3; i>=0; i--) {
01020     if (dslashParam.commDim[i]) {
01021       int prev = Nstream-1;
01022       for (int j=3; j>i; j--) if (dslashParam.commDim[j]) prev = 2*j;
01023       previousDir[2*i + 1] = prev;
01024       previousDir[2*i + 0] = 2*i + 1; // always valid
01025     }
01026   }
01027 
01028   // this tells us how many events / comms occurances there are in
01029   // total.  Used for exiting the while loop
01030   commDimTotal = 0;
01031   for (int i=3; i>=0; i--) commDimTotal += dslashParam.commDim[i];
01032   commDimTotal *= 4; // 2 from pipe length, 2 from direction
01033 }
01034 
01035 void dslashCuda(DslashCuda &dslash, const size_t regSize, const int parity, const int dagger, 
01036                 const int volume, const int *faceVolumeCB) {
01037 
01038   dslashParam.parity = parity;
01039   dslashParam.kernel_type = INTERIOR_KERNEL;
01040   dslashParam.threads = volume;
01041 
01042   CUDA_EVENT_RECORD(dslashStart, 0);
01043   gettimeofday(&dslashStart_h, NULL);
01044 
01045 #ifdef MULTI_GPU
01046   for(int i = 3; i >=0; i--){
01047     if (!dslashParam.commDim[i]) continue;
01048 
01049     // Record the start of the packing
01050     CUDA_EVENT_RECORD(packStart[2*i+0], streams[Nstream-1]);
01051     CUDA_EVENT_RECORD(packStart[2*i+1], streams[Nstream-1]);
01052 
01053     // Initialize pack from source spinor
01054     face->pack(*inSpinor, 1-parity, dagger, i, streams);
01055     
01056     // Record the end of the packing
01057     cudaEventRecord(packEnd[2*i+0], streams[Nstream-1]);
01058     cudaEventRecord(packEnd[2*i+1], streams[Nstream-1]);
01059   }
01060 
01061   for(int i = 3; i >=0; i--){
01062     if (!dslashParam.commDim[i]) continue;
01063 
01064     for (int dir=1; dir>=0; dir--) {
01065       cudaStreamWaitEvent(streams[2*i+dir], packEnd[2*i+dir], 0);
01066 
01067       // Record the start of the gathering
01068       CUDA_EVENT_RECORD(gatherStart[2*i+dir], streams[2*i+dir]);
01069 
01070       // Initialize host transfer from source spinor
01071       face->gather(*inSpinor, dagger, 2*i+dir);
01072 
01073       // Record the end of the gathering
01074       cudaEventRecord(gatherEnd[2*i+dir], streams[2*i+dir]);
01075     }
01076   }
01077 #endif
01078 
01079   CUDA_EVENT_RECORD(kernelStart[Nstream-1], streams[Nstream-1]);
01080   dslash.apply(streams[Nstream-1]);
01081   CUDA_EVENT_RECORD(kernelEnd[Nstream-1], streams[Nstream-1]);
01082 
01083 #ifdef MULTI_GPU
01084   initDslashCommsPattern();
01085 
01086   int completeSum = 0;
01087   while (completeSum < commDimTotal) {
01088     for (int i=3; i>=0; i--) {
01089       if (!dslashParam.commDim[i]) continue;
01090       
01091       for (int dir=1; dir>=0; dir--) {
01092         
01093         // Query if gather has completed
01094         if (!gatherCompleted[2*i+dir] && gatherCompleted[previousDir[2*i+dir]]) { 
01095           if (cudaSuccess == cudaEventQuery(gatherEnd[2*i+dir])) {
01096             gatherCompleted[2*i+dir] = 1;
01097             completeSum++;
01098             gettimeofday(&commsStart[2*i+dir], NULL);
01099             face->commsStart(2*i+dir);
01100           }
01101         }
01102         
01103         // Query if comms has finished
01104         if (!commsCompleted[2*i+dir] && commsCompleted[previousDir[2*i+dir]] &&
01105             gatherCompleted[2*i+dir]) {
01106           if (face->commsQuery(2*i+dir)) { 
01107             commsCompleted[2*i+dir] = 1;
01108             completeSum++;
01109             gettimeofday(&commsEnd[2*i+dir], NULL);
01110             
01111             // Record the end of the scattering
01112             CUDA_EVENT_RECORD(scatterStart[2*i+dir], streams[2*i+dir]);
01113             
01114             // Scatter into the end zone
01115             face->scatter(*inSpinor, dagger, 2*i+dir);
01116             
01117             // Record the end of the scattering
01118             cudaEventRecord(scatterEnd[2*i+dir], streams[2*i+dir]);
01119           }
01120         }
01121 
01122       }
01123     }
01124     
01125   }
01126 
01127   for (int i=3; i>=0; i--) {
01128     if (!dslashParam.commDim[i]) continue;
01129 
01130     dslashParam.kernel_type = static_cast<KernelType>(i);
01131     dslashParam.threads = dslash.Nface()*faceVolumeCB[i]; // updating 2 or 6 faces
01132 
01133     // wait for scattering to finish and then launch dslash
01134     cudaStreamWaitEvent(streams[Nstream-1], scatterEnd[2*i], 0);
01135     cudaStreamWaitEvent(streams[Nstream-1], scatterEnd[2*i+1], 0);
01136 
01137     CUDA_EVENT_RECORD(kernelStart[2*i], streams[Nstream-1]);
01138     dslash.apply(streams[Nstream-1]); // all faces use this stream
01139     CUDA_EVENT_RECORD(kernelEnd[2*i], streams[Nstream-1]);
01140   }
01141 
01142   CUDA_EVENT_RECORD(dslashEnd, 0);
01143   DSLASH_TIME_PROFILE();
01144 
01145 #endif // MULTI_GPU
01146 }
01147 
01148 // Wilson wrappers
01149 void wilsonDslashCuda(cudaColorSpinorField *out, const cudaGaugeField &gauge, const cudaColorSpinorField *in, const int parity,
01150                       const int dagger, const cudaColorSpinorField *x, const double &k, const int *commOverride)
01151 {
01152   inSpinor = (cudaColorSpinorField*)in; // EVIL
01153 
01154 #ifdef GPU_WILSON_DIRAC
01155   int Npad = (in->Ncolor()*in->Nspin()*2)/in->FieldOrder(); // SPINOR_HOP in old code
01156   for(int i=0;i<4;i++){
01157     dslashParam.ghostDim[i] = commDimPartitioned(i); // determines whether to use regular or ghost indexing at boundary
01158     dslashParam.ghostOffset[i] = Npad*(in->GhostOffset(i) + in->Stride());
01159     dslashParam.ghostNormOffset[i] = in->GhostNormOffset(i) + in->Stride();
01160     dslashParam.commDim[i] = (!commOverride[i]) ? 0 : commDimPartitioned(i); // switch off comms if override = 0
01161   }
01162 
01163   void *gauge0, *gauge1;
01164   bindGaugeTex(gauge, parity, &gauge0, &gauge1);
01165 
01166   if (in->Precision() != gauge.Precision())
01167     errorQuda("Mixing gauge %d and spinor %d precision not supported", 
01168               gauge.Precision(), in->Precision());
01169 
01170   const void *xv = (x ? x->V() : 0);
01171   const void *xn = (x ? x->Norm() : 0);
01172 
01173   DslashCuda *dslash = 0;
01174   size_t regSize = sizeof(float);
01175   if (in->Precision() == QUDA_DOUBLE_PRECISION) {
01176 #if (__COMPUTE_CAPABILITY__ >= 130)
01177     dslash = new WilsonDslashCuda<double2, double2>((double2*)out->V(), (float*)out->Norm(), 
01178                                                     (double2*)gauge0, (double2*)gauge1, 
01179                                                     gauge.Reconstruct(), (double2*)in->V(), 
01180                                                     (float*)in->Norm(), (double2*)xv, (float*)xn,
01181                                                     k, dagger, in->Bytes(), in->NormBytes());
01182     regSize = sizeof(double);
01183 #else
01184     errorQuda("Double precision not supported on this GPU");
01185 #endif
01186   } else if (in->Precision() == QUDA_SINGLE_PRECISION) {
01187     dslash = new WilsonDslashCuda<float4, float4>((float4*)out->V(), (float*)out->Norm(), (float4*)gauge0, (float4*)gauge1,
01188                                                   gauge.Reconstruct(), (float4*)in->V(), (float*)in->Norm(), 
01189                                                   (float4*)xv, (float*)xn, k, dagger, in->Bytes(), in->NormBytes());
01190   } else if (in->Precision() == QUDA_HALF_PRECISION) {
01191     dslash = new WilsonDslashCuda<short4, short4>((short4*)out->V(), (float*)out->Norm(), (short4*)gauge0, (short4*)gauge1,
01192                                                   gauge.Reconstruct(), (short4*)in->V(), (float*)in->Norm(),
01193                                                   (short4*)xv, (float*)xn, k, dagger, in->Bytes(), in->NormBytes());
01194   }
01195   dslashCuda(*dslash, regSize, parity, dagger, in->Volume(), in->GhostFace());
01196 
01197   delete dslash;
01198   unbindGaugeTex(gauge);
01199 
01200   checkCudaError();
01201 #else
01202   errorQuda("Wilson dslash has not been built");
01203 #endif // GPU_WILSON_DIRAC
01204 
01205 }
01206 
01207 void cloverDslashCuda(cudaColorSpinorField *out, const cudaGaugeField &gauge, const FullClover cloverInv,
01208                       const cudaColorSpinorField *in, const int parity, const int dagger, 
01209                       const cudaColorSpinorField *x, const double &a, const int *commOverride)
01210 {
01211   inSpinor = (cudaColorSpinorField*)in; // EVIL
01212 
01213 #ifdef GPU_CLOVER_DIRAC
01214   int Npad = (in->Ncolor()*in->Nspin()*2)/in->FieldOrder(); // SPINOR_HOP in old code
01215   for(int i=0;i<4;i++){
01216     dslashParam.ghostDim[i] = commDimPartitioned(i); // determines whether to use regular or ghost indexing at boundary
01217     dslashParam.ghostOffset[i] = Npad*(in->GhostOffset(i) + in->Stride());
01218     dslashParam.ghostNormOffset[i] = in->GhostNormOffset(i) + in->Stride();
01219     dslashParam.commDim[i] = (!commOverride[i]) ? 0 : commDimPartitioned(i); // switch off comms if override = 0
01220   }
01221 
01222   void *cloverP, *cloverNormP;
01223   QudaPrecision clover_prec = bindCloverTex(cloverInv, parity, &cloverP, &cloverNormP);
01224 
01225   void *gauge0, *gauge1;
01226   bindGaugeTex(gauge, parity, &gauge0, &gauge1);
01227 
01228   if (in->Precision() != gauge.Precision())
01229     errorQuda("Mixing gauge and spinor precision not supported");
01230 
01231   if (in->Precision() != clover_prec)
01232     errorQuda("Mixing clover and spinor precision not supported");
01233 
01234   const void *xv = x ? x->V() : 0;
01235   const void *xn = x ? x->Norm() : 0;
01236 
01237   DslashCuda *dslash = 0;
01238   size_t regSize = sizeof(float);
01239 
01240   if (in->Precision() == QUDA_DOUBLE_PRECISION) {
01241 #if (__COMPUTE_CAPABILITY__ >= 130)
01242     dslash = new CloverDslashCuda<double2, double2, double2>((double2*)out->V(), (float*)out->Norm(), (double2*)gauge0, 
01243                                                              (double2*)gauge1, gauge.Reconstruct(), (double2*)cloverP, 
01244                                                              (float*)cloverNormP, (double2*)in->V(), (float*)in->Norm(),
01245                                                              (double2*)xv, (float*)xn, a, dagger, in->Bytes(), in->NormBytes());
01246     regSize = sizeof(double);
01247 #else
01248     errorQuda("Double precision not supported on this GPU");
01249 #endif
01250   } else if (in->Precision() == QUDA_SINGLE_PRECISION) {
01251     dslash = new CloverDslashCuda<float4, float4, float4>((float4*)out->V(), (float*)out->Norm(), (float4*)gauge0, 
01252                                                           (float4*)gauge1, gauge.Reconstruct(), (float4*)cloverP, 
01253                                                           (float*)cloverNormP, (float4*)in->V(), (float*)in->Norm(), 
01254                                                           (float4*)xv, (float*)xn, a, dagger, in->Bytes(), in->NormBytes());
01255   } else if (in->Precision() == QUDA_HALF_PRECISION) {
01256     dslash = new CloverDslashCuda<short4, short4, short4>((short4*)out->V(), (float*)out->Norm(), (short4*)gauge0, 
01257                                                           (short4*)gauge1, gauge.Reconstruct(), (short4*)cloverP, 
01258                                                           (float*)cloverNormP, (short4*)in->V(), (float*)in->Norm(), 
01259                                                           (short4*)xv, (float*)xn, a, dagger, in->Bytes(), in->NormBytes());
01260   }
01261 
01262   dslashCuda(*dslash, regSize, parity, dagger, in->Volume(), in->GhostFace());
01263 
01264   delete dslash;
01265   unbindGaugeTex(gauge);
01266   unbindCloverTex(cloverInv);
01267 
01268   checkCudaError();
01269 #else
01270   errorQuda("Clover dslash has not been built");
01271 #endif
01272 
01273 }
01274 
01275 void twistedMassDslashCuda(cudaColorSpinorField *out, const cudaGaugeField &gauge, 
01276                            const cudaColorSpinorField *in, const int parity, const int dagger, 
01277                            const cudaColorSpinorField *x, const double &kappa, const double &mu, 
01278                            const double &a, const int *commOverride)
01279 {
01280   inSpinor = (cudaColorSpinorField*)in; // EVIL
01281 
01282 #ifdef GPU_TWISTED_MASS_DIRAC
01283   int Npad = (in->Ncolor()*in->Nspin()*2)/in->FieldOrder(); // SPINOR_HOP in old code
01284   for(int i=0;i<4;i++){
01285     dslashParam.ghostDim[i] = commDimPartitioned(i); // determines whether to use regular or ghost indexing at boundary
01286     dslashParam.ghostOffset[i] = Npad*(in->GhostOffset(i) + in->Stride());
01287     dslashParam.ghostNormOffset[i] = in->GhostNormOffset(i) + in->Stride();
01288     dslashParam.commDim[i] = (!commOverride[i]) ? 0 : commDimPartitioned(i); // switch off comms if override = 0
01289   }
01290 
01291   void *gauge0, *gauge1;
01292   bindGaugeTex(gauge, parity, &gauge0, &gauge1);
01293 
01294   if (in->Precision() != gauge.Precision())
01295     errorQuda("Mixing gauge and spinor precision not supported");
01296 
01297   const void *xv = x ? x->V() : 0;
01298   const void *xn = x ? x->Norm() : 0;
01299 
01300   DslashCuda *dslash = 0;
01301   size_t regSize = sizeof(float);
01302 
01303   if (in->Precision() == QUDA_DOUBLE_PRECISION) {
01304 #if (__COMPUTE_CAPABILITY__ >= 130)
01305     dslash = new TwistedDslashCuda<double2,double2>((double2*)out->V(), (float*)out->Norm(), (double2*)gauge0, 
01306                                                     (double2*)gauge1, gauge.Reconstruct(), (double2*)in->V(), 
01307                                                     (float*)in->Norm(), (double2*)xv, (float*)xn, 
01308                                                     kappa, mu, a, dagger, in->Bytes(), in->NormBytes());
01309     regSize = sizeof(double);
01310 #else
01311     errorQuda("Double precision not supported on this GPU");
01312 #endif
01313   } else if (in->Precision() == QUDA_SINGLE_PRECISION) {
01314     dslash = new TwistedDslashCuda<float4,float4>((float4*)out->V(), (float*)out->Norm(), (float4*)gauge0, (float4*)gauge1, 
01315                                                   gauge.Reconstruct(), (float4*)in->V(), (float*)in->Norm(), 
01316                                                   (float4*)xv, (float*)xn, kappa, mu, a, dagger, in->Bytes(), in->NormBytes());
01317   } else if (in->Precision() == QUDA_HALF_PRECISION) {
01318     dslash = new TwistedDslashCuda<short4,short4>((short4*)out->V(), (float*)out->Norm(), (short4*)gauge0, (short4*)gauge1, 
01319                                                   gauge.Reconstruct(), (short4*)in->V(), (float*)in->Norm(), 
01320                                                   (short4*)xv, (float*)xn, kappa, mu, a, dagger, in->Bytes(), in->NormBytes());
01321     
01322   }
01323 
01324   dslashCuda(*dslash, regSize, parity, dagger, in->Volume(), in->GhostFace());
01325 
01326   delete dslash;
01327   unbindGaugeTex(gauge);
01328 
01329   checkCudaError();
01330 #else
01331   errorQuda("Twisted mass dslash has not been built");
01332 #endif
01333 
01334 }
01335 
01336 void domainWallDslashCuda(cudaColorSpinorField *out, const cudaGaugeField &gauge, 
01337                           const cudaColorSpinorField *in, const int parity, const int dagger, 
01338                           const cudaColorSpinorField *x, const double &m_f, const double &k2)
01339 {
01340   inSpinor = (cudaColorSpinorField*)in; // EVIL
01341 
01342 #ifdef MULTI_GPU
01343   errorQuda("Multi-GPU domain wall not implemented\n");
01344 #endif
01345 
01346   dslashParam.parity = parity;
01347   dslashParam.threads = in->Volume();
01348 
01349 #ifdef GPU_DOMAIN_WALL_DIRAC
01350   void *gauge0, *gauge1;
01351   bindGaugeTex(gauge, parity, &gauge0, &gauge1);
01352 
01353   if (in->Precision() != gauge.Precision())
01354     errorQuda("Mixing gauge and spinor precision not supported");
01355 
01356   const void *xv = x ? x->V() : 0;
01357   const void *xn = x ? x->Norm() : 0;
01358 
01359   DslashCuda *dslash = 0;
01360   size_t regSize = sizeof(float);
01361 
01362   if (in->Precision() == QUDA_DOUBLE_PRECISION) {
01363 #if (__COMPUTE_CAPABILITY__ >= 130)
01364     dslash = new DomainWallDslashCuda<double2,double2>((double2*)out->V(), (float*)out->Norm(), (double2*)gauge0, (double2*)gauge1, 
01365                                                        gauge.Reconstruct(), (double2*)in->V(), (float*)in->Norm(), (double2*)xv, 
01366                                                        (float*)xn, m_f, k2, dagger, in->Bytes(), in->NormBytes());
01367     regSize = sizeof(double);
01368 #else
01369     errorQuda("Double precision not supported on this GPU");
01370 #endif
01371   } else if (in->Precision() == QUDA_SINGLE_PRECISION) {
01372     dslash = new DomainWallDslashCuda<float4,float4>((float4*)out->V(), (float*)out->Norm(), (float4*)gauge0, (float4*)gauge1, 
01373                                                      gauge.Reconstruct(), (float4*)in->V(), (float*)in->Norm(), (float4*)xv, 
01374                                                      (float*)xn, m_f, k2, dagger, in->Bytes(), in->NormBytes());
01375   } else if (in->Precision() == QUDA_HALF_PRECISION) {
01376     dslash = new DomainWallDslashCuda<short4,short4>((short4*)out->V(), (float*)out->Norm(), (short4*)gauge0, (short4*)gauge1, 
01377                                                      gauge.Reconstruct(), (short4*)in->V(), (float*)in->Norm(), (short4*)xv, 
01378                                                      (float*)xn, m_f, k2, dagger, in->Bytes(), in->NormBytes());
01379   }
01380 
01381   dslashCuda(*dslash, regSize, parity, dagger, in->Volume(), in->GhostFace());
01382 
01383   delete dslash;
01384   unbindGaugeTex(gauge);
01385 
01386   checkCudaError();
01387 #else
01388   errorQuda("Domain wall dslash has not been built");
01389 #endif
01390 
01391 }
01392 
01393 void staggeredDslashCuda(cudaColorSpinorField *out, const cudaGaugeField &fatGauge, 
01394                          const cudaGaugeField &longGauge, const cudaColorSpinorField *in,
01395                          const int parity, const int dagger, const cudaColorSpinorField *x,
01396                          const double &k, const int *commOverride)
01397 {
01398   inSpinor = (cudaColorSpinorField*)in; // EVIL
01399 
01400 #ifdef GPU_STAGGERED_DIRAC
01401   int Npad = (in->Ncolor()*in->Nspin()*2)/in->FieldOrder(); // SPINOR_HOP in old code
01402 
01403   dslashParam.parity = parity;
01404   dslashParam.threads = in->Volume();
01405   for(int i=0;i<4;i++){
01406     dslashParam.ghostDim[i] = commDimPartitioned(i); // determines whether to use regular or ghost indexing at boundary
01407     dslashParam.ghostOffset[i] = Npad*(in->GhostOffset(i) + in->Stride());
01408     dslashParam.ghostNormOffset[i] = in->GhostNormOffset(i) + in->Stride();
01409     dslashParam.commDim[i] = (!commOverride[i]) ? 0 : commDimPartitioned(i); // switch off comms if override = 0
01410   }
01411   void *fatGauge0, *fatGauge1;
01412   void* longGauge0, *longGauge1;
01413   bindFatGaugeTex(fatGauge, parity, &fatGauge0, &fatGauge1);
01414   bindLongGaugeTex(longGauge, parity, &longGauge0, &longGauge1);
01415     
01416   if (in->Precision() != fatGauge.Precision() || in->Precision() != longGauge.Precision()){
01417     errorQuda("Mixing gauge and spinor precision not supported"
01418               "(precision=%d, fatlinkGauge.precision=%d, longGauge.precision=%d",
01419               in->Precision(), fatGauge.Precision(), longGauge.Precision());
01420   }
01421     
01422   const void *xv = x ? x->V() : 0;
01423   const void *xn = x ? x->Norm() : 0;
01424 
01425   DslashCuda *dslash = 0;
01426   size_t regSize = sizeof(float);
01427 
01428   if (in->Precision() == QUDA_DOUBLE_PRECISION) {
01429 #if (__COMPUTE_CAPABILITY__ >= 130)
01430     dslash = new StaggeredDslashCuda<double2, double2, double2>((double2*)out->V(), (float*)out->Norm(), 
01431                                                                 (double2*)fatGauge0, (double2*)fatGauge1,
01432                                                                 (double2*)longGauge0, (double2*)longGauge1, 
01433                                                                 longGauge.Reconstruct(), (double2*)in->V(), 
01434                                                                 (float*)in->Norm(), (double2*)xv, (float*)xn, 
01435                                                                 k, dagger, in->Bytes(), in->NormBytes());
01436     regSize = sizeof(double);
01437 #else
01438     errorQuda("Double precision not supported on this GPU");
01439 #endif
01440   } else if (in->Precision() == QUDA_SINGLE_PRECISION) {
01441     dslash = new StaggeredDslashCuda<float2, float2, float4>((float2*)out->V(), (float*)out->Norm(), 
01442                                                              (float2*)fatGauge0, (float2*)fatGauge1,
01443                                                              (float4*)longGauge0, (float4*)longGauge1, 
01444                                                              longGauge.Reconstruct(), (float2*)in->V(),
01445                                                              (float*)in->Norm(), (float2*)xv, (float*)xn, 
01446                                                              k, dagger, in->Bytes(), in->NormBytes());
01447   } else if (in->Precision() == QUDA_HALF_PRECISION) {  
01448     dslash = new StaggeredDslashCuda<short2, short2, short4>((short2*)out->V(), (float*)out->Norm(), 
01449                                                              (short2*)fatGauge0, (short2*)fatGauge1,
01450                                                              (short4*)longGauge0, (short4*)longGauge1, 
01451                                                              longGauge.Reconstruct(), (short2*)in->V(), 
01452                                                              (float*)in->Norm(), (short2*)xv, (float*)xn, 
01453                                                              k, dagger,  in->Bytes(), in->NormBytes());
01454   }
01455 
01456   dslashCuda(*dslash, regSize, parity, dagger, in->Volume(), in->GhostFace());
01457 
01458   delete dslash;
01459   unbindGaugeTex(fatGauge);
01460   unbindGaugeTex(longGauge);
01461 
01462   checkCudaError();
01463   
01464 #else
01465   errorQuda("Staggered dslash has not been built");
01466 #endif  // GPU_STAGGERED_DIRAC
01467 }
01468 
01469 
01470 template <typename sFloat, typename cFloat>
01471 class CloverCuda : public Tunable {
01472  private:
01473   const size_t bytes, norm_bytes;
01474   sFloat *out;
01475   float *outNorm;
01476   char *saveOut, *saveOutNorm;
01477   const cFloat *clover;
01478   const float *cloverNorm;
01479   const sFloat *in;
01480   const float *inNorm;
01481 
01482  protected:
01483   int sharedBytesPerThread() const
01484   {
01485     int reg_size = (typeid(sFloat)==typeid(double2) ? sizeof(double) : sizeof(float));
01486     return CLOVER_SHARED_FLOATS_PER_THREAD * reg_size;
01487   }
01488   int sharedBytesPerBlock() const { return 0; }
01489   bool advanceGridDim(TuneParam &param) const { return false; } // Don't tune the grid dimensions.
01490 
01491  public:
01492   CloverCuda(sFloat *out, float *outNorm, const cFloat *clover, const float *cloverNorm, const sFloat *in,
01493              const float *inNorm, const size_t bytes, const size_t norm_bytes)
01494     : out(out), outNorm(outNorm), clover(clover), cloverNorm(cloverNorm), in(in), inNorm(inNorm),
01495       bytes(bytes), norm_bytes(norm_bytes)
01496   {
01497     bindSpinorTex(bytes, norm_bytes, in, inNorm);
01498   }
01499   virtual ~CloverCuda() { unbindSpinorTex(in, inNorm); }
01500   void apply(const cudaStream_t &stream)
01501   {
01502     TuneParam tp = tuneLaunch(*this, dslashTuning, verbosity);
01503     dim3 gridDim( (dslashParam.threads+tp.block.x-1) / tp.block.x, 1, 1);
01504     cloverKernel<<<gridDim, tp.block, tp.shared_bytes, stream>>>(out, outNorm, clover, cloverNorm, in, inNorm, dslashParam);
01505   }
01506   virtual TuneKey tuneKey() const
01507   {
01508     std::stringstream vol, aux;
01509     vol << dslashConstants.x[0] << "x";
01510     vol << dslashConstants.x[1] << "x";
01511     vol << dslashConstants.x[2] << "x";
01512     vol << dslashConstants.x[3];
01513     return TuneKey(vol.str(), typeid(*this).name());
01514   }
01515 
01516   // Need to save the out field if it aliases the in field
01517   void preTune() {
01518     if (in == out) {
01519       saveOut = new char[bytes];
01520       cudaMemcpy(saveOut, out, bytes, cudaMemcpyDeviceToHost);
01521       if (typeid(sFloat) == typeid(short4)) {
01522         saveOutNorm = new char[norm_bytes];
01523         cudaMemcpy(saveOutNorm, outNorm, norm_bytes, cudaMemcpyDeviceToHost);
01524       }
01525     }
01526   }
01527 
01528   // Restore if the in and out fields alias
01529   void postTune() {
01530     if (in == out) {
01531       cudaMemcpy(out, saveOut, bytes, cudaMemcpyHostToDevice);
01532       delete[] saveOut;
01533       if (typeid(sFloat) == typeid(short4)) {
01534         cudaMemcpy(outNorm, saveOutNorm, norm_bytes, cudaMemcpyHostToDevice);
01535         delete[] saveOutNorm;
01536       }
01537     }
01538   }
01539 
01540   std::string paramString(const TuneParam &param) const // Don't bother printing the grid dim.
01541   {
01542     std::stringstream ps;
01543     ps << "block=(" << param.block.x << "," << param.block.y << "," << param.block.z << "), ";
01544     ps << "shared=" << param.shared_bytes;
01545     return ps.str();
01546   }
01547 
01548 };
01549 
01550 
01551 void cloverCuda(cudaColorSpinorField *out, const cudaGaugeField &gauge, const FullClover clover, 
01552                 const cudaColorSpinorField *in, const int parity) {
01553 
01554   dslashParam.parity = parity;
01555   dslashParam.threads = in->Volume();
01556 
01557 #ifdef GPU_CLOVER_DIRAC
01558   Tunable *clov = 0;
01559   void *cloverP, *cloverNormP;
01560   QudaPrecision clover_prec = bindCloverTex(clover, parity, &cloverP, &cloverNormP);
01561 
01562   if (in->Precision() != clover_prec)
01563     errorQuda("Mixing clover and spinor precision not supported");
01564 
01565   if (in->Precision() == QUDA_DOUBLE_PRECISION) {
01566 #if (__COMPUTE_CAPABILITY__ >= 130)
01567     clov = new CloverCuda<double2, double2>((double2*)out->V(), (float*)out->Norm(), (double2*)cloverP, 
01568                                             (float*)cloverNormP, (double2*)in->V(), (float*)in->Norm(), 
01569                                             in->Bytes(), in->NormBytes());
01570 #else
01571     errorQuda("Double precision not supported on this GPU");
01572 #endif
01573   } else if (in->Precision() == QUDA_SINGLE_PRECISION) {
01574     clov = new CloverCuda<float4, float4>((float4*)out->V(), (float*)out->Norm(), (float4*)cloverP, 
01575                           (float*)cloverNormP, (float4*)in->V(), (float*)in->Norm(),
01576                           in->Bytes(), in->NormBytes());
01577   } else if (in->Precision() == QUDA_HALF_PRECISION) {
01578     clov = new CloverCuda<short4, short4>((short4*)out->V(), (float*)out->Norm(), (short4*)cloverP, 
01579                           (float*)cloverNormP, (short4*)in->V(), (float*)in->Norm(), 
01580                           in->Bytes(), in->NormBytes());
01581   }
01582   clov->apply(0);
01583 
01584   unbindCloverTex(clover);
01585   checkCudaError();
01586 
01587   delete clov;
01588 #else
01589   errorQuda("Clover dslash has not been built");
01590 #endif
01591 }
01592 
01593 template <typename sFloat>
01594 class TwistGamma5Cuda : public Tunable {
01595 
01596 private:
01597   sFloat *out;
01598   float *outNorm;
01599   sFloat *in;
01600   float *inNorm;
01601   double a;
01602   double b;
01603   size_t bytes;
01604   size_t norm_bytes;
01605 
01606   int sharedBytesPerThread() const { return 0; }
01607   int sharedBytesPerBlock() const { return 0; }
01608   bool advanceGridDim(TuneParam &param) const { return false; } // Don't tune the grid dimensions.
01609 
01610   char *saveOut, *saveOutNorm;
01611 
01612 public:
01613   TwistGamma5Cuda(sFloat *out, float *outNorm, sFloat *in, float *inNorm,
01614                   double kappa, double mu, const int dagger, 
01615                   QudaTwistGamma5Type twist, size_t bytes, size_t norm_bytes) :
01616     out(out), outNorm(outNorm), in(in), inNorm(inNorm), 
01617     bytes(bytes), norm_bytes(norm_bytes){
01618     bindSpinorTex(bytes, norm_bytes, in, inNorm);
01619     setTwistParam(a, b, kappa, mu, dagger, twist);
01620   }
01621   virtual ~TwistGamma5Cuda() {
01622     unbindSpinorTex(in, inNorm);    
01623   }
01624 
01625   TuneKey tuneKey() const {
01626     std::stringstream vol, aux;
01627     vol << dslashConstants.x[0] << "x";
01628     vol << dslashConstants.x[1] << "x";
01629     vol << dslashConstants.x[2] << "x";
01630     vol << dslashConstants.x[3];    
01631     return TuneKey(vol.str(), typeid(*this).name(), aux.str());
01632   }  
01633 
01634   void apply(const cudaStream_t &stream) {
01635     TuneParam tp = tuneLaunch(*this, dslashTuning, verbosity);
01636     dim3 gridDim( (dslashParam.threads+tp.block.x-1) / tp.block.x, 1, 1);
01637     twistGamma5Kernel<<<gridDim, tp.block, tp.shared_bytes, stream>>> 
01638       (out, outNorm, a, b, in, inNorm, dslashParam);
01639   }
01640 
01641   void preTune() {
01642     saveOut = new char[bytes];
01643     cudaMemcpy(saveOut, out, bytes, cudaMemcpyDeviceToHost);
01644     if (typeid(sFloat) == typeid(short4)) {
01645       saveOutNorm = new char[norm_bytes];
01646       cudaMemcpy(saveOutNorm, outNorm, norm_bytes, cudaMemcpyDeviceToHost);
01647     }
01648   }
01649 
01650   void postTune() {
01651     cudaMemcpy(out, saveOut, bytes, cudaMemcpyHostToDevice);
01652     delete[] saveOut;
01653     if (typeid(sFloat) == typeid(short4)) {
01654       cudaMemcpy(outNorm, saveOutNorm, norm_bytes, cudaMemcpyHostToDevice);
01655       delete[] saveOutNorm;
01656     }
01657   }
01658 
01659   std::string paramString(const TuneParam &param) const {
01660     std::stringstream ps;
01661     ps << "block=(" << param.block.x << "," << param.block.y << "," << param.block.z << "), ";
01662     ps << "shared=" << param.shared_bytes;
01663     return ps.str();
01664   }
01665 };
01666 
01667 void twistGamma5Cuda(cudaColorSpinorField *out, const cudaColorSpinorField *in,
01668                      const int dagger, const double &kappa, const double &mu,
01669                      const QudaTwistGamma5Type twist)
01670 {
01671   dslashParam.threads = in->Volume();
01672 
01673 #ifdef GPU_TWISTED_MASS_DIRAC
01674   Tunable *twistGamma5 = 0;
01675 
01676   if (in->Precision() == QUDA_DOUBLE_PRECISION) {
01677 #if (__COMPUTE_CAPABILITY__ >= 130)
01678     twistGamma5 = new TwistGamma5Cuda<double2>
01679       ((double2*)out->V(), (float*)out->Norm(), (double2*)in->V(), 
01680        (float*)in->Norm(), kappa, mu, dagger, twist, in->Bytes(), in->NormBytes());
01681 #else
01682     errorQuda("Double precision not supported on this GPU");
01683 #endif
01684   } else if (in->Precision() == QUDA_SINGLE_PRECISION) {
01685     twistGamma5 = new TwistGamma5Cuda<float4>
01686       ((float4*)out->V(), (float*)out->Norm(), (float4*)in->V(), 
01687        (float*)in->Norm(), kappa, mu, dagger, twist, in->Bytes(), in->NormBytes());
01688   } else if (in->Precision() == QUDA_HALF_PRECISION) {
01689     twistGamma5 = new TwistGamma5Cuda<short4>
01690       ((short4*)out->V(), (float*)out->Norm(), (short4*)in->V(), 
01691        (float*)in->Norm(), kappa, mu, dagger, twist, in->Bytes(), in->NormBytes());
01692   }
01693 
01694   twistGamma5->apply(streams[Nstream-1]);
01695   checkCudaError();
01696 
01697   delete twistGamma5;
01698 #else
01699   errorQuda("Twisted mass dslash has not been built");
01700 #endif // GPU_TWISTED_MASS_DIRAC
01701 }
01702 
01703 
01704 #include "misc_helpers.cu"
01705 
01706 
01707 #if defined(GPU_FATLINK) || defined(GPU_GAUGE_FORCE) || defined(GPU_FERMION_FORCE) || defined(GPU_HISQ_FORCE) || defined(GPU_UNITARIZE)
01708 #include <force_common.h>
01709 #include "force_kernel_common.cu"
01710 #endif
01711 
01712 #ifdef GPU_FATLINK
01713 #include "llfat_quda.cu"
01714 #endif
01715 
01716 #ifdef GPU_GAUGE_FORCE
01717 #include "gauge_force_quda.cu"
01718 #endif
01719 
01720 #ifdef GPU_FERMION_FORCE
01721 #include "fermion_force_quda.cu"
01722 #endif
01723 
01724 #ifdef GPU_UNITARIZE
01725 #include "unitarize_links_quda.cu"
01726 #endif
01727 
01728 #ifdef GPU_HISQ_FORCE
01729 #include "hisq_paths_force_quda.cu"
01730 #include "unitarize_force_quda.cu"
01731 #endif