QUDA v0.4.0
A library for QCD on GPUs
|
00001 #include <cstdlib> 00002 #include <cstdio> 00003 #include <string> 00004 #include <iostream> 00005 00006 #include <color_spinor_field.h> 00007 #include <clover_field.h> 00008 00009 #define BLOCK_DIM 64 00010 00011 // these control the Wilson-type actions 00012 //#define DIRECT_ACCESS_LINK 00013 //#define DIRECT_ACCESS_WILSON_SPINOR 00014 //#define DIRECT_ACCESS_WILSON_ACCUM 00015 //#define DIRECT_ACCESS_WILSON_INTER 00016 //#define DIRECT_ACCESS_WILSON_PACK_SPINOR 00017 //#define DIRECT_ACCESS_CLOVER 00018 00019 //these are access control for staggered action 00020 #if (__COMPUTE_CAPABILITY__ >= 200) 00021 //#define DIRECT_ACCESS_FAT_LINK 00022 //#define DIRECT_ACCESS_LONG_LINK 00023 #define DIRECT_ACCESS_SPINOR 00024 //#define DIRECT_ACCESS_ACCUM 00025 //#define DIRECT_ACCESS_INTER 00026 //#define DIRECT_ACCESS_PACK 00027 #else 00028 #define DIRECT_ACCESS_FAT_LINK 00029 //#define DIRECT_ACCESS_LONG_LINK 00030 //#define DIRECT_ACCESS_SPINOR 00031 //#define DIRECT_ACCESS_ACCUM 00032 //#define DIRECT_ACCESS_INTER 00033 //#define DIRECT_ACCESS_PACK 00034 #endif 00035 00036 #include <quda_internal.h> 00037 #include <dslash_quda.h> 00038 #include <sys/time.h> 00039 #include <inline_ptx.h> 00040 00041 enum KernelType { 00042 INTERIOR_KERNEL = 5, 00043 EXTERIOR_KERNEL_X = 0, 00044 EXTERIOR_KERNEL_Y = 1, 00045 EXTERIOR_KERNEL_Z = 2, 00046 EXTERIOR_KERNEL_T = 3 00047 }; 00048 00049 struct DslashParam { 00050 int threads; // the desired number of active threads 00051 int parity; // Even-Odd or Odd-Even 00052 int commDim[QUDA_MAX_DIM]; // Whether to do comms or not 00053 int ghostDim[QUDA_MAX_DIM]; // Whether a ghost zone has been allocated for a given dimension 00054 int ghostOffset[QUDA_MAX_DIM]; 00055 int ghostNormOffset[QUDA_MAX_DIM]; 00056 KernelType kernel_type; //is it INTERIOR_KERNEL, EXTERIOR_KERNEL_X/Y/Z/T 00057 }; 00058 00059 // determines whether the temporal ghost zones are packed with a gather kernel, 00060 // as opposed to multiple calls to cudaMemcpy() 00061 bool kernelPackT = false; 00062 00063 DslashParam dslashParam; 00064 00065 // these are set in initDslashConst 00066 int Vspatial; 00067 00068 static cudaEvent_t packEnd[Nstream]; 00069 static cudaEvent_t gatherStart[Nstream]; 00070 static cudaEvent_t gatherEnd[Nstream]; 00071 static cudaEvent_t scatterStart[Nstream]; 00072 static cudaEvent_t scatterEnd[Nstream]; 00073 00074 static struct timeval dslashStart_h; 00075 #ifdef MULTI_GPU 00076 static struct timeval commsStart[Nstream]; 00077 static struct timeval commsEnd[Nstream]; 00078 #endif 00079 00080 // these events are only used for profiling 00081 #ifdef DSLASH_PROFILING 00082 #define DSLASH_TIME_PROFILE() dslashTimeProfile() 00083 00084 static cudaEvent_t dslashStart; 00085 static cudaEvent_t dslashEnd; 00086 static cudaEvent_t packStart[Nstream]; 00087 static cudaEvent_t kernelStart[Nstream]; 00088 static cudaEvent_t kernelEnd[Nstream]; 00089 00090 // dimension 2 because we want absolute and relative 00091 float packTime[Nstream][2]; 00092 float gatherTime[Nstream][2]; 00093 float commsTime[Nstream][2]; 00094 float scatterTime[Nstream][2]; 00095 float kernelTime[Nstream][2]; 00096 float dslashTime; 00097 #define CUDA_EVENT_RECORD(a,b) cudaEventRecord(a,b) 00098 #else 00099 #define CUDA_EVENT_RECORD(a,b) 00100 #define DSLASH_TIME_PROFILE() 00101 #endif 00102 00103 FaceBuffer *face; 00104 cudaColorSpinorField *inSpinor; 00105 00106 // For tuneLaunch() to uniquely identify a suitable set of launch parameters, we need copies of a few of 00107 // the constants set by initDslashConstants(). 00108 static struct { 00109 int x[4]; 00110 int Ls; 00111 // In the future, we may also want to add gauge_fixed, sp_stride, ga_stride, cl_stride, etc. 00112 } dslashConstants; 00113 00114 // dslashTuning = QUDA_TUNE_YES enables autotuning when the dslash is 00115 // first launched 00116 static QudaTune dslashTuning = QUDA_TUNE_NO; 00117 static QudaVerbosity verbosity = QUDA_SILENT; 00118 00119 void setDslashTuning(QudaTune tune, QudaVerbosity verbose) 00120 { 00121 dslashTuning = tune; 00122 verbosity = verbose; 00123 } 00124 00125 #include <dslash_textures.h> 00126 #include <dslash_constants.h> 00127 00128 static inline __device__ float short2float(short a) { 00129 return (float)a/MAX_SHORT; 00130 } 00131 00132 static inline __device__ short float2short(float c, float a) { 00133 return (short)(a*c*MAX_SHORT); 00134 } 00135 00136 static inline __device__ short2 float22short2(float c, float2 a) { 00137 return make_short2((short)(a.x*c*MAX_SHORT), (short)(a.y*c*MAX_SHORT)); 00138 } 00139 00140 #if defined(DIRECT_ACCESS_LINK) || defined(DIRECT_ACCESS_WILSON_SPINOR) || \ 00141 defined(DIRECT_ACCESS_WILSON_ACCUM) || defined(DIRECT_ACCESS_WILSON_PACK_SPINOR) || \ 00142 defined(DIRECT_ACCESS_WILSON_INTER) || defined(DIRECT_ACCESS_WILSON_PACK_SPINOR) 00143 00144 static inline __device__ short4 float42short4(float c, float4 a) { 00145 return make_short4(float2short(c, a.x), float2short(c, a.y), float2short(c, a.z), float2short(c, a.w)); 00146 } 00147 00148 static inline __device__ float4 short42float4(short4 a) { 00149 return make_float4(short2float(a.x), short2float(a.y), short2float(a.z), short2float(a.w)); 00150 } 00151 00152 static inline __device__ float2 short22float2(short2 a) { 00153 return make_float2(short2float(a.x), short2float(a.y)); 00154 } 00155 #endif // DIRECT_ACCESS inclusions 00156 00157 // Enable shared memory dslash for Fermi architecture 00158 //#define SHARED_WILSON_DSLASH 00159 //#define SHARED_8_BYTE_WORD_SIZE // 8-byte shared memory access 00160 00161 #include <pack_face_def.h> // kernels for packing the ghost zones and general indexing 00162 #include <staggered_dslash_def.h> // staggered Dslash kernels 00163 #include <wilson_dslash_def.h> // Wilson Dslash kernels (including clover) 00164 #include <dw_dslash_def.h> // Domain Wall kernels 00165 #include <tm_dslash_def.h> // Twisted Mass kernels 00166 #include <tm_core.h> // solo twisted mass kernel 00167 #include <clover_def.h> // kernels for applying the clover term alone 00168 00169 #ifndef DSLASH_SHARED_FLOATS_PER_THREAD 00170 #define DSLASH_SHARED_FLOATS_PER_THREAD 0 00171 #endif 00172 00173 #ifndef CLOVER_SHARED_FLOATS_PER_THREAD 00174 #define CLOVER_SHARED_FLOATS_PER_THREAD 0 00175 #endif 00176 00177 #include <blas_quda.h> 00178 #include <face_quda.h> 00179 00180 00181 __global__ void dummyKernel() { 00182 // do nothing 00183 } 00184 00185 void initCache() { 00186 00187 #if (__COMPUTE_CAPABILITY__ >= 200) 00188 00189 static int firsttime = 1; 00190 if (firsttime){ 00191 cudaFuncSetCacheConfig(dummyKernel, cudaFuncCachePreferL1); 00192 dummyKernel<<<1,1>>>(); 00193 firsttime=0; 00194 } 00195 00196 #endif 00197 00198 } 00199 00200 void setFace(const FaceBuffer &Face) { 00201 face = (FaceBuffer*)&Face; // nasty 00202 } 00203 00204 #define MORE_GENERIC_DSLASH(FUNC, DAG, X, kernel_type, gridDim, blockDim, shared, stream, param, ...) \ 00205 if (x==0) { \ 00206 if (reconstruct == QUDA_RECONSTRUCT_NO) { \ 00207 FUNC ## 18 ## DAG ## Kernel<kernel_type><<<gridDim, blockDim, shared, stream>>> ( __VA_ARGS__ , param); \ 00208 } else if (reconstruct == QUDA_RECONSTRUCT_12) { \ 00209 FUNC ## 12 ## DAG ## Kernel<kernel_type><<<gridDim, blockDim, shared, stream>>> ( __VA_ARGS__ , param); \ 00210 } else { \ 00211 FUNC ## 8 ## DAG ## Kernel<kernel_type><<<gridDim, blockDim, shared, stream>>> ( __VA_ARGS__, param); \ 00212 } \ 00213 } else { \ 00214 if (reconstruct == QUDA_RECONSTRUCT_NO) { \ 00215 FUNC ## 18 ## DAG ## X ## Kernel<kernel_type><<<gridDim, blockDim, shared, stream>>> ( __VA_ARGS__, param); \ 00216 } else if (reconstruct == QUDA_RECONSTRUCT_12) { \ 00217 FUNC ## 12 ## DAG ## X ## Kernel<kernel_type><<<gridDim, blockDim, shared, stream>>> ( __VA_ARGS__, param); \ 00218 } else if (reconstruct == QUDA_RECONSTRUCT_8) { \ 00219 FUNC ## 8 ## DAG ## X ## Kernel<kernel_type> <<<gridDim, blockDim, shared, stream>>> ( __VA_ARGS__, param); \ 00220 } \ 00221 } 00222 00223 #ifndef MULTI_GPU 00224 00225 #define GENERIC_DSLASH(FUNC, DAG, X, gridDim, blockDim, shared, stream, param, ...) \ 00226 switch(param.kernel_type) { \ 00227 case INTERIOR_KERNEL: \ 00228 MORE_GENERIC_DSLASH(FUNC, DAG, X, INTERIOR_KERNEL, gridDim, blockDim, shared, stream, param, __VA_ARGS__) \ 00229 break; \ 00230 default: \ 00231 errorQuda("KernelType %d not defined for single GPU", param.kernel_type); \ 00232 } 00233 00234 #else 00235 00236 #define GENERIC_DSLASH(FUNC, DAG, X, gridDim, blockDim, shared, stream, param, ...) \ 00237 switch(param.kernel_type) { \ 00238 case INTERIOR_KERNEL: \ 00239 MORE_GENERIC_DSLASH(FUNC, DAG, X, INTERIOR_KERNEL, gridDim, blockDim, shared, stream, param, __VA_ARGS__) \ 00240 break; \ 00241 case EXTERIOR_KERNEL_X: \ 00242 MORE_GENERIC_DSLASH(FUNC, DAG, X, EXTERIOR_KERNEL_X, gridDim, blockDim, shared, stream, param, __VA_ARGS__) \ 00243 break; \ 00244 case EXTERIOR_KERNEL_Y: \ 00245 MORE_GENERIC_DSLASH(FUNC, DAG, X, EXTERIOR_KERNEL_Y, gridDim, blockDim, shared, stream, param, __VA_ARGS__) \ 00246 break; \ 00247 case EXTERIOR_KERNEL_Z: \ 00248 MORE_GENERIC_DSLASH(FUNC, DAG, X, EXTERIOR_KERNEL_Z, gridDim, blockDim, shared, stream, param, __VA_ARGS__) \ 00249 break; \ 00250 case EXTERIOR_KERNEL_T: \ 00251 MORE_GENERIC_DSLASH(FUNC, DAG, X, EXTERIOR_KERNEL_T, gridDim, blockDim, shared, stream, param, __VA_ARGS__) \ 00252 break; \ 00253 } 00254 00255 #endif 00256 00257 // macro used for dslash types with dagger kernel defined (Wilson, domain wall, etc.) 00258 #define DSLASH(FUNC, gridDim, blockDim, shared, stream, param, ...) \ 00259 if (!dagger) { \ 00260 GENERIC_DSLASH(FUNC, , Xpay, gridDim, blockDim, shared, stream, param, __VA_ARGS__) \ 00261 } else { \ 00262 GENERIC_DSLASH(FUNC, Dagger, Xpay, gridDim, blockDim, shared, stream, param, __VA_ARGS__) \ 00263 } 00264 00265 // macro used for staggered dslash 00266 #define STAGGERED_DSLASH(gridDim, blockDim, shared, stream, param, ...) \ 00267 GENERIC_DSLASH(staggeredDslash, , Axpy, gridDim, blockDim, shared, stream, param, __VA_ARGS__) 00268 00269 00270 // Use an abstract class interface to drive the different CUDA dslash 00271 // kernels. All parameters are curried into the derived classes to 00272 // allow a simple interface. 00273 class DslashCuda : public Tunable { 00274 protected: 00275 int sharedBytesPerBlock() const { return 0; } 00276 bool advanceGridDim(TuneParam ¶m) const { return false; } // Don't tune the grid dimensions. 00277 bool advanceBlockDim(TuneParam ¶m) const { 00278 bool advance = Tunable::advanceBlockDim(param); 00279 if (advance) param.grid = dim3( (dslashParam.threads+param.block.x-1) / param.block.x, 1, 1); 00280 return advance; 00281 } 00282 00283 public: 00284 DslashCuda() { } 00285 virtual ~DslashCuda() { } 00286 virtual TuneKey tuneKey() const; 00287 std::string paramString(const TuneParam ¶m) const // Don't bother printing the grid dim. 00288 { 00289 std::stringstream ps; 00290 ps << "block=(" << param.block.x << "," << param.block.y << "," << param.block.z << "), "; 00291 ps << "shared=" << param.shared_bytes; 00292 return ps.str(); 00293 } 00294 virtual int Nface() { return 2; } 00295 00296 virtual void initTuneParam(TuneParam ¶m) const 00297 { 00298 Tunable::initTuneParam(param); 00299 param.grid = dim3( (dslashParam.threads+param.block.x-1) / param.block.x, 1, 1); 00300 } 00301 00303 virtual void defaultTuneParam(TuneParam ¶m) const 00304 { 00305 Tunable::defaultTuneParam(param); 00306 param.grid = dim3( (dslashParam.threads+param.block.x-1) / param.block.x, 1, 1); 00307 } 00308 00309 00310 }; 00311 00312 TuneKey DslashCuda::tuneKey() const 00313 { 00314 std::stringstream vol, aux; 00315 00316 vol << dslashConstants.x[0] << "x"; 00317 vol << dslashConstants.x[1] << "x"; 00318 vol << dslashConstants.x[2] << "x"; 00319 vol << dslashConstants.x[3]; 00320 00321 aux << "type="; 00322 #ifdef MULTI_GPU 00323 char comm[5], ghost[5]; 00324 switch (dslashParam.kernel_type) { 00325 case INTERIOR_KERNEL: aux << "interior"; break; 00326 case EXTERIOR_KERNEL_X: aux << "exterior_x"; break; 00327 case EXTERIOR_KERNEL_Y: aux << "exterior_y"; break; 00328 case EXTERIOR_KERNEL_Z: aux << "exterior_z"; break; 00329 case EXTERIOR_KERNEL_T: aux << "exterior_t"; break; 00330 } 00331 for (int i=0; i<4; i++) { 00332 comm[i] = (dslashParam.commDim[i] ? '1' : '0'); 00333 ghost[i] = (dslashParam.ghostDim[i] ? '1' : '0'); 00334 } 00335 comm[4] = '\0'; ghost[4] = '\0'; 00336 aux << ",comm=" << comm; 00337 if (dslashParam.kernel_type == INTERIOR_KERNEL) { 00338 aux << ",ghost=" << ghost; 00339 } 00340 #else 00341 aux << "single-GPU"; 00342 #endif // MULTI_GPU 00343 return TuneKey(vol.str(), typeid(*this).name(), aux.str()); 00344 } 00345 00349 #if (__COMPUTE_CAPABILITY__ >= 200 && defined(SHARED_WILSON_DSLASH)) 00350 class SharedDslashCuda : public DslashCuda { 00351 protected: 00352 int sharedBytesPerBlock() const { return 0; } // FIXME: this isn't quite true, but works 00353 bool advanceSharedBytes(TuneParam ¶m) const { 00354 if (dslashParam.kernel_type != INTERIOR_KERNEL) return DslashCuda::advanceSharedBytes(param); 00355 else return false; 00356 } // FIXME - shared memory tuning only supported on exterior kernels 00357 00359 int sharedBytes(const dim3 &block) const { 00360 int warpSize = 32; // FIXME - query from device properties 00361 int block_xy = block.x*block.y; 00362 if (block_xy % warpSize != 0) block_xy = ((block_xy / warpSize) + 1)*warpSize; 00363 return block_xy*block.z*sharedBytesPerThread(); 00364 } 00365 00367 dim3 createGrid(const dim3 &block) const { 00368 unsigned int gx = ((dslashConstants.x[0]/2)*dslashConstants.x[3] + block.x - 1) / block.x; 00369 unsigned int gy = (dslashConstants.x[1] + block.y - 1 ) / block.y; 00370 unsigned int gz = (dslashConstants.x[2] + block.z - 1) / block.z; 00371 return dim3(gx, gy, gz); 00372 } 00373 00375 bool advanceBlockDim(TuneParam ¶m) const { 00376 if (dslashParam.kernel_type != INTERIOR_KERNEL) return DslashCuda::advanceBlockDim(param); 00377 const unsigned int min_threads = 2; 00378 const unsigned int max_threads = 512; // FIXME: use deviceProp.maxThreadsDim[0]; 00379 const unsigned int max_shared = 16384*3; // FIXME: use deviceProp.sharedMemPerBlock; 00380 00381 // set the x-block dimension equal to the entire x dimension 00382 bool set = false; 00383 dim3 blockInit = param.block; 00384 blockInit.z++; 00385 for (unsigned bx=blockInit.x; bx<=dslashConstants.x[0]/2; bx++) { 00386 //unsigned int gx = (dslashConstants.x[0]*dslashConstants.x[3] + bx - 1) / bx; 00387 for (unsigned by=blockInit.y; by<=dslashConstants.x[1]; by++) { 00388 unsigned int gy = (dslashConstants.x[1] + by - 1 ) / by; 00389 00390 if (by > 1 && (by%2) != 0) continue; // can't handle odd blocks yet except by=1 00391 00392 for (unsigned bz=blockInit.z; bz<=dslashConstants.x[2]; bz++) { 00393 unsigned int gz = (dslashConstants.x[2] + bz - 1) / bz; 00394 00395 if (bz > 1 && (bz%2) != 0) continue; // can't handle odd blocks yet except bz=1 00396 if (bx*by*bz > max_threads) continue; 00397 if (bx*by*bz < min_threads) continue; 00398 // can't yet handle the last block properly in shared memory addressing 00399 if (by*gy != dslashConstants.x[1]) continue; 00400 if (bz*gz != dslashConstants.x[2]) continue; 00401 if (sharedBytes(dim3(bx, by, bz)) > max_shared) continue; 00402 00403 param.block = dim3(bx, by, bz); 00404 set = true; break; 00405 } 00406 if (set) break; 00407 blockInit.z = 1; 00408 } 00409 if (set) break; 00410 blockInit.y = 1; 00411 } 00412 00413 if (param.block.x > dslashConstants.x[0]/2 && param.block.y > dslashConstants.x[1] && 00414 param.block.z > dslashConstants.x[2] || !set) { 00415 //||sharedBytesPerThread()*param.block.x > max_shared) { 00416 param.block = dim3(dslashConstants.x[0]/2, 1, 1); 00417 return false; 00418 } else { 00419 param.grid = createGrid(param.block); 00420 param.shared_bytes = sharedBytes(param.block); 00421 return true; 00422 } 00423 00424 } 00425 00426 public: 00427 SharedDslashCuda() : DslashCuda() { ; } 00428 virtual ~SharedDslashCuda() { ; } 00429 std::string paramString(const TuneParam ¶m) const // override and print out grid as well 00430 { 00431 std::stringstream ps; 00432 ps << "block=(" << param.block.x << "," << param.block.y << "," << param.block.z << "), "; 00433 ps << "grid=(" << param.grid.x << "," << param.grid.y << "," << param.grid.z << "), "; 00434 ps << "shared=" << param.shared_bytes; 00435 return ps.str(); 00436 } 00437 00438 virtual void initTuneParam(TuneParam ¶m) const 00439 { 00440 if (dslashParam.kernel_type != INTERIOR_KERNEL) return DslashCuda::initTuneParam(param); 00441 00442 param.block = dim3(dslashConstants.x[0]/2, 1, 1); 00443 param.grid = createGrid(param.block); 00444 param.shared_bytes = sharedBytes(param.block); 00445 } 00446 00448 virtual void defaultTuneParam(TuneParam ¶m) const 00449 { 00450 if (dslashParam.kernel_type != INTERIOR_KERNEL) DslashCuda::defaultTuneParam(param); 00451 else initTuneParam(param); 00452 } 00453 }; 00454 #else 00455 class SharedDslashCuda : public DslashCuda { 00456 public: 00457 SharedDslashCuda() : DslashCuda() { } 00458 virtual ~SharedDslashCuda() { } 00459 }; 00460 #endif 00461 00462 00463 template <typename sFloat, typename gFloat> 00464 class WilsonDslashCuda : public SharedDslashCuda { 00465 00466 private: 00467 const size_t bytes, norm_bytes; 00468 sFloat *out; 00469 float *outNorm; 00470 char *saveOut, *saveOutNorm; 00471 const sFloat *in, *x; 00472 const float *inNorm, *xNorm; 00473 const gFloat *gauge0, *gauge1; 00474 const QudaReconstructType reconstruct; 00475 const int dagger; 00476 const double a; 00477 00478 protected: 00479 int sharedBytesPerThread() const 00480 { 00481 #if (__COMPUTE_CAPABILITY__ >= 200) // Fermi uses shared memory for common input 00482 if (dslashParam.kernel_type == INTERIOR_KERNEL) { // Interior kernels use shared memory for common iunput 00483 int reg_size = (typeid(sFloat)==typeid(double2) ? sizeof(double) : sizeof(float)); 00484 return DSLASH_SHARED_FLOATS_PER_THREAD * reg_size; 00485 } else { // Exterior kernels use no shared memory 00486 return 0; 00487 } 00488 #else // Pre-Fermi uses shared memory only for pseudo-registers 00489 int reg_size = (typeid(sFloat)==typeid(double2) ? sizeof(double) : sizeof(float)); 00490 return DSLASH_SHARED_FLOATS_PER_THREAD * reg_size; 00491 #endif 00492 } 00493 00494 public: 00495 WilsonDslashCuda(sFloat *out, float *outNorm, const gFloat *gauge0, const gFloat *gauge1, 00496 const QudaReconstructType reconstruct, const sFloat *in, const float *inNorm, 00497 const sFloat *x, const float *xNorm, const double a, 00498 const int dagger, const size_t bytes, const size_t norm_bytes) 00499 : SharedDslashCuda(), bytes(bytes), norm_bytes(norm_bytes), out(out), outNorm(outNorm), gauge0(gauge0), gauge1(gauge1), in(in), 00500 inNorm(inNorm), reconstruct(reconstruct), dagger(dagger), x(x), xNorm(xNorm), a(a) 00501 { 00502 bindSpinorTex(bytes, norm_bytes, in, inNorm, out, outNorm, x, xNorm); 00503 } 00504 00505 virtual ~WilsonDslashCuda() { unbindSpinorTex(in, inNorm, out, outNorm, x, xNorm); } 00506 00507 TuneKey tuneKey() const 00508 { 00509 TuneKey key = DslashCuda::tuneKey(); 00510 std::stringstream recon; 00511 recon << reconstruct; 00512 key.aux += ",reconstruct=" + recon.str(); 00513 if (x) key.aux += ",Xpay"; 00514 return key; 00515 } 00516 00517 void apply(const cudaStream_t &stream) 00518 { 00519 #ifdef SHARED_WILSON_DSLASH 00520 if (dslashParam.kernel_type == EXTERIOR_KERNEL_X) 00521 errorQuda("Shared dslash does not yet support X-dimension partitioning"); 00522 #endif 00523 TuneParam tp = tuneLaunch(*this, dslashTuning, verbosity); 00524 DSLASH(dslash, tp.grid, tp.block, tp.shared_bytes, stream, dslashParam, 00525 out, outNorm, gauge0, gauge1, in, inNorm, x, xNorm, a); 00526 } 00527 00528 void preTune() 00529 { 00530 if (dslashParam.kernel_type < 5) { // exterior kernel 00531 saveOut = new char[bytes]; 00532 cudaMemcpy(saveOut, out, bytes, cudaMemcpyDeviceToHost); 00533 if (typeid(sFloat) == typeid(short4)) { 00534 saveOutNorm = new char[norm_bytes]; 00535 cudaMemcpy(saveOutNorm, outNorm, norm_bytes, cudaMemcpyDeviceToHost); 00536 } 00537 } 00538 } 00539 00540 void postTune() 00541 { 00542 if (dslashParam.kernel_type < 5) { // exterior kernel 00543 cudaMemcpy(out, saveOut, bytes, cudaMemcpyHostToDevice); 00544 delete[] saveOut; 00545 if (typeid(sFloat) == typeid(short4)) { 00546 cudaMemcpy(outNorm, saveOutNorm, norm_bytes, cudaMemcpyHostToDevice); 00547 delete[] saveOutNorm; 00548 } 00549 } 00550 } 00551 00552 }; 00553 00554 template <typename sFloat, typename gFloat, typename cFloat> 00555 class CloverDslashCuda : public SharedDslashCuda { 00556 00557 private: 00558 const size_t bytes, norm_bytes; 00559 sFloat *out; 00560 float *outNorm; 00561 char *saveOut, *saveOutNorm; 00562 const sFloat *in, *x; 00563 const float *inNorm, *xNorm; 00564 const gFloat *gauge0, *gauge1; 00565 const QudaReconstructType reconstruct; 00566 const cFloat *clover; 00567 const float *cloverNorm; 00568 const int dagger; 00569 const double a; 00570 00571 protected: 00572 int sharedBytesPerThread() const 00573 { 00574 #if (__COMPUTE_CAPABILITY__ >= 200) 00575 if (dslashParam.kernel_type == INTERIOR_KERNEL) { 00576 int reg_size = (typeid(sFloat)==typeid(double2) ? sizeof(double) : sizeof(float)); 00577 return DSLASH_SHARED_FLOATS_PER_THREAD * reg_size; 00578 } else { 00579 return 0; 00580 } 00581 #else 00582 int reg_size = (typeid(sFloat)==typeid(double2) ? sizeof(double) : sizeof(float)); 00583 return DSLASH_SHARED_FLOATS_PER_THREAD * reg_size; 00584 #endif 00585 } 00586 00587 public: 00588 CloverDslashCuda(sFloat *out, float *outNorm, const gFloat *gauge0, const gFloat *gauge1, 00589 const QudaReconstructType reconstruct, const cFloat *clover, 00590 const float *cloverNorm, const sFloat *in, const float *inNorm, 00591 const sFloat *x, const float *xNorm, const double a, 00592 const int dagger, const size_t bytes, const size_t norm_bytes) 00593 : SharedDslashCuda(), bytes(bytes), norm_bytes(norm_bytes), out(out), outNorm(outNorm), gauge0(gauge0), gauge1(gauge1), clover(clover), 00594 cloverNorm(cloverNorm), in(in), inNorm(inNorm), reconstruct(reconstruct), dagger(dagger), x(x), xNorm(xNorm), a(a) 00595 { 00596 bindSpinorTex(bytes, norm_bytes, in, inNorm, out, outNorm, x, xNorm); 00597 } 00598 virtual ~CloverDslashCuda() { unbindSpinorTex(in, inNorm, out, outNorm, x, xNorm); } 00599 00600 TuneKey tuneKey() const 00601 { 00602 TuneKey key = DslashCuda::tuneKey(); 00603 std::stringstream recon; 00604 recon << reconstruct; 00605 key.aux += ",reconstruct=" + recon.str(); 00606 if (x) key.aux += ",Xpay"; 00607 return key; 00608 } 00609 00610 void apply(const cudaStream_t &stream) 00611 { 00612 #ifdef SHARED_WILSON_DSLASH 00613 if (dslashParam.kernel_type == EXTERIOR_KERNEL_X) 00614 errorQuda("Shared dslash does not yet support X-dimension partitioning"); 00615 #endif 00616 TuneParam tp = tuneLaunch(*this, dslashTuning, verbosity); 00617 DSLASH(cloverDslash, tp.grid, tp.block, tp.shared_bytes, stream, dslashParam, 00618 out, outNorm, gauge0, gauge1, clover, cloverNorm, in, inNorm, x, xNorm, a); 00619 } 00620 00621 void preTune() 00622 { 00623 if (dslashParam.kernel_type < 5) { // exterior kernel 00624 saveOut = new char[bytes]; 00625 cudaMemcpy(saveOut, out, bytes, cudaMemcpyDeviceToHost); 00626 if (typeid(sFloat) == typeid(short4)) { 00627 saveOutNorm = new char[norm_bytes]; 00628 cudaMemcpy(saveOutNorm, outNorm, norm_bytes, cudaMemcpyDeviceToHost); 00629 } 00630 } 00631 } 00632 00633 void postTune() 00634 { 00635 if (dslashParam.kernel_type < 5) { // exterior kernel 00636 cudaMemcpy(out, saveOut, bytes, cudaMemcpyHostToDevice); 00637 delete[] saveOut; 00638 if (typeid(sFloat) == typeid(short4)) { 00639 cudaMemcpy(outNorm, saveOutNorm, norm_bytes, cudaMemcpyHostToDevice); 00640 delete[] saveOutNorm; 00641 } 00642 } 00643 } 00644 00645 }; 00646 00647 void setTwistParam(double &a, double &b, const double &kappa, const double &mu, 00648 const int dagger, const QudaTwistGamma5Type twist) { 00649 if (twist == QUDA_TWIST_GAMMA5_DIRECT) { 00650 a = 2.0 * kappa * mu; 00651 b = 1.0; 00652 } else if (twist == QUDA_TWIST_GAMMA5_INVERSE) { 00653 a = -2.0 * kappa * mu; 00654 b = 1.0 / (1.0 + a*a); 00655 } else { 00656 errorQuda("Twist type %d not defined\n", twist); 00657 } 00658 if (dagger) a *= -1.0; 00659 00660 } 00661 00662 template <typename sFloat, typename gFloat> 00663 class TwistedDslashCuda : public SharedDslashCuda { 00664 00665 private: 00666 const size_t bytes, norm_bytes; 00667 sFloat *out; 00668 float *outNorm; 00669 char *saveOut, *saveOutNorm; 00670 const sFloat *in, *x; 00671 const float *inNorm, *xNorm; 00672 const gFloat *gauge0, *gauge1; 00673 const QudaReconstructType reconstruct; 00674 const int dagger; 00675 double a; 00676 double b; 00677 00678 protected: 00679 int sharedBytesPerThread() const 00680 { 00681 #if (__COMPUTE_CAPABILITY__ >= 200) 00682 if (dslashParam.kernel_type == INTERIOR_KERNEL) { 00683 int reg_size = (typeid(sFloat)==typeid(double2) ? sizeof(double) : sizeof(float)); 00684 return DSLASH_SHARED_FLOATS_PER_THREAD * reg_size; 00685 } else { 00686 return 0; 00687 } 00688 #else 00689 int reg_size = (typeid(sFloat)==typeid(double2) ? sizeof(double) : sizeof(float)); 00690 return DSLASH_SHARED_FLOATS_PER_THREAD * reg_size; 00691 #endif 00692 } 00693 00694 public: 00695 TwistedDslashCuda(sFloat *out, float *outNorm, const gFloat *gauge0, const gFloat *gauge1, 00696 const QudaReconstructType reconstruct, const sFloat *in, const float *inNorm, 00697 const sFloat *x, const float *xNorm, const double kappa, const double mu, 00698 const double k, const int dagger, const size_t bytes, const size_t norm_bytes) 00699 : SharedDslashCuda(), bytes(bytes), norm_bytes(norm_bytes), out(out), outNorm(outNorm), gauge0(gauge0), gauge1(gauge1), in(in), 00700 inNorm(inNorm), reconstruct(reconstruct), dagger(dagger), x(x), xNorm(xNorm) 00701 { 00702 bindSpinorTex(bytes, norm_bytes, in, inNorm, out, outNorm, x, xNorm); 00703 setTwistParam(a, b, kappa, mu, dagger, QUDA_TWIST_GAMMA5_INVERSE); 00704 if (x) b *= k; 00705 } 00706 virtual ~TwistedDslashCuda() { unbindSpinorTex(in, inNorm, out, outNorm, x, xNorm); } 00707 00708 TuneKey tuneKey() const 00709 { 00710 TuneKey key = DslashCuda::tuneKey(); 00711 std::stringstream recon; 00712 recon << reconstruct; 00713 key.aux += ",reconstruct=" + recon.str(); 00714 if (x) key.aux += ",Xpay"; 00715 return key; 00716 } 00717 00718 void apply(const cudaStream_t &stream) 00719 { 00720 #ifdef SHARED_WILSON_DSLASH 00721 if (dslashParam.kernel_type == EXTERIOR_KERNEL_X) 00722 errorQuda("Shared dslash does not yet support X-dimension partitioning"); 00723 #endif 00724 TuneParam tp = tuneLaunch(*this, dslashTuning, verbosity); 00725 DSLASH(twistedMassDslash, tp.grid, tp.block, tp.shared_bytes, stream, dslashParam, 00726 out, outNorm, gauge0, gauge1, in, inNorm, a, b, x, xNorm); 00727 } 00728 00729 void preTune() 00730 { 00731 if (dslashParam.kernel_type < 5) { // exterior kernel 00732 saveOut = new char[bytes]; 00733 cudaMemcpy(saveOut, out, bytes, cudaMemcpyDeviceToHost); 00734 if (typeid(sFloat) == typeid(short4)) { 00735 saveOutNorm = new char[norm_bytes]; 00736 cudaMemcpy(saveOutNorm, outNorm, norm_bytes, cudaMemcpyDeviceToHost); 00737 } 00738 } 00739 } 00740 00741 void postTune() 00742 { 00743 if (dslashParam.kernel_type < 5) { // exterior kernel 00744 cudaMemcpy(out, saveOut, bytes, cudaMemcpyHostToDevice); 00745 delete[] saveOut; 00746 if (typeid(sFloat) == typeid(short4)) { 00747 cudaMemcpy(outNorm, saveOutNorm, norm_bytes, cudaMemcpyHostToDevice); 00748 delete[] saveOutNorm; 00749 } 00750 } 00751 } 00752 }; 00753 00754 template <typename sFloat, typename gFloat> 00755 class DomainWallDslashCuda : public DslashCuda { 00756 00757 private: 00758 const size_t bytes, norm_bytes; 00759 sFloat *out; 00760 float *outNorm; 00761 char *saveOut, *saveOutNorm; 00762 const sFloat *in, *x; 00763 const float *inNorm, *xNorm; 00764 const gFloat *gauge0, *gauge1; 00765 const QudaReconstructType reconstruct; 00766 const int dagger; 00767 const double mferm; 00768 const double a; 00769 00770 protected: 00771 int sharedBytesPerThread() const { return 0; } 00772 00773 public: 00774 DomainWallDslashCuda(sFloat *out, float *outNorm, const gFloat *gauge0, const gFloat *gauge1, 00775 const QudaReconstructType reconstruct, const sFloat *in, 00776 const float *inNorm, const sFloat *x, const float *xNorm, const double mferm, 00777 const double a, const int dagger, const size_t bytes, const size_t norm_bytes) 00778 : DslashCuda(), bytes(bytes), norm_bytes(norm_bytes), out(out), outNorm(outNorm), gauge0(gauge0), gauge1(gauge1), 00779 in(in), inNorm(inNorm), mferm(mferm), reconstruct(reconstruct), dagger(dagger), x(x), xNorm(xNorm), a(a) 00780 { 00781 bindSpinorTex(bytes, norm_bytes, in, inNorm, out, outNorm, x, xNorm); 00782 } 00783 virtual ~DomainWallDslashCuda() { unbindSpinorTex(in, inNorm, out, outNorm, x, xNorm); } 00784 00785 TuneKey tuneKey() const 00786 { 00787 TuneKey key = DslashCuda::tuneKey(); 00788 std::stringstream ls, recon; 00789 ls << dslashConstants.Ls; 00790 recon << reconstruct; 00791 key.volume += "x" + ls.str(); 00792 key.aux += ",reconstruct=" + recon.str(); 00793 if (x) key.aux += ",Xpay"; 00794 return key; 00795 } 00796 00797 void apply(const cudaStream_t &stream) 00798 { 00799 TuneParam tp = tuneLaunch(*this, dslashTuning, verbosity); 00800 dim3 gridDim( (dslashParam.threads+tp.block.x-1) / tp.block.x, 1, 1); 00801 DSLASH(domainWallDslash, gridDim, tp.block, tp.shared_bytes, stream, dslashParam, 00802 out, outNorm, gauge0, gauge1, in, inNorm, mferm, x, xNorm, a); 00803 } 00804 00805 void preTune() 00806 { 00807 if (dslashParam.kernel_type < 5) { // exterior kernel 00808 saveOut = new char[bytes]; 00809 cudaMemcpy(saveOut, out, bytes, cudaMemcpyDeviceToHost); 00810 if (typeid(sFloat) == typeid(short4)) { 00811 saveOutNorm = new char[norm_bytes]; 00812 cudaMemcpy(saveOutNorm, outNorm, norm_bytes, cudaMemcpyDeviceToHost); 00813 } 00814 } 00815 } 00816 00817 void postTune() 00818 { 00819 if (dslashParam.kernel_type < 5) { // exterior kernel 00820 cudaMemcpy(out, saveOut, bytes, cudaMemcpyHostToDevice); 00821 delete[] saveOut; 00822 if (typeid(sFloat) == typeid(short4)) { 00823 cudaMemcpy(outNorm, saveOutNorm, norm_bytes, cudaMemcpyHostToDevice); 00824 delete[] saveOutNorm; 00825 } 00826 } 00827 } 00828 }; 00829 00830 template <typename sFloat, typename fatGFloat, typename longGFloat> 00831 class StaggeredDslashCuda : public DslashCuda { 00832 00833 private: 00834 const size_t bytes, norm_bytes; 00835 sFloat *out; 00836 float *outNorm; 00837 char *saveOut, *saveOutNorm; 00838 const sFloat *in, *x; 00839 const float *inNorm, *xNorm; 00840 const fatGFloat *fat0, *fat1; 00841 const longGFloat *long0, *long1; 00842 const QudaReconstructType reconstruct; 00843 const int dagger; 00844 const double a; 00845 00846 protected: 00847 int sharedBytesPerThread() const 00848 { 00849 int reg_size = (typeid(sFloat)==typeid(double2) ? sizeof(double) : sizeof(float)); 00850 return 6 * reg_size; 00851 } 00852 00853 public: 00854 StaggeredDslashCuda(sFloat *out, float *outNorm, const fatGFloat *fat0, const fatGFloat *fat1, 00855 const longGFloat *long0, const longGFloat *long1, 00856 const QudaReconstructType reconstruct, const sFloat *in, 00857 const float *inNorm, const sFloat *x, const float *xNorm, const double a, 00858 const int dagger, const size_t bytes, const size_t norm_bytes) 00859 : DslashCuda(), bytes(bytes), norm_bytes(norm_bytes), out(out), outNorm(outNorm), fat0(fat0), fat1(fat1), long0(long0), long1(long1), 00860 in(in), inNorm(inNorm), reconstruct(reconstruct), dagger(dagger), x(x), xNorm(xNorm), a(a) 00861 { 00862 bindSpinorTex(bytes, norm_bytes, in, inNorm, out, outNorm, x, xNorm); 00863 } 00864 00865 virtual ~StaggeredDslashCuda() { unbindSpinorTex(in, inNorm, out, outNorm, x, xNorm); } 00866 00867 TuneKey tuneKey() const 00868 { 00869 TuneKey key = DslashCuda::tuneKey(); 00870 std::stringstream recon; 00871 recon << reconstruct; 00872 key.aux += ",reconstruct=" + recon.str(); 00873 if (x) key.aux += ",Axpy"; 00874 return key; 00875 } 00876 00877 void apply(const cudaStream_t &stream) 00878 { 00879 TuneParam tp = tuneLaunch(*this, dslashTuning, verbosity); 00880 dim3 gridDim( (dslashParam.threads+tp.block.x-1) / tp.block.x, 1, 1); 00881 STAGGERED_DSLASH(gridDim, tp.block, tp.shared_bytes, stream, dslashParam, 00882 out, outNorm, fat0, fat1, long0, long1, in, inNorm, x, xNorm, a); 00883 } 00884 00885 void preTune() 00886 { 00887 if (dslashParam.kernel_type < 5) { // exterior kernel 00888 saveOut = new char[bytes]; 00889 cudaMemcpy(saveOut, out, bytes, cudaMemcpyDeviceToHost); 00890 if (typeid(sFloat) == typeid(short2)) { 00891 saveOutNorm = new char[norm_bytes]; 00892 cudaMemcpy(saveOutNorm, outNorm, norm_bytes, cudaMemcpyDeviceToHost); 00893 } 00894 } 00895 } 00896 00897 void postTune() 00898 { 00899 if (dslashParam.kernel_type < 5) { // exterior kernel 00900 cudaMemcpy(out, saveOut, bytes, cudaMemcpyHostToDevice); 00901 delete[] saveOut; 00902 if (typeid(sFloat) == typeid(short2)) { 00903 cudaMemcpy(outNorm, saveOutNorm, norm_bytes, cudaMemcpyHostToDevice); 00904 delete[] saveOutNorm; 00905 } 00906 } 00907 } 00908 00909 int Nface() { return 6; } 00910 }; 00911 00912 #ifdef DSLASH_PROFILING 00913 00914 #define TDIFF(a,b) 1e3*(b.tv_sec - a.tv_sec + 1e-6*(b.tv_usec - a.tv_usec)) 00915 00916 void dslashTimeProfile() { 00917 00918 cudaEventSynchronize(dslashEnd); 00919 float runTime; 00920 cudaEventElapsedTime(&runTime, dslashStart, dslashEnd); 00921 dslashTime += runTime; 00922 00923 for (int i=4; i>=0; i--) { 00924 if (!dslashParam.commDim[i] && i<4) continue; 00925 00926 // kernel timing 00927 cudaEventElapsedTime(&runTime, dslashStart, kernelStart[2*i]); 00928 kernelTime[2*i][0] += runTime; // start time 00929 cudaEventElapsedTime(&runTime, dslashStart, kernelEnd[2*i]); 00930 kernelTime[2*i][1] += runTime; // end time 00931 } 00932 00933 #ifdef MULTI_GPU 00934 for (int i=3; i>=0; i--) { 00935 if (!dslashParam.commDim[i]) continue; 00936 00937 for (int dir = 0; dir < 2; dir ++) { 00938 // pack timing 00939 cudaEventElapsedTime(&runTime, dslashStart, packStart[2*i+dir]); 00940 packTime[2*i+dir][0] += runTime; // start time 00941 cudaEventElapsedTime(&runTime, dslashStart, packEnd[2*i+dir]); 00942 packTime[2*i+dir][1] += runTime; // end time 00943 00944 // gather timing 00945 cudaEventElapsedTime(&runTime, dslashStart, gatherStart[2*i+dir]); 00946 gatherTime[2*i+dir][0] += runTime; // start time 00947 cudaEventElapsedTime(&runTime, dslashStart, gatherEnd[2*i+dir]); 00948 gatherTime[2*i+dir][1] += runTime; // end time 00949 00950 // comms timing 00951 runTime = TDIFF(dslashStart_h, commsStart[2*i+dir]); 00952 commsTime[2*i+dir][0] += runTime; // start time 00953 runTime = TDIFF(dslashStart_h, commsEnd[2*i+dir]); 00954 commsTime[2*i+dir][1] += runTime; // end time 00955 00956 // scatter timing 00957 cudaEventElapsedTime(&runTime, dslashStart, scatterStart[2*i+dir]); 00958 scatterTime[2*i+dir][0] += runTime; // start time 00959 cudaEventElapsedTime(&runTime, dslashStart, scatterEnd[2*i+dir]); 00960 scatterTime[2*i+dir][1] += runTime; // end time 00961 } 00962 } 00963 #endif 00964 00965 } 00966 00967 void printDslashProfile() { 00968 00969 printfQuda("Total Dslash time = %6.2f\n", dslashTime); 00970 00971 char dimstr[8][8] = {"X-", "X+", "Y-", "Y+", "Z-", "Z+", "T-", "T+"}; 00972 00973 printfQuda(" %13s %13s %13s %13s %13s\n", "Pack", "Gather", "Comms", "Scatter", "Kernel"); 00974 printfQuda(" %6s %6s %6s %6s %6s %6s %6s %6s %6s %6s\n", 00975 "Start", "End", "Start", "End", "Start", "End", "Start", "End", "Start", "End"); 00976 00977 printfQuda("%8s %55s %6.2f %6.2f\n", "Interior", "", kernelTime[8][0], kernelTime[8][1]); 00978 00979 for (int i=3; i>=0; i--) { 00980 if (!dslashParam.commDim[i]) continue; 00981 00982 for (int dir = 0; dir < 2; dir ++) { 00983 printfQuda("%8s ", dimstr[2*i+dir]); 00984 #ifdef MULTI_GPU 00985 printfQuda("%6.2f %6.2f ", packTime[2*i+dir][0], packTime[2*i+dir][1]); 00986 printfQuda("%6.2f %6.2f ", gatherTime[2*i+dir][0], gatherTime[2*i+dir][1]); 00987 printfQuda("%6.2f %6.2f ", commsTime[2*i+dir][0], commsTime[2*i+dir][1]); 00988 printfQuda("%6.2f %6.2f ", scatterTime[2*i+dir][0], scatterTime[2*i+dir][1]); 00989 #endif 00990 00991 if (dir==0) printfQuda("%6.2f %6.2f\n", kernelTime[2*i][0], kernelTime[2*i][1]); 00992 else printfQuda("\n"); 00993 } 00994 } 00995 00996 } 00997 #endif 00998 00999 int gatherCompleted[Nstream]; 01000 int previousDir[Nstream]; 01001 int commsCompleted[Nstream]; 01002 int commDimTotal; 01003 01007 void initDslashCommsPattern() { 01008 for (int i=0; i<Nstream-1; i++) { 01009 gatherCompleted[i] = 0; 01010 commsCompleted[i] = 0; 01011 } 01012 gatherCompleted[Nstream-1] = 1; 01013 commsCompleted[Nstream-1] = 1; 01014 01015 // We need to know which was the previous direction in which 01016 // communication was issued, since we only query a given event / 01017 // comms call after the previous the one has successfully 01018 // completed. 01019 for (int i=3; i>=0; i--) { 01020 if (dslashParam.commDim[i]) { 01021 int prev = Nstream-1; 01022 for (int j=3; j>i; j--) if (dslashParam.commDim[j]) prev = 2*j; 01023 previousDir[2*i + 1] = prev; 01024 previousDir[2*i + 0] = 2*i + 1; // always valid 01025 } 01026 } 01027 01028 // this tells us how many events / comms occurances there are in 01029 // total. Used for exiting the while loop 01030 commDimTotal = 0; 01031 for (int i=3; i>=0; i--) commDimTotal += dslashParam.commDim[i]; 01032 commDimTotal *= 4; // 2 from pipe length, 2 from direction 01033 } 01034 01035 void dslashCuda(DslashCuda &dslash, const size_t regSize, const int parity, const int dagger, 01036 const int volume, const int *faceVolumeCB) { 01037 01038 dslashParam.parity = parity; 01039 dslashParam.kernel_type = INTERIOR_KERNEL; 01040 dslashParam.threads = volume; 01041 01042 CUDA_EVENT_RECORD(dslashStart, 0); 01043 gettimeofday(&dslashStart_h, NULL); 01044 01045 #ifdef MULTI_GPU 01046 for(int i = 3; i >=0; i--){ 01047 if (!dslashParam.commDim[i]) continue; 01048 01049 // Record the start of the packing 01050 CUDA_EVENT_RECORD(packStart[2*i+0], streams[Nstream-1]); 01051 CUDA_EVENT_RECORD(packStart[2*i+1], streams[Nstream-1]); 01052 01053 // Initialize pack from source spinor 01054 face->pack(*inSpinor, 1-parity, dagger, i, streams); 01055 01056 // Record the end of the packing 01057 cudaEventRecord(packEnd[2*i+0], streams[Nstream-1]); 01058 cudaEventRecord(packEnd[2*i+1], streams[Nstream-1]); 01059 } 01060 01061 for(int i = 3; i >=0; i--){ 01062 if (!dslashParam.commDim[i]) continue; 01063 01064 for (int dir=1; dir>=0; dir--) { 01065 cudaStreamWaitEvent(streams[2*i+dir], packEnd[2*i+dir], 0); 01066 01067 // Record the start of the gathering 01068 CUDA_EVENT_RECORD(gatherStart[2*i+dir], streams[2*i+dir]); 01069 01070 // Initialize host transfer from source spinor 01071 face->gather(*inSpinor, dagger, 2*i+dir); 01072 01073 // Record the end of the gathering 01074 cudaEventRecord(gatherEnd[2*i+dir], streams[2*i+dir]); 01075 } 01076 } 01077 #endif 01078 01079 CUDA_EVENT_RECORD(kernelStart[Nstream-1], streams[Nstream-1]); 01080 dslash.apply(streams[Nstream-1]); 01081 CUDA_EVENT_RECORD(kernelEnd[Nstream-1], streams[Nstream-1]); 01082 01083 #ifdef MULTI_GPU 01084 initDslashCommsPattern(); 01085 01086 int completeSum = 0; 01087 while (completeSum < commDimTotal) { 01088 for (int i=3; i>=0; i--) { 01089 if (!dslashParam.commDim[i]) continue; 01090 01091 for (int dir=1; dir>=0; dir--) { 01092 01093 // Query if gather has completed 01094 if (!gatherCompleted[2*i+dir] && gatherCompleted[previousDir[2*i+dir]]) { 01095 if (cudaSuccess == cudaEventQuery(gatherEnd[2*i+dir])) { 01096 gatherCompleted[2*i+dir] = 1; 01097 completeSum++; 01098 gettimeofday(&commsStart[2*i+dir], NULL); 01099 face->commsStart(2*i+dir); 01100 } 01101 } 01102 01103 // Query if comms has finished 01104 if (!commsCompleted[2*i+dir] && commsCompleted[previousDir[2*i+dir]] && 01105 gatherCompleted[2*i+dir]) { 01106 if (face->commsQuery(2*i+dir)) { 01107 commsCompleted[2*i+dir] = 1; 01108 completeSum++; 01109 gettimeofday(&commsEnd[2*i+dir], NULL); 01110 01111 // Record the end of the scattering 01112 CUDA_EVENT_RECORD(scatterStart[2*i+dir], streams[2*i+dir]); 01113 01114 // Scatter into the end zone 01115 face->scatter(*inSpinor, dagger, 2*i+dir); 01116 01117 // Record the end of the scattering 01118 cudaEventRecord(scatterEnd[2*i+dir], streams[2*i+dir]); 01119 } 01120 } 01121 01122 } 01123 } 01124 01125 } 01126 01127 for (int i=3; i>=0; i--) { 01128 if (!dslashParam.commDim[i]) continue; 01129 01130 dslashParam.kernel_type = static_cast<KernelType>(i); 01131 dslashParam.threads = dslash.Nface()*faceVolumeCB[i]; // updating 2 or 6 faces 01132 01133 // wait for scattering to finish and then launch dslash 01134 cudaStreamWaitEvent(streams[Nstream-1], scatterEnd[2*i], 0); 01135 cudaStreamWaitEvent(streams[Nstream-1], scatterEnd[2*i+1], 0); 01136 01137 CUDA_EVENT_RECORD(kernelStart[2*i], streams[Nstream-1]); 01138 dslash.apply(streams[Nstream-1]); // all faces use this stream 01139 CUDA_EVENT_RECORD(kernelEnd[2*i], streams[Nstream-1]); 01140 } 01141 01142 CUDA_EVENT_RECORD(dslashEnd, 0); 01143 DSLASH_TIME_PROFILE(); 01144 01145 #endif // MULTI_GPU 01146 } 01147 01148 // Wilson wrappers 01149 void wilsonDslashCuda(cudaColorSpinorField *out, const cudaGaugeField &gauge, const cudaColorSpinorField *in, const int parity, 01150 const int dagger, const cudaColorSpinorField *x, const double &k, const int *commOverride) 01151 { 01152 inSpinor = (cudaColorSpinorField*)in; // EVIL 01153 01154 #ifdef GPU_WILSON_DIRAC 01155 int Npad = (in->Ncolor()*in->Nspin()*2)/in->FieldOrder(); // SPINOR_HOP in old code 01156 for(int i=0;i<4;i++){ 01157 dslashParam.ghostDim[i] = commDimPartitioned(i); // determines whether to use regular or ghost indexing at boundary 01158 dslashParam.ghostOffset[i] = Npad*(in->GhostOffset(i) + in->Stride()); 01159 dslashParam.ghostNormOffset[i] = in->GhostNormOffset(i) + in->Stride(); 01160 dslashParam.commDim[i] = (!commOverride[i]) ? 0 : commDimPartitioned(i); // switch off comms if override = 0 01161 } 01162 01163 void *gauge0, *gauge1; 01164 bindGaugeTex(gauge, parity, &gauge0, &gauge1); 01165 01166 if (in->Precision() != gauge.Precision()) 01167 errorQuda("Mixing gauge %d and spinor %d precision not supported", 01168 gauge.Precision(), in->Precision()); 01169 01170 const void *xv = (x ? x->V() : 0); 01171 const void *xn = (x ? x->Norm() : 0); 01172 01173 DslashCuda *dslash = 0; 01174 size_t regSize = sizeof(float); 01175 if (in->Precision() == QUDA_DOUBLE_PRECISION) { 01176 #if (__COMPUTE_CAPABILITY__ >= 130) 01177 dslash = new WilsonDslashCuda<double2, double2>((double2*)out->V(), (float*)out->Norm(), 01178 (double2*)gauge0, (double2*)gauge1, 01179 gauge.Reconstruct(), (double2*)in->V(), 01180 (float*)in->Norm(), (double2*)xv, (float*)xn, 01181 k, dagger, in->Bytes(), in->NormBytes()); 01182 regSize = sizeof(double); 01183 #else 01184 errorQuda("Double precision not supported on this GPU"); 01185 #endif 01186 } else if (in->Precision() == QUDA_SINGLE_PRECISION) { 01187 dslash = new WilsonDslashCuda<float4, float4>((float4*)out->V(), (float*)out->Norm(), (float4*)gauge0, (float4*)gauge1, 01188 gauge.Reconstruct(), (float4*)in->V(), (float*)in->Norm(), 01189 (float4*)xv, (float*)xn, k, dagger, in->Bytes(), in->NormBytes()); 01190 } else if (in->Precision() == QUDA_HALF_PRECISION) { 01191 dslash = new WilsonDslashCuda<short4, short4>((short4*)out->V(), (float*)out->Norm(), (short4*)gauge0, (short4*)gauge1, 01192 gauge.Reconstruct(), (short4*)in->V(), (float*)in->Norm(), 01193 (short4*)xv, (float*)xn, k, dagger, in->Bytes(), in->NormBytes()); 01194 } 01195 dslashCuda(*dslash, regSize, parity, dagger, in->Volume(), in->GhostFace()); 01196 01197 delete dslash; 01198 unbindGaugeTex(gauge); 01199 01200 checkCudaError(); 01201 #else 01202 errorQuda("Wilson dslash has not been built"); 01203 #endif // GPU_WILSON_DIRAC 01204 01205 } 01206 01207 void cloverDslashCuda(cudaColorSpinorField *out, const cudaGaugeField &gauge, const FullClover cloverInv, 01208 const cudaColorSpinorField *in, const int parity, const int dagger, 01209 const cudaColorSpinorField *x, const double &a, const int *commOverride) 01210 { 01211 inSpinor = (cudaColorSpinorField*)in; // EVIL 01212 01213 #ifdef GPU_CLOVER_DIRAC 01214 int Npad = (in->Ncolor()*in->Nspin()*2)/in->FieldOrder(); // SPINOR_HOP in old code 01215 for(int i=0;i<4;i++){ 01216 dslashParam.ghostDim[i] = commDimPartitioned(i); // determines whether to use regular or ghost indexing at boundary 01217 dslashParam.ghostOffset[i] = Npad*(in->GhostOffset(i) + in->Stride()); 01218 dslashParam.ghostNormOffset[i] = in->GhostNormOffset(i) + in->Stride(); 01219 dslashParam.commDim[i] = (!commOverride[i]) ? 0 : commDimPartitioned(i); // switch off comms if override = 0 01220 } 01221 01222 void *cloverP, *cloverNormP; 01223 QudaPrecision clover_prec = bindCloverTex(cloverInv, parity, &cloverP, &cloverNormP); 01224 01225 void *gauge0, *gauge1; 01226 bindGaugeTex(gauge, parity, &gauge0, &gauge1); 01227 01228 if (in->Precision() != gauge.Precision()) 01229 errorQuda("Mixing gauge and spinor precision not supported"); 01230 01231 if (in->Precision() != clover_prec) 01232 errorQuda("Mixing clover and spinor precision not supported"); 01233 01234 const void *xv = x ? x->V() : 0; 01235 const void *xn = x ? x->Norm() : 0; 01236 01237 DslashCuda *dslash = 0; 01238 size_t regSize = sizeof(float); 01239 01240 if (in->Precision() == QUDA_DOUBLE_PRECISION) { 01241 #if (__COMPUTE_CAPABILITY__ >= 130) 01242 dslash = new CloverDslashCuda<double2, double2, double2>((double2*)out->V(), (float*)out->Norm(), (double2*)gauge0, 01243 (double2*)gauge1, gauge.Reconstruct(), (double2*)cloverP, 01244 (float*)cloverNormP, (double2*)in->V(), (float*)in->Norm(), 01245 (double2*)xv, (float*)xn, a, dagger, in->Bytes(), in->NormBytes()); 01246 regSize = sizeof(double); 01247 #else 01248 errorQuda("Double precision not supported on this GPU"); 01249 #endif 01250 } else if (in->Precision() == QUDA_SINGLE_PRECISION) { 01251 dslash = new CloverDslashCuda<float4, float4, float4>((float4*)out->V(), (float*)out->Norm(), (float4*)gauge0, 01252 (float4*)gauge1, gauge.Reconstruct(), (float4*)cloverP, 01253 (float*)cloverNormP, (float4*)in->V(), (float*)in->Norm(), 01254 (float4*)xv, (float*)xn, a, dagger, in->Bytes(), in->NormBytes()); 01255 } else if (in->Precision() == QUDA_HALF_PRECISION) { 01256 dslash = new CloverDslashCuda<short4, short4, short4>((short4*)out->V(), (float*)out->Norm(), (short4*)gauge0, 01257 (short4*)gauge1, gauge.Reconstruct(), (short4*)cloverP, 01258 (float*)cloverNormP, (short4*)in->V(), (float*)in->Norm(), 01259 (short4*)xv, (float*)xn, a, dagger, in->Bytes(), in->NormBytes()); 01260 } 01261 01262 dslashCuda(*dslash, regSize, parity, dagger, in->Volume(), in->GhostFace()); 01263 01264 delete dslash; 01265 unbindGaugeTex(gauge); 01266 unbindCloverTex(cloverInv); 01267 01268 checkCudaError(); 01269 #else 01270 errorQuda("Clover dslash has not been built"); 01271 #endif 01272 01273 } 01274 01275 void twistedMassDslashCuda(cudaColorSpinorField *out, const cudaGaugeField &gauge, 01276 const cudaColorSpinorField *in, const int parity, const int dagger, 01277 const cudaColorSpinorField *x, const double &kappa, const double &mu, 01278 const double &a, const int *commOverride) 01279 { 01280 inSpinor = (cudaColorSpinorField*)in; // EVIL 01281 01282 #ifdef GPU_TWISTED_MASS_DIRAC 01283 int Npad = (in->Ncolor()*in->Nspin()*2)/in->FieldOrder(); // SPINOR_HOP in old code 01284 for(int i=0;i<4;i++){ 01285 dslashParam.ghostDim[i] = commDimPartitioned(i); // determines whether to use regular or ghost indexing at boundary 01286 dslashParam.ghostOffset[i] = Npad*(in->GhostOffset(i) + in->Stride()); 01287 dslashParam.ghostNormOffset[i] = in->GhostNormOffset(i) + in->Stride(); 01288 dslashParam.commDim[i] = (!commOverride[i]) ? 0 : commDimPartitioned(i); // switch off comms if override = 0 01289 } 01290 01291 void *gauge0, *gauge1; 01292 bindGaugeTex(gauge, parity, &gauge0, &gauge1); 01293 01294 if (in->Precision() != gauge.Precision()) 01295 errorQuda("Mixing gauge and spinor precision not supported"); 01296 01297 const void *xv = x ? x->V() : 0; 01298 const void *xn = x ? x->Norm() : 0; 01299 01300 DslashCuda *dslash = 0; 01301 size_t regSize = sizeof(float); 01302 01303 if (in->Precision() == QUDA_DOUBLE_PRECISION) { 01304 #if (__COMPUTE_CAPABILITY__ >= 130) 01305 dslash = new TwistedDslashCuda<double2,double2>((double2*)out->V(), (float*)out->Norm(), (double2*)gauge0, 01306 (double2*)gauge1, gauge.Reconstruct(), (double2*)in->V(), 01307 (float*)in->Norm(), (double2*)xv, (float*)xn, 01308 kappa, mu, a, dagger, in->Bytes(), in->NormBytes()); 01309 regSize = sizeof(double); 01310 #else 01311 errorQuda("Double precision not supported on this GPU"); 01312 #endif 01313 } else if (in->Precision() == QUDA_SINGLE_PRECISION) { 01314 dslash = new TwistedDslashCuda<float4,float4>((float4*)out->V(), (float*)out->Norm(), (float4*)gauge0, (float4*)gauge1, 01315 gauge.Reconstruct(), (float4*)in->V(), (float*)in->Norm(), 01316 (float4*)xv, (float*)xn, kappa, mu, a, dagger, in->Bytes(), in->NormBytes()); 01317 } else if (in->Precision() == QUDA_HALF_PRECISION) { 01318 dslash = new TwistedDslashCuda<short4,short4>((short4*)out->V(), (float*)out->Norm(), (short4*)gauge0, (short4*)gauge1, 01319 gauge.Reconstruct(), (short4*)in->V(), (float*)in->Norm(), 01320 (short4*)xv, (float*)xn, kappa, mu, a, dagger, in->Bytes(), in->NormBytes()); 01321 01322 } 01323 01324 dslashCuda(*dslash, regSize, parity, dagger, in->Volume(), in->GhostFace()); 01325 01326 delete dslash; 01327 unbindGaugeTex(gauge); 01328 01329 checkCudaError(); 01330 #else 01331 errorQuda("Twisted mass dslash has not been built"); 01332 #endif 01333 01334 } 01335 01336 void domainWallDslashCuda(cudaColorSpinorField *out, const cudaGaugeField &gauge, 01337 const cudaColorSpinorField *in, const int parity, const int dagger, 01338 const cudaColorSpinorField *x, const double &m_f, const double &k2) 01339 { 01340 inSpinor = (cudaColorSpinorField*)in; // EVIL 01341 01342 #ifdef MULTI_GPU 01343 errorQuda("Multi-GPU domain wall not implemented\n"); 01344 #endif 01345 01346 dslashParam.parity = parity; 01347 dslashParam.threads = in->Volume(); 01348 01349 #ifdef GPU_DOMAIN_WALL_DIRAC 01350 void *gauge0, *gauge1; 01351 bindGaugeTex(gauge, parity, &gauge0, &gauge1); 01352 01353 if (in->Precision() != gauge.Precision()) 01354 errorQuda("Mixing gauge and spinor precision not supported"); 01355 01356 const void *xv = x ? x->V() : 0; 01357 const void *xn = x ? x->Norm() : 0; 01358 01359 DslashCuda *dslash = 0; 01360 size_t regSize = sizeof(float); 01361 01362 if (in->Precision() == QUDA_DOUBLE_PRECISION) { 01363 #if (__COMPUTE_CAPABILITY__ >= 130) 01364 dslash = new DomainWallDslashCuda<double2,double2>((double2*)out->V(), (float*)out->Norm(), (double2*)gauge0, (double2*)gauge1, 01365 gauge.Reconstruct(), (double2*)in->V(), (float*)in->Norm(), (double2*)xv, 01366 (float*)xn, m_f, k2, dagger, in->Bytes(), in->NormBytes()); 01367 regSize = sizeof(double); 01368 #else 01369 errorQuda("Double precision not supported on this GPU"); 01370 #endif 01371 } else if (in->Precision() == QUDA_SINGLE_PRECISION) { 01372 dslash = new DomainWallDslashCuda<float4,float4>((float4*)out->V(), (float*)out->Norm(), (float4*)gauge0, (float4*)gauge1, 01373 gauge.Reconstruct(), (float4*)in->V(), (float*)in->Norm(), (float4*)xv, 01374 (float*)xn, m_f, k2, dagger, in->Bytes(), in->NormBytes()); 01375 } else if (in->Precision() == QUDA_HALF_PRECISION) { 01376 dslash = new DomainWallDslashCuda<short4,short4>((short4*)out->V(), (float*)out->Norm(), (short4*)gauge0, (short4*)gauge1, 01377 gauge.Reconstruct(), (short4*)in->V(), (float*)in->Norm(), (short4*)xv, 01378 (float*)xn, m_f, k2, dagger, in->Bytes(), in->NormBytes()); 01379 } 01380 01381 dslashCuda(*dslash, regSize, parity, dagger, in->Volume(), in->GhostFace()); 01382 01383 delete dslash; 01384 unbindGaugeTex(gauge); 01385 01386 checkCudaError(); 01387 #else 01388 errorQuda("Domain wall dslash has not been built"); 01389 #endif 01390 01391 } 01392 01393 void staggeredDslashCuda(cudaColorSpinorField *out, const cudaGaugeField &fatGauge, 01394 const cudaGaugeField &longGauge, const cudaColorSpinorField *in, 01395 const int parity, const int dagger, const cudaColorSpinorField *x, 01396 const double &k, const int *commOverride) 01397 { 01398 inSpinor = (cudaColorSpinorField*)in; // EVIL 01399 01400 #ifdef GPU_STAGGERED_DIRAC 01401 int Npad = (in->Ncolor()*in->Nspin()*2)/in->FieldOrder(); // SPINOR_HOP in old code 01402 01403 dslashParam.parity = parity; 01404 dslashParam.threads = in->Volume(); 01405 for(int i=0;i<4;i++){ 01406 dslashParam.ghostDim[i] = commDimPartitioned(i); // determines whether to use regular or ghost indexing at boundary 01407 dslashParam.ghostOffset[i] = Npad*(in->GhostOffset(i) + in->Stride()); 01408 dslashParam.ghostNormOffset[i] = in->GhostNormOffset(i) + in->Stride(); 01409 dslashParam.commDim[i] = (!commOverride[i]) ? 0 : commDimPartitioned(i); // switch off comms if override = 0 01410 } 01411 void *fatGauge0, *fatGauge1; 01412 void* longGauge0, *longGauge1; 01413 bindFatGaugeTex(fatGauge, parity, &fatGauge0, &fatGauge1); 01414 bindLongGaugeTex(longGauge, parity, &longGauge0, &longGauge1); 01415 01416 if (in->Precision() != fatGauge.Precision() || in->Precision() != longGauge.Precision()){ 01417 errorQuda("Mixing gauge and spinor precision not supported" 01418 "(precision=%d, fatlinkGauge.precision=%d, longGauge.precision=%d", 01419 in->Precision(), fatGauge.Precision(), longGauge.Precision()); 01420 } 01421 01422 const void *xv = x ? x->V() : 0; 01423 const void *xn = x ? x->Norm() : 0; 01424 01425 DslashCuda *dslash = 0; 01426 size_t regSize = sizeof(float); 01427 01428 if (in->Precision() == QUDA_DOUBLE_PRECISION) { 01429 #if (__COMPUTE_CAPABILITY__ >= 130) 01430 dslash = new StaggeredDslashCuda<double2, double2, double2>((double2*)out->V(), (float*)out->Norm(), 01431 (double2*)fatGauge0, (double2*)fatGauge1, 01432 (double2*)longGauge0, (double2*)longGauge1, 01433 longGauge.Reconstruct(), (double2*)in->V(), 01434 (float*)in->Norm(), (double2*)xv, (float*)xn, 01435 k, dagger, in->Bytes(), in->NormBytes()); 01436 regSize = sizeof(double); 01437 #else 01438 errorQuda("Double precision not supported on this GPU"); 01439 #endif 01440 } else if (in->Precision() == QUDA_SINGLE_PRECISION) { 01441 dslash = new StaggeredDslashCuda<float2, float2, float4>((float2*)out->V(), (float*)out->Norm(), 01442 (float2*)fatGauge0, (float2*)fatGauge1, 01443 (float4*)longGauge0, (float4*)longGauge1, 01444 longGauge.Reconstruct(), (float2*)in->V(), 01445 (float*)in->Norm(), (float2*)xv, (float*)xn, 01446 k, dagger, in->Bytes(), in->NormBytes()); 01447 } else if (in->Precision() == QUDA_HALF_PRECISION) { 01448 dslash = new StaggeredDslashCuda<short2, short2, short4>((short2*)out->V(), (float*)out->Norm(), 01449 (short2*)fatGauge0, (short2*)fatGauge1, 01450 (short4*)longGauge0, (short4*)longGauge1, 01451 longGauge.Reconstruct(), (short2*)in->V(), 01452 (float*)in->Norm(), (short2*)xv, (float*)xn, 01453 k, dagger, in->Bytes(), in->NormBytes()); 01454 } 01455 01456 dslashCuda(*dslash, regSize, parity, dagger, in->Volume(), in->GhostFace()); 01457 01458 delete dslash; 01459 unbindGaugeTex(fatGauge); 01460 unbindGaugeTex(longGauge); 01461 01462 checkCudaError(); 01463 01464 #else 01465 errorQuda("Staggered dslash has not been built"); 01466 #endif // GPU_STAGGERED_DIRAC 01467 } 01468 01469 01470 template <typename sFloat, typename cFloat> 01471 class CloverCuda : public Tunable { 01472 private: 01473 const size_t bytes, norm_bytes; 01474 sFloat *out; 01475 float *outNorm; 01476 char *saveOut, *saveOutNorm; 01477 const cFloat *clover; 01478 const float *cloverNorm; 01479 const sFloat *in; 01480 const float *inNorm; 01481 01482 protected: 01483 int sharedBytesPerThread() const 01484 { 01485 int reg_size = (typeid(sFloat)==typeid(double2) ? sizeof(double) : sizeof(float)); 01486 return CLOVER_SHARED_FLOATS_PER_THREAD * reg_size; 01487 } 01488 int sharedBytesPerBlock() const { return 0; } 01489 bool advanceGridDim(TuneParam ¶m) const { return false; } // Don't tune the grid dimensions. 01490 01491 public: 01492 CloverCuda(sFloat *out, float *outNorm, const cFloat *clover, const float *cloverNorm, const sFloat *in, 01493 const float *inNorm, const size_t bytes, const size_t norm_bytes) 01494 : out(out), outNorm(outNorm), clover(clover), cloverNorm(cloverNorm), in(in), inNorm(inNorm), 01495 bytes(bytes), norm_bytes(norm_bytes) 01496 { 01497 bindSpinorTex(bytes, norm_bytes, in, inNorm); 01498 } 01499 virtual ~CloverCuda() { unbindSpinorTex(in, inNorm); } 01500 void apply(const cudaStream_t &stream) 01501 { 01502 TuneParam tp = tuneLaunch(*this, dslashTuning, verbosity); 01503 dim3 gridDim( (dslashParam.threads+tp.block.x-1) / tp.block.x, 1, 1); 01504 cloverKernel<<<gridDim, tp.block, tp.shared_bytes, stream>>>(out, outNorm, clover, cloverNorm, in, inNorm, dslashParam); 01505 } 01506 virtual TuneKey tuneKey() const 01507 { 01508 std::stringstream vol, aux; 01509 vol << dslashConstants.x[0] << "x"; 01510 vol << dslashConstants.x[1] << "x"; 01511 vol << dslashConstants.x[2] << "x"; 01512 vol << dslashConstants.x[3]; 01513 return TuneKey(vol.str(), typeid(*this).name()); 01514 } 01515 01516 // Need to save the out field if it aliases the in field 01517 void preTune() { 01518 if (in == out) { 01519 saveOut = new char[bytes]; 01520 cudaMemcpy(saveOut, out, bytes, cudaMemcpyDeviceToHost); 01521 if (typeid(sFloat) == typeid(short4)) { 01522 saveOutNorm = new char[norm_bytes]; 01523 cudaMemcpy(saveOutNorm, outNorm, norm_bytes, cudaMemcpyDeviceToHost); 01524 } 01525 } 01526 } 01527 01528 // Restore if the in and out fields alias 01529 void postTune() { 01530 if (in == out) { 01531 cudaMemcpy(out, saveOut, bytes, cudaMemcpyHostToDevice); 01532 delete[] saveOut; 01533 if (typeid(sFloat) == typeid(short4)) { 01534 cudaMemcpy(outNorm, saveOutNorm, norm_bytes, cudaMemcpyHostToDevice); 01535 delete[] saveOutNorm; 01536 } 01537 } 01538 } 01539 01540 std::string paramString(const TuneParam ¶m) const // Don't bother printing the grid dim. 01541 { 01542 std::stringstream ps; 01543 ps << "block=(" << param.block.x << "," << param.block.y << "," << param.block.z << "), "; 01544 ps << "shared=" << param.shared_bytes; 01545 return ps.str(); 01546 } 01547 01548 }; 01549 01550 01551 void cloverCuda(cudaColorSpinorField *out, const cudaGaugeField &gauge, const FullClover clover, 01552 const cudaColorSpinorField *in, const int parity) { 01553 01554 dslashParam.parity = parity; 01555 dslashParam.threads = in->Volume(); 01556 01557 #ifdef GPU_CLOVER_DIRAC 01558 Tunable *clov = 0; 01559 void *cloverP, *cloverNormP; 01560 QudaPrecision clover_prec = bindCloverTex(clover, parity, &cloverP, &cloverNormP); 01561 01562 if (in->Precision() != clover_prec) 01563 errorQuda("Mixing clover and spinor precision not supported"); 01564 01565 if (in->Precision() == QUDA_DOUBLE_PRECISION) { 01566 #if (__COMPUTE_CAPABILITY__ >= 130) 01567 clov = new CloverCuda<double2, double2>((double2*)out->V(), (float*)out->Norm(), (double2*)cloverP, 01568 (float*)cloverNormP, (double2*)in->V(), (float*)in->Norm(), 01569 in->Bytes(), in->NormBytes()); 01570 #else 01571 errorQuda("Double precision not supported on this GPU"); 01572 #endif 01573 } else if (in->Precision() == QUDA_SINGLE_PRECISION) { 01574 clov = new CloverCuda<float4, float4>((float4*)out->V(), (float*)out->Norm(), (float4*)cloverP, 01575 (float*)cloverNormP, (float4*)in->V(), (float*)in->Norm(), 01576 in->Bytes(), in->NormBytes()); 01577 } else if (in->Precision() == QUDA_HALF_PRECISION) { 01578 clov = new CloverCuda<short4, short4>((short4*)out->V(), (float*)out->Norm(), (short4*)cloverP, 01579 (float*)cloverNormP, (short4*)in->V(), (float*)in->Norm(), 01580 in->Bytes(), in->NormBytes()); 01581 } 01582 clov->apply(0); 01583 01584 unbindCloverTex(clover); 01585 checkCudaError(); 01586 01587 delete clov; 01588 #else 01589 errorQuda("Clover dslash has not been built"); 01590 #endif 01591 } 01592 01593 template <typename sFloat> 01594 class TwistGamma5Cuda : public Tunable { 01595 01596 private: 01597 sFloat *out; 01598 float *outNorm; 01599 sFloat *in; 01600 float *inNorm; 01601 double a; 01602 double b; 01603 size_t bytes; 01604 size_t norm_bytes; 01605 01606 int sharedBytesPerThread() const { return 0; } 01607 int sharedBytesPerBlock() const { return 0; } 01608 bool advanceGridDim(TuneParam ¶m) const { return false; } // Don't tune the grid dimensions. 01609 01610 char *saveOut, *saveOutNorm; 01611 01612 public: 01613 TwistGamma5Cuda(sFloat *out, float *outNorm, sFloat *in, float *inNorm, 01614 double kappa, double mu, const int dagger, 01615 QudaTwistGamma5Type twist, size_t bytes, size_t norm_bytes) : 01616 out(out), outNorm(outNorm), in(in), inNorm(inNorm), 01617 bytes(bytes), norm_bytes(norm_bytes){ 01618 bindSpinorTex(bytes, norm_bytes, in, inNorm); 01619 setTwistParam(a, b, kappa, mu, dagger, twist); 01620 } 01621 virtual ~TwistGamma5Cuda() { 01622 unbindSpinorTex(in, inNorm); 01623 } 01624 01625 TuneKey tuneKey() const { 01626 std::stringstream vol, aux; 01627 vol << dslashConstants.x[0] << "x"; 01628 vol << dslashConstants.x[1] << "x"; 01629 vol << dslashConstants.x[2] << "x"; 01630 vol << dslashConstants.x[3]; 01631 return TuneKey(vol.str(), typeid(*this).name(), aux.str()); 01632 } 01633 01634 void apply(const cudaStream_t &stream) { 01635 TuneParam tp = tuneLaunch(*this, dslashTuning, verbosity); 01636 dim3 gridDim( (dslashParam.threads+tp.block.x-1) / tp.block.x, 1, 1); 01637 twistGamma5Kernel<<<gridDim, tp.block, tp.shared_bytes, stream>>> 01638 (out, outNorm, a, b, in, inNorm, dslashParam); 01639 } 01640 01641 void preTune() { 01642 saveOut = new char[bytes]; 01643 cudaMemcpy(saveOut, out, bytes, cudaMemcpyDeviceToHost); 01644 if (typeid(sFloat) == typeid(short4)) { 01645 saveOutNorm = new char[norm_bytes]; 01646 cudaMemcpy(saveOutNorm, outNorm, norm_bytes, cudaMemcpyDeviceToHost); 01647 } 01648 } 01649 01650 void postTune() { 01651 cudaMemcpy(out, saveOut, bytes, cudaMemcpyHostToDevice); 01652 delete[] saveOut; 01653 if (typeid(sFloat) == typeid(short4)) { 01654 cudaMemcpy(outNorm, saveOutNorm, norm_bytes, cudaMemcpyHostToDevice); 01655 delete[] saveOutNorm; 01656 } 01657 } 01658 01659 std::string paramString(const TuneParam ¶m) const { 01660 std::stringstream ps; 01661 ps << "block=(" << param.block.x << "," << param.block.y << "," << param.block.z << "), "; 01662 ps << "shared=" << param.shared_bytes; 01663 return ps.str(); 01664 } 01665 }; 01666 01667 void twistGamma5Cuda(cudaColorSpinorField *out, const cudaColorSpinorField *in, 01668 const int dagger, const double &kappa, const double &mu, 01669 const QudaTwistGamma5Type twist) 01670 { 01671 dslashParam.threads = in->Volume(); 01672 01673 #ifdef GPU_TWISTED_MASS_DIRAC 01674 Tunable *twistGamma5 = 0; 01675 01676 if (in->Precision() == QUDA_DOUBLE_PRECISION) { 01677 #if (__COMPUTE_CAPABILITY__ >= 130) 01678 twistGamma5 = new TwistGamma5Cuda<double2> 01679 ((double2*)out->V(), (float*)out->Norm(), (double2*)in->V(), 01680 (float*)in->Norm(), kappa, mu, dagger, twist, in->Bytes(), in->NormBytes()); 01681 #else 01682 errorQuda("Double precision not supported on this GPU"); 01683 #endif 01684 } else if (in->Precision() == QUDA_SINGLE_PRECISION) { 01685 twistGamma5 = new TwistGamma5Cuda<float4> 01686 ((float4*)out->V(), (float*)out->Norm(), (float4*)in->V(), 01687 (float*)in->Norm(), kappa, mu, dagger, twist, in->Bytes(), in->NormBytes()); 01688 } else if (in->Precision() == QUDA_HALF_PRECISION) { 01689 twistGamma5 = new TwistGamma5Cuda<short4> 01690 ((short4*)out->V(), (float*)out->Norm(), (short4*)in->V(), 01691 (float*)in->Norm(), kappa, mu, dagger, twist, in->Bytes(), in->NormBytes()); 01692 } 01693 01694 twistGamma5->apply(streams[Nstream-1]); 01695 checkCudaError(); 01696 01697 delete twistGamma5; 01698 #else 01699 errorQuda("Twisted mass dslash has not been built"); 01700 #endif // GPU_TWISTED_MASS_DIRAC 01701 } 01702 01703 01704 #include "misc_helpers.cu" 01705 01706 01707 #if defined(GPU_FATLINK) || defined(GPU_GAUGE_FORCE) || defined(GPU_FERMION_FORCE) || defined(GPU_HISQ_FORCE) || defined(GPU_UNITARIZE) 01708 #include <force_common.h> 01709 #include "force_kernel_common.cu" 01710 #endif 01711 01712 #ifdef GPU_FATLINK 01713 #include "llfat_quda.cu" 01714 #endif 01715 01716 #ifdef GPU_GAUGE_FORCE 01717 #include "gauge_force_quda.cu" 01718 #endif 01719 01720 #ifdef GPU_FERMION_FORCE 01721 #include "fermion_force_quda.cu" 01722 #endif 01723 01724 #ifdef GPU_UNITARIZE 01725 #include "unitarize_links_quda.cu" 01726 #endif 01727 01728 #ifdef GPU_HISQ_FORCE 01729 #include "hisq_paths_force_quda.cu" 01730 #include "unitarize_force_quda.cu" 01731 #endif