10 #ifdef GPU_WILSON_DIRAC
17 #endif // GPU_WILSON_DIRAC
20 #ifdef GPU_STAGGERED_DIRAC
21 #if (__COMPUTE_CAPABILITY__ >= 300) // Kepler works best with texture loads only
28 #elif (__COMPUTE_CAPABILITY__ >= 200)
31 #define DIRECT_ACCESS_SPINOR
36 #define DIRECT_ACCESS_FAT_LINK
43 #endif // GPU_STAGGERED_DIRAC
55 namespace dslash_aux {
58 #include <dslash_index.cuh>
65 #ifndef DSLASH_SHARED_FLOATS_PER_THREAD
66 #define DSLASH_SHARED_FLOATS_PER_THREAD 0
69 #ifndef CLOVER_SHARED_FLOATS_PER_THREAD
70 #define CLOVER_SHARED_FLOATS_PER_THREAD 0
73 #ifndef NDEGTM_SHARED_FLOATS_PER_THREAD
74 #define NDEGTM_SHARED_FLOATS_PER_THREAD 0
80 static bool kernelPackT =
false;
88 static bool twistPack =
false;
97 cudaEvent_t interiorDslashEnd;
110 using namespace dslash;
112 for (
int i=0; i<
Nstream; i++) {
113 cudaEventCreate(&
packEnd[i], cudaEventDisableTiming);
114 cudaEventCreate(&
gatherStart[i], cudaEventDisableTiming);
115 cudaEventCreate(&
gatherEnd[i], cudaEventDisableTiming);
116 cudaEventCreateWithFlags(&
scatterStart[i], cudaEventDisableTiming);
117 cudaEventCreateWithFlags(&
scatterEnd[i], cudaEventDisableTiming);
119 cudaEventCreateWithFlags(&
dslashStart, cudaEventDisableTiming);
120 cudaEventCreateWithFlags(&
dslashEnd, cudaEventDisableTiming);
122 cudaEventCreateWithFlags(&interiorDslashEnd, cudaEventDisableTiming);
131 using namespace dslash;
132 for (
int i=0; i<
Nstream; i++) {
143 cudaEventDestroy(interiorDslashEnd);
149 using namespace dslash_aux;
151 template <
typename sFloat,
typename cFloat>
156 char *saveOut, *saveOutNorm;
157 const cFloat *clover;
158 const float *cloverNorm;
164 int reg_size = (
typeid(sFloat)==
typeid(double2) ?
sizeof(double) :
sizeof(
float));
174 : out(out), clover(clover), cloverNorm(cloverNorm), in(in)
176 bindSpinorTex<sFloat>(
in);
177 dslashParam.sp_stride = in->
Stride();
178 #ifdef GPU_CLOVER_DIRAC
179 dslashParam.cl_stride = cl_stride;
186 dim3 gridDim( (dslashParam.threads+tp.
block.x-1) / tp.
block.x, 1, 1);
188 ((sFloat*)
out->V(), (
float*)
out->Norm(), clover, cloverNorm,
189 (sFloat*)
in->V(), (
float*)
in->Norm(), dslashParam);
196 saveOut =
new char[
out->Bytes()];
197 cudaMemcpy(saveOut,
out->V(),
out->Bytes(), cudaMemcpyDeviceToHost);
198 if (
typeid(sFloat) ==
typeid(short4)) {
199 saveOutNorm =
new char[
out->NormBytes()];
200 cudaMemcpy(saveOutNorm,
out->Norm(),
out->NormBytes(), cudaMemcpyDeviceToHost);
208 cudaMemcpy(
out->V(), saveOut,
out->Bytes(), cudaMemcpyHostToDevice);
210 if (
typeid(sFloat) ==
typeid(short4)) {
211 cudaMemcpy(
out->Norm(), saveOutNorm,
out->NormBytes(), cudaMemcpyHostToDevice);
212 delete[] saveOutNorm;
219 std::stringstream ps;
220 ps <<
"block=(" <<
param.block.x <<
"," <<
param.block.y <<
"," <<
param.block.z <<
"), ";
221 ps <<
"shared=" <<
param.shared_bytes;
225 long long flops()
const {
return 504ll *
in->VolumeCB(); }
232 dslashParam.parity =
parity;
233 dslashParam.threads = in->
Volume();
235 #ifdef GPU_CLOVER_DIRAC
237 void *cloverP, *cloverNormP;
241 errorQuda(
"Mixing clover and spinor precision not supported");
244 #if (__COMPUTE_CAPABILITY__ >= 130)
247 errorQuda(
"Double precision not supported on this GPU");
261 errorQuda(
"Clover dslash has not been built");
266 template <
typename sFloat>
276 unsigned int sharedBytesPerThread()
const {
return 0; }
277 unsigned int sharedBytesPerBlock(
const TuneParam &
param)
const {
return 0; }
278 bool tuneGridDim()
const {
return false; }
279 unsigned int minThreads()
const {
return in->X(0) *
in->X(1) *
in->X(2) *
in->X(3); }
281 char *saveOut, *saveOutNorm;
288 bindSpinorTex<sFloat>(
in);
289 dslashParam.sp_stride = in->
Stride();
292 #if (defined GPU_TWISTED_MASS_DIRAC) || (defined GPU_NDEG_TWISTED_MASS_DIRAC)
293 dslashParam.fl_stride = in->
VolumeCB();
296 a =
kappa, b =
mu, c = epsilon;
297 #if (defined GPU_TWISTED_MASS_DIRAC) || (defined GPU_NDEG_TWISTED_MASS_DIRAC)
298 dslashParam.fl_stride = in->
VolumeCB()/2;
304 unbindSpinorTex<sFloat>(
in);
311 #if (defined GPU_TWISTED_MASS_DIRAC) || (defined GPU_NDEG_TWISTED_MASS_DIRAC)
313 dim3 gridDim( (dslashParam.threads+tp.
block.x-1) / tp.
block.x, 1, 1);
316 ((sFloat*)
out->V(), (
float*)
out->Norm(), a, b,
317 (sFloat*)
in->V(), (
float*)
in->Norm(), dslashParam);
320 ((sFloat*)
out->V(), (
float*)
out->Norm(), a, b, c,
321 (sFloat*)
in->V(), (
float*)
in->Norm(), dslashParam);
327 saveOut =
new char[
out->Bytes()];
328 cudaMemcpy(saveOut,
out->V(),
out->Bytes(), cudaMemcpyDeviceToHost);
329 if (
typeid(sFloat) ==
typeid(short4)) {
330 saveOutNorm =
new char[
out->NormBytes()];
331 cudaMemcpy(saveOutNorm,
out->Norm(),
out->NormBytes(), cudaMemcpyDeviceToHost);
336 cudaMemcpy(
out->V(), saveOut,
out->Bytes(), cudaMemcpyHostToDevice);
338 if (
typeid(sFloat) ==
typeid(short4)) {
339 cudaMemcpy(
out->Norm(), saveOutNorm,
out->NormBytes(), cudaMemcpyHostToDevice);
340 delete[] saveOutNorm;
345 std::stringstream ps;
346 ps <<
"block=(" << param.
block.x <<
"," << param.
block.y <<
"," << param.
block.z <<
"), ";
351 long long flops()
const {
return 24ll *
in->VolumeCB(); }
352 long long bytes()
const {
return in->Bytes() +
in->NormBytes() +
out->Bytes() +
out->NormBytes(); }
360 dslashParam.threads = in->
Volume();
362 dslashParam.threads = in->
Volume() / 2;
364 #if (defined GPU_TWISTED_MASS_DIRAC) || (defined GPU_NDEG_TWISTED_MASS_DIRAC)
368 #if (__COMPUTE_CAPABILITY__ >= 130)
371 errorQuda(
"Double precision not supported on this GPU");
384 errorQuda(
"Twisted mass dslash has not been built");
385 #endif // GPU_TWISTED_MASS_DIRAC
388 #if (__COMPUTE_CAPABILITY__ >= 200) && defined(GPU_TWISTED_CLOVER_DIRAC)
392 template <
typename cFloat,
typename sFloat>
395 const cFloat *clover;
397 const cFloat *cloverInv;
406 unsigned int sharedBytesPerThread()
const {
return 0; }
407 unsigned int sharedBytesPerBlock(
const TuneParam &
param)
const {
return 0; }
408 bool tuneGridDim()
const {
return false; }
409 unsigned int minThreads()
const {
return in->X(0) *
in->X(1) *
in->X(2) *
in->X(3); }
410 char *saveOut, *saveOutNorm;
415 cFloat *clov,
const float *cN, cFloat *clovInv,
const float *cN2,
int cl_stride) :
418 bindSpinorTex<sFloat>(
in);
419 dslashParam.sp_stride = in->
Stride();
420 #ifdef GPU_TWISTED_CLOVER_DIRAC
421 dslashParam.cl_stride = cl_stride;
422 dslashParam.fl_stride = in->
VolumeCB();
433 errorQuda(
"ERROR: Non-degenerated twisted-mass not supported in this regularization\n");
437 unbindSpinorTex<sFloat>(
in);
441 return TuneKey(
in->VolString(),
typeid(*this).name(),
in->AuxString());
446 #if (__COMPUTE_CAPABILITY__ >= 200) && defined(GPU_TWISTED_CLOVER_DIRAC)
448 dim3 gridDim( (dslashParam.threads+tp.
block.x-1) / tp.
block.x, 1, 1);
452 ((sFloat*)
out->V(), (
float*)
out->Norm(), a,
453 (sFloat*)
in->V(), (
float*)
in->Norm(), dslashParam,
454 clover, cNorm, cloverInv, cNrm2);
457 ((sFloat*)
out->V(), (
float*)
out->Norm(), a,
458 (sFloat*)
in->V(), (
float*)
in->Norm(), dslashParam,
459 clover, cNorm, cloverInv, cNrm2);
461 errorQuda(
"ERROR: Non-degenerated twisted-mass not supported in this regularization\n");
467 saveOut =
new char[
out->Bytes()];
468 cudaMemcpy(saveOut,
out->V(),
out->Bytes(), cudaMemcpyDeviceToHost);
469 if (
typeid(sFloat) ==
typeid(short4)) {
470 saveOutNorm =
new char[
out->NormBytes()];
471 cudaMemcpy(saveOutNorm,
out->Norm(),
out->NormBytes(), cudaMemcpyDeviceToHost);
476 cudaMemcpy(
out->V(), saveOut,
out->Bytes(), cudaMemcpyHostToDevice);
478 if (
typeid(sFloat) ==
typeid(short4)) {
479 cudaMemcpy(
out->Norm(), saveOutNorm,
out->NormBytes(), cudaMemcpyHostToDevice);
480 delete[] saveOutNorm;
485 std::stringstream ps;
486 ps <<
"block=(" << param.
block.x <<
"," << param.
block.y <<
"," << param.
block.z <<
"), ";
491 long long flops()
const {
return 24ll *
in->VolumeCB(); }
492 long long bytes()
const {
return in->Bytes() +
in->NormBytes() +
out->Bytes() +
out->NormBytes(); }
499 dslashParam.threads = in->
Volume();
501 errorQuda(
"Twisted doublet not supported in twisted clover dslash");
503 #ifdef GPU_TWISTED_CLOVER_DIRAC
506 void *clover, *cNorm, *cloverInv, *cNorm2;
510 errorQuda(
"ERROR: Clover precision and spinor precision do not match\n");
513 errorQuda(
"clover and cloverInv must have matching strides (%d != %d)", clov->
stride, clovInv->
stride);
517 #if (__COMPUTE_CAPABILITY__ >= 130)
519 (
out,
in,
kappa,
mu, epsilon,
dagger, twist, (double2 *) clover, (
float *) cNorm, (double2 *) cloverInv, (
float *) cNorm2, clov->
stride);
521 errorQuda(
"Double precision not supported on this GPU");
525 (
out,
in,
kappa,
mu, epsilon,
dagger, twist, (float4 *) clover, (
float *) cNorm, (float4 *) cloverInv, (
float *) cNorm2, clov->
stride);
528 (
out,
in,
kappa,
mu, epsilon,
dagger, twist, (short4 *) clover, (
float *) cNorm, (short4 *) cloverInv, (
float *) cNorm2, clov->
stride);
537 errorQuda(
"Twisted clover dslash has not been built");
538 #endif // GPU_TWISTED_MASS_DIRAC
QudaPrecision bindTwistedCloverTex(const FullClover clover, const FullClover cloverInv, const int oddBit, void **cloverP, void **cloverNormP, void **cloverInvP, void **cloverInvNormP)
enum QudaPrecision_s QudaPrecision
__global__ void twistCloverGamma5InvKernel(float4 *spinor, float *null, float a, const float4 *in, const float *null2, DslashParam param, const float4 *clover, const float *cNorm, const float4 *cloverInv, const float *cNrm2)
virtual TuneKey tuneKey() const
std::string paramString(const TuneParam ¶m) const
QudaVerbosity getVerbosity()
__global__ void const RealA *const const RealA *const const RealA *const const RealB *const const RealB *const int int mu
TwistCloverGamma5Cuda(cudaColorSpinorField *out, const cudaColorSpinorField *in, double kappa, double mu, double epsilon, const int dagger, QudaTwistGamma5Type tw, cFloat *clov, const float *cN, cFloat *clovInv, const float *cN2, int cl_stride)
virtual ~TwistCloverGamma5Cuda()
__global__ void twistGamma5Kernel(float4 *spinor, float *null, float a, float b, const float4 *in, const float *null2, DslashParam param)
std::string paramString(const TuneParam ¶m) const
cudaEvent_t scatterStart[Nstream]
__global__ void twistCloverGamma5Kernel(float4 *spinor, float *null, float a, const float4 *in, const float *null2, DslashParam param, const float4 *clover, const float *cNorm, const float4 *cloverInv, const float *cNrm2)
unsigned int minThreads() const
VOLATILE spinorFloat kappa
void apply(const cudaStream_t &stream)
cudaEvent_t packEnd[Nstream]
void cloverCuda(cudaColorSpinorField *out, const cudaGaugeField &gauge, const FullClover clover, const cudaColorSpinorField *in, const int oddBit)
void createDslashEvents()
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
void setTwistParam(double &a, double &b, const double &kappa, const double &mu, const int dagger, const QudaTwistGamma5Type twist)
std::string paramString(const TuneParam ¶m) const
void apply(const cudaStream_t &stream)
void unbindCloverTex(const FullClover clover)
TwistGamma5Cuda(cudaColorSpinorField *out, const cudaColorSpinorField *in, double kappa, double mu, double epsilon, const int dagger, QudaTwistGamma5Type twist)
CloverCuda(cudaColorSpinorField *out, const cFloat *clover, const float *cloverNorm, int cl_stride, const cudaColorSpinorField *in)
cudaEvent_t gatherEnd[Nstream]
void twistGamma5Cuda(cudaColorSpinorField *out, const cudaColorSpinorField *in, const int dagger, const double &kappa, const double &mu, const double &epsilon, const QudaTwistGamma5Type)
ndeg tm:
void setTwistPack(bool pack)
#define CLOVER_SHARED_FLOATS_PER_THREAD
void twistCloverGamma5Cuda(cudaColorSpinorField *out, const cudaColorSpinorField *in, const int dagger, const double &kappa, const double &mu, const double &epsilon, const QudaTwistGamma5Type twist, const FullClover *clov, const FullClover *clovInv, const int parity)
cpuColorSpinorField * out
QudaPrecision Precision() const
unsigned int sharedBytesPerBlock(const TuneParam ¶m) const
QudaTwistFlavorType TwistFlavor() const
unsigned int sharedBytesPerThread() const
void setKernelPackT(bool pack)
QudaPrecision bindCloverTex(const FullClover clover, const int oddBit, void **cloverP, void **cloverNormP)
enum QudaTwistGamma5Type_s QudaTwistGamma5Type
void unbindTwistedCloverTex(const FullClover clover)
cudaEvent_t scatterEnd[Nstream]
void destroyDslashEvents()
void apply(const cudaStream_t &stream)
virtual ~TwistGamma5Cuda()
void twistGamma5(sFloat *out, sFloat *in, const int dagger, const sFloat kappa, const sFloat mu, const QudaTwistFlavorType flavor, const int V, QudaTwistGamma5Type twist)
cudaEvent_t gatherStart[Nstream]
virtual void apply(const cudaStream_t &stream)=0