35 if (kptstack.size() > 10)
37 warningQuda(
"KernelPackT stack contains %u elements. Is there a missing popKernelPackT() somewhere?",
38 static_cast<unsigned int>(kptstack.size()));
46 errorQuda(
"popKernelPackT() called with empty stack");
89 #if CUDA_VERSION >= 8000 90 cuuint32_t *commsEnd_h;
91 CUdeviceptr commsEnd_d[
Nstream];
97 using namespace dslash;
100 cudaEventCreateWithFlags(&
gatherStart[i], cudaEventDisableTiming);
101 cudaEventCreateWithFlags(&
gatherEnd[i], cudaEventDisableTiming);
102 cudaEventCreateWithFlags(&
scatterStart[i], cudaEventDisableTiming);
103 cudaEventCreateWithFlags(&
scatterEnd[i], cudaEventDisableTiming);
105 for (
int i=0; i<2; i++) {
106 cudaEventCreateWithFlags(&
packEnd[i], cudaEventDisableTiming);
107 cudaEventCreateWithFlags(&
dslashStart[i], cudaEventDisableTiming);
112 #if CUDA_VERSION >= 8000 113 commsEnd_h =
static_cast<cuuint32_t*
>(
mapped_malloc(Nstream*
sizeof(
int)));
114 for (
int i=0; i<
Nstream; i++) {
115 cudaHostGetDevicePointer((
void**)&commsEnd_d[i], commsEnd_h+i, 0);
133 policies = std::vector<QudaDslashPolicy>(
146 using namespace dslash;
148 #if CUDA_VERSION >= 8000 153 for (
int i=0; i<
Nstream; i++) {
160 for (
int i=0; i<2; i++) {
171 template <
typename Float,
int nColor>
189 : out(out), in(in), d(d), nParity(in.SiteSubset()),
191 volumeCB(doublet ? in.VolumeCB()/2 : in.VolumeCB()), a(0.0), b(0.0), c(0.0)
193 if (d < 0 || d > 4)
errorQuda(
"Undefined gamma matrix %d", d);
203 a = 1.0 / (1.0 + b * b);
207 }
else if (doublet) {
215 a = 1.0 / (1.0 + b * b - c * c);
216 if (a <= 0)
errorQuda(
"Invalid twisted mass parameters (kappa=%e, mu=%e, epsilon=%e)\n",
kappa, mu, epsilon);
224 template <
typename Float,
int nColor,
typename Arg>
230 for (
int x_cb = 0; x_cb < arg.
volumeCB; x_cb++) {
232 arg.out(x_cb,
parity) = in.gamma(arg.d);
239 template <
typename Float,
int nColor,
int d,
typename Arg>
243 int x_cb = blockIdx.x*blockDim.x + threadIdx.x;
244 int parity = blockDim.y*blockIdx.y + threadIdx.y;
247 if (parity >= arg.
nParity)
return;
250 arg.out(x_cb, parity) = in.gamma(d);
253 template <
typename Float,
int nColor,
typename Arg>
260 long long flops()
const {
return 0; }
261 long long bytes()
const {
return arg.out.Bytes() + arg.in.Bytes(); }
274 gammaCPU<Float,nColor>(
arg);
279 default:
errorQuda(
"%d not instantiated", arg.d);
291 template <
typename Float,
int nColor>
300 template <
typename Float>
304 ApplyGamma<Float,3>(
out,
in, d);
318 ApplyGamma<double>(
out,
in, d);
320 ApplyGamma<float>(
out,
in, d);
322 ApplyGamma<short>(
out,
in, d);
324 ApplyGamma<char>(
out,
in, d);
331 template <
bool doublet,
typename Float,
int nColor,
typename Arg>
336 for (
int x_cb = 0; x_cb < arg.
volumeCB; x_cb++) {
339 arg.out(x_cb,
parity) = arg.a * (in + arg.b * in.igamma(arg.d));
343 arg.out(x_cb + 0 * arg.
volumeCB,
parity) = arg.a * (in_1 + arg.b * in_1.igamma(arg.d) + arg.c * in_2);
344 arg.out(x_cb + 1 * arg.
volumeCB,
parity) = arg.a * (in_2 - arg.b * in_2.igamma(arg.d) + arg.c * in_1);
352 template <
bool doublet,
typename Float,
int nColor,
int d,
typename Arg>
356 int x_cb = blockIdx.x*blockDim.x + threadIdx.x;
357 int parity = blockDim.y*blockIdx.y + threadIdx.y;
362 arg.out(x_cb, parity) = arg.a * (in + arg.b * in.igamma(d));
366 arg.out(x_cb + 0 * arg.
volumeCB, parity) = arg.a * (in_1 + arg.b * in_1.igamma(d) + arg.c * in_2);
367 arg.out(x_cb + 1 * arg.
volumeCB, parity) = arg.a * (in_2 - arg.b * in_2.igamma(d) + arg.c * in_1);
371 template <
typename Float,
int nColor,
typename Arg>
378 long long flops()
const {
return 0; }
379 long long bytes()
const {
return arg.out.Bytes() + arg.in.Bytes(); }
392 if (arg.doublet) twistGammaCPU<true,Float,nColor>(
arg);
393 twistGammaCPU<false,Float,nColor>(
arg);
399 default:
errorQuda(
"%d not instantiated", arg.d);
404 default:
errorQuda(
"%d not instantiated", arg.d);
410 void preTune() {
if (arg.out.field == arg.in.field) arg.out.save(); }
411 void postTune() {
if (arg.out.field == arg.in.field) arg.out.load(); }
415 template <
typename Float,
int nColor>
426 template <
typename Float>
443 #ifdef GPU_TWISTED_MASS_DIRAC 456 errorQuda(
"Twisted mass dslash has not been built");
457 #endif // GPU_TWISTED_MASS_DIRAC 470 template <
typename Float,
int nSpin,
int nColor,
bool dynamic_clover_=false>
473 static constexpr
bool dynamic_clover = dynamic_clover_;
494 bool inverse,
int parity, RegType kappa=0.0, RegType mu=0.0, RegType epsilon=0.0,
498 in(in), nParity(in.SiteSubset()), parity(parity), inverse(inverse),
500 volumeCB(doublet ? in.VolumeCB()/2 : in.VolumeCB()), a(0.0), b(0.0), c(0.0), twist(twist)
504 a = 2.0 * kappa *
mu;
507 a = -2.0 * kappa *
mu;
508 b = 1.0 / (1.0 + a*a);
511 if (dagger) a *= -1.0;
512 }
else if (doublet) {
513 errorQuda(
"ERROR: Non-degenerated twisted-mass not supported in this regularization\n");
518 template <
typename Float,
int nSpin,
int nColor,
typename Arg>
520 using namespace linalg;
525 Spinor in = arg.in(x_cb, spinor_parity);
531 for (
int chirality=0; chirality<2; chirality++) {
533 HMatrix<RegType,nColor*nSpin/2> A = arg.clover(x_cb, parity, chirality);
534 HalfSpinor chi = in.chiral_project(chirality);
536 if (arg.dynamic_clover) {
537 Cholesky<
HMatrix, RegType, nColor * nSpin / 2> cholesky(A);
538 chi =
static_cast<RegType
>(0.25) * cholesky.backward(cholesky.forward(chi));
543 out += chi.chiral_reconstruct(chirality);
548 arg.out(x_cb, spinor_parity) =
out;
551 template <
typename Float,
int nSpin,
int nColor,
typename Arg>
555 for (
int x_cb=0; x_cb<arg.
volumeCB; x_cb++) cloverApply<Float,nSpin,nColor>(arg, x_cb,
parity);
559 template <
typename Float,
int nSpin,
int nColor,
typename Arg>
561 int x_cb = blockIdx.x*blockDim.x + threadIdx.x;
562 int parity = (arg.
nParity == 2) ? blockDim.y*blockIdx.y + threadIdx.y : arg.parity;
564 cloverApply<Float,nSpin,nColor>(
arg, x_cb,
parity);
567 template <
typename Float,
int nSpin,
int nColor,
typename Arg>
576 long long bytes()
const {
return arg.out.Bytes() + arg.in.Bytes() + arg.
nParity*arg.
volumeCB*arg.clover.Bytes(); }
591 cloverCPU<Float,nSpin,nColor>(
arg);
598 void preTune() {
if (arg.out.field == arg.in.field) arg.out.save(); }
599 void postTune() {
if (arg.out.field == arg.in.field) arg.out.load(); }
603 template <
typename Float,
int nColor>
607 constexpr
int nSpin = 4;
610 #ifdef DYNAMIC_CLOVER 611 constexpr
bool dynamic_clover =
true;
613 constexpr
bool dynamic_clover =
false;
628 template <
typename Float>
645 #ifdef GPU_CLOVER_DIRAC 658 errorQuda(
"Clover dslash has not been built");
659 #endif // GPU_TWISTED_MASS_DIRAC 664 template <
bool inverse,
typename Float,
int nSpin,
int nColor,
typename Arg>
666 using namespace linalg;
667 constexpr
int N =
nColor*nSpin/2;
673 Spinor in = arg.in(x_cb, spinor_parity);
679 for (
int chirality=0; chirality<2; chirality++) {
681 const complex<RegType> j(0.0, chirality == 0 ? static_cast<RegType>(0.5) : -static_cast<RegType>(0.5));
683 Mat A = arg.clover(x_cb, parity, chirality);
685 HalfSpinor in_chi = in.chiral_project(chirality);
686 HalfSpinor out_chi = A*in_chi + j*arg.a*in_chi;
689 if (arg.dynamic_clover) {
691 A2 += arg.a*arg.a*
static_cast<RegType
>(0.25);
692 Cholesky<HMatrix,RegType,N> cholesky(A2);
693 out_chi =
static_cast<RegType
>(0.25)*cholesky.backward(cholesky.forward(out_chi));
695 Mat Ainv = arg.cloverInv(x_cb, parity, chirality);
696 out_chi =
static_cast<RegType
>(2.0)*(Ainv*out_chi);
700 out += (out_chi).chiral_reconstruct(chirality);
705 arg.out(x_cb, spinor_parity) =
out;
708 template <
bool inverse,
typename Float,
int nSpin,
int nColor,
typename Arg>
710 for (
int parity=0; parity<arg.
nParity; parity++) {
711 parity = (arg.
nParity == 2) ? parity : arg.parity;
712 for (
int x_cb=0; x_cb<arg.
volumeCB; x_cb++) twistCloverApply<inverse,Float,nSpin,nColor>(arg, x_cb, parity);
716 template <
bool inverse,
typename Float,
int nSpin,
int nColor,
typename Arg>
718 int x_cb = blockIdx.x*blockDim.x + threadIdx.x;
719 int parity = (arg.
nParity == 2) ? blockDim.y*blockIdx.y + threadIdx.y : arg.parity;
721 twistCloverApply<inverse,Float,nSpin,nColor>(
arg, x_cb,
parity);
724 template <
typename Float,
int nSpin,
int nColor,
typename Arg>
734 long long rtn = arg.out.Bytes() + arg.in.Bytes() + arg.
nParity*arg.
volumeCB*arg.clover.Bytes();
746 strcat(aux, arg.inverse ?
",inverse" :
",direct");
754 if (arg.inverse) twistCloverCPU<true,Float,nSpin,nColor>(
arg);
755 else twistCloverCPU<false,Float,nSpin,nColor>(
arg);
763 void preTune() {
if (arg.out.field == arg.in.field) arg.out.save(); }
764 void postTune() {
if (arg.out.field == arg.in.field) arg.out.load(); }
768 template <
typename Float,
int nColor>
770 double kappa,
double mu,
double epsilon,
int parity,
int dagger,
QudaTwistGamma5Type twist)
773 constexpr
int nSpin = 4;
776 #ifdef DYNAMIC_CLOVER 777 constexpr
bool dynamic_clover =
true;
779 constexpr
bool dynamic_clover =
false;
782 CloverArg<Float,nSpin,nColor,dynamic_clover> arg(out, in, clover, inverse, parity, kappa, mu, epsilon, dagger, twist);
790 template <
typename Float>
792 double kappa,
double mu,
double epsilon,
int parity,
int dagger,
QudaTwistGamma5Type twist)
803 double kappa,
double mu,
double epsilon,
int parity,
int dagger,
QudaTwistGamma5Type twist)
808 #ifdef GPU_CLOVER_DIRAC 821 errorQuda(
"Clover dslash has not been built");
822 #endif // GPU_TWISTED_MASS_DIRAC bool dslash_exterior_compute
__global__ void gammaGPU(Arg arg)
colorspinor_mapper< Float, 4, nColor >::type F
bool dslash_interior_compute
cudaEvent_t scatterStart[Nstream]
cudaEvent_t gatherStart[Nstream]
const ColorSpinorField & meta
const char * AuxString() const
QudaVerbosity getVerbosity()
const ColorSpinorField & meta
mapper< Float >::type RegType
#define checkPrecision(...)
void ApplyGamma(ColorSpinorField &out, const ColorSpinorField &in, int d)
__device__ __host__ void twistCloverApply(Arg &arg, int x_cb, int parity)
unsigned int minThreads() const
__device__ __host__ complex< ValueType > apply(int row, const complex< ValueType > &a) const
const char * VolString() const
void Mat(sFloat *out, gFloat **link, sFloat *in, int daggerBit, int mu)
Main header file for host and device accessors to CloverFields.
unsigned int minThreads() const
cudaEvent_t dslashStart[2]
std::vector< QudaP2PPolicy > p2p_policies
char policy_string[TuneKey::aux_n]
std::vector< QudaDslashPolicy > policies
GammaArg(ColorSpinorField &out, const ColorSpinorField &in, int d, RegType kappa=0.0, RegType mu=0.0, RegType epsilon=0.0, bool dagger=false, QudaTwistGamma5Type twist=QUDA_TWIST_GAMMA5_INVALID)
Clover(Arg &arg, const ColorSpinorField &meta)
mapper< Float >::type RegType
void ApplyTwistGamma(ColorSpinorField &out, const ColorSpinorField &in, int d, double kappa, double mu, double epsilon, int dagger, QudaTwistGamma5Type type)
Apply the twisted-mass gamma operator to a color-spinor field.
Gamma(Arg &arg, const ColorSpinorField &meta)
void apply(const cudaStream_t &stream)
void createDslashEvents()
void twistCloverCPU(Arg &arg)
static std::stack< bool > kptstack
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
clover_mapper< Float, length >::type C
#define checkLocation(...)
Specialized container for Hermitian matrices (e.g., used for wrapping clover matrices) ...
void apply(const cudaStream_t &stream)
TwistClover(Arg &arg, const ColorSpinorField &meta)
const ColorSpinorField & meta
unsigned int minThreads() const
int first_active_p2p_policy
const ColorSpinorField & meta
QudaFieldLocation Location() const
CloverArg(ColorSpinorField &out, const ColorSpinorField &in, const CloverField &clover, bool inverse, int parity, RegType kappa=0.0, RegType mu=0.0, RegType epsilon=0.0, bool dagger=false, QudaTwistGamma5Type twist=QUDA_TWIST_GAMMA5_INVALID)
TwistGamma(Arg &arg, const ColorSpinorField &meta)
__device__ __host__ Matrix< T, 3 > inverse(const Matrix< T, 3 > &u)
cpuColorSpinorField * out
__device__ __host__ void cloverApply(Arg &arg, int x_cb, int parity)
void ApplyClover(ColorSpinorField &out, const ColorSpinorField &in, const CloverField &clover, bool inverse, int parity)
Apply clover-matrix field to a color-spinor field.
Parameteter structure for driving the clover and twist-clover application kernels.
__global__ void twistCloverGPU(Arg arg)
colorspinor_mapper< Float, nSpin, nColor >::type F
unsigned int minThreads() const
void apply(const cudaStream_t &stream)
cudaEvent_t scatterEnd[Nstream]
QudaTwistFlavorType TwistFlavor() const
QudaTwistGamma5Type twist
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
void setKernelPackT(bool pack)
void twistGammaCPU(Arg arg)
void gamma5(ColorSpinorField &out, const ColorSpinorField &in)
Applies a gamma5 matrix to a spinor (wrapper to ApplyGamma)
enum QudaTwistGamma5Type_s QudaTwistGamma5Type
void pushKernelPackT(bool pack)
#define mapped_malloc(size)
QudaTune getTuning()
Query whether autotuning is enabled or not. Default is enabled but can be overridden by setting QUDA_...
QudaPrecision Precision() const
__global__ void twistGammaGPU(Arg arg)
void apply(const cudaStream_t &stream)
__global__ void cloverGPU(Arg arg)
void destroyDslashEvents()
QudaFieldOrder FieldOrder() const
cudaEvent_t gatherEnd[Nstream]
void ApplyTwistClover(ColorSpinorField &out, const ColorSpinorField &in, const CloverField &clover, double kappa, double mu, double epsilon, int parity, int dagger, QudaTwistGamma5Type twist)
Apply twisted clover-matrix field to a color-spinor field.
Parameter structure for driving the Gamma operator.