10 #ifdef GPU_WILSON_DIRAC 17 #endif // GPU_WILSON_DIRAC 20 #ifdef GPU_STAGGERED_DIRAC 21 #if (__COMPUTE_CAPABILITY__ >= 300) // Kepler works best with texture loads only 31 #define DIRECT_ACCESS_SPINOR 36 #endif // GPU_STAGGERED_DIRAC 67 cudaEvent_t interiorDslashEnd;
80 #if CUDA_VERSION >= 8000 91 cudaEventCreateWithFlags(&
gatherStart[
i], cudaEventDisableTiming);
92 cudaEventCreateWithFlags(&
gatherEnd[
i], cudaEventDisableTiming);
93 cudaEventCreateWithFlags(&
scatterStart[
i], cudaEventDisableTiming);
94 cudaEventCreateWithFlags(&
scatterEnd[
i], cudaEventDisableTiming);
96 for (
int i=0;
i<2;
i++) {
97 cudaEventCreateWithFlags(&
packEnd[
i], cudaEventDisableTiming);
98 cudaEventCreateWithFlags(&
dslashStart[
i], cudaEventDisableTiming);
101 cudaEventCreateWithFlags(&interiorDslashEnd, cudaEventDisableTiming);
106 #if CUDA_VERSION >= 8000 122 #if CUDA_VERSION >= 8000 134 for (
int i=0;
i<2;
i++) {
139 cudaEventDestroy(interiorDslashEnd);
148 template <
typename Float,
int nColor>
170 if (d < 0 || d > 4)
errorQuda(
"Undefined gamma matrix %d",
d);
171 if (
in.Nspin() != 4)
errorQuda(
"Cannot apply gamma5 to nSpin=%d field",
in.Nspin());
172 if (!
in.isNative() || !
out.isNative())
errorQuda(
"Unsupported field order out=%d in=%d\n",
out.FieldOrder(),
in.FieldOrder());
180 b = 1.0 / (1.0 +
a*
a);
186 b = 2.0 *
kappa * epsilon;
187 c = 1.0 / (1.0 +
a*
a -
b*
b);
188 if (
c<=0)
errorQuda(
"Invalid twisted mass parameters (kappa=%e, mu=%e, epsilon=%e)\n",
kappa,
mu, epsilon);
194 template <
typename Float,
int nColor,
typename Arg>
200 for (
int x_cb = 0; x_cb <
arg.volumeCB; x_cb++) {
209 template <
typename Float,
int nColor,
int d,
typename Arg>
213 int x_cb = blockIdx.x*
blockDim.x + threadIdx.x;
216 if (x_cb >=
arg.volumeCB)
return;
223 template <
typename Float,
int nColor,
typename Arg>
224 class Gamma :
public TunableVectorY {
230 long long flops()
const {
return 0; }
231 long long bytes()
const {
return arg.out.Bytes() +
arg.in.Bytes(); }
244 gammaCPU<Float,nColor>(
arg);
261 template <
typename Float,
int nColor>
270 template <
typename Float>
274 ApplyGamma<Float,3>(
out,
in,
d);
288 ApplyGamma<double>(
out,
in,
d);
290 ApplyGamma<float>(
out,
in,
d);
292 ApplyGamma<short>(
out,
in,
d);
299 template <
bool doublet,
typename Float,
int nColor,
typename Arg>
304 for (
int x_cb = 0; x_cb <
arg.volumeCB; x_cb++) {
320 template <
bool doublet,
typename Float,
int nColor,
int d,
typename Arg>
324 int x_cb = blockIdx.x*
blockDim.x + threadIdx.x;
326 if (x_cb >=
arg.volumeCB)
return;
339 template <
typename Float,
int nColor,
typename Arg>
346 long long flops()
const {
return 0; }
347 long long bytes()
const {
return arg.out.Bytes() +
arg.in.Bytes(); }
360 if (
arg.doublet) twistGammaCPU<true,Float,nColor>(
arg);
361 twistGammaCPU<false,Float,nColor>(
arg);
383 template <
typename Float,
int nColor>
394 template <
typename Float>
411 #ifdef GPU_TWISTED_MASS_DIRAC 422 errorQuda(
"Twisted mass dslash has not been built");
423 #endif // GPU_TWISTED_MASS_DIRAC 436 template <
typename Float,
int nSpin,
int nColor,
bool dynamic_clover_=false>
474 b = 1.0 / (1.0 +
a*
a);
479 errorQuda(
"ERROR: Non-degenerated twisted-mass not supported in this regularization\n");
484 template <
typename Float,
int nSpin,
int nColor,
typename Arg>
487 int spinor_parity =
arg.nParity == 2 ?
parity : 0;
494 for (
int chirality=0; chirality<2; chirality++) {
496 out += (A *
in.chiral_project(chirality)).chiral_reconstruct(chirality);
501 arg.out(x_cb, spinor_parity) =
out;
504 template <
typename Float,
int nSpin,
int nColor,
typename Arg>
508 for (
int x_cb=0; x_cb<
arg.volumeCB; x_cb++) cloverApply<Float,nSpin,nColor>(
arg, x_cb,
parity);
512 template <
typename Float,
int nSpin,
int nColor,
typename Arg>
514 int x_cb = blockIdx.x*
blockDim.x + threadIdx.x;
516 if (x_cb >=
arg.volumeCB)
return;
517 cloverApply<Float,nSpin,nColor>(
arg, x_cb,
parity);
520 template <
typename Float,
int nSpin,
int nColor,
typename Arg>
528 long long flops()
const {
return arg.nParity*
arg.volumeCB*504ll; }
529 long long bytes()
const {
return arg.out.Bytes() +
arg.in.Bytes() +
arg.nParity*
arg.volumeCB*
arg.clover.Bytes(); }
544 cloverCPU<Float,nSpin,nColor>(
arg);
556 template <
typename Float,
int nColor>
560 constexpr
int nSpin = 4;
569 template <
typename Float>
586 #ifdef GPU_CLOVER_DIRAC 597 errorQuda(
"Clover dslash has not been built");
598 #endif // GPU_TWISTED_MASS_DIRAC 603 template <
bool inverse,
typename Float,
int nSpin,
int nColor,
typename Arg>
605 using namespace linalg;
606 constexpr
int N =
nColor*nSpin/2;
611 int spinor_parity =
arg.nParity == 2 ?
parity : 0;
618 for (
int chirality=0; chirality<2; chirality++) {
620 const complex<RegType> j(0.0, chirality == 0 ? static_cast<RegType>(0.5) : -static_cast<RegType>(0.5));
624 HalfSpinor in_chi =
in.chiral_project(chirality);
625 HalfSpinor out_chi = A*in_chi + j*
arg.a*in_chi;
628 if (
arg.dynamic_clover) {
630 A2 +=
arg.a*
arg.a*
static_cast<RegType
>(0.25);
631 Cholesky<HMatrix,RegType,N> cholesky(A2);
632 out_chi =
static_cast<RegType
>(0.25)*cholesky.backward(cholesky.forward(out_chi));
635 out_chi =
static_cast<RegType
>(2.0)*(Ainv*out_chi);
639 out += (out_chi).chiral_reconstruct(chirality);
644 arg.out(x_cb, spinor_parity) =
out;
647 template <
bool inverse,
typename Float,
int nSpin,
int nColor,
typename Arg>
651 for (
int x_cb=0; x_cb<
arg.volumeCB; x_cb++) twistCloverApply<inverse,Float,nSpin,nColor>(
arg, x_cb,
parity);
655 template <
bool inverse,
typename Float,
int nSpin,
int nColor,
typename Arg>
657 int x_cb = blockIdx.x*
blockDim.x + threadIdx.x;
659 if (x_cb >=
arg.volumeCB)
return;
660 twistCloverApply<inverse,Float,nSpin,nColor>(
arg, x_cb,
parity);
663 template <
typename Float,
int nSpin,
int nColor,
typename Arg>
671 long long flops()
const {
return (
arg.inverse ? 1056ll : 552ll) *
arg.nParity*
arg.volumeCB; }
673 long long rtn =
arg.out.Bytes() +
arg.in.Bytes() +
arg.nParity*
arg.volumeCB*
arg.clover.Bytes();
675 rtn +=
arg.nParity*
arg.volumeCB*
arg.cloverInv.Bytes();
693 if (
arg.inverse) twistCloverCPU<true,Float,nSpin,nColor>(
arg);
694 else twistCloverCPU<false,Float,nSpin,nColor>(
arg);
707 template <
typename Float,
int nColor>
712 constexpr
int nSpin = 4;
715 #ifdef DYNAMIC_CLOVER 716 constexpr
bool dynamic_clover =
true;
718 constexpr
bool dynamic_clover =
false;
721 CloverArg<Float,nSpin,nColor,dynamic_clover> arg(
out,
in,
clover, inverse,
parity,
kappa,
mu, epsilon,
dagger,
twist);
729 template <
typename Float>
747 #ifdef GPU_CLOVER_DIRAC 758 errorQuda(
"Clover dslash has not been built");
759 #endif // GPU_TWISTED_MASS_DIRAC
__global__ void gammaGPU(Arg arg)
colorspinor_mapper< Float, 4, nColor >::type F
const ColorSpinorField & meta
const char * AuxString() const
QudaVerbosity getVerbosity()
const ColorSpinorField & meta
mapper< Float >::type RegType
#define checkPrecision(...)
void ApplyGamma(ColorSpinorField &out, const ColorSpinorField &in, int d)
__device__ __host__ void twistCloverApply(Arg &arg, int x_cb, int parity)
unsigned int minThreads() const
char * strcpy(char *__dst, const char *__src)
const char * VolString() const
char * strcat(char *__s1, const char *__s2)
void Mat(sFloat *out, gFloat **link, sFloat *in, int daggerBit, int mu)
static constexpr bool dynamic_clover
Main header file for host and device accessors to CloverFields.
unsigned int minThreads() const
cudaEvent_t scatterStart[Nstream]
GammaArg(ColorSpinorField &out, const ColorSpinorField &in, int d, RegType kappa=0.0, RegType mu=0.0, RegType epsilon=0.0, bool dagger=false, QudaTwistGamma5Type twist=QUDA_TWIST_GAMMA5_INVALID)
Clover(Arg &arg, const ColorSpinorField &meta)
mapper< Float >::type RegType
VOLATILE spinorFloat kappa
void ApplyTwistGamma(ColorSpinorField &out, const ColorSpinorField &in, int d, double kappa, double mu, double epsilon, int dagger, QudaTwistGamma5Type type)
Apply the twisted-mass gamma operator to a color-spinor field.
Gamma(Arg &arg, const ColorSpinorField &meta)
void apply(const cudaStream_t &stream)
for(int s=0;s< param.dc.Ls;s++)
void createDslashEvents()
void twistCloverCPU(Arg &arg)
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
clover_mapper< Float, length >::type C
#define checkLocation(...)
Specialized container for Hermitian matrices (e.g., used for wrapping clover matrices) ...
void apply(const cudaStream_t &stream)
TwistClover(Arg &arg, const ColorSpinorField &meta)
const ColorSpinorField & meta
cudaEvent_t gatherEnd[Nstream]
double gamma(double) __attribute__((availability(macosx
unsigned int minThreads() const
const ColorSpinorField & meta
QudaFieldLocation Location() const
static constexpr int length
CloverArg(ColorSpinorField &out, const ColorSpinorField &in, const CloverField &clover, bool inverse, int parity, RegType kappa=0.0, RegType mu=0.0, RegType epsilon=0.0, bool dagger=false, QudaTwistGamma5Type twist=QUDA_TWIST_GAMMA5_INVALID)
TwistGamma(Arg &arg, const ColorSpinorField &meta)
cpuColorSpinorField * out
__device__ __host__ void cloverApply(Arg &arg, int x_cb, int parity)
void ApplyClover(ColorSpinorField &out, const ColorSpinorField &in, const CloverField &clover, bool inverse, int parity)
Apply clover-matrix field to a color-spinor field.
Parameteter structure for driving the clover and twist-clover application kernels.
__global__ void twistCloverGPU(Arg arg)
colorspinor_mapper< Float, nSpin, nColor >::type F
unsigned int minThreads() const
void apply(const cudaStream_t &stream)
QudaTwistGamma5Type twist
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
void setKernelPackT(bool pack)
void twistGammaCPU(Arg arg)
void gamma5(ColorSpinorField &out, const ColorSpinorField &in)
Applies a gamma5 matrix to a spinor (wrapper to ApplyGamma)
enum QudaTwistGamma5Type_s QudaTwistGamma5Type
#define mapped_malloc(size)
QudaTune getTuning()
Query whether autotuning is enabled or not. Default is enabled but can be overridden by setting QUDA_...
static __inline__ size_t size_t d
QudaPrecision Precision() const
cudaEvent_t scatterEnd[Nstream]
__global__ void twistGammaGPU(Arg arg)
void apply(const cudaStream_t &stream)
__global__ void cloverGPU(Arg arg)
void destroyDslashEvents()
cudaEvent_t dslashStart[2]
cudaEvent_t gatherStart[Nstream]
void ApplyTwistClover(ColorSpinorField &out, const ColorSpinorField &in, const CloverField &clover, double kappa, double mu, double epsilon, int parity, int dagger, QudaTwistGamma5Type twist)
Apply twisted clover-matrix field to a color-spinor field.
Parameter structure for driving the Gamma operator.
CUdeviceptr commsEnd_d[Nstream]