1 #include <gauge_field.h>
2 #include <color_spinor_field.h>
3 #include <clover_field.h>
7 #include <dslash_policy.cuh>
8 #include <kernels/dslash_wilson_clover_preconditioned.cuh>
11 This is the Wilson-clover preconditioned linear operator
17 template <typename Arg> class WilsonCloverPreconditioned : public Dslash<wilsonCloverPreconditioned, Arg>
19 using Dslash = Dslash<wilsonCloverPreconditioned, Arg>;
24 WilsonCloverPreconditioned(Arg &arg, const ColorSpinorField &out, const ColorSpinorField &in) : Dslash(arg, out, in)
28 void apply(const qudaStream_t &stream)
30 TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
32 // specialize here to constrain the template instantiation
33 if (arg.nParity == 1) {
35 if (arg.dagger) errorQuda("xpay operator only defined for not dagger");
36 Dslash::template instantiate<packShmem, 1, false, true>(tp, stream);
39 Dslash::template instantiate<packShmem, 1, true, false>(tp, stream);
41 Dslash::template instantiate<packShmem, 1, false, false>(tp, stream);
44 errorQuda("Preconditioned Wilson-clover operator not defined nParity=%d", arg.nParity);
48 long long flops() const
50 int clover_flops = 504;
51 long long flops = Dslash::flops();
52 switch (arg.kernel_type) {
53 case EXTERIOR_KERNEL_X:
54 case EXTERIOR_KERNEL_Y:
55 case EXTERIOR_KERNEL_Z:
56 case EXTERIOR_KERNEL_T: flops += clover_flops * 2 * in.GhostFace()[arg.kernel_type]; break;
57 case EXTERIOR_KERNEL_ALL:
58 flops += clover_flops * 2 * (in.GhostFace()[0] + in.GhostFace()[1] + in.GhostFace()[2] + in.GhostFace()[3]);
63 flops += clover_flops * in.Volume();
65 if (arg.kernel_type == KERNEL_POLICY) break;
66 // now correct for flops done by exterior kernel
67 long long ghost_sites = 0;
68 for (int d = 0; d < 4; d++)
69 if (arg.commDim[d]) ghost_sites += 2 * in.GhostFace()[d];
70 flops -= clover_flops * ghost_sites;
77 long long bytes() const
79 int clover_bytes = 72 * in.Precision() + (isFixed<typename Arg::Float>::value ? 2 * sizeof(float) : 0);
81 long long bytes = Dslash::bytes();
82 switch (arg.kernel_type) {
83 case EXTERIOR_KERNEL_X:
84 case EXTERIOR_KERNEL_Y:
85 case EXTERIOR_KERNEL_Z:
86 case EXTERIOR_KERNEL_T: bytes += clover_bytes * 2 * in.GhostFace()[arg.kernel_type]; break;
87 case EXTERIOR_KERNEL_ALL:
88 bytes += clover_bytes * 2 * (in.GhostFace()[0] + in.GhostFace()[1] + in.GhostFace()[2] + in.GhostFace()[3]);
93 bytes += clover_bytes * in.Volume();
95 if (arg.kernel_type == KERNEL_POLICY) break;
96 // now correct for bytes done by exterior kernel
97 long long ghost_sites = 0;
98 for (int d = 0; d < 4; d++)
99 if (arg.commDim[d]) ghost_sites += 2 * in.GhostFace()[d];
100 bytes -= clover_bytes * ghost_sites;
110 template <typename Float, int nColor, QudaReconstructType recon> struct WilsonCloverPreconditionedApply {
112 inline WilsonCloverPreconditionedApply(ColorSpinorField &out, const ColorSpinorField &in, const GaugeField &U,
113 const CloverField &A, double a, const ColorSpinorField &x, int parity, bool dagger, const int *comm_override,
114 TimeProfile &profile)
116 constexpr int nDim = 4;
117 WilsonCloverArg<Float, nColor, nDim, recon> arg(out, in, U, A, a, x, parity, dagger, comm_override);
118 WilsonCloverPreconditioned<decltype(arg)> wilson(arg, out, in);
120 dslash::DslashPolicyTune<decltype(wilson)> policy(wilson,
121 const_cast<cudaColorSpinorField *>(static_cast<const cudaColorSpinorField *>(&in)), in.VolumeCB(),
122 in.GhostFaceCB(), profile);
127 // Apply the preconditioned Wilson-clover operator
128 // out(x) = M*in = a * A(x)^{-1} (\sum_mu U_{-\mu}(x)in(x+mu) + U^\dagger_mu(x-mu)in(x-mu))
129 // Uses the kappa normalization for the Wilson operator.
130 void ApplyWilsonCloverPreconditioned(ColorSpinorField &out, const ColorSpinorField &in, const GaugeField &U,
131 const CloverField &A, double a, const ColorSpinorField &x, int parity, bool dagger, const int *comm_override,
132 TimeProfile &profile)
134 #ifdef GPU_CLOVER_DIRAC
135 instantiate<WilsonCloverPreconditionedApply>(out, in, U, A, a, x, parity, dagger, comm_override, profile);
137 errorQuda("Clover dslash has not been built");