quda-ref/v1.0.0/dslash__quda_8cu_source.html

 #include <cstdlib>
 #include <cstdio>
 #include <string>
 #include <iostream>
 #include <stack>

 #include <color_spinor_field.h>
 #include <clover_field.h>
 #include <dslash_quda.h>
 #include <color_spinor_field_order.h>
 #include <clover_field_order.h>
 #include <index_helper.cuh>
 #include <color_spinor.h>
 #include <linalg.cuh>
 #include <dslash_policy.cuh>

 namespace quda {

   // these should not be namespaced!!
   // determines whether the temporal ghost zones are packed with a gather kernel,
   // as opposed to multiple calls to cudaMemcpy()
   static bool kernelPackT = false;

   void setKernelPackT(bool packT) { kernelPackT = packT; }

   bool getKernelPackT() { return kernelPackT; }

   static std::stack<bool> kptstack;

   void pushKernelPackT(bool packT)
   {
     kptstack.push(getKernelPackT());
     setKernelPackT(packT);

     if (kptstack.size() > 10)
     {
       warningQuda("KernelPackT stack contains %u elements.  Is there a missing popKernelPackT() somewhere?",
       static_cast<unsigned int>(kptstack.size()));
     }
   }

   void popKernelPackT()
   {
     if (kptstack.empty())
     {
       errorQuda("popKernelPackT() called with empty stack");
     }
     setKernelPackT(kptstack.top());
     kptstack.pop();
   }

   namespace dslash {
     int it = 0;

     cudaEvent_t packEnd[2];
     cudaEvent_t gatherStart[Nstream];
     cudaEvent_t gatherEnd[Nstream];
     cudaEvent_t scatterStart[Nstream];
     cudaEvent_t scatterEnd[Nstream];
     cudaEvent_t dslashStart[2];

     // these variables are used for benchmarking the dslash components in isolation
     bool dslash_pack_compute;
     bool dslash_interior_compute;
     bool dslash_exterior_compute;
     bool dslash_comms;
     bool dslash_copy;

     // whether the dslash policy tuner has been enabled
     bool dslash_policy_init;

     // used to keep track of which policy to start the autotuning
     int first_active_policy;
     int first_active_p2p_policy;

     // list of dslash policies that are enabled
     std::vector<QudaDslashPolicy> policies;

     // list of p2p policies that are enabled
     std::vector<QudaP2PPolicy> p2p_policies;

     // string used as a tunekey to ensure we retune if the dslash policy env changes
     char policy_string[TuneKey::aux_n];

     // FIX this is a hack from hell
     // Auxiliary work that can be done while waiting on comms to finis
     Worker *aux_worker;

 #if CUDA_VERSION >= 8000
     cuuint32_t *commsEnd_h;
     CUdeviceptr commsEnd_d[Nstream];
 #endif
   }

   void createDslashEvents()
   {
     using namespace dslash;
     // add cudaEventDisableTiming for lower sync overhead
     for (int i=0; i<Nstream; i++) {
       cudaEventCreateWithFlags(&gatherStart[i], cudaEventDisableTiming);
       cudaEventCreateWithFlags(&gatherEnd[i], cudaEventDisableTiming);
       cudaEventCreateWithFlags(&scatterStart[i], cudaEventDisableTiming);
       cudaEventCreateWithFlags(&scatterEnd[i], cudaEventDisableTiming);
     }
     for (int i=0; i<2; i++) {
       cudaEventCreateWithFlags(&packEnd[i], cudaEventDisableTiming);
       cudaEventCreateWithFlags(&dslashStart[i], cudaEventDisableTiming);
     }

     aux_worker = NULL;

 #if CUDA_VERSION >= 8000
     commsEnd_h = static_cast<cuuint32_t*>(mapped_malloc(Nstream*sizeof(int)));
     for (int i=0; i<Nstream; i++) {
       cudaHostGetDevicePointer((void**)&commsEnd_d[i], commsEnd_h+i, 0);
       commsEnd_h[i] = 0;
     }
 #endif

     checkCudaError();

     dslash_pack_compute = true;
     dslash_interior_compute = true;
     dslash_exterior_compute = true;
     dslash_comms = true;
     dslash_copy = true;

     dslash_policy_init = false;
     first_active_policy = 0;
     first_active_p2p_policy = 0;

     // list of dslash policies that are enabled
     policies = std::vector<QudaDslashPolicy>(
         static_cast<int>(QudaDslashPolicy::QUDA_DSLASH_POLICY_DISABLED), QudaDslashPolicy::QUDA_DSLASH_POLICY_DISABLED);

     // list of p2p policies that are enabled
     p2p_policies = std::vector<QudaP2PPolicy>(
         static_cast<int>(QudaP2PPolicy::QUDA_P2P_POLICY_DISABLED), QudaP2PPolicy::QUDA_P2P_POLICY_DISABLED);

     strcat(policy_string, ",pol=");
   }


   void destroyDslashEvents()
   {
     using namespace dslash;

 #if CUDA_VERSION >= 8000
     host_free(commsEnd_h);
     commsEnd_h = 0;
 #endif

     for (int i=0; i<Nstream; i++) {
       cudaEventDestroy(gatherStart[i]);
       cudaEventDestroy(gatherEnd[i]);
       cudaEventDestroy(scatterStart[i]);
       cudaEventDestroy(scatterEnd[i]);
     }

     for (int i=0; i<2; i++) {
       cudaEventDestroy(packEnd[i]);
       cudaEventDestroy(dslashStart[i]);
     }

     checkCudaError();
   }

   template <typename Float, int nColor>
   struct GammaArg {
     typedef typename colorspinor_mapper<Float,4,nColor>::type F;
     typedef typename mapper<Float>::type RegType;

     F out;                // output vector field
     const F in;           // input vector field
     const int d;          // which gamma matrix are we applying
     const int nParity;    // number of parities we're working on
     bool doublet;         // whether we applying the operator to a doublet
     const int volumeCB;   // checkerboarded volume
     RegType a;            // scale factor
     RegType b;            // chiral twist
     RegType c;            // flavor twist

     GammaArg(ColorSpinorField &out, const ColorSpinorField &in, int d,
        RegType kappa=0.0, RegType mu=0.0, RegType epsilon=0.0,
        bool dagger=false, QudaTwistGamma5Type twist=QUDA_TWIST_GAMMA5_INVALID)
       : out(out), in(in), d(d), nParity(in.SiteSubset()),
   doublet(in.TwistFlavor() == QUDA_TWIST_DEG_DOUBLET || in.TwistFlavor() == QUDA_TWIST_NONDEG_DOUBLET),
   volumeCB(doublet ? in.VolumeCB()/2 : in.VolumeCB()), a(0.0), b(0.0), c(0.0)
     {
       if (d < 0 || d > 4) errorQuda("Undefined gamma matrix %d", d);
       if (in.Nspin() != 4) errorQuda("Cannot apply gamma5 to nSpin=%d field", in.Nspin());
       if (!in.isNative() || !out.isNative()) errorQuda("Unsupported field order out=%d in=%d\n", out.FieldOrder(), in.FieldOrder());

       if (in.TwistFlavor() == QUDA_TWIST_SINGLET) {
   if (twist == QUDA_TWIST_GAMMA5_DIRECT) {
           b = 2.0 * kappa * mu;
           a = 1.0;
         } else if (twist == QUDA_TWIST_GAMMA5_INVERSE) {
           b = -2.0 * kappa * mu;
           a = 1.0 / (1.0 + b * b);
         }
   c = 0.0;
         if (dagger) b *= -1.0;
       } else if (doublet) {
         if (twist == QUDA_TWIST_GAMMA5_DIRECT) {
           b = 2.0 * kappa * mu;
           c = -2.0 * kappa * epsilon;
           a = 1.0;
         } else if (twist == QUDA_TWIST_GAMMA5_INVERSE) {
           b = -2.0 * kappa * mu;
           c = 2.0 * kappa * epsilon;
           a = 1.0 / (1.0 + b * b - c * c);
           if (a <= 0) errorQuda("Invalid twisted mass parameters (kappa=%e, mu=%e, epsilon=%e)\n", kappa, mu, epsilon);
         }
         if (dagger) b *= -1.0;
       }
     }
   };

   // CPU kernel for applying the gamma matrix to a colorspinor
   template <typename Float, int nColor, typename Arg>
   void gammaCPU(Arg arg)
   {
     typedef typename mapper<Float>::type RegType;
     for (int parity= 0; parity < arg.nParity; parity++) {

       for (int x_cb = 0; x_cb < arg.volumeCB; x_cb++) { // 4-d volume
   ColorSpinor<RegType,nColor,4> in = arg.in(x_cb, parity);
   arg.out(x_cb, parity) = in.gamma(arg.d);
       } // 4-d volumeCB
     } // parity

   }

   // GPU Kernel for applying the gamma matrix to a colorspinor
   template <typename Float, int nColor, int d, typename Arg>
   __global__ void gammaGPU(Arg arg)
   {
     typedef typename mapper<Float>::type RegType;
     int x_cb = blockIdx.x*blockDim.x + threadIdx.x;
     int parity = blockDim.y*blockIdx.y + threadIdx.y;

     if (x_cb >= arg.volumeCB) return;
     if (parity >= arg.nParity) return;

     ColorSpinor<RegType,nColor,4> in = arg.in(x_cb, parity);
     arg.out(x_cb, parity) = in.gamma(d);
   }

   template <typename Float, int nColor, typename Arg>
   class Gamma : public TunableVectorY {

   protected:
     Arg &arg;
     const ColorSpinorField &meta;

     long long flops() const { return 0; }
     long long bytes() const { return arg.out.Bytes() + arg.in.Bytes(); }
     bool tuneGridDim() const { return false; }
     unsigned int minThreads() const { return arg.volumeCB; }

   public:
     Gamma(Arg &arg, const ColorSpinorField &meta) : TunableVectorY(arg.nParity), arg(arg), meta(meta)
     {
       strcpy(aux, meta.AuxString());
     }
     virtual ~Gamma() { }

     void apply(const cudaStream_t &stream) {
       if (meta.Location() == QUDA_CPU_FIELD_LOCATION) {
   gammaCPU<Float,nColor>(arg);
       } else {
         TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
   switch (arg.d) {
   case 4: gammaGPU<Float,nColor,4> <<<tp.grid,tp.block,tp.shared_bytes,stream>>>(arg); break;
   default: errorQuda("%d not instantiated", arg.d);
   }
       }
     }

     TuneKey tuneKey() const { return TuneKey(meta.VolString(), typeid(*this).name(), aux); }

     void preTune() { arg.out.save(); }
     void postTune() { arg.out.load(); }
   };


   template <typename Float, int nColor>
   void ApplyGamma(ColorSpinorField &out, const ColorSpinorField &in, int d)
   {
     GammaArg<Float,nColor> arg(out, in, d);
     Gamma<Float,nColor,GammaArg<Float,nColor> > gamma(arg, in);
     gamma.apply(streams[Nstream-1]);
   }

   // template on the number of colors
   template <typename Float>
   void ApplyGamma(ColorSpinorField &out, const ColorSpinorField &in, int d)
   {
     if (in.Ncolor() == 3) {
       ApplyGamma<Float,3>(out, in, d);
     } else {
       errorQuda("Unsupported number of colors %d\n", in.Ncolor());
     }
   }

   //Apply the Gamma matrix to a colorspinor field
   //out(x) = gamma_d*in
   void ApplyGamma(ColorSpinorField &out, const ColorSpinorField &in, int d)
   {
     checkPrecision(out, in);    // check all precisions match
     checkLocation(out, in);     // check all locations match

     if (in.Precision() == QUDA_DOUBLE_PRECISION) {
       ApplyGamma<double>(out, in, d);
     } else if (in.Precision() == QUDA_SINGLE_PRECISION) {
       ApplyGamma<float>(out, in, d);
     } else if (in.Precision() == QUDA_HALF_PRECISION) {
       ApplyGamma<short>(out, in, d);
     } else if (in.Precision() == QUDA_QUARTER_PRECISION) {
       ApplyGamma<char>(out, in, d);
     } else {
       errorQuda("Unsupported precision %d\n", in.Precision());
     }
   }

   // CPU kernel for applying the gamma matrix to a colorspinor
   template <bool doublet, typename Float, int nColor, typename Arg>
   void twistGammaCPU(Arg arg)
   {
     typedef typename mapper<Float>::type RegType;
     for (int parity= 0; parity < arg.nParity; parity++) {
       for (int x_cb = 0; x_cb < arg.volumeCB; x_cb++) { // 4-d volume
   if (!doublet) {
     ColorSpinor<RegType,nColor,4> in = arg.in(x_cb, parity);
           arg.out(x_cb, parity) = arg.a * (in + arg.b * in.igamma(arg.d));
         } else {
     ColorSpinor<RegType,nColor,4> in_1 = arg.in(x_cb+0*arg.volumeCB, parity);
     ColorSpinor<RegType,nColor,4> in_2 = arg.in(x_cb+1*arg.volumeCB, parity);
           arg.out(x_cb + 0 * arg.volumeCB, parity) = arg.a * (in_1 + arg.b * in_1.igamma(arg.d) + arg.c * in_2);
           arg.out(x_cb + 1 * arg.volumeCB, parity) = arg.a * (in_2 - arg.b * in_2.igamma(arg.d) + arg.c * in_1);
         }
       } // 4-d volumeCB
     } // parity

   }

   // GPU Kernel for applying the gamma matrix to a colorspinor
   template <bool doublet, typename Float, int nColor, int d, typename Arg>
   __global__ void twistGammaGPU(Arg arg)
   {
     typedef typename mapper<Float>::type RegType;
     int x_cb = blockIdx.x*blockDim.x + threadIdx.x;
     int parity = blockDim.y*blockIdx.y + threadIdx.y;
     if (x_cb >= arg.volumeCB) return;

     if (!doublet) {
       ColorSpinor<RegType,nColor,4> in = arg.in(x_cb, parity);
       arg.out(x_cb, parity) = arg.a * (in + arg.b * in.igamma(d));
     } else {
       ColorSpinor<RegType,nColor,4> in_1 = arg.in(x_cb+0*arg.volumeCB, parity);
       ColorSpinor<RegType,nColor,4> in_2 = arg.in(x_cb+1*arg.volumeCB, parity);
       arg.out(x_cb + 0 * arg.volumeCB, parity) = arg.a * (in_1 + arg.b * in_1.igamma(d) + arg.c * in_2);
       arg.out(x_cb + 1 * arg.volumeCB, parity) = arg.a * (in_2 - arg.b * in_2.igamma(d) + arg.c * in_1);
     }
   }

   template <typename Float, int nColor, typename Arg>
   class TwistGamma : public TunableVectorY {

   protected:
     Arg &arg;
     const ColorSpinorField &meta;

     long long flops() const { return 0; }
     long long bytes() const { return arg.out.Bytes() + arg.in.Bytes(); }
     bool tuneGridDim() const { return false; }
     unsigned int minThreads() const { return arg.volumeCB; }

   public:
     TwistGamma(Arg &arg, const ColorSpinorField &meta) : TunableVectorY(arg.nParity), arg(arg), meta(meta)
     {
       strcpy(aux, meta.AuxString());
     }
     virtual ~TwistGamma() { }

     void apply(const cudaStream_t &stream) {
       if (meta.Location() == QUDA_CPU_FIELD_LOCATION) {
   if (arg.doublet) twistGammaCPU<true,Float,nColor>(arg);
   twistGammaCPU<false,Float,nColor>(arg);
       } else {
         TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
   if (arg.doublet)
     switch (arg.d) {
     case 4: twistGammaGPU<true,Float,nColor,4> <<<tp.grid,tp.block,tp.shared_bytes,stream>>>(arg); break;
     default: errorQuda("%d not instantiated", arg.d);
     }
   else
     switch (arg.d) {
     case 4: twistGammaGPU<false,Float,nColor,4> <<<tp.grid,tp.block,tp.shared_bytes,stream>>>(arg); break;
     default: errorQuda("%d not instantiated", arg.d);
     }
       }
     }

     TuneKey tuneKey() const { return TuneKey(meta.VolString(), typeid(*this).name(), aux); }
     void preTune() { if (arg.out.field == arg.in.field) arg.out.save(); }
     void postTune() { if (arg.out.field == arg.in.field) arg.out.load(); }
   };


   template <typename Float, int nColor>
   void ApplyTwistGamma(ColorSpinorField &out, const ColorSpinorField &in, int d, double kappa, double mu, double epsilon, int dagger, QudaTwistGamma5Type type)
   {
     GammaArg<Float,nColor> arg(out, in, d, kappa, mu, epsilon, dagger, type);
     TwistGamma<Float,nColor,GammaArg<Float,nColor> > gamma(arg, in);
     gamma.apply(streams[Nstream-1]);

     checkCudaError();
   }

   // template on the number of colors
   template <typename Float>
   void ApplyTwistGamma(ColorSpinorField &out, const ColorSpinorField &in, int d, double kappa, double mu, double epsilon, int dagger, QudaTwistGamma5Type type)
   {
     if (in.Ncolor() == 3) {
       ApplyTwistGamma<Float,3>(out, in, d, kappa, mu, epsilon, dagger, type);
     } else {
       errorQuda("Unsupported number of colors %d\n", in.Ncolor());
     }
   }

   //Apply the Gamma matrix to a colorspinor field
   //out(x) = gamma_d*in
   void ApplyTwistGamma(ColorSpinorField &out, const ColorSpinorField &in, int d, double kappa, double mu, double epsilon, int dagger, QudaTwistGamma5Type type)
   {
     checkPrecision(out, in);    // check all precisions match
     checkLocation(out, in);     // check all locations match

 #ifdef GPU_TWISTED_MASS_DIRAC
     if (in.Precision() == QUDA_DOUBLE_PRECISION) {
       ApplyTwistGamma<double>(out, in, d, kappa, mu, epsilon, dagger, type);
     } else if (in.Precision() == QUDA_SINGLE_PRECISION) {
       ApplyTwistGamma<float>(out, in, d, kappa, mu, epsilon, dagger, type);
     } else if (in.Precision() == QUDA_HALF_PRECISION) {
       ApplyTwistGamma<short>(out, in, d, kappa, mu, epsilon, dagger, type);
     } else if (in.Precision() == QUDA_QUARTER_PRECISION) {
       ApplyTwistGamma<char>(out, in, d, kappa, mu, epsilon, dagger, type);
     } else {
       errorQuda("Unsupported precision %d\n", in.Precision());
     }
 #else
     errorQuda("Twisted mass dslash has not been built");
 #endif // GPU_TWISTED_MASS_DIRAC
   }

   // Applies a gamma5 matrix to a spinor (wrapper to ApplyGamma)
   void gamma5(ColorSpinorField &out, const ColorSpinorField &in) { ApplyGamma(out,in,4); }

   template <typename Float, int nSpin, int nColor, bool dynamic_clover_=false>
   struct CloverArg {
     static constexpr int length = (nSpin / (nSpin/2)) * 2 * nColor * nColor * (nSpin/2) * (nSpin/2) / 2;
     static constexpr bool dynamic_clover = dynamic_clover_;

     typedef typename colorspinor_mapper<Float,nSpin,nColor>::type F;
     typedef typename clover_mapper<Float,length>::type C;
     typedef typename mapper<Float>::type RegType;

     F out;                // output vector field
     const F in;           // input vector field
     const C clover;       // clover field
     const C cloverInv;    // inverse clover field (only set if not dynamic clover and doing twisted clover)
     const int nParity;    // number of parities we're working on
     const int parity;     // which parity we're acting on (if nParity=1)
     bool inverse;         // whether we are applying the inverse
     bool doublet;         // whether we applying the operator to a doublet
     const int volumeCB;   // checkerboarded volume
     RegType a;
     RegType b;
     RegType c;
     QudaTwistGamma5Type twist;

     CloverArg(ColorSpinorField &out, const ColorSpinorField &in, const CloverField &clover,
         bool inverse, int parity, RegType kappa=0.0, RegType mu=0.0, RegType epsilon=0.0,
         bool dagger = false, QudaTwistGamma5Type twist=QUDA_TWIST_GAMMA5_INVALID)
       : out(out), clover(clover, twist == QUDA_TWIST_GAMMA5_INVALID ? inverse : false),
   cloverInv(clover, (twist != QUDA_TWIST_GAMMA5_INVALID && !dynamic_clover) ? true : false),
   in(in), nParity(in.SiteSubset()), parity(parity), inverse(inverse),
   doublet(in.TwistFlavor() == QUDA_TWIST_DEG_DOUBLET || in.TwistFlavor() == QUDA_TWIST_NONDEG_DOUBLET),
         volumeCB(doublet ? in.VolumeCB()/2 : in.VolumeCB()), a(0.0), b(0.0), c(0.0), twist(twist)
     {
       if (in.TwistFlavor() == QUDA_TWIST_SINGLET) {
   if (twist == QUDA_TWIST_GAMMA5_DIRECT) {
     a = 2.0 * kappa * mu;
     b = 1.0;
   } else if (twist == QUDA_TWIST_GAMMA5_INVERSE) {
     a = -2.0 * kappa * mu;
     b = 1.0 / (1.0 + a*a);
   }
   c = 0.0;
   if (dagger) a *= -1.0;
       } else if (doublet) {
   errorQuda("ERROR: Non-degenerated twisted-mass not supported in this regularization\n");
       }
     }
   };

   template <typename Float, int nSpin, int nColor, typename Arg>
   __device__ __host__ inline void cloverApply(Arg &arg, int x_cb, int parity) {
     using namespace linalg; // for Cholesky
     typedef typename mapper<Float>::type RegType;
     typedef ColorSpinor<RegType, nColor, nSpin> Spinor;
     typedef ColorSpinor<RegType, nColor, nSpin / 2> HalfSpinor;
     int spinor_parity = arg.nParity == 2 ? parity : 0;
     Spinor in = arg.in(x_cb, spinor_parity);
     Spinor out;

     in.toRel(); // change to chiral basis here

 #pragma unroll
     for (int chirality=0; chirality<2; chirality++) {

       HMatrix<RegType,nColor*nSpin/2> A = arg.clover(x_cb, parity, chirality);
       HalfSpinor chi = in.chiral_project(chirality);

       if (arg.dynamic_clover) {
         Cholesky<HMatrix, RegType, nColor * nSpin / 2> cholesky(A);
         chi = static_cast<RegType>(0.25) * cholesky.backward(cholesky.forward(chi));
       } else {
         chi = A * chi;
       }

       out += chi.chiral_reconstruct(chirality);
     }

     out.toNonRel(); // change basis back

     arg.out(x_cb, spinor_parity) = out;
   }

   template <typename Float, int nSpin, int nColor, typename Arg>
   void cloverCPU(Arg &arg) {
     for (int parity=0; parity<arg.nParity; parity++) {
       parity = (arg.nParity == 2) ? parity : arg.parity;
       for (int x_cb=0; x_cb<arg.volumeCB; x_cb++) cloverApply<Float,nSpin,nColor>(arg, x_cb, parity);
     }
   }

   template <typename Float, int nSpin, int nColor, typename Arg>
   __global__ void cloverGPU(Arg arg) {
     int x_cb = blockIdx.x*blockDim.x + threadIdx.x;
     int parity = (arg.nParity == 2) ? blockDim.y*blockIdx.y + threadIdx.y : arg.parity;
     if (x_cb >= arg.volumeCB) return;
     cloverApply<Float,nSpin,nColor>(arg, x_cb, parity);
   }

   template <typename Float, int nSpin, int nColor, typename Arg>
   class Clover : public TunableVectorY {

   protected:
     Arg &arg;
     const ColorSpinorField &meta;

   protected:
     long long flops() const { return arg.nParity*arg.volumeCB*504ll; }
     long long bytes() const { return arg.out.Bytes() + arg.in.Bytes() + arg.nParity*arg.volumeCB*arg.clover.Bytes(); }
     bool tuneGridDim() const { return false; }
     unsigned int minThreads() const { return arg.volumeCB; }

   public:
     Clover(Arg &arg, const ColorSpinorField &meta) : TunableVectorY(arg.nParity), arg(arg), meta(meta)
     {
       strcpy(aux, meta.AuxString());
     }
     virtual ~Clover() { }

     void apply(const cudaStream_t &stream)
     {
       TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
       if (meta.Location() == QUDA_CPU_FIELD_LOCATION) {
   cloverCPU<Float,nSpin,nColor>(arg);
       } else {
   cloverGPU<Float,nSpin,nColor> <<<tp.grid,tp.block,tp.shared_bytes,stream>>>(arg);
       }
     }

     TuneKey tuneKey() const { return TuneKey(meta.VolString(), typeid(*this).name(), aux); }
     void preTune() { if (arg.out.field == arg.in.field) arg.out.save(); }  // Need to save the out field if it aliases the in field
     void postTune() { if (arg.out.field == arg.in.field) arg.out.load(); } // Restore if the in and out fields alias
   };


   template <typename Float, int nColor>
   void ApplyClover(ColorSpinorField &out, const ColorSpinorField &in, const CloverField &clover, bool inverse, int parity)
   {
     if (in.Nspin() != 4) errorQuda("Unsupported nSpin=%d", in.Nspin());
     constexpr int nSpin = 4;

     if (inverse) {
 #ifdef DYNAMIC_CLOVER
       constexpr bool dynamic_clover = true;
 #else
       constexpr bool dynamic_clover = false;
 #endif
       CloverArg<Float, nSpin, nColor, dynamic_clover> arg(out, in, clover, inverse, parity);
       Clover<Float, nSpin, nColor, CloverArg<Float, nSpin, nColor, dynamic_clover>> worker(arg, in);
       worker.apply(streams[Nstream - 1]);
     } else {
       CloverArg<Float, nSpin, nColor, false> arg(out, in, clover, inverse, parity);
       Clover<Float, nSpin, nColor, CloverArg<Float, nSpin, nColor, false>> worker(arg, in);
       worker.apply(streams[Nstream - 1]);
     }

     checkCudaError();
   }

   // template on the number of colors
   template <typename Float>
   void ApplyClover(ColorSpinorField &out, const ColorSpinorField &in, const CloverField &clover, bool inverse, int parity)
   {
     if (in.Ncolor() == 3) {
       ApplyClover<Float,3>(out, in, clover, inverse, parity);
     } else {
       errorQuda("Unsupported number of colors %d\n", in.Ncolor());
     }
   }

   //Apply the clvoer matrix field to a colorspinor field
   //out(x) = clover*in
   void ApplyClover(ColorSpinorField &out, const ColorSpinorField &in, const CloverField &clover, bool inverse, int parity)
   {
     checkPrecision(out, clover, in);    // check all precisions match
     checkLocation(out, clover, in);     // check all locations match

 #ifdef GPU_CLOVER_DIRAC
     if (in.Precision() == QUDA_DOUBLE_PRECISION) {
       ApplyClover<double>(out, in, clover, inverse, parity);
     } else if (in.Precision() == QUDA_SINGLE_PRECISION) {
       ApplyClover<float>(out, in, clover, inverse, parity);
     } else if (in.Precision() == QUDA_HALF_PRECISION) {
       ApplyClover<short>(out, in, clover, inverse, parity);
     } else if (in.Precision() == QUDA_QUARTER_PRECISION) {
       ApplyClover<char>(out, in, clover, inverse, parity);
     } else {
       errorQuda("Unsupported precision %d\n", in.Precision());
     }
 #else
     errorQuda("Clover dslash has not been built");
 #endif // GPU_TWISTED_MASS_DIRAC
   }

   // if (!inverse) apply (Clover + i*a*gamma_5) to the input spinor
   // else apply (Clover + i*a*gamma_5)/(Clover^2 + a^2) to the input spinor
   template <bool inverse, typename Float, int nSpin, int nColor, typename Arg>
   __device__ __host__ inline void twistCloverApply(Arg &arg, int x_cb, int parity) {
     using namespace linalg; // for Cholesky
     constexpr int N = nColor*nSpin/2;
     typedef typename mapper<Float>::type RegType;
     typedef ColorSpinor<RegType,nColor,nSpin> Spinor;
     typedef ColorSpinor<RegType,nColor,nSpin/2> HalfSpinor;
     typedef HMatrix<RegType,N> Mat;
     int spinor_parity = arg.nParity == 2 ? parity : 0;
     Spinor in = arg.in(x_cb, spinor_parity);
     Spinor out;

     in.toRel(); // change to chiral basis here

 #pragma unroll
     for (int chirality=0; chirality<2; chirality++) {
       // factor of 2 comes from clover normalization we need to correct for
       const complex<RegType> j(0.0, chirality == 0 ? static_cast<RegType>(0.5) : -static_cast<RegType>(0.5));

       Mat A = arg.clover(x_cb, parity, chirality);

       HalfSpinor in_chi = in.chiral_project(chirality);
       HalfSpinor out_chi = A*in_chi + j*arg.a*in_chi;

       if (inverse) {
   if (arg.dynamic_clover) {
     Mat A2 = A.square();
     A2 += arg.a*arg.a*static_cast<RegType>(0.25);
     Cholesky<HMatrix,RegType,N> cholesky(A2);
     out_chi = static_cast<RegType>(0.25)*cholesky.backward(cholesky.forward(out_chi));
   } else {
     Mat Ainv = arg.cloverInv(x_cb, parity, chirality);
     out_chi = static_cast<RegType>(2.0)*(Ainv*out_chi);
   }
       }

       out += (out_chi).chiral_reconstruct(chirality);
     }

     out.toNonRel(); // change basis back

     arg.out(x_cb, spinor_parity) = out;
   }

   template <bool inverse, typename Float, int nSpin, int nColor, typename Arg>
   void twistCloverCPU(Arg &arg) {
     for (int parity=0; parity<arg.nParity; parity++) {
       parity = (arg.nParity == 2) ? parity : arg.parity;
       for (int x_cb=0; x_cb<arg.volumeCB; x_cb++) twistCloverApply<inverse,Float,nSpin,nColor>(arg, x_cb, parity);
     }
   }

   template <bool inverse, typename Float, int nSpin, int nColor, typename Arg>
   __global__ void twistCloverGPU(Arg arg) {
     int x_cb = blockIdx.x*blockDim.x + threadIdx.x;
     int parity = (arg.nParity == 2) ? blockDim.y*blockIdx.y + threadIdx.y : arg.parity;
     if (x_cb >= arg.volumeCB) return;
     twistCloverApply<inverse,Float,nSpin,nColor>(arg, x_cb, parity);
   }

   template <typename Float, int nSpin, int nColor, typename Arg>
   class TwistClover : public TunableVectorY {

   protected:
     Arg &arg;
     const ColorSpinorField &meta;

   protected:
     long long flops() const { return (arg.inverse ? 1056ll : 552ll) * arg.nParity*arg.volumeCB; }
     long long bytes() const {
       long long rtn = arg.out.Bytes() + arg.in.Bytes() + arg.nParity*arg.volumeCB*arg.clover.Bytes();
       if (arg.twist == QUDA_TWIST_GAMMA5_INVERSE && !arg.dynamic_clover)
   rtn += arg.nParity*arg.volumeCB*arg.cloverInv.Bytes();
       return rtn;
     }
     bool tuneGridDim() const { return false; }
     unsigned int minThreads() const { return arg.volumeCB; }

   public:
     TwistClover(Arg &arg, const ColorSpinorField &meta) : TunableVectorY(arg.nParity), arg(arg), meta(meta)
     {
       strcpy(aux, meta.AuxString());
       strcat(aux, arg.inverse ? ",inverse" : ",direct");
     }
     virtual ~TwistClover() { }

     void apply(const cudaStream_t &stream)
     {
       TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
       if (meta.Location() == QUDA_CPU_FIELD_LOCATION) {
   if (arg.inverse) twistCloverCPU<true,Float,nSpin,nColor>(arg);
   else twistCloverCPU<false,Float,nSpin,nColor>(arg);
       } else {
   if (arg.inverse) twistCloverGPU<true,Float,nSpin,nColor> <<<tp.grid,tp.block,tp.shared_bytes,stream>>>(arg);
   else twistCloverGPU<false,Float,nSpin,nColor> <<<tp.grid,tp.block,tp.shared_bytes,stream>>>(arg);
       }
     }

     TuneKey tuneKey() const { return TuneKey(meta.VolString(), typeid(*this).name(), aux); }
     void preTune() { if (arg.out.field == arg.in.field) arg.out.save(); }  // Need to save the out field if it aliases the in field
     void postTune() { if (arg.out.field == arg.in.field) arg.out.load(); } // Restore if the in and out fields alias
   };


   template <typename Float, int nColor>
   void ApplyTwistClover(ColorSpinorField &out, const ColorSpinorField &in, const CloverField &clover,
       double kappa, double mu, double epsilon, int parity, int dagger, QudaTwistGamma5Type twist)
   {
     if (in.Nspin() != 4) errorQuda("Unsupported nSpin=%d", in.Nspin());
     constexpr int nSpin = 4;
     bool inverse = twist == QUDA_TWIST_GAMMA5_DIRECT ? false : true;

 #ifdef DYNAMIC_CLOVER
     constexpr bool dynamic_clover = true;
 #else
     constexpr bool dynamic_clover = false;
 #endif

     CloverArg<Float,nSpin,nColor,dynamic_clover> arg(out, in, clover, inverse, parity, kappa, mu, epsilon, dagger, twist);
     TwistClover<Float,nSpin,nColor,CloverArg<Float,nSpin,nColor,dynamic_clover> > worker(arg, in);
     worker.apply(streams[Nstream-1]);

     checkCudaError();
   }

   // template on the number of colors
   template <typename Float>
   void ApplyTwistClover(ColorSpinorField &out, const ColorSpinorField &in, const CloverField &clover,
       double kappa, double mu, double epsilon, int parity, int dagger, QudaTwistGamma5Type twist)
   {
     if (in.Ncolor() == 3) {
       ApplyTwistClover<Float,3>(out, in, clover, kappa, mu, epsilon, parity, dagger, twist);
     } else {
       errorQuda("Unsupported number of colors %d\n", in.Ncolor());
     }
   }

   //Apply the twisted-clover matrix field to a colorspinor field
   void ApplyTwistClover(ColorSpinorField &out, const ColorSpinorField &in, const CloverField &clover,
       double kappa, double mu, double epsilon, int parity, int dagger, QudaTwistGamma5Type twist)
   {
     checkPrecision(out, clover, in);    // check all precisions match
     checkLocation(out, clover, in);     // check all locations match

 #ifdef GPU_CLOVER_DIRAC
     if (in.Precision() == QUDA_DOUBLE_PRECISION) {
       ApplyTwistClover<double>(out, in, clover, kappa, mu, epsilon, parity, dagger, twist);
     } else if (in.Precision() == QUDA_SINGLE_PRECISION) {
       ApplyTwistClover<float>(out, in, clover, kappa, mu, epsilon, parity, dagger, twist);
     } else if (in.Precision() == QUDA_HALF_PRECISION) {
       ApplyTwistClover<short>(out, in, clover, kappa, mu, epsilon, parity, dagger, twist);
     } else if (in.Precision() == QUDA_QUARTER_PRECISION) {
       ApplyTwistClover<char>(out, in, clover, kappa, mu, epsilon, parity, dagger, twist);
     } else {
       errorQuda("Unsupported precision %d\n", in.Precision());
     }
 #else
     errorQuda("Clover dslash has not been built");
 #endif // GPU_TWISTED_MASS_DIRAC
   }

 } // namespace quda
quda::dslash::dslash_exterior_compute
bool dslash_exterior_compute
Definition: dslash_quda.cu:65

linalg.cuh

quda::gammaGPU
__global__ void gammaGPU(Arg arg)
Definition: dslash_quda.cu:240

quda::ColorSpinorField::Nspin
int Nspin() const
Definition: color_spinor_field.h:406

quda::GammaArg::F
colorspinor_mapper< Float, 4, nColor >::type F
Definition: dslash_quda.cu:173

quda::TuneParam
Definition: tune_quda.h:17

quda::CloverArg::c
RegType c
Definition: dslash_quda.cu:490

quda::dslash::dslash_interior_compute
bool dslash_interior_compute
Definition: dslash_quda.cu:64

quda::dslash::scatterStart
cudaEvent_t scatterStart[Nstream]
Definition: dslash_quda.cu:58

quda::TwistGamma
Definition: dslash_quda.cu:372

quda::dslash::gatherStart
cudaEvent_t gatherStart[Nstream]
Definition: dslash_quda.cu:56

mu
double mu
Definition: test_util.cpp:1648

dslash_policy.cuh

quda::TwistClover
Definition: dslash_quda.cu:725

quda::CloverArg::volumeCB
const int volumeCB
Definition: dslash_quda.cu:487

quda::ColorSpinorField
Definition: color_spinor_field.h:311

quda::TwistGamma::meta
const ColorSpinorField & meta
Definition: dslash_quda.cu:376

quda::LatticeField::AuxString
const char * AuxString() const
Definition: lattice_field.h:627

quda::GammaArg::doublet
bool doublet
Definition: dslash_quda.cu:180

quda::Worker
Definition: worker.h:5

quda::getKernelPackT
bool getKernelPackT()
Definition: dslash_quda.cu:26

getVerbosity
QudaVerbosity getVerbosity()
Definition: util_quda.cpp:21

kappa
double kappa
Definition: test_util.cpp:1647

quda::dslash::QudaDslashPolicy::QUDA_DSLASH_POLICY_DISABLED

quda::Gamma::meta
const ColorSpinorField & meta
Definition: dslash_quda.cu:258

QUDA_TWIST_GAMMA5_INVALID
Definition: enum_quda.h:424

quda::CloverArg::RegType
mapper< Float >::type RegType
Definition: dslash_quda.cu:477

checkPrecision
#define checkPrecision(...)
Definition: lattice_field.h:695

errorQuda
#define errorQuda(...)
Definition: util_quda.h:121

quda::ApplyGamma
void ApplyGamma(ColorSpinorField &out, const ColorSpinorField &in, int d)
Definition: dslash_quda.cu:292

color_spinor_field.h

quda::twistCloverApply
__device__ __host__ void twistCloverApply(Arg &arg, int x_cb, int parity)
Definition: dslash_quda.cu:665

quda::ColorSpinor
Definition: color_spinor.h:24

host_free
#define host_free(ptr)
Definition: malloc_quda.h:71

QUDA_QUARTER_PRECISION
Definition: enum_quda.h:59

quda::TwistGamma::minThreads
unsigned int minThreads() const
Definition: dslash_quda.cu:381

QUDA_HALF_PRECISION
Definition: enum_quda.h:60

quda::CloverField
Definition: clover_field.h:45

epsilon
double epsilon
Definition: test_util.cpp:1649

streams
cudaStream_t * streams
Definition: interface_quda.cpp:157

quda::stream
cudaStream_t * stream
Definition: cuda_color_spinor_field.cpp:897

quda::CloverArg::in
const F in
Definition: dslash_quda.cu:480

quda::TwistClover::postTune
void postTune()
Definition: dslash_quda.cu:764

quda::Nstream
const int Nstream
Definition: quda_internal.h:83

quda::Gamma::apply
__device__ __host__ complex< ValueType > apply(int row, const complex< ValueType > &a) const
Definition: gamma.cuh:221

quda::LatticeField::VolString
const char * VolString() const
Definition: lattice_field.h:624

quda::TuneParam::shared_bytes
int shared_bytes
Definition: tune_quda.h:22

quda::Clover::postTune
void postTune()
Definition: dslash_quda.cu:599

Mat
void Mat(sFloat *out, gFloat **link, sFloat *in, int daggerBit, int mu)
Definition: covdev_reference.cpp:99

QUDA_TWIST_SINGLET
Definition: enum_quda.h:399

clover_field_order.h
Main header file for host and device accessors to CloverFields.

quda::CloverArg::inverse
bool inverse
Definition: dslash_quda.cu:485

quda::TwistClover::minThreads
unsigned int minThreads() const
Definition: dslash_quda.cu:740

QUDA_TWIST_NONDEG_DOUBLET
Definition: enum_quda.h:400

quda::ColorSpinorField::Ncolor
int Ncolor() const
Definition: color_spinor_field.h:405

quda::TwistClover::bytes
long long bytes() const
Definition: dslash_quda.cu:733

length
int length[]
Definition: gauge_force_test.cpp:34

quda::dslash::dslash_policy_init
bool dslash_policy_init
Definition: dslash_quda.cu:70

quda::TuneParam::grid
dim3 grid
Definition: tune_quda.h:21

QUDA_TWIST_GAMMA5_DIRECT
Definition: enum_quda.h:422

quda::dslash::dslashStart
cudaEvent_t dslashStart[2]
Definition: dslash_quda.cu:60

quda
Definition: blas_cublas.h:5

quda::Clover::~Clover
virtual ~Clover()
Definition: dslash_quda.cu:585

quda::CloverArg::cloverInv
const C cloverInv
Definition: dslash_quda.cu:482

quda::TwistClover::~TwistClover
virtual ~TwistClover()
Definition: dslash_quda.cu:748

quda::Gamma::postTune
void postTune()
Definition: dslash_quda.cu:287

quda::dslash::p2p_policies
std::vector< QudaP2PPolicy > p2p_policies
Definition: dslash_quda.cu:80

quda::dslash::policy_string
char policy_string[TuneKey::aux_n]
Definition: dslash_quda.cu:83

QUDA_TWIST_DEG_DOUBLET
Definition: enum_quda.h:401

quda::popKernelPackT
void popKernelPackT()
Definition: dslash_quda.cu:42

quda::TwistGamma::tuneGridDim
bool tuneGridDim() const
Definition: dslash_quda.cu:380

quda::GammaArg::nParity
const int nParity
Definition: dslash_quda.cu:179

quda::dslash::policies
std::vector< QudaDslashPolicy > policies
Definition: dslash_quda.cu:77

quda::dslash::dslash_comms
bool dslash_comms
Definition: dslash_quda.cu:66

quda::CloverArg::clover
const C clover
Definition: dslash_quda.cu:481

quda::dslash::dslash_copy
bool dslash_copy
Definition: dslash_quda.cu:67

quda::CloverArg::b
RegType b
Definition: dslash_quda.cu:489

quda::GammaArg::GammaArg
GammaArg(ColorSpinorField &out, const ColorSpinorField &in, int d, RegType kappa=0.0, RegType mu=0.0, RegType epsilon=0.0, bool dagger=false, QudaTwistGamma5Type twist=QUDA_TWIST_GAMMA5_INVALID)
Definition: dslash_quda.cu:186

quda::Clover::Clover
Clover(Arg &arg, const ColorSpinorField &meta)
Definition: dslash_quda.cu:581

quda::GammaArg::out
F out
Definition: dslash_quda.cu:176

quda::GammaArg::RegType
mapper< Float >::type RegType
Definition: dslash_quda.cu:174

quda::GammaArg::d
const int d
Definition: dslash_quda.cu:178

quda::ApplyTwistGamma
void ApplyTwistGamma(ColorSpinorField &out, const ColorSpinorField &in, int d, double kappa, double mu, double epsilon, int dagger, QudaTwistGamma5Type type)
Apply the twisted-mass gamma operator to a color-spinor field.
Definition: dslash_quda.cu:416

quda::Gamma::preTune
void preTune()
Definition: dslash_quda.cu:286

quda::Gamma::Gamma
Gamma(Arg &arg, const ColorSpinorField &meta)
Definition: dslash_quda.cu:266

quda::dslash::aux_worker
Worker * aux_worker
Definition: dslash_quda.cu:87

quda::TwistClover::preTune
void preTune()
Definition: dslash_quda.cu:763

nColor
const int nColor
Definition: covdev_test.cpp:75

quda::TuneParam::block
dim3 block
Definition: tune_quda.h:20

quda::TwistGamma::tuneKey
TuneKey tuneKey() const
Definition: dslash_quda.cu:409

quda::TunableVectorY
Definition: tune_quda.h:426

in
cpuColorSpinorField * in
Definition: staggered_invert_test.cpp:98

quda::TwistGamma::postTune
void postTune()
Definition: dslash_quda.cu:411

quda::Clover::apply
void apply(const cudaStream_t &stream)
Definition: dslash_quda.cu:587

quda::createDslashEvents
void createDslashEvents()
Definition: dslash_quda.cu:95

quda::twistCloverCPU
void twistCloverCPU(Arg &arg)
Definition: dslash_quda.cu:709

quda::kptstack
static std::stack< bool > kptstack
Definition: dslash_quda.cu:28

quda::tuneLaunch
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
Definition: tune.cpp:643

quda::CloverArg::C
clover_mapper< Float, length >::type C
Definition: dslash_quda.cu:476

warningQuda
#define warningQuda(...)
Definition: util_quda.h:133

checkLocation
#define checkLocation(...)
Definition: lattice_field.h:664

quda::HMatrix
Specialized container for Hermitian matrices (e.g., used for wrapping clover matrices) ...
Definition: quda_matrix.h:61

quda::dslash::packEnd
cudaEvent_t packEnd[2]
Definition: dslash_quda.cu:55

quda::TwistClover::apply
void apply(const cudaStream_t &stream)
Definition: dslash_quda.cu:750

quda::kernelPackT
static bool kernelPackT
Definition: dslash_quda.cu:22

quda::CloverArg::a
RegType a
Definition: dslash_quda.cu:488

quda::TwistClover::TwistClover
TwistClover(Arg &arg, const ColorSpinorField &meta)
Definition: dslash_quda.cu:743

quda::TwistClover::meta
const ColorSpinorField & meta
Definition: dslash_quda.cu:729

quda::TwistClover::flops
long long flops() const
Definition: dslash_quda.cu:732

quda::CloverArg::out
F out
Definition: dslash_quda.cu:479

quda::TwistClover::tuneKey
TuneKey tuneKey() const
Definition: dslash_quda.cu:762

quda::GammaArg::volumeCB
const int volumeCB
Definition: dslash_quda.cu:181

quda::Gamma
Definition: gamma.cuh:9

clover_field.h

quda::CloverArg::doublet
bool doublet
Definition: dslash_quda.cu:486

quda::Clover
Definition: dslash_quda.cu:568

quda::Gamma::minThreads
unsigned int minThreads() const
Definition: dslash_quda.cu:263

quda::dslash::first_active_p2p_policy
int first_active_p2p_policy
Definition: dslash_quda.cu:74

quda::TwistGamma::arg
Arg & arg
Definition: dslash_quda.cu:375

quda::Gamma::bytes
long long bytes() const
Definition: dslash_quda.cu:261

quda::Arg
Definition: spinor_noise.cu:22

quda::ColorSpinorField::isNative
bool isNative() const
Definition: color_spinor_field.cpp:568

quda::Clover::preTune
void preTune()
Definition: dslash_quda.cu:598

quda::Clover::meta
const ColorSpinorField & meta
Definition: dslash_quda.cu:572

quda::GammaArg::in
const F in
Definition: dslash_quda.cu:177

quda::LatticeField::Location
QudaFieldLocation Location() const
Definition: lattice_field.cpp:660

quda::CloverArg::CloverArg
CloverArg(ColorSpinorField &out, const ColorSpinorField &in, const CloverField &clover, bool inverse, int parity, RegType kappa=0.0, RegType mu=0.0, RegType epsilon=0.0, bool dagger=false, QudaTwistGamma5Type twist=QUDA_TWIST_GAMMA5_INVALID)
Definition: dslash_quda.cu:493

quda::cloverCPU
void cloverCPU(Arg &arg)
Definition: dslash_quda.cu:552

quda::TwistClover::arg
Arg & arg
Definition: dslash_quda.cu:728

quda::TwistGamma::TwistGamma
TwistGamma(Arg &arg, const ColorSpinorField &meta)
Definition: dslash_quda.cu:384

QUDA_DOUBLE_PRECISION
Definition: enum_quda.h:62

quda::Gamma::tuneKey
TuneKey tuneKey() const
Definition: dslash_quda.cu:284

index_helper.cuh

quda::Clover::arg
Arg & arg
Definition: dslash_quda.cu:571

quda::inverse
__device__ __host__ Matrix< T, 3 > inverse(const Matrix< T, 3 > &u)
Definition: quda_matrix.h:611

out
cpuColorSpinorField * out
Definition: staggered_invert_test.cpp:99

quda::Arg::nParity
const int nParity
Definition: spinor_noise.cu:25

quda::Gamma::flops
long long flops() const
Definition: dslash_quda.cu:260

QUDA_SINGLE_PRECISION
Definition: enum_quda.h:61

quda::cloverApply
__device__ __host__ void cloverApply(Arg &arg, int x_cb, int parity)
Definition: dslash_quda.cu:519

quda::ApplyClover
void ApplyClover(ColorSpinorField &out, const ColorSpinorField &in, const CloverField &clover, bool inverse, int parity)
Apply clover-matrix field to a color-spinor field.
Definition: dslash_quda.cu:604

quda::mapper
Definition: register_traits.h:43

quda::CloverArg
Parameteter structure for driving the clover and twist-clover application kernels.
Definition: dslash_quda.cu:471

color_spinor_field_order.h

quda::twistCloverGPU
__global__ void twistCloverGPU(Arg arg)
Definition: dslash_quda.cu:717

quda::CloverArg::F
colorspinor_mapper< Float, nSpin, nColor >::type F
Definition: dslash_quda.cu:475

quda::Clover::minThreads
unsigned int minThreads() const
Definition: dslash_quda.cu:578

dslash_quda.h

quda::TuneKey::aux_n
static const int aux_n
Definition: tune_key.h:12

quda::TwistGamma::apply
void apply(const cudaStream_t &stream)
Definition: dslash_quda.cu:390

quda::colorspinor_mapper
Definition: color_spinor_field_order.h:1602

quda::dslash::dslash_pack_compute
bool dslash_pack_compute
Definition: dslash_quda.cu:63

quda::dslash::scatterEnd
cudaEvent_t scatterEnd[Nstream]
Definition: dslash_quda.cu:59

quda::Gamma::arg
Arg & arg
Definition: dslash_quda.cu:257

quda::clover_mapper
Definition: clover_field_order.h:975

quda::ColorSpinorField::TwistFlavor
QudaTwistFlavorType TwistFlavor() const
Definition: color_spinor_field.h:408

quda::CloverArg::twist
QudaTwistGamma5Type twist
Definition: dslash_quda.cu:491

quda::CloverArg::parity
const int parity
Definition: dslash_quda.cu:484

quda::arg
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
Definition: complex_quda.h:1076

quda::TwistGamma::~TwistGamma
virtual ~TwistGamma()
Definition: dslash_quda.cu:388

quda::GammaArg::c
RegType c
Definition: dslash_quda.cu:184

quda::Arg::volumeCB
const int volumeCB
Definition: spinor_noise.cu:26

quda::TwistClover::tuneGridDim
bool tuneGridDim() const
Definition: dslash_quda.cu:739

quda::setKernelPackT
void setKernelPackT(bool pack)
Definition: dslash_quda.cu:24

quda::Clover::tuneKey
TuneKey tuneKey() const
Definition: dslash_quda.cu:597

quda::gammaCPU
void gammaCPU(Arg arg)
Definition: dslash_quda.cu:225

quda::twistGammaCPU
void twistGammaCPU(Arg arg)
Definition: dslash_quda.cu:332

quda::GammaArg::a
RegType a
Definition: dslash_quda.cu:182

quda::dslash::it
int it
Definition: dslash_quda.cu:53

quda::gamma5
void gamma5(ColorSpinorField &out, const ColorSpinorField &in)
Applies a gamma5 matrix to a spinor (wrapper to ApplyGamma)
Definition: dslash_quda.cu:461

Spinor
Definition: texture.h:288

quda::TwistGamma::flops
long long flops() const
Definition: dslash_quda.cu:378

quda::CloverArg::nParity
const int nParity
Definition: dslash_quda.cu:483

QudaTwistGamma5Type
enum QudaTwistGamma5Type_s QudaTwistGamma5Type

quda::pushKernelPackT
void pushKernelPackT(bool pack)
Definition: dslash_quda.cu:30

quda::dslash::QudaP2PPolicy::QUDA_P2P_POLICY_DISABLED

checkCudaError
#define checkCudaError()
Definition: util_quda.h:161

quda::GammaArg::b
RegType b
Definition: dslash_quda.cu:183

mapped_malloc
#define mapped_malloc(size)
Definition: malloc_quda.h:68

getTuning
QudaTune getTuning()
Query whether autotuning is enabled or not. Default is enabled but can be overridden by setting QUDA_...
Definition: util_quda.cpp:52

quda::LatticeField::Precision
QudaPrecision Precision() const
Definition: lattice_field.h:546

quda::Gamma::~Gamma
virtual ~Gamma()
Definition: dslash_quda.cu:270

dagger
QudaDagType dagger
Definition: test_util.cpp:1620

quda::Clover::tuneGridDim
bool tuneGridDim() const
Definition: dslash_quda.cu:577

parity
QudaParity parity
Definition: covdev_test.cpp:54

quda::twistGammaGPU
__global__ void twistGammaGPU(Arg arg)
Definition: dslash_quda.cu:353

quda::Gamma::apply
void apply(const cudaStream_t &stream)
Definition: dslash_quda.cu:272

QUDA_CPU_FIELD_LOCATION
Definition: enum_quda.h:325

quda::cloverGPU
__global__ void cloverGPU(Arg arg)
Definition: dslash_quda.cu:560

quda::TwistGamma::preTune
void preTune()
Definition: dslash_quda.cu:410

quda::destroyDslashEvents
void destroyDslashEvents()
Definition: dslash_quda.cu:144

quda::ColorSpinorField::FieldOrder
QudaFieldOrder FieldOrder() const
Definition: color_spinor_field.h:483

quda::TuneKey
Definition: tune_key.h:8

quda::dslash::first_active_policy
int first_active_policy
Definition: dslash_quda.cu:73

quda::TwistGamma::bytes
long long bytes() const
Definition: dslash_quda.cu:379

QUDA_TWIST_GAMMA5_INVERSE
Definition: enum_quda.h:423

quda::dslash::gatherEnd
cudaEvent_t gatherEnd[Nstream]
Definition: dslash_quda.cu:57

quda::Clover::flops
long long flops() const
Definition: dslash_quda.cu:575

quda::ApplyTwistClover
void ApplyTwistClover(ColorSpinorField &out, const ColorSpinorField &in, const CloverField &clover, double kappa, double mu, double epsilon, int parity, int dagger, QudaTwistGamma5Type twist)
Apply twisted clover-matrix field to a color-spinor field.
Definition: dslash_quda.cu:769

quda::Clover::bytes
long long bytes() const
Definition: dslash_quda.cu:576

quda::GammaArg
Parameter structure for driving the Gamma operator.
Definition: dslash_quda.cu:172

color_spinor.h

quda::Gamma::tuneGridDim
bool tuneGridDim() const
Definition: dslash_quda.cu:262