quda-ref/v1.0.0/cov_dev_8cu_source.html

 #include <dslash.h>
 #include <worker.h>
 #include <dslash_helper.cuh>
 #include <color_spinor_field_order.h>
 #include <gauge_field_order.h>
 #include <color_spinor.h>
 #include <dslash_helper.cuh>
 #include <index_helper.cuh>
 #include <gauge_field.h>
 #include <uint_to_char.h>

 #include <dslash_policy.cuh>
 #include <kernels/covDev.cuh>

 namespace quda
 {

 #ifdef GPU_COVDEV

   template <typename Float, int nDim, int nColor, int nParity, bool dagger, bool xpay, KernelType kernel_type, typename Arg>
   struct CovDevLaunch {

     // kernel name for jit compilation
     static constexpr const char *kernel = "quda::covDevGPU";

     template <typename Dslash>
     inline static void launch(Dslash &dslash, TuneParam &tp, Arg &arg, const cudaStream_t &stream)
     {
       static_assert(xpay == false, "Covariant derivative operator only defined without xpay");
       static_assert(nParity == 2, "Covariant derivative operator only defined for full field");
       dslash.launch(covDevGPU<Float, nDim, nColor, nParity, dagger, xpay, kernel_type, Arg>, tp, arg, stream);
     }
   };

   template <typename Float, int nDim, int nColor, typename Arg> class CovDev : public Dslash<Float>
   {

 protected:
     Arg &arg;
     const ColorSpinorField &in;

 public:
     CovDev(Arg &arg, const ColorSpinorField &out, const ColorSpinorField &in) :
       Dslash<Float>(arg, out, in, "kernels/covDev.cuh"),
       arg(arg),
       in(in)
     {
     }

     virtual ~CovDev() {}

     void apply(const cudaStream_t &stream)
     {
       TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
       Dslash<Float>::setParam(arg);
       if (arg.xpay) errorQuda("Covariant derivative operator only defined without xpay");
       if (arg.nParity != 2) errorQuda("Covariant derivative operator only defined for full field");

       constexpr bool xpay = false;
       constexpr int nParity = 2;
       Dslash<Float>::template instantiate<CovDevLaunch, nDim, nColor, nParity, xpay>(tp, arg, stream);
     }

     long long flops() const
     {
       int mv_flops = (8 * in.Ncolor() - 2) * in.Ncolor(); // SU(3) matrix-vector flops
       int num_mv_multiply = in.Nspin();
       int ghost_flops = num_mv_multiply * mv_flops;
       int dim = arg.mu % 4;
       long long flops_ = 0;

       switch (arg.kernel_type) {
       case EXTERIOR_KERNEL_X:
       case EXTERIOR_KERNEL_Y:
       case EXTERIOR_KERNEL_Z:
       case EXTERIOR_KERNEL_T:
         if (arg.kernel_type != dim) break;
         flops_ = (ghost_flops)*in.GhostFace()[dim];
         break;
       case EXTERIOR_KERNEL_ALL: {
         long long ghost_sites = in.GhostFace()[dim];
         flops_ = ghost_flops * ghost_sites;
         break;
       }
       case INTERIOR_KERNEL:
       case KERNEL_POLICY: {
         long long sites = in.Volume();
         flops_ = num_mv_multiply * mv_flops * sites; // SU(3) matrix-vector multiplies

         if (arg.kernel_type == KERNEL_POLICY) break;
         // now correct for flops done by exterior kernel
         long long ghost_sites = arg.commDim[dim] ? in.GhostFace()[dim] : 0;
         flops_ -= ghost_flops * ghost_sites;

         break;
       }
       }

       return flops_;
     }

     long long bytes() const
     {
       int gauge_bytes = arg.reconstruct * in.Precision();
       bool isFixed = (in.Precision() == sizeof(short) || in.Precision() == sizeof(char)) ? true : false;
       int spinor_bytes = 2 * in.Ncolor() * in.Nspin() * in.Precision() + (isFixed ? sizeof(float) : 0);
       int ghost_bytes = gauge_bytes + 3 * spinor_bytes; // 3 since we have to load the partial
       int dim = arg.mu % 4;
       long long bytes_ = 0;

       switch (arg.kernel_type) {
       case EXTERIOR_KERNEL_X:
       case EXTERIOR_KERNEL_Y:
       case EXTERIOR_KERNEL_Z:
       case EXTERIOR_KERNEL_T:
         if (arg.kernel_type != dim) break;
         bytes_ = ghost_bytes * in.GhostFace()[dim];
         break;
       case EXTERIOR_KERNEL_ALL: {
         long long ghost_sites = in.GhostFace()[dim];
         bytes_ = ghost_bytes * ghost_sites;
         break;
       }
       case INTERIOR_KERNEL:
       case KERNEL_POLICY: {
         long long sites = in.Volume();
         bytes_ = (gauge_bytes + 2 * spinor_bytes) * sites;

         if (arg.kernel_type == KERNEL_POLICY) break;
         // now correct for bytes done by exterior kernel
         long long ghost_sites = arg.commDim[dim] ? in.GhostFace()[dim] : 0;
         bytes_ -= ghost_bytes * ghost_sites;

         break;
       }
       }
       return bytes_;
     }

     TuneKey tuneKey() const
     {
       // add mu to the key
       char aux[TuneKey::aux_n];
       strcpy(aux, Dslash<Float>::aux[arg.kernel_type]);
       strcat(aux, ",mu=");
       char mu[8];
       u32toa(mu, arg.mu);
       strcat(aux, mu);
       return TuneKey(in.VolString(), typeid(*this).name(), aux);
     }
   };

   template <typename Float, int nColor, QudaReconstructType recon> struct CovDevApply {

     inline CovDevApply(ColorSpinorField &out, const ColorSpinorField &in, const GaugeField &U, int mu, int parity,
                        bool dagger, const int *comm_override, TimeProfile &profile)

     {
       constexpr int nDim = 4;
       CovDevArg<Float, nColor, recon> arg(out, in, U, mu, parity, dagger, comm_override);
       CovDev<Float, nDim, nColor, CovDevArg<Float, nColor, recon>> covDev(arg, out, in);

       dslash::DslashPolicyTune<decltype(covDev)> policy(
         covDev, const_cast<cudaColorSpinorField *>(static_cast<const cudaColorSpinorField *>(&in)), in.VolumeCB(),
         in.GhostFaceCB(), profile);
       policy.apply(0);

       checkCudaError();
     }
   };

 #endif

   // Apply the covariant derivative operator
   // out(x) = U_{\mu}(x)in(x+mu) for mu = 0...3
   // out(x) = U^\dagger_mu'(x-mu')in(x-mu') for mu = 4...7 and we set mu' = mu-4
   void ApplyCovDev(ColorSpinorField &out, const ColorSpinorField &in, const GaugeField &U, int mu, int parity,
                    bool dagger, const int *comm_override, TimeProfile &profile)
   {
 #ifdef GPU_COVDEV
     if (in.V() == out.V()) errorQuda("Aliasing pointers");
     if (in.FieldOrder() != out.FieldOrder())
       errorQuda("Field order mismatch in = %d, out = %d", in.FieldOrder(), out.FieldOrder());

     // check all precisions match
     checkPrecision(out, in, U);

     // check all locations match
     checkLocation(out, in, U);

     pushKernelPackT(true); // non-spin projection requires kernel packing

     instantiate<CovDevApply>(out, in, U, mu, parity, dagger, comm_override, profile);

     popKernelPackT();
 #else
     errorQuda("Covariant derivative kernels have not been built");
 #endif
   }
 } // namespace quda
mu
double mu
Definition: test_util.cpp:1648

dslash_policy.cuh

worker.h

quda::EXTERIOR_KERNEL_ALL
Definition: index_helper.cuh:466

quda::Dslash::setParam
void setParam(Arg &arg)
Definition: dslash.h:66

quda::ColorSpinorField
Definition: color_spinor_field.h:311

getVerbosity
QudaVerbosity getVerbosity()
Definition: util_quda.cpp:21

checkPrecision
#define checkPrecision(...)
Definition: lattice_field.h:695

errorQuda
#define errorQuda(...)
Definition: util_quda.h:121

quda::stream
cudaStream_t * stream
Definition: cuda_color_spinor_field.cpp:897

quda::EXTERIOR_KERNEL_T
Definition: index_helper.cuh:470

quda::ApplyCovDev
void ApplyCovDev(ColorSpinorField &out, const ColorSpinorField &in, const GaugeField &U, int mu, int parity, bool dagger, const int *comm_override, TimeProfile &profile)
Driver for applying the covariant derivative.
Definition: covDev.cu:185

quda
Definition: blas_cublas.h:5

quda::KERNEL_POLICY
Definition: index_helper.cuh:471

quda::blas::xpay
void xpay(ColorSpinorField &x, double a, ColorSpinorField &y)
Definition: blas_quda.h:37

quda::EXTERIOR_KERNEL_X
Definition: index_helper.cuh:467

quda::popKernelPackT
void popKernelPackT()
Definition: dslash_quda.cu:42

quda::EXTERIOR_KERNEL_Y
Definition: index_helper.cuh:468

covDev.cuh

quda::INTERIOR_KERNEL
Definition: index_helper.cuh:465

quda::covDev
__device__ __host__ void covDev(Arg &arg, int idx, int parity)
Definition: covDev.cuh:119

in
cpuColorSpinorField * in
Definition: staggered_invert_test.cpp:98

quda::tuneLaunch
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
Definition: tune.cpp:643

checkLocation
#define checkLocation(...)
Definition: lattice_field.h:664

dslash.h

gauge_field_order.h
Main header file for host and device accessors to GaugeFields.

dslash_helper.cuh

quda::EXTERIOR_KERNEL_Z
Definition: index_helper.cuh:469

index_helper.cuh

out
cpuColorSpinorField * out
Definition: staggered_invert_test.cpp:99

quda::u32toa
void u32toa(char *buffer, uint32_t value)
Definition: uint_to_char.h:45

color_spinor_field_order.h

quda::TuneKey::aux_n
static const int aux_n
Definition: tune_key.h:12

quda::blas::flops
unsigned long long flops
Definition: blas_quda.cu:22

quda::arg
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
Definition: complex_quda.h:1076

quda::TimeProfile
Definition: timer.h:171

quda::pushKernelPackT
void pushKernelPackT(bool pack)
Definition: dslash_quda.cu:30

checkCudaError
#define checkCudaError()
Definition: util_quda.h:161

uint_to_char.h

quda::Dslash::aux
char aux[8][TuneKey::aux_n]
Definition: dslash.h:23

getTuning
QudaTune getTuning()
Query whether autotuning is enabled or not. Default is enabled but can be overridden by setting QUDA_...
Definition: util_quda.cpp:52

dagger
QudaDagType dagger
Definition: test_util.cpp:1620

parity
QudaParity parity
Definition: covdev_test.cpp:54

quda::ColorSpinorField::FieldOrder
QudaFieldOrder FieldOrder() const
Definition: color_spinor_field.h:483

gauge_field.h

quda::blas::bytes
unsigned long long bytes
Definition: blas_quda.cu:23

quda::ColorSpinorField::V
void * V()
Definition: color_spinor_field.h:424

color_spinor.h

quda::GaugeField
Definition: gauge_field.h:164