quda-ref/v1.0.0/dslash__improved__staggered_8cu_source.html

 #include <dslash.h>
 #include <worker.h>
 #include <dslash_helper.cuh>
 #include <color_spinor_field_order.h>
 #include <gauge_field_order.h>
 #include <color_spinor.h>
 #include <dslash_helper.cuh>
 #include <index_helper.cuh>
 #include <gauge_field.h>

 #include <dslash_policy.cuh>
 #include <kernels/dslash_staggered.cuh>

 namespace quda
 {

   template <typename Float, int nDim, int nColor, int nParity, bool dagger, bool xpay, KernelType kernel_type, typename Arg>
   struct StaggeredLaunch {
     static constexpr const char *kernel = "quda::staggeredGPU"; // kernel name for jit compilation
     template <typename Dslash>
     inline static void launch(Dslash &dslash, TuneParam &tp, Arg &arg, const cudaStream_t &stream)
     {
       dslash.launch(staggeredGPU<Float, nDim, nColor, nParity, dagger, xpay, kernel_type, Arg>, tp, arg, stream);
     }
   };

   template <typename Float, int nDim, int nColor, typename Arg> class Staggered : public Dslash<Float>
   {

 protected:
     Arg &arg;
     const ColorSpinorField &in;

 public:
     Staggered(Arg &arg, const ColorSpinorField &out, const ColorSpinorField &in) :
       Dslash<Float>(arg, out, in, "kernels/dslash_staggered.cuh"),
       arg(arg),
       in(in)
     {
     }

     virtual ~Staggered() {}

     void apply(const cudaStream_t &stream)
     {
       if (in.Location() == QUDA_CPU_FIELD_LOCATION) {
         errorQuda("Staggered Dslash not implemented on CPU");
       } else {
         TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
         Dslash<Float>::setParam(arg);
         Dslash<Float>::template instantiate<StaggeredLaunch, nDim, nColor>(tp, arg, stream);
       }
     }

     /*
       per direction / dimension flops
       SU(3) matrix-vector flops = (8 Nc - 2) * Nc
       xpay = 2 * 2 * Nc * Ns

       So for the full dslash we have
       flops = (2 * 2 * Nd * (8*Nc-2) * Nc)  +  ((2 * 2 * Nd - 1) * 2 * Nc * Ns)
       flops_xpay = flops + 2 * 2 * Nc * Ns

       For Asqtad this should give 1146 for Nc=3,Ns=2 and 1158 for the axpy equivalent
     */
     long long flops() const
     {
       int mv_flops = (8 * in.Ncolor() - 2) * in.Ncolor(); // SU(3) matrix-vector flops
       int ghost_flops = (3 + 1) * (mv_flops + 2 * in.Ncolor() * in.Nspin());
       int xpay_flops = 2 * 2 * in.Ncolor() * in.Nspin(); // multiply and add per real component
       int num_dir = 2 * 4;                               // hard code factor of 4 in direction since fields may be 5-d

       long long flops_ = 0;

       switch (arg.kernel_type) {
       case EXTERIOR_KERNEL_X:
       case EXTERIOR_KERNEL_Y:
       case EXTERIOR_KERNEL_Z:
       case EXTERIOR_KERNEL_T: flops_ = ghost_flops * 2 * in.GhostFace()[arg.kernel_type]; break;
       case EXTERIOR_KERNEL_ALL: {
         long long ghost_sites = 2 * (in.GhostFace()[0] + in.GhostFace()[1] + in.GhostFace()[2] + in.GhostFace()[3]);
         flops_ = ghost_flops * ghost_sites;
         break;
       }
       case INTERIOR_KERNEL:
       case KERNEL_POLICY: {
         long long sites = in.Volume();
         flops_ = (2 * num_dir * mv_flops + // SU(3) matrix-vector multiplies
                   (2 * num_dir - 1) * 2 * in.Ncolor() * in.Nspin())
           * sites;                                  // accumulation
         if (arg.xpay) flops_ += xpay_flops * sites; // axpy is always on interior

         if (arg.kernel_type == KERNEL_POLICY) break;
         // now correct for flops done by exterior kernel
         long long ghost_sites = 0;
         for (int d = 0; d < 4; d++)
           if (arg.commDim[d]) ghost_sites += 2 * in.GhostFace()[d];
         flops_ -= ghost_flops * ghost_sites;

         break;
       }
       }
       return flops_;
     }

     long long bytes() const
     {
       int gauge_bytes_fat = QUDA_RECONSTRUCT_NO * in.Precision();
       int gauge_bytes_long = arg.reconstruct * in.Precision();
       bool isFixed = (in.Precision() == sizeof(short) || in.Precision() == sizeof(char)) ? true : false;
       int spinor_bytes = 2 * in.Ncolor() * in.Nspin() * in.Precision() + (isFixed ? sizeof(float) : 0);
       int ghost_bytes = 3 * (spinor_bytes + gauge_bytes_long) + (spinor_bytes + gauge_bytes_fat)
         + 3 * 2 * spinor_bytes; // last term is the accumulator load/store through the face
       int num_dir = 2 * 4;      // set to 4-d since we take care of 5-d fermions in derived classes where necessary

       long long bytes_ = 0;

       switch (arg.kernel_type) {
       case EXTERIOR_KERNEL_X:
       case EXTERIOR_KERNEL_Y:
       case EXTERIOR_KERNEL_Z:
       case EXTERIOR_KERNEL_T: bytes_ = ghost_bytes * 2 * in.GhostFace()[arg.kernel_type]; break;
       case EXTERIOR_KERNEL_ALL: {
         long long ghost_sites = 2 * (in.GhostFace()[0] + in.GhostFace()[1] + in.GhostFace()[2] + in.GhostFace()[3]);
         bytes_ = ghost_bytes * ghost_sites;
         break;
       }
       case INTERIOR_KERNEL:
       case KERNEL_POLICY: {
         long long sites = in.Volume();
         bytes_ = (num_dir * (gauge_bytes_fat + gauge_bytes_long) + // gauge reads
                   num_dir * 2 * spinor_bytes +                     // spinor reads
                   spinor_bytes)
           * sites; // spinor write
         if (arg.xpay) bytes_ += spinor_bytes;

         if (arg.kernel_type == KERNEL_POLICY) break;
         // now correct for bytes done by exterior kernel
         long long ghost_sites = 0;
         for (int d = 0; d < 4; d++)
           if (arg.commDim[d]) ghost_sites += 2 * in.GhostFace()[d];
         bytes_ -= ghost_bytes * ghost_sites;

         break;
       }
       }
       return bytes_;
     }

     TuneKey tuneKey() const
     {
       return TuneKey(in.VolString(), typeid(*this).name(), Dslash<Float>::aux[arg.kernel_type]);
     }
   };

   template <typename Float, int nColor, QudaReconstructType recon_l> struct ImprovedStaggeredApply {

     inline ImprovedStaggeredApply(ColorSpinorField &out, const ColorSpinorField &in, const GaugeField &L,
                                   const GaugeField &U, double a, const ColorSpinorField &x, int parity, bool dagger,
                                   const int *comm_override, TimeProfile &profile)
     {
       constexpr int nDim = 4; // MWTODO: this probably should be 5 for mrhs Dslash
       constexpr bool improved = true;
       constexpr QudaReconstructType recon_u = QUDA_RECONSTRUCT_NO;
       StaggeredArg<Float, nColor, recon_u, recon_l, improved> arg(out, in, U, L, a, x, parity, dagger, comm_override);
       Staggered<Float, nDim, nColor, decltype(arg)> staggered(arg, out, in);

       dslash::DslashPolicyTune<decltype(staggered)> policy(
         staggered, const_cast<cudaColorSpinorField *>(static_cast<const cudaColorSpinorField *>(&in)), in.VolumeCB(),
         in.GhostFaceCB(), profile);
       policy.apply(0);

       checkCudaError();
     }
   };

   void ApplyImprovedStaggered(ColorSpinorField &out, const ColorSpinorField &in, const GaugeField &U,
                               const GaugeField &L, double a, const ColorSpinorField &x, int parity, bool dagger,
                               const int *comm_override, TimeProfile &profile)
   {

 #ifdef GPU_STAGGERED_DIRAC
     if (in.V() == out.V()) errorQuda("Aliasing pointers");
     if (in.FieldOrder() != out.FieldOrder())
       errorQuda("Field order mismatch in = %d, out = %d", in.FieldOrder(), out.FieldOrder());

     // check all precisions match
     checkPrecision(out, in, U, L);

     // check all locations match
     checkLocation(out, in, U, L);

     for (int i = 0; i < 4; i++) {
       if (comm_dim_partitioned(i) && (U.X()[i] < 6)) {
         errorQuda(
           "ERROR: partitioned dimension with local size less than 6 is not supported in improved staggered dslash\n");
       }
     }

     // L must be first gauge field argument since we template on long reconstruct
     instantiate<ImprovedStaggeredApply, StaggeredReconstruct>(out, in, L, U, a, x, parity, dagger, comm_override,
                                                               profile);
 #else
     errorQuda("Staggered dslash has not been built");
 #endif
   }

 } // namespace quda
QUDA_RECONSTRUCT_NO
Definition: enum_quda.h:67

quda::ColorSpinorField::Nspin
int Nspin() const
Definition: color_spinor_field.h:406

quda::Dslash::launch
void launch(T *f, const TuneParam &tp, Arg &arg, const cudaStream_t &stream)
Definition: dslash.h:101

quda::TuneParam
Definition: tune_quda.h:17

dslash_policy.cuh

worker.h

quda::EXTERIOR_KERNEL_ALL
Definition: index_helper.cuh:466

quda::Dslash::setParam
void setParam(Arg &arg)
Definition: dslash.h:66

quda::ColorSpinorField
Definition: color_spinor_field.h:311

quda::dslash::DslashPolicyTune::apply
void apply(const cudaStream_t &stream)
Definition: dslash_policy.cuh:2011

quda::StaggeredLaunch
Definition: dslash_improved_staggered.cu:22

getVerbosity
QudaVerbosity getVerbosity()
Definition: util_quda.cpp:21

checkPrecision
#define checkPrecision(...)
Definition: lattice_field.h:695

errorQuda
#define errorQuda(...)
Definition: util_quda.h:121

quda::ColorSpinorField::VolumeCB
int VolumeCB() const
Definition: color_spinor_field.h:416

quda::Staggered::~Staggered
virtual ~Staggered()
Definition: dslash_improved_staggered.cu:46

quda::stream
cudaStream_t * stream
Definition: cuda_color_spinor_field.cpp:897

quda::Staggered::bytes
long long bytes() const
Definition: dslash_improved_staggered.cu:110

quda::StaggeredLaunch::launch
static void launch(Dslash &dslash, TuneParam &tp, Arg &arg, const cudaStream_t &stream)
Definition: dslash_improved_staggered.cu:25

quda::LatticeField::VolString
const char * VolString() const
Definition: lattice_field.h:624

quda::EXTERIOR_KERNEL_T
Definition: index_helper.cuh:470

quda::ColorSpinorField::Ncolor
int Ncolor() const
Definition: color_spinor_field.h:405

quda::ColorSpinorField::Volume
int Volume() const
Definition: color_spinor_field.h:415

quda
Definition: blas_cublas.h:5

quda::KERNEL_POLICY
Definition: index_helper.cuh:471

quda::dslash::DslashPolicyTune
Definition: dslash_policy.cuh:1770

quda::EXTERIOR_KERNEL_X
Definition: index_helper.cuh:467

quda::ImprovedStaggeredApply::ImprovedStaggeredApply
ImprovedStaggeredApply(ColorSpinorField &out, const ColorSpinorField &in, const GaugeField &L, const GaugeField &U, double a, const ColorSpinorField &x, int parity, bool dagger, const int *comm_override, TimeProfile &profile)
Definition: dslash_improved_staggered.cu:162

quda::Dslash
Definition: dslash.h:12

quda::Staggered::in
const ColorSpinorField & in
Definition: dslash_improved_staggered.cu:36

quda::staggered
__device__ __host__ void staggered(Arg &arg, int idx, int parity)
Definition: dslash_staggered.cuh:164

quda::EXTERIOR_KERNEL_Y
Definition: index_helper.cuh:468

quda::INTERIOR_KERNEL
Definition: index_helper.cuh:465

in
cpuColorSpinorField * in
Definition: staggered_invert_test.cpp:98

quda::ImprovedStaggeredApply
Definition: dslash_improved_staggered.cu:160

quda::ColorSpinorField::GhostFaceCB
const int * GhostFaceCB() const
Definition: color_spinor_field.h:487

quda::tuneLaunch
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
Definition: tune.cpp:643

checkLocation
#define checkLocation(...)
Definition: lattice_field.h:664

dslash.h

gauge_field_order.h
Main header file for host and device accessors to GaugeFields.

quda::Staggered::arg
Arg & arg
Definition: dslash_improved_staggered.cu:35

dslash_helper.cuh

quda::Staggered::flops
long long flops() const
Definition: dslash_improved_staggered.cu:70

quda::Staggered::tuneKey
TuneKey tuneKey() const
Definition: dslash_improved_staggered.cu:154

quda::Staggered
Definition: dslash_improved_staggered.cu:31

quda::EXTERIOR_KERNEL_Z
Definition: index_helper.cuh:469

quda::Arg
Definition: spinor_noise.cu:22

quda::LatticeField::Location
QudaFieldLocation Location() const
Definition: lattice_field.cpp:660

dslash_staggered.cuh

quda::StaggeredArg
Parameter structure for driving the Staggered Dslash operator.
Definition: dslash_staggered.cuh:18

quda::isFixed
Definition: register_traits.h:144

index_helper.cuh

out
cpuColorSpinorField * out
Definition: staggered_invert_test.cpp:99

QudaReconstructType
enum QudaReconstructType_s QudaReconstructType

color_spinor_field_order.h

quda::StaggeredLaunch::kernel
static constexpr const char * kernel
Definition: dslash_improved_staggered.cu:23

quda::Staggered::apply
void apply(const cudaStream_t &stream)
Definition: dslash_improved_staggered.cu:48

quda::ApplyImprovedStaggered
void ApplyImprovedStaggered(ColorSpinorField &out, const ColorSpinorField &in, const GaugeField &U, const GaugeField &L, double a, const ColorSpinorField &x, int parity, bool dagger, const int *comm_override, TimeProfile &profile)
Apply the improved staggered dslash operator to a color-spinor field.
Definition: dslash_improved_staggered.cu:181

quda::arg
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
Definition: complex_quda.h:1076

quda::TimeProfile
Definition: timer.h:171

quda::ColorSpinorField::GhostFace
const int * GhostFace() const
Definition: color_spinor_field.h:486

quda::Staggered::Staggered
Staggered(Arg &arg, const ColorSpinorField &out, const ColorSpinorField &in)
Definition: dslash_improved_staggered.cu:39

checkCudaError
#define checkCudaError()
Definition: util_quda.h:161

getTuning
QudaTune getTuning()
Query whether autotuning is enabled or not. Default is enabled but can be overridden by setting QUDA_...
Definition: util_quda.cpp:52

quda::LatticeField::Precision
QudaPrecision Precision() const
Definition: lattice_field.h:546

dagger
QudaDagType dagger
Definition: test_util.cpp:1620

parity
QudaParity parity
Definition: covdev_test.cpp:54

QUDA_CPU_FIELD_LOCATION
Definition: enum_quda.h:325

quda::ColorSpinorField::FieldOrder
QudaFieldOrder FieldOrder() const
Definition: color_spinor_field.h:483

gauge_field.h

quda::TuneKey
Definition: tune_key.h:8

quda::ColorSpinorField::V
void * V()
Definition: color_spinor_field.h:424

comm_dim_partitioned
int comm_dim_partitioned(int dim)
Definition: comm_common.cpp:635

quda::LatticeField::X
const int * X() const
Definition: lattice_field.h:499

color_spinor.h

quda::GaugeField
Definition: gauge_field.h:164