quda-ref/v1.0.0/dslash_8h_source.html

 #pragma once

 #include <color_spinor_field.h>
 #include <tune_quda.h>
 #include <dslash_quda.h>
 #include <dslash_helper.cuh>
 #include <jitify_helper.cuh>

 namespace quda
 {

   template <typename Float> class Dslash : public TunableVectorYZ
   {

 protected:
     DslashArg<Float> &arg;
     const ColorSpinorField &out;
     const ColorSpinorField &in;

     const int nDimComms;

     char aux_base[TuneKey::aux_n - 32];
     char aux[8][TuneKey::aux_n];

 #ifdef JITIFY
     // local copy of the static program pointer - this is a work
     // around for issues with the static program pointer when
     // HOSTDEBUG compilation is targeted (more precisely -fno-inline)
     jitify::Program *program_;
 #endif

     inline void fillAuxBase()
     {
       char comm[5];
       comm[0] = (arg.commDim[0] ? '1' : '0');
       comm[1] = (arg.commDim[1] ? '1' : '0');
       comm[2] = (arg.commDim[2] ? '1' : '0');
       comm[3] = (arg.commDim[3] ? '1' : '0');
       comm[4] = '\0';
       strcpy(aux_base, ",commDim=");
       strcat(aux_base, comm);

       if (arg.xpay) strcat(aux_base, ",xpay");
       if (arg.dagger) strcat(aux_base, ",dagger");
     }

     inline void fillAux(KernelType kernel_type, const char *kernel_str)
     {
       strcpy(aux[kernel_type], kernel_str);
       if (kernel_type == INTERIOR_KERNEL) strcat(aux[kernel_type], comm_dim_partitioned_string());
       strncat(aux[kernel_type], aux_base, TuneKey::aux_n-1);
     }

     bool tuneGridDim() const { return false; }
     unsigned int minThreads() const { return arg.threads; }

     template <typename Arg> inline void setParam(Arg &arg)
     {
       arg.t_proj_scale = getKernelPackT() ? 1.0 : 2.0;

       // Need to reset ghost pointers prior to every call since the
       // ghost buffer may have been changed during policy tuning.
       // Also, the accessor constructor calls Ghost(), which uses
       // ghost_buf, but this is only presently set with the
       // synchronous exchangeGhost.
       static void *ghost[8] = {}; // needs to be persistent across interior and exterior calls
       for (int dim = 0; dim < 4; dim++) {

         for (int dir = 0; dir < 2; dir++) {
           // if doing interior kernel, then this is the initial call,
           // so we set all ghost pointers else if doing exterior
           // kernel, then we only have to update the non-p2p ghosts,
           // since these may have been assigned to zero-copy memory
           if (!comm_peer2peer_enabled(dir, dim) || arg.kernel_type == INTERIOR_KERNEL) {
             ghost[2 * dim + dir] = (Float *)((char *)in.Ghost2() + in.GhostOffset(dim, dir) * in.GhostPrecision());
           }
         }
       }

       arg.in.resetGhost(in, ghost);
     }

     virtual int tuningIter() const { return 10; }

     int blockStep() const { return 16; }
     int blockMin() const { return 16; }

     unsigned int maxSharedBytesPerBlock() const { return maxDynamicSharedBytesPerBlock(); }

 public:
     template <typename T, typename Arg>
     inline void launch(T *f, const TuneParam &tp, Arg &arg, const cudaStream_t &stream)
     {
       if (deviceProp.major >= 7) { // should test whether this is always optimal on Volta
         this->setMaxDynamicSharedBytesPerBlock(f);
       }
       void *args[] = {&arg};
       qudaLaunchKernel((const void *)f, tp.grid, tp.block, args, tp.shared_bytes, stream);
     }

     template <template <typename, int, int, int, bool, bool, KernelType, typename> class Launch, int nDim, int nColor,
         int nParity, bool dagger, bool xpay, typename Arg>
     inline void instantiate(TuneParam &tp, Arg &arg, const cudaStream_t &stream)
     {

       if (in.Location() == QUDA_CPU_FIELD_LOCATION) {
         errorQuda("Not implemented");
       } else {
         switch (arg.kernel_type) {
         case INTERIOR_KERNEL:
           Launch<Float, nDim, nColor, nParity, dagger, xpay, INTERIOR_KERNEL, Arg>::launch(*this, tp, arg, stream);
           break;
 #ifdef MULTI_GPU
         case EXTERIOR_KERNEL_X:
           Launch<Float, nDim, nColor, nParity, dagger, xpay, EXTERIOR_KERNEL_X, Arg>::launch(*this, tp, arg, stream);
           break;
         case EXTERIOR_KERNEL_Y:
           Launch<Float, nDim, nColor, nParity, dagger, xpay, EXTERIOR_KERNEL_Y, Arg>::launch(*this, tp, arg, stream);
           break;
         case EXTERIOR_KERNEL_Z:
           Launch<Float, nDim, nColor, nParity, dagger, xpay, EXTERIOR_KERNEL_Z, Arg>::launch(*this, tp, arg, stream);
           break;
         case EXTERIOR_KERNEL_T:
           Launch<Float, nDim, nColor, nParity, dagger, xpay, EXTERIOR_KERNEL_T, Arg>::launch(*this, tp, arg, stream);
           break;
         case EXTERIOR_KERNEL_ALL:
           Launch<Float, nDim, nColor, nParity, dagger, xpay, EXTERIOR_KERNEL_ALL, Arg>::launch(*this, tp, arg, stream);
           break;
         default: errorQuda("Unexpected kernel type %d", arg.kernel_type);
 #else
         default: errorQuda("Unexpected kernel type %d for single-GPU build", arg.kernel_type);
 #endif
         }
       }
     }

     template <template <typename, int, int, int, bool, bool, KernelType, typename> class Launch, int nDim, int nColor,
         int nParity, bool xpay, typename Arg>
     inline void instantiate(TuneParam &tp, Arg &arg, const cudaStream_t &stream)
     {
 #ifdef JITIFY
       using namespace jitify::reflection;
       const auto kernel = Launch<void, 0, 0, 0, false, false, INTERIOR_KERNEL, Arg>::kernel;
       Tunable::jitify_error
           = program_->kernel(kernel)
                 .instantiate(Type<Float>(), nDim, nColor, nParity, arg.dagger, xpay, arg.kernel_type, Type<Arg>())
                 .configure(tp.grid, tp.block, tp.shared_bytes, stream)
                 .launch(arg);
 #else
       if (arg.dagger)
         instantiate<Launch, nDim, nColor, nParity, true, xpay>(tp, arg, stream);
       else
         instantiate<Launch, nDim, nColor, nParity, false, xpay>(tp, arg, stream);
 #endif
     }

     template <template <typename, int, int, int, bool, bool, KernelType, typename> class Launch, int nDim, int nColor,
         bool xpay, typename Arg>
     inline void instantiate(TuneParam &tp, Arg &arg, const cudaStream_t &stream)
     {
 #ifdef JITIFY
       using namespace jitify::reflection;
       const auto kernel = Launch<void, 0, 0, 0, false, false, INTERIOR_KERNEL, Arg>::kernel;
       Tunable::jitify_error
           = program_->kernel(kernel)
                 .instantiate(Type<Float>(), nDim, nColor, arg.nParity, arg.dagger, xpay, arg.kernel_type, Type<Arg>())
                 .configure(tp.grid, tp.block, tp.shared_bytes, stream)
                 .launch(arg);
 #else
       switch (arg.nParity) {
       case 1: instantiate<Launch, nDim, nColor, 1, xpay>(tp, arg, stream); break;
       case 2: instantiate<Launch, nDim, nColor, 2, xpay>(tp, arg, stream); break;
       default: errorQuda("nParity = %d undefined\n", arg.nParity);
       }
 #endif
     }

     template <template <typename, int, int, int, bool, bool, KernelType, typename> class Launch, int nDim, int nColor,
         typename Arg>
     inline void instantiate(TuneParam &tp, Arg &arg, const cudaStream_t &stream)
     {
 #ifdef JITIFY
       using namespace jitify::reflection;
       const auto kernel = Launch<void, 0, 0, 0, false, false, INTERIOR_KERNEL, Arg>::kernel;
       Tunable::jitify_error = program_->kernel(kernel)
                                   .instantiate(Type<Float>(), nDim, nColor, arg.nParity, arg.dagger, arg.xpay,
                                       arg.kernel_type, Type<Arg>())
                                   .configure(tp.grid, tp.block, tp.shared_bytes, stream)
                                   .launch(arg);
 #else
       if (arg.xpay)
         instantiate<Launch, nDim, nColor, true>(tp, arg, stream);
       else
         instantiate<Launch, nDim, nColor, false>(tp, arg, stream);
 #endif
     }

     DslashArg<Float> &dslashParam; // temporary addition for policy compatibility

     Dslash(DslashArg<Float> &arg, const ColorSpinorField &out, const ColorSpinorField &in, const char *src) :
         TunableVectorYZ(1, arg.nParity),
         arg(arg),
         out(out),
         in(in),
         nDimComms(4),
         dslashParam(arg)
     {
       if (checkLocation(out, in) == QUDA_CPU_FIELD_LOCATION)
         errorQuda("CPU Fields not supported in Dslash framework yet");

       // this sets the communications pattern for the packing kernel
       setPackComms(arg.commDim);

       // strcpy(aux, in.AuxString());
       fillAuxBase();
 #ifdef MULTI_GPU
       fillAux(INTERIOR_KERNEL, "policy_kernel=interior");
       fillAux(EXTERIOR_KERNEL_ALL, "policy_kernel=exterior_all");
       fillAux(EXTERIOR_KERNEL_X, "policy_kernel=exterior_x");
       fillAux(EXTERIOR_KERNEL_Y, "policy_kernel=exterior_y");
       fillAux(EXTERIOR_KERNEL_Z, "policy_kernel=exterior_z");
       fillAux(EXTERIOR_KERNEL_T, "policy_kernel=exterior_t");
 #else
       fillAux(INTERIOR_KERNEL, "policy_kernel=single-GPU");
 #endif // MULTI_GPU
       fillAux(KERNEL_POLICY, "policy");

 #ifdef JITIFY
       create_jitify_program(src);
       program_ = program;
 #endif
     }

     int Nface() const
     {
       return 2 * arg.nFace;
     } // factor of 2 is for forwards/backwards (convention used in dslash policy)
     int Dagger() const { return arg.dagger; }

     const char *getAux(KernelType type) const { return aux[type]; }

     void setAux(KernelType type, const char *aux_) { strcpy(aux[type], aux_); }

     void augmentAux(KernelType type, const char *extra) { strcat(aux[type], extra); }

     virtual void preTune()
     {
       if (arg.kernel_type != INTERIOR_KERNEL && arg.kernel_type != KERNEL_POLICY) out.backup();
     }

     virtual void postTune()
     {
       if (arg.kernel_type != INTERIOR_KERNEL && arg.kernel_type != KERNEL_POLICY) out.restore();
     }

     /*
       per direction / dimension flops
       spin project flops = Nc * Ns
       SU(3) matrix-vector flops = (8 Nc - 2) * Nc
       spin reconstruction flops = 2 * Nc * Ns (just an accumulation to all components)
       xpay = 2 * 2 * Nc * Ns

       So for the full dslash we have, where for the final spin
       reconstruct we have -1 since the first direction does not
       require any accumulation.

       flops = (2 * Nd * Nc * Ns)  +  (2 * Nd * (Ns/2) * (8*Nc-2) * Nc)  +  ((2 * Nd - 1) * 2 * Nc * Ns)
       flops_xpay = flops + 2 * 2 * Nc * Ns

       For Wilson this should give 1344 for Nc=3,Ns=2 and 1368 for the xpay equivalent
     */
     virtual long long flops() const
     {
       int mv_flops = (8 * in.Ncolor() - 2) * in.Ncolor(); // SU(3) matrix-vector flops
       int num_mv_multiply = in.Nspin() == 4 ? 2 : 1;
       int ghost_flops = (num_mv_multiply * mv_flops + 2 * in.Ncolor() * in.Nspin());
       int xpay_flops = 2 * 2 * in.Ncolor() * in.Nspin(); // multiply and add per real component
       int num_dir = 2 * 4; // set to 4-d since we take care of 5-d fermions in derived classes where necessary

       long long flops_ = 0;

       // FIXME - should we count the xpay flops in the derived kernels
       // since some kernels require the xpay in the exterior (preconditiond clover)

       switch (arg.kernel_type) {
       case EXTERIOR_KERNEL_X:
       case EXTERIOR_KERNEL_Y:
       case EXTERIOR_KERNEL_Z:
       case EXTERIOR_KERNEL_T:
         flops_ = (ghost_flops + (arg.xpay ? xpay_flops : xpay_flops / 2)) * 2 * in.GhostFace()[arg.kernel_type];
         break;
       case EXTERIOR_KERNEL_ALL: {
         long long ghost_sites = 2 * (in.GhostFace()[0] + in.GhostFace()[1] + in.GhostFace()[2] + in.GhostFace()[3]);
         flops_ = (ghost_flops + (arg.xpay ? xpay_flops : xpay_flops / 2)) * ghost_sites;
         break;
       }
       case INTERIOR_KERNEL:
       case KERNEL_POLICY: {
         long long sites = in.Volume();
         flops_ = (num_dir * (in.Nspin() / 4) * in.Ncolor() * in.Nspin() + // spin project (=0 for staggered)
                      num_dir * num_mv_multiply * mv_flops +               // SU(3) matrix-vector multiplies
                      ((num_dir - 1) * 2 * in.Ncolor() * in.Nspin()))
             * sites; // accumulation
         if (arg.xpay) flops_ += xpay_flops * sites;

         if (arg.kernel_type == KERNEL_POLICY) break;
         // now correct for flops done by exterior kernel
         long long ghost_sites = 0;
         for (int d = 0; d < 4; d++)
           if (arg.commDim[d]) ghost_sites += 2 * in.GhostFace()[d];
         flops_ -= ghost_flops * ghost_sites;

         break;
       }
       }

       return flops_;
     }

     virtual long long bytes() const
     {
       int gauge_bytes = arg.reconstruct * in.Precision();
       bool isFixed = (in.Precision() == sizeof(short) || in.Precision() == sizeof(char)) ? true : false;
       int spinor_bytes = 2 * in.Ncolor() * in.Nspin() * in.Precision() + (isFixed ? sizeof(float) : 0);
       int proj_spinor_bytes = in.Nspin() == 4 ? spinor_bytes / 2 : spinor_bytes;
       int ghost_bytes = (proj_spinor_bytes + gauge_bytes) + 2 * spinor_bytes; // 2 since we have to load the partial
       int num_dir = 2 * 4; // set to 4-d since we take care of 5-d fermions in derived classes where necessary

       long long bytes_ = 0;

       switch (arg.kernel_type) {
       case EXTERIOR_KERNEL_X:
       case EXTERIOR_KERNEL_Y:
       case EXTERIOR_KERNEL_Z:
       case EXTERIOR_KERNEL_T: bytes_ = ghost_bytes * 2 * in.GhostFace()[arg.kernel_type]; break;
       case EXTERIOR_KERNEL_ALL: {
         long long ghost_sites = 2 * (in.GhostFace()[0] + in.GhostFace()[1] + in.GhostFace()[2] + in.GhostFace()[3]);
         bytes_ = ghost_bytes * ghost_sites;
         break;
       }
       case INTERIOR_KERNEL:
       case KERNEL_POLICY: {
         long long sites = in.Volume();
         bytes_ = (num_dir * gauge_bytes + ((num_dir - 2) * spinor_bytes + 2 * proj_spinor_bytes) + spinor_bytes) * sites;
         if (arg.xpay) bytes_ += spinor_bytes;

         if (arg.kernel_type == KERNEL_POLICY) break;
         // now correct for bytes done by exterior kernel
         long long ghost_sites = 0;
         for (int d = 0; d < 4; d++)
           if (arg.commDim[d]) ghost_sites += 2 * in.GhostFace()[d];
         bytes_ -= ghost_bytes * ghost_sites;

         break;
       }
       }
       return bytes_;
     }
   };

   struct WilsonReconstruct {
     static constexpr QudaReconstructType recon0 = QUDA_RECONSTRUCT_NO;
     static constexpr QudaReconstructType recon1 = QUDA_RECONSTRUCT_12;
     static constexpr QudaReconstructType recon2 = QUDA_RECONSTRUCT_8;
   };

   struct StaggeredReconstruct {
     static constexpr QudaReconstructType recon0 = QUDA_RECONSTRUCT_NO;
     static constexpr QudaReconstructType recon1 = QUDA_RECONSTRUCT_13;
     static constexpr QudaReconstructType recon2 = QUDA_RECONSTRUCT_9;
   };

   template <template <typename, int, QudaReconstructType> class Apply, typename Recon, typename Float, int nColor,
       typename... Args>
   inline void instantiate(ColorSpinorField &out, const ColorSpinorField &in, const GaugeField &U, Args &&... args)
   {
     if (U.Reconstruct() == Recon::recon0) {
 #if QUDA_RECONSTRUCT & 4
       Apply<Float, nColor, Recon::recon0>(out, in, U, args...);
 #else
       errorQuda("QUDA_RECONSTRUCT=%d does not enable reconstruct-18", QUDA_RECONSTRUCT);
 #endif
     } else if (U.Reconstruct() == Recon::recon1) {
 #if QUDA_RECONSTRUCT & 2
       Apply<Float, nColor, Recon::recon1>(out, in, U, args...);
 #else
       errorQuda("QUDA_RECONSTRUCT=%d does not enable reconstruct-12", QUDA_RECONSTRUCT);
 #endif
     } else if (U.Reconstruct() == Recon::recon2) {
 #if QUDA_RECONSTRUCT & 1
       Apply<Float, nColor, Recon::recon2>(out, in, U, args...);
 #else
       errorQuda("QUDA_RECONSTRUCT=%d does not enable reconstruct-12", QUDA_RECONSTRUCT);
 #endif
     } else {
       errorQuda("Unsupported reconstruct type %d\n", U.Reconstruct());
     }
   }

   template <template <typename, int, QudaReconstructType> class Apply, typename Recon, typename Float, typename... Args>
   inline void instantiate(ColorSpinorField &out, const ColorSpinorField &in, const GaugeField &U, Args &&... args)
   {
     if (in.Ncolor() == 3) {
       instantiate<Apply, Recon, Float, 3>(out, in, U, args...);
     } else {
       errorQuda("Unsupported number of colors %d\n", U.Ncolor());
     }
   }

   template <template <typename, int, QudaReconstructType> class Apply, typename Recon = WilsonReconstruct, typename... Args>
   inline void instantiate(ColorSpinorField &out, const ColorSpinorField &in, const GaugeField &U, Args &&... args)
   {
     if (U.Precision() == QUDA_DOUBLE_PRECISION) {
 #if QUDA_PRECISION & 8
       instantiate<Apply, Recon, double>(out, in, U, args...);
 #else
       errorQuda("QUDA_PRECISION=%d does not enable double precision", QUDA_PRECISION);
 #endif
     } else if (U.Precision() == QUDA_SINGLE_PRECISION) {
 #if QUDA_PRECISION & 4
       instantiate<Apply, Recon, float>(out, in, U, args...);
 #else
       errorQuda("QUDA_PRECISION=%d does not enable single precision", QUDA_PRECISION);
 #endif
     } else if (U.Precision() == QUDA_HALF_PRECISION) {
 #if QUDA_PRECISION & 2
       instantiate<Apply, Recon, short>(out, in, U, args...);
 #else
       errorQuda("QUDA_PRECISION=%d does not enable half precision", QUDA_PRECISION);
 #endif
     } else if (U.Precision() == QUDA_QUARTER_PRECISION) {
 #if QUDA_PRECISION & 1
       instantiate<Apply, Recon, char>(out, in, U, args...);
 #else
       errorQuda("QUDA_PRECISION=%d does not enable quarter precision", QUDA_PRECISION);
 #endif
     } else {
       errorQuda("Unsupported precision %d\n", U.Precision());
     }
   }

 } // namespace quda
QUDA_RECONSTRUCT_NO
Definition: enum_quda.h:67

quda::ColorSpinorField::Nspin
int Nspin() const
Definition: color_spinor_field.h:406

quda::Dslash::postTune
virtual void postTune()
Restore the output field if doing exterior kernel.
Definition: dslash.h:295

quda::Dslash::launch
void launch(T *f, const TuneParam &tp, Arg &arg, const cudaStream_t &stream)
Definition: dslash.h:101

quda::DslashArg
Definition: dslash_helper.cuh:229

quda::TuneParam
Definition: tune_quda.h:17

quda::DslashArg::kernel_type
KernelType kernel_type
Definition: dslash_helper.cuh:250

quda::Dslash::minThreads
unsigned int minThreads() const
Definition: dslash.h:64

quda::KernelType
KernelType
Definition: index_helper.cuh:464

quda::EXTERIOR_KERNEL_ALL
Definition: index_helper.cuh:466

quda::Dslash::setParam
void setParam(Arg &arg)
Definition: dslash.h:66

quda::ColorSpinorField
Definition: color_spinor_field.h:311

deviceProp
cudaDeviceProp deviceProp
Definition: interface_quda.cpp:156

quda::getKernelPackT
bool getKernelPackT()
Definition: dslash_quda.cu:26

errorQuda
#define errorQuda(...)
Definition: util_quda.h:121

color_spinor_field.h

quda::Dslash::Dagger
int Dagger() const
Definition: dslash.h:275

jitify_helper.cuh
Helper file when using jitify run-time compilation. This file should be included in source code...

QUDA_QUARTER_PRECISION
Definition: enum_quda.h:59

QUDA_HALF_PRECISION
Definition: enum_quda.h:60

quda::LatticeField::GhostPrecision
QudaPrecision GhostPrecision() const
Definition: lattice_field.h:551

quda::Dslash::nDimComms
const int nDimComms
Definition: dslash.h:20

quda::DslashArg::commDim
int commDim[4]
Definition: dslash_helper.cuh:241

quda::StaggeredReconstruct
Definition: dslash.h:411

quda::stream
cudaStream_t * stream
Definition: cuda_color_spinor_field.cpp:897

quda::Dslash::maxSharedBytesPerBlock
unsigned int maxSharedBytesPerBlock() const
The maximum shared memory that a CUDA thread block can use in the autotuner. This isn&#39;t necessarily t...
Definition: dslash.h:97

quda::Dslash::augmentAux
void augmentAux(KernelType type, const char *extra)
Definition: dslash.h:281

quda::TuneParam::shared_bytes
int shared_bytes
Definition: tune_quda.h:22

quda::EXTERIOR_KERNEL_T
Definition: index_helper.cuh:470

quda::ColorSpinorField::Ncolor
int Ncolor() const
Definition: color_spinor_field.h:405

quda::ColorSpinorField::Volume
int Volume() const
Definition: color_spinor_field.h:415

quda::TuneParam::grid
dim3 grid
Definition: tune_quda.h:21

quda::DslashArg::reconstruct
const QudaReconstructType reconstruct
Definition: dslash_helper.cuh:236

comm_dim_partitioned_string
const char * comm_dim_partitioned_string(const int *comm_dim_override=0)
Return a string that defines the comm partitioning (used as a tuneKey)
Definition: comm_common.cpp:782

quda
Definition: blas_cublas.h:5

quda::KERNEL_POLICY
Definition: index_helper.cuh:471

quda::blas::xpay
void xpay(ColorSpinorField &x, double a, ColorSpinorField &y)
Definition: blas_quda.h:37

quda::Dslash::Dslash
Dslash(DslashArg< Float > &arg, const ColorSpinorField &out, const ColorSpinorField &in, const char *src)
Definition: dslash.h:237

quda::EXTERIOR_KERNEL_X
Definition: index_helper.cuh:467

quda::DslashArg::threads
int_fastdiv threads
Definition: dslash_helper.cuh:253

quda::Dslash::setAux
void setAux(KernelType type, const char *aux_)
Definition: dslash.h:279

quda::Dslash
Definition: dslash.h:12

quda::GaugeField::Ncolor
int Ncolor() const
Definition: gauge_field.h:249

QUDA_RECONSTRUCT_9
Definition: enum_quda.h:70

quda::Dslash::bytes
virtual long long bytes() const
Definition: dslash.h:364

quda::Tunable::setMaxDynamicSharedBytesPerBlock
void setMaxDynamicSharedBytesPerBlock(F *func) const
Enable the maximum dynamic shared bytes for the kernel "func" (values given by maxDynamicSharedBytesP...
Definition: tune_quda.h:181

quda::EXTERIOR_KERNEL_Y
Definition: index_helper.cuh:468

QUDA_RECONSTRUCT_12
Definition: enum_quda.h:68

quda::Dslash::fillAux
void fillAux(KernelType kernel_type, const char *kernel_str)
Specialize the auxiliary strings for each kernel type.
Definition: dslash.h:56

quda::Dslash::in
const ColorSpinorField & in
Definition: dslash.h:18

quda::INTERIOR_KERNEL
Definition: index_helper.cuh:465

nColor
const int nColor
Definition: covdev_test.cpp:75

quda::Dslash::fillAuxBase
void fillAuxBase()
Set the base strings used by the different dslash kernel types for autotuning.
Definition: dslash.h:36

quda::TuneParam::block
dim3 block
Definition: tune_quda.h:20

quda::Dslash::tuneGridDim
bool tuneGridDim() const
Definition: dslash.h:63

quda::Tunable::jitify_error
CUresult jitify_error
Definition: tune_quda.h:276

checkLocation
#define checkLocation(...)
Definition: lattice_field.h:664

quda::LatticeField::backup
virtual void backup() const
Backs up the LatticeField.
Definition: lattice_field.h:630

quda::Dslash::dslashParam
DslashArg< Float > & dslashParam
Definition: dslash.h:235

quda::Dslash::out
const ColorSpinorField & out
Definition: dslash.h:17

dslash_helper.cuh

quda::EXTERIOR_KERNEL_Z
Definition: index_helper.cuh:469

quda::ColorSpinorField::GhostOffset
int GhostOffset(const int i) const
Definition: color_spinor_field.h:488

tune_quda.h

quda::Arg
Definition: spinor_noise.cu:22

comm_peer2peer_enabled
bool comm_peer2peer_enabled(int dir, int dim)
Definition: comm_common.cpp:285

quda::LatticeField::Location
QudaFieldLocation Location() const
Definition: lattice_field.cpp:660

QUDA_RECONSTRUCT_8
Definition: enum_quda.h:69

QUDA_DOUBLE_PRECISION
Definition: enum_quda.h:62

quda::isFixed
Definition: register_traits.h:144

quda::DslashArg::dagger
const bool dagger
Definition: dslash_helper.cuh:244

quda::WilsonReconstruct
Definition: dslash.h:405

quda::Dslash::blockMin
int blockMin() const
Definition: dslash.h:95

quda::Arg::nParity
const int nParity
Definition: spinor_noise.cu:25

quda::Dslash::Nface
int Nface() const
Definition: dslash.h:271

QUDA_SINGLE_PRECISION
Definition: enum_quda.h:61

QudaReconstructType
enum QudaReconstructType_s QudaReconstructType

quda::TunableVectorYZ
Definition: tune_quda.h:485

quda::Dslash::arg
DslashArg< Float > & arg
Definition: dslash.h:16

dslash_quda.h

quda::TuneKey::aux_n
static const int aux_n
Definition: tune_key.h:12

quda::Dslash::instantiate
void instantiate(TuneParam &tp, Arg &arg, const cudaStream_t &stream)
This instantiate function is used to instantiate the the KernelType template required for the multi-G...
Definition: dslash.h:119

quda::Dslash::instantiate
void instantiate(TuneParam &tp, Arg &arg, const cudaStream_t &stream)
This instantiate function is used to instantiate the the nParity template.
Definition: dslash.h:189

QUDA_RECONSTRUCT_13
Definition: enum_quda.h:71

quda::ColorSpinorField::GhostFace
const int * GhostFace() const
Definition: color_spinor_field.h:486

quda::GaugeField::Reconstruct
QudaReconstructType Reconstruct() const
Definition: gauge_field.h:250

quda::Dslash::aux_base
char aux_base[TuneKey::aux_n - 32]
Definition: dslash.h:22

quda::Dslash::instantiate
void instantiate(TuneParam &tp, Arg &arg, const cudaStream_t &stream)
This instantiate function is used to instantiate the the xpay template.
Definition: dslash.h:217

quda::DslashArg::nFace
const int nFace
Definition: dslash_helper.cuh:235

quda::ColorSpinorField::Ghost2
virtual const void * Ghost2() const
Definition: color_spinor_field.h:428

quda::Tunable::maxDynamicSharedBytesPerBlock
unsigned int maxDynamicSharedBytesPerBlock() const
This can&#39;t be correctly queried in CUDA for all architectures so here we set set this. Based on Table 14 of the CUDA Programming Guide 10.0 (Technical Specifications per Compute Capability).
Definition: tune_quda.h:198

quda::Dslash::instantiate
void instantiate(TuneParam &tp, Arg &arg, const cudaStream_t &stream)
This instantiate function is used to instantiate the the dagger template.
Definition: dslash.h:162

quda::Dslash::aux
char aux[8][TuneKey::aux_n]
Definition: dslash.h:23

quda::Dslash::flops
virtual long long flops() const
Definition: dslash.h:316

quda::LatticeField::Precision
QudaPrecision Precision() const
Definition: lattice_field.h:546

quda::Dslash::preTune
virtual void preTune()
Save the output field since the output field is both read from and written to in the exterior kernels...
Definition: dslash.h:287

dagger
QudaDagType dagger
Definition: test_util.cpp:1620

quda::DslashArg::xpay
const bool xpay
Definition: dslash_helper.cuh:245

quda::Dslash::getAux
const char * getAux(KernelType type) const
Definition: dslash.h:277

QUDA_CPU_FIELD_LOCATION
Definition: enum_quda.h:325

quda::Dslash::blockStep
int blockStep() const
Definition: dslash.h:94

quda::qudaLaunchKernel
cudaError_t qudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim, void **args, size_t sharedMem, cudaStream_t stream)
Wrapper around cudaLaunchKernel.
Definition: quda_cuda_api.cpp:201

quda::LatticeField::restore
virtual void restore() const
Restores the LatticeField.
Definition: lattice_field.h:633

quda::Dslash::tuningIter
virtual int tuningIter() const
Definition: dslash.h:92

quda::setPackComms
void setPackComms(const int *dim_pack)
Helper function that sets which dimensions the packing kernel should be packing for.
Definition: dslash_pack2.cu:14

quda::GaugeField
Definition: gauge_field.h:164