quda-ref/v1.0.0/dslash5__domain__wall_8cu_source.html

 #include <color_spinor_field.h>
 #include <color_spinor_field_order.h>
 #include <dslash_quda.h>
 #include <index_helper.cuh>
 #include <dslash_quda.h>

 #include <include/kernels/dslash_domain_wall_m5.cuh>

 namespace quda
 {

   /*
     FIXME
     - fix flops counters
     - check dagger operators are correct - there might need to be a
     shift by 1 in which coefficients are used and conjugation of coefficients
     - use kappa notation and not b/c for consistency with other codes and sanity
   */

   template <typename Float, int nColor, typename Arg> class Dslash5 : public TunableVectorYZ
   {

 protected:
     Arg &arg;
     const ColorSpinorField &meta;
     static constexpr bool shared = true; // whether to use shared memory cache blocking for M5inv

     static constexpr bool var_inverse = true;

     long long flops() const
     {
       long long Ls = meta.X(4);
       long long bulk = (Ls - 2) * (meta.Volume() / Ls);
       long long wall = 2 * meta.Volume() / Ls;
       long long n = meta.Ncolor() * meta.Nspin();

       long long flops_ = 0;
       switch (arg.type) {
       case DSLASH5_DWF: flops_ = n * (8ll * bulk + 10ll * wall + (arg.xpay ? 4ll * meta.Volume() : 0)); break;
       case DSLASH5_MOBIUS_PRE:
         flops_ = n * (8ll * bulk + 10ll * wall + 14ll * meta.Volume() + (arg.xpay ? 8ll * meta.Volume() : 0));
         break;
       case DSLASH5_MOBIUS:
         flops_ = n * (8ll * bulk + 10ll * wall + 8ll * meta.Volume() + (arg.xpay ? 8ll * meta.Volume() : 0));
         break;
       case M5_INV_DWF:
       case M5_INV_MOBIUS: // FIXME flops
         // flops_ = ((2 + 8 * n) * Ls + (arg.xpay ? 4ll : 0)) * meta.Volume();
         flops_ = (144 * Ls + (arg.xpay ? 4ll : 0)) * meta.Volume();
         break;
       case M5_INV_ZMOBIUS:
         // flops_ = ((12 + 16 * n) * Ls + (arg.xpay ? 8ll : 0)) * meta.Volume();
         flops_ = (144 * Ls + (arg.xpay ? 8ll : 0)) * meta.Volume();
         break;
       default: errorQuda("Unknown Dslash5Type %d", arg.type);
       }

       return flops_;
     }

     long long bytes() const
     {
       long long Ls = meta.X(4);
       switch (arg.type) {
       case DSLASH5_DWF: return arg.out.Bytes() + 2 * arg.in.Bytes() + (arg.xpay ? arg.x.Bytes() : 0);
       case DSLASH5_MOBIUS_PRE: return arg.out.Bytes() + 3 * arg.in.Bytes() + (arg.xpay ? arg.x.Bytes() : 0);
       case DSLASH5_MOBIUS: return arg.out.Bytes() + 3 * arg.in.Bytes() + (arg.xpay ? arg.x.Bytes() : 0);
       case M5_INV_DWF: return arg.out.Bytes() + Ls * arg.in.Bytes() + (arg.xpay ? arg.x.Bytes() : 0);
       case M5_INV_MOBIUS: return arg.out.Bytes() + Ls * arg.in.Bytes() + (arg.xpay ? arg.x.Bytes() : 0);
       case M5_INV_ZMOBIUS: return arg.out.Bytes() + Ls * arg.in.Bytes() + (arg.xpay ? arg.x.Bytes() : 0);
       default: errorQuda("Unknown Dslash5Type %d", arg.type);
       }
       return 0ll;
     }

     bool tuneGridDim() const { return false; }
     unsigned int minThreads() const { return arg.volume_4d_cb; }
     int blockStep() const { return 4; }
     int blockMin() const { return 4; }
     unsigned int sharedBytesPerThread() const
     {
       if (shared && (arg.type == M5_INV_DWF || arg.type == M5_INV_MOBIUS || arg.type == M5_INV_ZMOBIUS)) {
         // spin components in shared depend on inversion algorithm
         int nSpin = var_inverse ? meta.Nspin() / 2 : meta.Nspin();
         return 2 * nSpin * nColor * sizeof(typename mapper<Float>::type);
       } else {
         return 0;
       }
     }

     // overloaded to return max dynamic shared memory if doing shared-memory inverse
     unsigned int maxSharedBytesPerBlock() const
     {
       if (shared && (arg.type == M5_INV_DWF || arg.type == M5_INV_MOBIUS || arg.type == M5_INV_ZMOBIUS)) {
         return maxDynamicSharedBytesPerBlock();
       } else {
         return TunableVectorYZ::maxSharedBytesPerBlock();
       }
     }

 public:
     Dslash5(Arg &arg, const ColorSpinorField &meta) : TunableVectorYZ(arg.Ls, arg.nParity), arg(arg), meta(meta)
     {
       strcpy(aux, meta.AuxString());
       if (arg.dagger) strcat(aux, ",Dagger");
       if (arg.xpay) strcat(aux, ",xpay");
       switch (arg.type) {
       case DSLASH5_DWF: strcat(aux, ",DSLASH5_DWF"); break;
       case DSLASH5_MOBIUS_PRE: strcat(aux, ",DSLASH5_MOBIUS_PRE"); break;
       case DSLASH5_MOBIUS: strcat(aux, ",DSLASH5_MOBIUS"); break;
       case M5_INV_DWF: strcat(aux, ",M5_INV_DWF"); break;
       case M5_INV_MOBIUS: strcat(aux, ",M5_INV_MOBIUS"); break;
       case M5_INV_ZMOBIUS: strcat(aux, ",M5_INV_ZMOBIUS"); break;
       default: errorQuda("Unknown Dslash5Type %d", arg.type);
       }
     }
     virtual ~Dslash5() {}

     template <typename T> inline void launch(T *f, const TuneParam &tp, Arg &arg, const cudaStream_t &stream)
     {
       if (shared && (arg.type == M5_INV_DWF || arg.type == M5_INV_MOBIUS || arg.type == M5_INV_ZMOBIUS)) {
         // if inverse kernel uses shared memory then maximize total shared memory pool
         setMaxDynamicSharedBytesPerBlock(f);
       }
       void *args[] = {&arg};
       qudaLaunchKernel((const void *)f, tp.grid, tp.block, args, tp.shared_bytes, stream);
     }

     void apply(const cudaStream_t &stream)
     {
       if (meta.Location() == QUDA_CPU_FIELD_LOCATION) {
         errorQuda("CPU variant not instantiated");
       } else {
         TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
         if (arg.type == DSLASH5_DWF) {
           if (arg.xpay)
             arg.dagger ? launch(dslash5GPU<Float, nColor, true, true, DSLASH5_DWF, Arg>, tp, arg, stream) :
                          launch(dslash5GPU<Float, nColor, false, true, DSLASH5_DWF, Arg>, tp, arg, stream);
           else
             arg.dagger ? launch(dslash5GPU<Float, nColor, true, false, DSLASH5_DWF, Arg>, tp, arg, stream) :
                          launch(dslash5GPU<Float, nColor, false, false, DSLASH5_DWF, Arg>, tp, arg, stream);
         } else if (arg.type == DSLASH5_MOBIUS_PRE) {
           if (arg.xpay)
             arg.dagger ? launch(dslash5GPU<Float, nColor, true, true, DSLASH5_MOBIUS_PRE, Arg>, tp, arg, stream) :
                          launch(dslash5GPU<Float, nColor, false, true, DSLASH5_MOBIUS_PRE, Arg>, tp, arg, stream);
           else
             arg.dagger ? launch(dslash5GPU<Float, nColor, true, false, DSLASH5_MOBIUS_PRE, Arg>, tp, arg, stream) :
                          launch(dslash5GPU<Float, nColor, false, false, DSLASH5_MOBIUS_PRE, Arg>, tp, arg, stream);
         } else if (arg.type == DSLASH5_MOBIUS) {
           if (arg.xpay)
             arg.dagger ? launch(dslash5GPU<Float, nColor, true, true, DSLASH5_MOBIUS, Arg>, tp, arg, stream) :
                          launch(dslash5GPU<Float, nColor, false, true, DSLASH5_MOBIUS, Arg>, tp, arg, stream);
           else
             arg.dagger ? launch(dslash5GPU<Float, nColor, true, false, DSLASH5_MOBIUS, Arg>, tp, arg, stream) :
                          launch(dslash5GPU<Float, nColor, false, false, DSLASH5_MOBIUS, Arg>, tp, arg, stream);
         } else if (arg.type == M5_INV_DWF) {
           if (arg.xpay)
             arg.dagger ?
                 launch(dslash5invGPU<Float, nColor, true, true, M5_INV_DWF, shared, var_inverse, Arg>, tp, arg, stream) :
                 launch(dslash5invGPU<Float, nColor, false, true, M5_INV_DWF, shared, var_inverse, Arg>, tp, arg, stream);
           else
             arg.dagger ?
                 launch(dslash5invGPU<Float, nColor, true, false, M5_INV_DWF, shared, var_inverse, Arg>, tp, arg, stream) :
                 launch(dslash5invGPU<Float, nColor, false, false, M5_INV_DWF, shared, var_inverse, Arg>, tp, arg, stream);
         } else if (arg.type == M5_INV_MOBIUS) {
           if (arg.xpay)
             arg.dagger ? launch(
                 dslash5invGPU<Float, nColor, true, true, M5_INV_MOBIUS, shared, var_inverse, Arg>, tp, arg, stream) :
                          launch(dslash5invGPU<Float, nColor, false, true, M5_INV_MOBIUS, shared, var_inverse, Arg>, tp,
                              arg, stream);
           else
             arg.dagger ? launch(
                 dslash5invGPU<Float, nColor, true, false, M5_INV_MOBIUS, shared, var_inverse, Arg>, tp, arg, stream) :
                          launch(dslash5invGPU<Float, nColor, false, false, M5_INV_MOBIUS, shared, var_inverse, Arg>, tp,
                              arg, stream);
         } else if (arg.type == M5_INV_ZMOBIUS) {
           if (arg.xpay)
             arg.dagger ? launch(
                 dslash5invGPU<Float, nColor, true, true, M5_INV_ZMOBIUS, shared, var_inverse, Arg>, tp, arg, stream) :
                          launch(dslash5invGPU<Float, nColor, false, true, M5_INV_ZMOBIUS, shared, var_inverse, Arg>, tp,
                              arg, stream);
           else
             arg.dagger ? launch(
                 dslash5invGPU<Float, nColor, true, false, M5_INV_ZMOBIUS, shared, var_inverse, Arg>, tp, arg, stream) :
                          launch(dslash5invGPU<Float, nColor, false, false, M5_INV_ZMOBIUS, shared, var_inverse, Arg>,
                              tp, arg, stream);
         }
       }
     }

     void initTuneParam(TuneParam &param) const
     {
       TunableVectorYZ::initTuneParam(param);
       if (shared && (arg.type == M5_INV_DWF || arg.type == M5_INV_MOBIUS || arg.type == M5_INV_ZMOBIUS)) {
         param.block.y = arg.Ls; // Ls must be contained in the block
         param.grid.y = 1;
         param.shared_bytes = sharedBytesPerThread() * param.block.x * param.block.y * param.block.z;
       }
     }

     void defaultTuneParam(TuneParam &param) const
     {
       TunableVectorYZ::defaultTuneParam(param);
       if (shared && (arg.type == M5_INV_DWF || arg.type == M5_INV_MOBIUS || arg.type == M5_INV_ZMOBIUS)) {
         param.block.y = arg.Ls; // Ls must be contained in the block
         param.grid.y = 1;
         param.shared_bytes = sharedBytesPerThread() * param.block.x * param.block.y * param.block.z;
       }
     }

     TuneKey tuneKey() const { return TuneKey(meta.VolString(), typeid(*this).name(), aux); }
   };

   template <typename Float, int nColor>
   void ApplyDslash5(ColorSpinorField &out, const ColorSpinorField &in, const ColorSpinorField &x, double m_f,
       double m_5, const Complex *b_5, const Complex *c_5, double a, bool dagger, Dslash5Type type)
   {
     Dslash5Arg<Float, nColor> arg(out, in, x, m_f, m_5, b_5, c_5, a, dagger, type);
     Dslash5<Float, nColor, Dslash5Arg<Float, nColor>> dslash(arg, in);
     dslash.apply(streams[Nstream - 1]);
   }

   // template on the number of colors
   template <typename Float>
   void ApplyDslash5(ColorSpinorField &out, const ColorSpinorField &in, const ColorSpinorField &x, double m_f,
       double m_5, const Complex *b_5, const Complex *c_5, double a, bool dagger, Dslash5Type type)
   {
     switch (in.Ncolor()) {
     case 3: ApplyDslash5<Float, 3>(out, in, x, m_f, m_5, b_5, c_5, a, dagger, type); break;
     default: errorQuda("Unsupported number of colors %d\n", in.Ncolor());
     }
   }

   // Apply the 5th dimension dslash operator to a colorspinor field
   // out = Dslash5*in
   void ApplyDslash5(ColorSpinorField &out, const ColorSpinorField &in, const ColorSpinorField &x, double m_f,
       double m_5, const Complex *b_5, const Complex *c_5, double a, bool dagger, Dslash5Type type)
   {
 #ifdef GPU_DOMAIN_WALL_DIRAC
     if (in.PCType() != QUDA_4D_PC) errorQuda("Only 4-d preconditioned fields are supported");
     checkLocation(out, in); // check all locations match

     switch (checkPrecision(out, in)) {
     case QUDA_DOUBLE_PRECISION: ApplyDslash5<double>(out, in, x, m_f, m_5, b_5, c_5, a, dagger, type); break;
     case QUDA_SINGLE_PRECISION: ApplyDslash5<float>(out, in, x, m_f, m_5, b_5, c_5, a, dagger, type); break;
     case QUDA_HALF_PRECISION: ApplyDslash5<short>(out, in, x, m_f, m_5, b_5, c_5, a, dagger, type); break;
     case QUDA_QUARTER_PRECISION: ApplyDslash5<char>(out, in, x, m_f, m_5, b_5, c_5, a, dagger, type); break;
     default: errorQuda("Unsupported precision %d\n", in.Precision());
     }
 #else
     errorQuda("Domain wall dslash has not been built");
 #endif
   }

 } // namespace quda
quda::M5_INV_MOBIUS
Definition: dslash_quda.h:396

quda::ColorSpinorField::Nspin
int Nspin() const
Definition: color_spinor_field.h:406

quda::TuneParam
Definition: tune_quda.h:17

quda::Dslash5::blockMin
int blockMin() const
Definition: dslash5_domain_wall.cu:80

quda::Dslash5::tuneKey
TuneKey tuneKey() const
Definition: dslash5_domain_wall.cu:212

quda::ColorSpinorField
Definition: color_spinor_field.h:311

quda::LatticeField::AuxString
const char * AuxString() const
Definition: lattice_field.h:627

getVerbosity
QudaVerbosity getVerbosity()
Definition: util_quda.cpp:21

quda::Dslash5::var_inverse
static constexpr bool var_inverse
Definition: dslash5_domain_wall.cu:29

checkPrecision
#define checkPrecision(...)
Definition: lattice_field.h:695

errorQuda
#define errorQuda(...)
Definition: util_quda.h:121

color_spinor_field.h

QUDA_QUARTER_PRECISION
Definition: enum_quda.h:59

QUDA_HALF_PRECISION
Definition: enum_quda.h:60

streams
cudaStream_t * streams
Definition: interface_quda.cpp:157

quda::stream
cudaStream_t * stream
Definition: cuda_color_spinor_field.cpp:897

quda::Dslash5::shared
static constexpr bool shared
Definition: dslash5_domain_wall.cu:26

quda::Dslash5::~Dslash5
virtual ~Dslash5()
Definition: dslash5_domain_wall.cu:118

quda::DSLASH5_MOBIUS
Definition: dslash_quda.h:396

quda::Nstream
const int Nstream
Definition: quda_internal.h:83

quda::Dslash5::defaultTuneParam
void defaultTuneParam(TuneParam &param) const
Definition: dslash5_domain_wall.cu:202

quda::LatticeField::VolString
const char * VolString() const
Definition: lattice_field.h:624

quda::TuneParam::shared_bytes
int shared_bytes
Definition: tune_quda.h:22

quda::DSLASH5_DWF
Definition: dslash_quda.h:396

quda::DSLASH5_MOBIUS_PRE
Definition: dslash_quda.h:396

quda::Dslash5
Definition: dslash5_domain_wall.cu:20

quda::ColorSpinorField::Ncolor
int Ncolor() const
Definition: color_spinor_field.h:405

quda::ApplyDslash5
void ApplyDslash5(ColorSpinorField &out, const ColorSpinorField &in, const ColorSpinorField &x, double m_f, double m_5, const Complex *b_5, const Complex *c_5, double a, bool dagger, Dslash5Type type)
Apply either the domain-wall / mobius Dslash5 operator or the M5 inverse operator. In the current implementation, it is expected that the color-spinor fields are 4-d preconditioned.
Definition: dslash5_domain_wall.cu:216

quda::ColorSpinorField::Volume
int Volume() const
Definition: color_spinor_field.h:415

quda::TuneParam::grid
dim3 grid
Definition: tune_quda.h:21

quda::Dslash5::apply
void apply(const cudaStream_t &stream)
Definition: dslash5_domain_wall.cu:130

quda::Dslash5::Dslash5
Dslash5(Arg &arg, const ColorSpinorField &meta)
Definition: dslash5_domain_wall.cu:103

quda
Definition: blas_cublas.h:5

quda::Dslash5::meta
const ColorSpinorField & meta
Definition: dslash5_domain_wall.cu:25

quda::Dslash5::sharedBytesPerThread
unsigned int sharedBytesPerThread() const
Definition: dslash5_domain_wall.cu:81

Ls
int Ls
Definition: test_util.cpp:38

param
QudaGaugeParam param
Definition: pack_test.cpp:17

quda::Tunable::setMaxDynamicSharedBytesPerBlock
void setMaxDynamicSharedBytesPerBlock(F *func) const
Enable the maximum dynamic shared bytes for the kernel "func" (values given by maxDynamicSharedBytesP...
Definition: tune_quda.h:181

nColor
const int nColor
Definition: covdev_test.cpp:75

quda::TuneParam::block
dim3 block
Definition: tune_quda.h:20

quda::Dslash5::minThreads
unsigned int minThreads() const
Definition: dslash5_domain_wall.cu:78

in
cpuColorSpinorField * in
Definition: staggered_invert_test.cpp:98

dslash_domain_wall_m5.cuh

quda::ColorSpinorField::PCType
QudaPCType PCType() const
Definition: color_spinor_field.h:479

quda::tuneLaunch
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
Definition: tune.cpp:643

checkLocation
#define checkLocation(...)
Definition: lattice_field.h:664

quda::Complex
std::complex< double > Complex
Definition: quda_internal.h:46

quda::Dslash5::initTuneParam
void initTuneParam(TuneParam &param) const
Definition: dslash5_domain_wall.cu:192

quda::TunableVectorYZ::initTuneParam
void initTuneParam(TuneParam &param) const
Definition: tune_quda.h:523

quda::Arg
Definition: spinor_noise.cu:22

quda::LatticeField::Location
QudaFieldLocation Location() const
Definition: lattice_field.cpp:660

quda::M5_INV_ZMOBIUS
Definition: dslash_quda.h:396

QUDA_DOUBLE_PRECISION
Definition: enum_quda.h:62

index_helper.cuh

quda::M5_INV_DWF
Definition: dslash_quda.h:396

out
cpuColorSpinorField * out
Definition: staggered_invert_test.cpp:99

quda::Dslash5::flops
long long flops() const
Definition: dslash5_domain_wall.cu:31

QUDA_SINGLE_PRECISION
Definition: enum_quda.h:61

quda::Dslash5Type
Dslash5Type
Definition: dslash_quda.h:396

quda::mapper
Definition: register_traits.h:43

quda::TunableVectorYZ
Definition: tune_quda.h:485

color_spinor_field_order.h

dslash_quda.h

quda::Dslash5::launch
void launch(T *f, const TuneParam &tp, Arg &arg, const cudaStream_t &stream)
Definition: dslash5_domain_wall.cu:120

quda::Dslash5::tuneGridDim
bool tuneGridDim() const
Definition: dslash5_domain_wall.cu:77

quda::ColorSpinorField::X
const int * X() const
Definition: color_spinor_field.h:410

quda::Dslash5::maxSharedBytesPerBlock
unsigned int maxSharedBytesPerBlock() const
The maximum shared memory that a CUDA thread block can use in the autotuner. This isn&#39;t necessarily t...
Definition: dslash5_domain_wall.cu:93

quda::Tunable::maxDynamicSharedBytesPerBlock
unsigned int maxDynamicSharedBytesPerBlock() const
This can&#39;t be correctly queried in CUDA for all architectures so here we set set this. Based on Table 14 of the CUDA Programming Guide 10.0 (Technical Specifications per Compute Capability).
Definition: tune_quda.h:198

quda::Dslash5Arg
Parameter structure for applying the Dslash.
Definition: dslash_domain_wall_m5.cuh:78

quda::Dslash5::blockStep
int blockStep() const
Definition: dslash5_domain_wall.cu:79

getTuning
QudaTune getTuning()
Query whether autotuning is enabled or not. Default is enabled but can be overridden by setting QUDA_...
Definition: util_quda.cpp:52

quda::Dslash5::bytes
long long bytes() const
Definition: dslash5_domain_wall.cu:62

dagger
QudaDagType dagger
Definition: test_util.cpp:1620

QUDA_CPU_FIELD_LOCATION
Definition: enum_quda.h:325

quda::qudaLaunchKernel
cudaError_t qudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim, void **args, size_t sharedMem, cudaStream_t stream)
Wrapper around cudaLaunchKernel.
Definition: quda_cuda_api.cpp:201

quda::Tunable::aux
char aux[TuneKey::aux_n]
Definition: tune_quda.h:265

quda::TuneKey
Definition: tune_key.h:8

quda::Dslash5::arg
Arg & arg
Definition: dslash5_domain_wall.cu:24

quda::Tunable::maxSharedBytesPerBlock
virtual unsigned int maxSharedBytesPerBlock() const
The maximum shared memory that a CUDA thread block can use in the autotuner. This isn&#39;t necessarily t...
Definition: tune_quda.h:229

QUDA_4D_PC
Definition: enum_quda.h:396

quda::TunableVectorYZ::defaultTuneParam
void defaultTuneParam(TuneParam &param) const
Definition: tune_quda.h:531