QUDA
1.0.0
|
#include <dslash.h>
Public Member Functions | |
template<typename T , typename Arg > | |
void | launch (T *f, const TuneParam &tp, Arg &arg, const cudaStream_t &stream) |
template<template< typename, int, int, int, bool, bool, KernelType, typename > class Launch, int nDim, int nColor, int nParity, bool dagger, bool xpay, typename Arg > | |
void | instantiate (TuneParam &tp, Arg &arg, const cudaStream_t &stream) |
This instantiate function is used to instantiate the the KernelType template required for the multi-GPU dslash kernels. More... | |
template<template< typename, int, int, int, bool, bool, KernelType, typename > class Launch, int nDim, int nColor, int nParity, bool xpay, typename Arg > | |
void | instantiate (TuneParam &tp, Arg &arg, const cudaStream_t &stream) |
This instantiate function is used to instantiate the the dagger template. More... | |
template<template< typename, int, int, int, bool, bool, KernelType, typename > class Launch, int nDim, int nColor, bool xpay, typename Arg > | |
void | instantiate (TuneParam &tp, Arg &arg, const cudaStream_t &stream) |
This instantiate function is used to instantiate the the nParity template. More... | |
template<template< typename, int, int, int, bool, bool, KernelType, typename > class Launch, int nDim, int nColor, typename Arg > | |
void | instantiate (TuneParam &tp, Arg &arg, const cudaStream_t &stream) |
This instantiate function is used to instantiate the the xpay template. More... | |
Dslash (DslashArg< Float > &arg, const ColorSpinorField &out, const ColorSpinorField &in, const char *src) | |
int | Nface () const |
int | Dagger () const |
const char * | getAux (KernelType type) const |
void | setAux (KernelType type, const char *aux_) |
void | augmentAux (KernelType type, const char *extra) |
virtual void | preTune () |
Save the output field since the output field is both read from and written to in the exterior kernels. More... | |
virtual void | postTune () |
Restore the output field if doing exterior kernel. More... | |
virtual long long | flops () const |
virtual long long | bytes () const |
![]() | |
TunableVectorYZ (unsigned int vector_length_y, unsigned int vector_length_z) | |
bool | advanceBlockDim (TuneParam ¶m) const |
void | initTuneParam (TuneParam ¶m) const |
void | defaultTuneParam (TuneParam ¶m) const |
void | resizeVector (int y, int z) const |
void | resizeStep (int y, int z) const |
![]() | |
TunableVectorY (unsigned int vector_length_y) | |
void | resizeVector (int y) const |
void | resizeStep (int y) const |
![]() | |
Tunable () | |
virtual | ~Tunable () |
virtual TuneKey | tuneKey () const =0 |
virtual void | apply (const cudaStream_t &stream)=0 |
virtual std::string | paramString (const TuneParam ¶m) const |
virtual std::string | perfString (float time) const |
virtual bool | advanceTuneParam (TuneParam ¶m) const |
void | checkLaunchParam (TuneParam ¶m) |
CUresult | jitifyError () const |
CUresult & | jitifyError () |
Public Attributes | |
DslashArg< Float > & | dslashParam |
Protected Member Functions | |
void | fillAuxBase () |
Set the base strings used by the different dslash kernel types for autotuning. More... | |
void | fillAux (KernelType kernel_type, const char *kernel_str) |
Specialize the auxiliary strings for each kernel type. More... | |
bool | tuneGridDim () const |
unsigned int | minThreads () const |
template<typename Arg > | |
void | setParam (Arg &arg) |
virtual int | tuningIter () const |
int | blockStep () const |
int | blockMin () const |
unsigned int | maxSharedBytesPerBlock () const |
The maximum shared memory that a CUDA thread block can use in the autotuner. This isn't necessarily the same as maxDynamicSharedMemoryPerBlock since that may need explicit opt in to enable (by calling setMaxDynamicSharedBytes for the kernel in question). If the CUDA kernel in question does this opt in then this function can be overloaded to return maxDynamicSharedBytesPerBlock. More... | |
![]() | |
virtual unsigned int | sharedBytesPerThread () const |
virtual unsigned int | sharedBytesPerBlock (const TuneParam ¶m) const |
![]() | |
virtual bool | tuneAuxDim () const |
virtual bool | tuneSharedBytes () const |
virtual bool | advanceGridDim (TuneParam ¶m) const |
virtual unsigned int | maxBlockSize (const TuneParam ¶m) const |
virtual unsigned int | maxGridSize () const |
virtual unsigned int | minGridSize () const |
virtual int | gridStep () const |
gridStep sets the step size when iterating the grid size in advanceGridDim. More... | |
virtual void | resetBlockDim (TuneParam ¶m) const |
unsigned int | maxBlocksPerSM () const |
For some reason this can't be queried from the device properties, so here we set set this. Based on Table 14 of the CUDA Programming Guide 10.0 (Technical Specifications per Compute Capability) More... | |
template<typename F > | |
void | setMaxDynamicSharedBytesPerBlock (F *func) const |
Enable the maximum dynamic shared bytes for the kernel "func" (values given by maxDynamicSharedBytesPerBlock()). More... | |
unsigned int | maxDynamicSharedBytesPerBlock () const |
This can't be correctly queried in CUDA for all architectures so here we set set this. Based on Table 14 of the CUDA Programming Guide 10.0 (Technical Specifications per Compute Capability). More... | |
virtual bool | advanceSharedBytes (TuneParam ¶m) const |
virtual bool | advanceAux (TuneParam ¶m) const |
int | writeAuxString (const char *format,...) |
Protected Attributes | |
DslashArg< Float > & | arg |
const ColorSpinorField & | out |
const ColorSpinorField & | in |
const int | nDimComms |
char | aux_base [TuneKey::aux_n - 32] |
char | aux [8][TuneKey::aux_n] |
![]() | |
unsigned int | vector_length_y |
unsigned int | step_y |
bool | tune_block_x |
![]() | |
char | aux [TuneKey::aux_n] |
CUresult | jitify_error |
|
inline |
Definition at line 237 of file dslash.h.
References checkLocation, quda::DslashArg< Float >::commDim, errorQuda, quda::EXTERIOR_KERNEL_ALL, quda::EXTERIOR_KERNEL_T, quda::EXTERIOR_KERNEL_X, quda::EXTERIOR_KERNEL_Y, quda::EXTERIOR_KERNEL_Z, quda::Dslash< Float >::fillAux(), quda::Dslash< Float >::fillAuxBase(), quda::INTERIOR_KERNEL, quda::KERNEL_POLICY, QUDA_CPU_FIELD_LOCATION, and quda::setPackComms().
|
inline |
Definition at line 281 of file dslash.h.
Referenced by quda::dslash::setMappedGhost().
|
inlineprotectedvirtual |
Reimplemented from quda::Tunable.
|
inlineprotectedvirtual |
Reimplemented from quda::Tunable.
|
inlinevirtual |
Reimplemented from quda::Tunable.
Reimplemented in quda::Laplace< Float, nDim, nColor, Arg >, quda::Staggered< Float, nDim, nColor, Arg >, quda::TwistedCloverPreconditioned< Float, nDim, nColor, Arg >, quda::WilsonCloverPreconditioned< Float, nDim, nColor, Arg >, quda::DomainWall5D< Float, nDim, nColor, Arg >, quda::TwistedClover< Float, nDim, nColor, Arg >, and quda::WilsonClover< Float, nDim, nColor, Arg >.
Definition at line 364 of file dslash.h.
References quda::DslashArg< Float >::commDim, quda::EXTERIOR_KERNEL_ALL, quda::EXTERIOR_KERNEL_T, quda::EXTERIOR_KERNEL_X, quda::EXTERIOR_KERNEL_Y, quda::EXTERIOR_KERNEL_Z, quda::ColorSpinorField::GhostFace(), quda::INTERIOR_KERNEL, quda::KERNEL_POLICY, quda::DslashArg< Float >::kernel_type, quda::ColorSpinorField::Ncolor(), quda::ColorSpinorField::Nspin(), quda::LatticeField::Precision(), quda::DslashArg< Float >::reconstruct, quda::ColorSpinorField::Volume(), and quda::DslashArg< Float >::xpay.
Referenced by quda::DomainWall5D< Float, nDim, nColor, Arg >::bytes(), quda::WilsonClover< Float, nDim, nColor, Arg >::bytes(), quda::TwistedClover< Float, nDim, nColor, Arg >::bytes(), quda::WilsonCloverPreconditioned< Float, nDim, nColor, Arg >::bytes(), quda::TwistedCloverPreconditioned< Float, nDim, nColor, Arg >::bytes(), and quda::dslash::DslashPolicyTune< Dslash >::bytes().
|
inline |
Definition at line 275 of file dslash.h.
References quda::DslashArg< Float >::dagger.
Referenced by quda::dslash::commsComplete(), quda::dslash::issueGather(), quda::dslash::issuePack(), quda::dslash::issueRecv(), quda::dslash::DslashBasic< Dslash >::operator()(), quda::dslash::DslashFusedExterior< Dslash >::operator()(), quda::dslash::DslashGDR< Dslash >::operator()(), quda::dslash::DslashFusedGDR< Dslash >::operator()(), quda::dslash::DslashGDRRecv< Dslash >::operator()(), quda::dslash::DslashFusedGDRRecv< Dslash >::operator()(), quda::dslash::DslashZeroCopyPack< Dslash >::operator()(), quda::dslash::DslashFusedZeroCopyPack< Dslash >::operator()(), quda::dslash::DslashZeroCopyPackGDRRecv< Dslash >::operator()(), quda::dslash::DslashFusedZeroCopyPackGDRRecv< Dslash >::operator()(), quda::dslash::DslashZeroCopy< Dslash >::operator()(), and quda::dslash::DslashFusedZeroCopy< Dslash >::operator()().
|
inlineprotected |
Specialize the auxiliary strings for each kernel type.
[in] | kernel_type | The kernel_type we are generating the string got |
[in] | kernel_str | String corresponding to the kernel type |
Definition at line 56 of file dslash.h.
References quda::TuneKey::aux_n, comm_dim_partitioned_string(), and quda::INTERIOR_KERNEL.
Referenced by quda::Dslash< Float >::Dslash().
|
inlineprotected |
Set the base strings used by the different dslash kernel types for autotuning.
Definition at line 36 of file dslash.h.
References quda::DslashArg< Float >::commDim, quda::DslashArg< Float >::dagger, and quda::DslashArg< Float >::xpay.
Referenced by quda::Dslash< Float >::Dslash().
|
inlinevirtual |
Implements quda::Tunable.
Reimplemented in quda::NdegTwistedMassPreconditioned< Float, nDim, nColor, Arg >, quda::Staggered< Float, nDim, nColor, Arg >, quda::TwistedMassPreconditioned< Float, nDim, nColor, Arg >, quda::TwistedCloverPreconditioned< Float, nDim, nColor, Arg >, quda::WilsonCloverPreconditioned< Float, nDim, nColor, Arg >, quda::Laplace< Float, nDim, nColor, Arg >, quda::NdegTwistedMass< Float, nDim, nColor, Arg >, quda::TwistedClover< Float, nDim, nColor, Arg >, quda::WilsonClover< Float, nDim, nColor, Arg >, quda::TwistedMass< Float, nDim, nColor, Arg >, and quda::DomainWall5D< Float, nDim, nColor, Arg >.
Definition at line 316 of file dslash.h.
References quda::DslashArg< Float >::commDim, quda::EXTERIOR_KERNEL_ALL, quda::EXTERIOR_KERNEL_T, quda::EXTERIOR_KERNEL_X, quda::EXTERIOR_KERNEL_Y, quda::EXTERIOR_KERNEL_Z, quda::ColorSpinorField::GhostFace(), quda::INTERIOR_KERNEL, quda::KERNEL_POLICY, quda::DslashArg< Float >::kernel_type, quda::ColorSpinorField::Ncolor(), quda::ColorSpinorField::Nspin(), quda::ColorSpinorField::Volume(), and quda::DslashArg< Float >::xpay.
Referenced by quda::DomainWall5D< Float, nDim, nColor, Arg >::flops(), quda::TwistedMass< Float, nDim, nColor, Arg >::flops(), quda::TwistedClover< Float, nDim, nColor, Arg >::flops(), quda::WilsonClover< Float, nDim, nColor, Arg >::flops(), quda::NdegTwistedMass< Float, nDim, nColor, Arg >::flops(), quda::TwistedCloverPreconditioned< Float, nDim, nColor, Arg >::flops(), quda::WilsonCloverPreconditioned< Float, nDim, nColor, Arg >::flops(), quda::TwistedMassPreconditioned< Float, nDim, nColor, Arg >::flops(), quda::NdegTwistedMassPreconditioned< Float, nDim, nColor, Arg >::flops(), and quda::dslash::DslashPolicyTune< Dslash >::flops().
|
inline |
Definition at line 277 of file dslash.h.
Referenced by quda::dslash::setMappedGhost().
|
inline |
This instantiate function is used to instantiate the the KernelType template required for the multi-GPU dslash kernels.
[in] | tp | The tuning parameters to use for this kernel |
[in,out] | arg | The argument struct for the kernel |
[in] | stream | The cudaStream_t where the kernel will run |
Definition at line 119 of file dslash.h.
References errorQuda, quda::EXTERIOR_KERNEL_ALL, quda::EXTERIOR_KERNEL_T, quda::EXTERIOR_KERNEL_X, quda::EXTERIOR_KERNEL_Y, quda::EXTERIOR_KERNEL_Z, quda::INTERIOR_KERNEL, quda::LatticeField::Location(), nColor, QUDA_CPU_FIELD_LOCATION, and quda::blas::xpay().
Referenced by quda::DomainWall4D< Float, nDim, nColor, Arg >::apply().
|
inline |
This instantiate function is used to instantiate the the dagger template.
[in] | tp | The tuning parameters to use for this kernel |
[in,out] | arg | The argument struct for the kernel |
[in] | stream | The cudaStream_t where the kernel will run |
Definition at line 162 of file dslash.h.
References quda::Dslash< Float >::arg, quda::TuneParam::block, quda::TuneParam::grid, quda::Tunable::jitify_error, nColor, quda::TuneParam::shared_bytes, quda::stream, and quda::blas::xpay().
|
inline |
This instantiate function is used to instantiate the the nParity template.
[in] | tp | The tuning parameters to use for this kernel |
[in,out] | arg | The argument struct for the kernel |
[in] | stream | The cudaStream_t where the kernel will run |
Definition at line 189 of file dslash.h.
References quda::Dslash< Float >::arg, quda::TuneParam::block, errorQuda, quda::TuneParam::grid, quda::Tunable::jitify_error, nColor, quda::Arg< real, Ns, Nc, order >::nParity, quda::TuneParam::shared_bytes, and quda::stream.
|
inline |
This instantiate function is used to instantiate the the xpay template.
[in] | tp | The tuning parameters to use for this kernel |
[in,out] | arg | The argument struct for the kernel |
[in] | stream | The cudaStream_t where the kernel will run |
Definition at line 217 of file dslash.h.
References quda::Dslash< Float >::arg, quda::TuneParam::block, quda::TuneParam::grid, quda::Tunable::jitify_error, quda::Arg< real, Ns, Nc, order >::nParity, quda::TuneParam::shared_bytes, and quda::stream.
|
inline |
Definition at line 101 of file dslash.h.
References quda::TuneParam::block, dagger, deviceProp, quda::TuneParam::grid, nColor, quda::qudaLaunchKernel(), quda::Tunable::setMaxDynamicSharedBytesPerBlock(), quda::TuneParam::shared_bytes, and quda::blas::xpay().
Referenced by quda::DomainWall5DLaunch< Float, nDim, nColor, nParity, dagger, xpay, kernel_type, Arg >::launch(), quda::TwistedMassLaunch< Float, nDim, nColor, nParity, dagger, xpay, kernel_type, Arg >::launch(), quda::TwistedMassPreconditionedLaunch< Float, nDim, nColor, nParity, dagger, xpay, kernel_type, Arg >::launch(), quda::WilsonCloverLaunch< Float, nDim, nColor, nParity, dagger, xpay, kernel_type, Arg >::launch(), quda::WilsonCloverPreconditionedLaunch< Float, nDim, nColor, nParity, dagger, xpay, kernel_type, Arg >::launch(), quda::NdegTwistedMassLaunch< Float, nDim, nColor, nParity, dagger, xpay, kernel_type, Arg >::launch(), quda::NdegTwistedMassPreconditionedLaunch< Float, nDim, nColor, nParity, dagger, xpay, kernel_type, Arg >::launch(), quda::TwistedCloverPreconditionedLaunch< Float, nDim, nColor, nParity, dagger, xpay, kernel_type, Arg >::launch(), quda::TwistedCloverLaunch< Float, nDim, nColor, nParity, dagger, xpay, kernel_type, Arg >::launch(), quda::StaggeredLaunch< Float, nDim, nColor, nParity, dagger, xpay, kernel_type, Arg >::launch(), quda::DomainWall4DLaunch< Float, nDim, nColor, nParity, dagger, xpay, kernel_type, Arg >::launch(), quda::WilsonLaunch< Float, nDim, nColor, nParity, dagger, xpay, kernel_type, Arg >::launch(), and quda::LaplaceLaunch< Float, nDim, nColor, nParity, dagger, xpay, kernel_type, Arg >::launch().
|
inlineprotectedvirtual |
The maximum shared memory that a CUDA thread block can use in the autotuner. This isn't necessarily the same as maxDynamicSharedMemoryPerBlock since that may need explicit opt in to enable (by calling setMaxDynamicSharedBytes for the kernel in question). If the CUDA kernel in question does this opt in then this function can be overloaded to return maxDynamicSharedBytesPerBlock.
Reimplemented from quda::Tunable.
Definition at line 97 of file dslash.h.
References quda::Tunable::maxDynamicSharedBytesPerBlock().
|
inlineprotectedvirtual |
Reimplemented from quda::Tunable.
Definition at line 64 of file dslash.h.
References quda::DslashArg< Float >::threads.
|
inline |
Definition at line 271 of file dslash.h.
References quda::DslashArg< Float >::nFace.
Referenced by quda::dslash::commsComplete(), quda::dslash::issueGather(), quda::dslash::issuePack(), quda::dslash::issueRecv(), quda::dslash::DslashBasic< Dslash >::operator()(), quda::dslash::DslashFusedExterior< Dslash >::operator()(), quda::dslash::DslashGDR< Dslash >::operator()(), quda::dslash::DslashFusedGDR< Dslash >::operator()(), quda::dslash::DslashGDRRecv< Dslash >::operator()(), quda::dslash::DslashFusedGDRRecv< Dslash >::operator()(), quda::dslash::DslashZeroCopyPack< Dslash >::operator()(), quda::dslash::DslashFusedZeroCopyPack< Dslash >::operator()(), quda::dslash::DslashZeroCopyPackGDRRecv< Dslash >::operator()(), quda::dslash::DslashFusedZeroCopyPackGDRRecv< Dslash >::operator()(), quda::dslash::DslashZeroCopy< Dslash >::operator()(), quda::dslash::DslashFusedZeroCopy< Dslash >::operator()(), and quda::dslash::setFusedParam().
|
inlinevirtual |
Restore the output field if doing exterior kernel.
Reimplemented from quda::Tunable.
Definition at line 295 of file dslash.h.
References quda::INTERIOR_KERNEL, quda::KERNEL_POLICY, quda::DslashArg< Float >::kernel_type, and quda::LatticeField::restore().
Referenced by quda::dslash::DslashPolicyTune< Dslash >::postTune().
|
inlinevirtual |
Save the output field since the output field is both read from and written to in the exterior kernels.
Reimplemented from quda::Tunable.
Definition at line 287 of file dslash.h.
References quda::LatticeField::backup(), quda::INTERIOR_KERNEL, quda::KERNEL_POLICY, and quda::DslashArg< Float >::kernel_type.
Referenced by quda::dslash::DslashPolicyTune< Dslash >::preTune().
|
inline |
Definition at line 279 of file dslash.h.
Referenced by quda::dslash::setMappedGhost().
|
inlineprotected |
Definition at line 66 of file dslash.h.
References comm_peer2peer_enabled(), quda::getKernelPackT(), quda::ColorSpinorField::Ghost2(), quda::ColorSpinorField::GhostOffset(), quda::LatticeField::GhostPrecision(), and quda::INTERIOR_KERNEL.
Referenced by quda::DomainWall5D< Float, nDim, nColor, Arg >::apply(), quda::TwistedMass< Float, nDim, nColor, Arg >::apply(), quda::Staggered< Float, nDim, nColor, Arg >::apply(), quda::WilsonClover< Float, nDim, nColor, Arg >::apply(), quda::TwistedClover< Float, nDim, nColor, Arg >::apply(), quda::NdegTwistedMass< Float, nDim, nColor, Arg >::apply(), quda::DomainWall4D< Float, nDim, nColor, Arg >::apply(), quda::TwistedCloverPreconditioned< Float, nDim, nColor, Arg >::apply(), quda::WilsonCloverPreconditioned< Float, nDim, nColor, Arg >::apply(), quda::Wilson< Float, nDim, nColor, Arg >::apply(), quda::TwistedMassPreconditioned< Float, nDim, nColor, Arg >::apply(), quda::Laplace< Float, nDim, nColor, Arg >::apply(), and quda::NdegTwistedMassPreconditioned< Float, nDim, nColor, Arg >::apply().
|
inlineprotectedvirtual |
Reimplemented from quda::Tunable.
|
inlineprotectedvirtual |
Reimplemented from quda::Tunable.
|
protected |
Definition at line 16 of file dslash.h.
Referenced by quda::Dslash< Float >::instantiate().
|
protected |
|
protected |
DslashArg<Float>& quda::Dslash< Float >::dslashParam |
Definition at line 235 of file dslash.h.
Referenced by quda::dslash::issueGather(), quda::dslash::issuePack(), quda::dslash::issueRecv(), quda::dslash::DslashBasic< Dslash >::operator()(), quda::dslash::DslashFusedExterior< Dslash >::operator()(), quda::dslash::DslashGDR< Dslash >::operator()(), quda::dslash::DslashFusedGDR< Dslash >::operator()(), quda::dslash::DslashGDRRecv< Dslash >::operator()(), quda::dslash::DslashFusedGDRRecv< Dslash >::operator()(), quda::dslash::DslashZeroCopyPack< Dslash >::operator()(), quda::dslash::DslashFusedZeroCopyPack< Dslash >::operator()(), quda::dslash::DslashZeroCopyPackGDRRecv< Dslash >::operator()(), quda::dslash::DslashFusedZeroCopyPackGDRRecv< Dslash >::operator()(), quda::dslash::DslashZeroCopy< Dslash >::operator()(), quda::dslash::DslashFusedZeroCopy< Dslash >::operator()(), quda::dslash::DslashNC< Dslash >::operator()(), quda::dslash::setFusedParam(), and quda::dslash::setMappedGhost().
|
protected |
Definition at line 18 of file dslash.h.
Referenced by quda::instantiate().
|
protected |
|
protected |
Definition at line 17 of file dslash.h.
Referenced by quda::instantiate().