quda-ref/v1.0.0/dslash__pack2_8cu_source.html

 #include <color_spinor_field.h>

 // STRIPED - spread the blocks throughout the workload to ensure we
 // work on all directions/dimensions simultanesouly to maximize NVLink saturation
 #define STRIPED
 // if not STRIPED then this means we assign one thread block per direction / dimension

 #include <dslash_quda.h>
 #include <kernels/dslash_pack.cuh>

 namespace quda
 {

   void setPackComms(const int *comm_dim)
   {
     for (int i = 0; i < 4; i++) commDim[i] = comm_dim[i];
     for (int i = 4; i < QUDA_MAX_DIM; i++) commDim[i] = 0;
   }

   template <typename Float, int nSpin, int nColor, bool spin_project>
   std::ostream &operator<<(std::ostream &out, const PackArg<Float, nSpin, nColor, spin_project> &arg)
   {
     out << "parity = " << arg.parity << std::endl;
     out << "nParity = " << arg.nParity << std::endl;
     out << "pc_type = " << arg.pc_type << std::endl;
     out << "nFace = " << arg.nFace << std::endl;
     out << "dagger = " << arg.dagger << std::endl;
     out << "a = " << arg.a << std::endl;
     out << "b = " << arg.b << std::endl;
     out << "c = " << arg.c << std::endl;
     out << "twist = " << arg.twist << std::endl;
     out << "threads = " << arg.threads << std::endl;
     out << "threadDimMapLower = { ";
     for (int i = 0; i < 4; i++) out << arg.threadDimMapLower[i] << (i < 3 ? ", " : " }");
     out << std::endl;
     out << "threadDimMapUpper = { ";
     for (int i = 0; i < 4; i++) out << arg.threadDimMapUpper[i] << (i < 3 ? ", " : " }");
     out << std::endl;
     out << "sites_per_block = " << arg.sites_per_block << std::endl;
     return out;
   }

   // FIXME - add CPU variant

   template <typename Float, int nColor, bool spin_project> class Pack : TunableVectorYZ
   {

 protected:
     void **ghost;
     const ColorSpinorField &in;
     MemoryLocation location;
     const int nFace;
     const bool dagger; // only has meaning for nSpin=4
     const int parity;
     const int nParity;
     int threads;
     const double a;
     const double b;
     const double c;
     int twist; // only has meaning for nSpin=4

     bool tuneGridDim() const { return true; } // If striping, always tune grid dimension

     unsigned int maxGridSize() const
     {
       if (location & Host) {
 #ifdef STRIPED
         // if zero-copy policy then set a maximum number of blocks to be
         // the 3 * number of dimensions we are communicating
         int max = 3;
 #else
         // if zero-copy policy then assign exactly up to four thread blocks
         // per direction per dimension (effectively no grid-size tuning)
         int max = 2 * 4;
 #endif
         int nDimComms = 0;
         for (int d = 0; d < in.Ndim(); d++) nDimComms += commDim[d];
         return max * nDimComms;
       } else {
         return TunableVectorYZ::maxGridSize();
       }
     } // use no more than a quarter of the GPU

     unsigned int minGridSize() const
     {
       if (location & Host) {
 #ifdef STRIPED
         // if zero-copy policy then set a minimum number of blocks to be
         // the 1 * number of dimensions we are communicating
         int min = 3;
 #else
         // if zero-copy policy then assign exactly one thread block
         // per direction per dimension (effectively no grid-size tuning)
         int min = 2;
 #endif
         int nDimComms = 0;
         for (int d = 0; d < in.Ndim(); d++) nDimComms += commDim[d];
         return min * nDimComms;
       } else {
         return TunableVectorYZ::minGridSize();
       }
     }

     int gridStep() const
     {
 #ifdef STRIPED
       return TunableVectorYZ::gridStep();
 #else
       if (location & Host) {
         // the shmem kernel must ensure the grid size autotuner
         // increments in steps of 2 * number partitioned dimensions
         // for equal division of blocks to each direction/dimension
         int nDimComms = 0;
         for (int d = 0; d < in.Ndim(); d++) nDimComms += commDim[d];
         return 2 * nDimComms;
       } else {
         return TunableVectorYZ::gridStep();
       }
 #endif
     }

     bool tuneAuxDim() const { return true; } // Do tune the aux dimensions.
     unsigned int minThreads() const { return threads; }

     void fillAux()
     {
       strcpy(aux, "policy_kernel,");
       strcat(aux, in.AuxString());
       char comm[5];
       for (int i = 0; i < 4; i++) comm[i] = (commDim[i] ? '1' : '0');
       comm[4] = '\0';
       strcat(aux, ",comm=");
       strcat(aux, comm);
       strcat(aux, comm_dim_topology_string());
       if (in.PCType() == QUDA_5D_PC) { strcat(aux, ",5D_pc"); }
       if (dagger && in.Nspin() == 4) { strcat(aux, ",dagger"); }
       if (getKernelPackT()) { strcat(aux, ",kernelPackT"); }
       switch (nFace) {
       case 1: strcat(aux, ",nFace=1"); break;
       case 3: strcat(aux, ",nFace=3"); break;
       default: errorQuda("Number of faces not supported");
       }

       twist = ((b != 0.0) ? (c != 0.0 ? 2 : 1) : 0);
       if (twist && a == 0.0) errorQuda("Twisted packing requires non-zero scale factor a");
       if (twist) strcat(aux, twist == 2 ? ",twist-doublet" : ",twist-singlet");

 #ifndef STRIPED
       if (location & Host) strcat(aux, ",shmem");
 #endif

       // label the locations we are packing to
       // location label is nonp2p-p2p
       switch ((int)location) {
       case Device | Remote: strcat(aux, ",device-remote"); break;
       case Host | Remote: strcat(aux, ",host-remote"); break;
       case Device: strcat(aux, ",device-device"); break;
       case Host: strcat(aux, comm_peer2peer_enabled_global() ? ",host-device" : ",host-host"); break;
       default: errorQuda("Unknown pack target location %d\n", location);
       }
     }

 public:
     Pack(void *ghost[], const ColorSpinorField &in, MemoryLocation location, int nFace, bool dagger, int parity,
         double a, double b, double c) :
         TunableVectorYZ((in.Ndim() == 5 ? in.X(4) : 1), in.SiteSubset()),
         ghost(ghost),
         in(in),
         location(location),
         nFace(nFace),
         dagger(dagger),
         parity(parity),
         nParity(in.SiteSubset()),
         threads(0),
         a(a),
         b(b),
         c(c)
     {
       fillAux();

       // compute number of threads - really number of active work items we have to do
       for (int i = 0; i < 4; i++) {
         if (!commDim[i]) continue;
         if (i == 3 && !getKernelPackT()) continue;
         threads += 2 * nFace * in.getDslashConstant().ghostFaceCB[i]; // 2 for forwards and backwards faces
       }
     }

     virtual ~Pack() {}

     template <typename T, typename Arg>
     inline void launch(T *f, const TuneParam &tp, Arg &arg, const cudaStream_t &stream)
     {
       if (deviceProp.major >= 7) { // enable max shared memory mode on GPUs that support it
         this->setMaxDynamicSharedBytesPerBlock(f);
       }

       void *args[] = {&arg};
       qudaLaunchKernel((const void *)f, tp.grid, tp.block, args, tp.shared_bytes, stream);
     }

     void apply(const cudaStream_t &stream)
     {
       TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());

       if (in.Nspin() == 4) {
         using Arg = PackArg<Float, nColor, 4, spin_project>;
         Arg arg(ghost, in, nFace, dagger, parity, threads, a, b, c);
         arg.swizzle = tp.aux.x;
         arg.sites_per_block = (arg.threads + tp.grid.x - 1) / tp.grid.x;
         arg.blocks_per_dir = tp.grid.x / (2 * arg.active_dims); // set number of blocks per direction

 #ifdef STRIPED
         if (in.PCType() == QUDA_4D_PC) {
           if (arg.dagger) {
             switch (arg.twist) {
             case 0: launch(packKernel<true, 0, QUDA_4D_PC, Arg>, tp, arg, stream); break;
             case 1: launch(packKernel<true, 1, QUDA_4D_PC, Arg>, tp, arg, stream); break;
             case 2: launch(packKernel<true, 2, QUDA_4D_PC, Arg>, tp, arg, stream); break;
             }
           } else {
             switch (arg.twist) {
             case 0: launch(packKernel<false, 0, QUDA_4D_PC, Arg>, tp, arg, stream); break;
             default: errorQuda("Twisted packing only for dagger");
             }
           }
         } else if (arg.pc_type == QUDA_5D_PC) {
           if (arg.twist) errorQuda("Twist packing not defined");
           if (arg.dagger) {
             launch(packKernel<true, 0, QUDA_5D_PC, Arg>, tp, arg, stream);
           } else {
             launch(packKernel<false, 0, QUDA_5D_PC, Arg>, tp, arg, stream);
           }
         } else {
           errorQuda("Unexpected preconditioning type %d", in.PCType());
         }
 #else
         if (in.PCType() == QUDA_4D_PC) {
           if (arg.dagger) {
             switch (arg.twist) {
             case 0:
               launch(location & Host ? packShmemKernel<true, 0, QUDA_4D_PC, Arg> : packKernel<true, 0, QUDA_4D_PC, Arg>,
                   tp, arg, stream);
               break;
             case 1:
               launch(location & Host ? packShmemKernel<true, 1, QUDA_4D_PC, Arg> : packKernel<true, 0, QUDA_4D_PC, Arg>,
                   tp, arg, stream);
               break;
             case 2:
               launch(location & Host ? packShmemKernel<true, 2, QUDA_4D_PC, Arg> : packKernel<true, 2, QUDA_4D_PC, Arg>,
                   tp, arg, stream);
               break;
             }
           } else {
             switch (arg.twist) {
             case 0:
               launch(location & Host ? packShmemKernel<false, 0, QUDA_4D_PC, Arg> : packKernel<false, 0, QUDA_4D_PC, Arg>,
                   tp, arg, stream);
               break;
             default: errorQuda("Twisted packing only for dagger");
             }
           }
         } else if (arg.pc_type == QUDA_5D_PC) {
           if (arg.twist) errorQuda("Twist packing not defined");
           if (arg.dagger) {
             launch(packKernel<true, 0, QUDA_5D_PC, Arg>, tp, arg, stream);
           } else {
             launch(packKernel<false, 0, QUDA_5D_PC, Arg>, tp, arg, stream);
           }
         }
 #endif
       } else if (in.Nspin() == 1) {
         using Arg = PackArg<Float, nColor, 1, false>;
         Arg arg(ghost, in, nFace, dagger, parity, threads, a, b, c);
         arg.swizzle = tp.aux.x;
         arg.sites_per_block = (arg.threads + tp.grid.x - 1) / tp.grid.x;
         arg.blocks_per_dir = tp.grid.x / (2 * arg.active_dims); // set number of blocks per direction

 #ifdef STRIPED
         launch(packStaggeredKernel<Arg>, tp, arg, stream);
 #else
         launch(location & Host ? packStaggeredShmemKernel<Arg> : packStaggeredKernel<Arg>, tp, arg, stream);
 #endif
       } else {
         errorQuda("Unsupported nSpin = %d\n", in.Nspin());
       }
     }

     bool tuneSharedBytes() const { return false; }

 #if 0
     // not used at present, but if tuneSharedBytes is enabled then
     // this allows tuning up the full dynamic shared memory if needed
     unsigned int maxSharedBytesPerBlock() const { return maxDynamicSharedBytesPerBlock(); }
 #endif

     void initTuneParam(TuneParam &param) const
     {
       TunableVectorYZ::initTuneParam(param);
       // if doing a zero-copy policy then ensure that each thread block
       // runs exclusively on a given SM - this is to ensure quality of
       // service for the packing kernel when running concurrently.
       if (location & Host) param.shared_bytes = maxDynamicSharedBytesPerBlock() / 2 + 1;
 #ifndef STRIPED
       if (location & Host) param.grid.x = minGridSize();
 #endif
     }

     void defaultTuneParam(TuneParam &param) const
     {
       TunableVectorYZ::defaultTuneParam(param);
       // if doing a zero-copy policy then ensure that each thread block
       // runs exclusively on a given SM - this is to ensure quality of
       // service for the packing kernel when running concurrently.
       if (location & Host) param.shared_bytes = maxDynamicSharedBytesPerBlock() / 2 + 1;
 #ifndef STRIPED
       if (location & Host) param.grid.x = minGridSize();
 #endif
     }

     TuneKey tuneKey() const { return TuneKey(in.VolString(), typeid(*this).name(), aux); }

     int tuningIter() const { return 3; }

     long long flops() const
     {
       // unless we are spin projecting (nSpin = 4), there are no flops to do
       return in.Nspin() == 4 ? 2 * in.Nspin() / 2 * nColor * nParity * in.getDslashConstant().Ls * threads : 0;
     }

     long long bytes() const
     {
       size_t precision = sizeof(Float);
       size_t faceBytes = 2 * ((in.Nspin() == 4 ? in.Nspin() / 2 : in.Nspin()) + in.Nspin()) * nColor * precision;
       if (precision == QUDA_HALF_PRECISION || precision == QUDA_QUARTER_PRECISION)
         faceBytes += 2 * sizeof(float); // 2 is from input and output
       return faceBytes * nParity * in.getDslashConstant().Ls * threads;
     }
   };

   template <typename Float, int nColor>
   void PackGhost(void *ghost[], const ColorSpinorField &in, MemoryLocation location, int nFace, bool dagger, int parity,
                  bool spin_project, double a, double b, double c, const cudaStream_t &stream)
   {
     if (spin_project) {
       Pack<Float, nColor, true> pack(ghost, in, location, nFace, dagger, parity, a, b, c);
       pack.apply(stream);
     } else {
       Pack<Float, nColor, false> pack(ghost, in, location, nFace, dagger, parity, a, b, c);
       pack.apply(stream);
     }
   }

   // template on the number of colors
   template <typename Float>
   void PackGhost(void *ghost[], const ColorSpinorField &in, MemoryLocation location, int nFace, bool dagger, int parity,
                  bool spin_project, double a, double b, double c, const cudaStream_t &stream)
   {
     if (in.Ncolor() == 3) {
       PackGhost<Float, 3>(ghost, in, location, nFace, dagger, parity, spin_project, a, b, c, stream);
     } else {
       errorQuda("Unsupported number of colors %d\n", in.Ncolor());
     }
   }

   // Pack the ghost for the Dslash operator
   void PackGhost(void *ghost[2 * QUDA_MAX_DIM], const ColorSpinorField &in, MemoryLocation location, int nFace,
                  bool dagger, int parity, bool spin_project, double a, double b, double c, const cudaStream_t &stream)
   {
     int nDimPack = 0;
     for (int d = 0; d < 4; d++) {
       if (!commDim[d]) continue;
       if (d != 3 || getKernelPackT()) nDimPack++;
     }

     if (!nDimPack) return; // if zero then we have nothing to pack

     if (in.Precision() == QUDA_DOUBLE_PRECISION) {
       PackGhost<double>(ghost, in, location, nFace, dagger, parity, spin_project, a, b, c, stream);
     } else if (in.Precision() == QUDA_SINGLE_PRECISION) {
       PackGhost<float>(ghost, in, location, nFace, dagger, parity, spin_project, a, b, c, stream);
     } else if (in.Precision() == QUDA_HALF_PRECISION) {
       PackGhost<short>(ghost, in, location, nFace, dagger, parity, spin_project, a, b, c, stream);
     } else if (in.Precision() == QUDA_QUARTER_PRECISION) {
       PackGhost<char>(ghost, in, location, nFace, dagger, parity, spin_project, a, b, c, stream);
     } else {
       errorQuda("Unsupported precision %d\n", in.Precision());
     }
   }

 } // namespace quda
quda::Remote
Definition: color_spinor_field.h:15

quda::Device
Definition: color_spinor_field.h:15

quda::Pack::tuneKey
TuneKey tuneKey() const
Definition: dslash_pack2.cu:321

quda::ColorSpinorField::Nspin
int Nspin() const
Definition: color_spinor_field.h:406

quda::TuneParam
Definition: tune_quda.h:17

quda::ColorSpinorField
Definition: color_spinor_field.h:311

quda::LatticeField::AuxString
const char * AuxString() const
Definition: lattice_field.h:627

deviceProp
cudaDeviceProp deviceProp
Definition: interface_quda.cpp:156

quda::getKernelPackT
bool getKernelPackT()
Definition: dslash_quda.cu:26

getVerbosity
QudaVerbosity getVerbosity()
Definition: util_quda.cpp:21

errorQuda
#define errorQuda(...)
Definition: util_quda.h:121

color_spinor_field.h

QUDA_QUARTER_PRECISION
Definition: enum_quda.h:59

QUDA_HALF_PRECISION
Definition: enum_quda.h:60

comm_dim
int comm_dim(int dim)
Definition: comm_common.cpp:424

quda::stream
cudaStream_t * stream
Definition: cuda_color_spinor_field.cpp:897

quda::Pack::maxGridSize
unsigned int maxGridSize() const
Definition: dslash_pack2.cu:64

quda::LatticeField::VolString
const char * VolString() const
Definition: lattice_field.h:624

quda::TuneParam::shared_bytes
int shared_bytes
Definition: tune_quda.h:22

quda::Pack::a
const double a
Definition: dslash_pack2.cu:57

quda::Pack::tuneGridDim
bool tuneGridDim() const
Definition: dslash_pack2.cu:62

quda::Pack::parity
const int parity
Definition: dslash_pack2.cu:54

quda::Pack::defaultTuneParam
void defaultTuneParam(TuneParam &param) const
Definition: dslash_pack2.cu:309

quda::Pack::apply
void apply(const cudaStream_t &stream)
Definition: dslash_pack2.cu:202

quda::ColorSpinorField::Ncolor
int Ncolor() const
Definition: color_spinor_field.h:405

quda::ColorSpinorField::Ndim
int Ndim() const
Definition: color_spinor_field.h:409

quda::TuneParam::grid
dim3 grid
Definition: tune_quda.h:21

quda::TuneParam::aux
int4 aux
Definition: tune_quda.h:23

quda
Definition: blas_cublas.h:5

quda::Pack::launch
void launch(T *f, const TuneParam &tp, Arg &arg, const cudaStream_t &stream)
Definition: dslash_pack2.cu:192

param
QudaGaugeParam param
Definition: pack_test.cpp:17

quda::Pack::twist
int twist
Definition: dslash_pack2.cu:60

quda::Pack::gridStep
int gridStep() const
gridStep sets the step size when iterating the grid size in advanceGridDim.
Definition: dslash_pack2.cu:104

quda::Pack::tuneAuxDim
bool tuneAuxDim() const
Definition: dslash_pack2.cu:122

quda::Tunable::setMaxDynamicSharedBytesPerBlock
void setMaxDynamicSharedBytesPerBlock(F *func) const
Enable the maximum dynamic shared bytes for the kernel "func" (values given by maxDynamicSharedBytesP...
Definition: tune_quda.h:181

quda::Tunable::maxGridSize
virtual unsigned int maxGridSize() const
Definition: tune_quda.h:95

quda::PackArg
Definition: dslash_pack.cuh:11

quda::Pack::nFace
const int nFace
Definition: dslash_pack2.cu:52

nColor
const int nColor
Definition: covdev_test.cpp:75

quda::TuneParam::block
dim3 block
Definition: tune_quda.h:20

comm_dim_topology_string
const char * comm_dim_topology_string()
Return a string that defines the comm topology (for use as a tuneKey)
Definition: comm_common.cpp:797

quda::Tunable::gridStep
virtual int gridStep() const
gridStep sets the step size when iterating the grid size in advanceGridDim.
Definition: tune_quda.h:103

quda::Pack::tuneSharedBytes
bool tuneSharedBytes() const
Definition: dslash_pack2.cu:289

quda::Pack::fillAux
void fillAux()
Definition: dslash_pack2.cu:125

quda::ColorSpinorField::PCType
QudaPCType PCType() const
Definition: color_spinor_field.h:479

quda::tuneLaunch
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
Definition: tune.cpp:643

quda::DslashConstant::Ls
int Ls
Definition: color_spinor_field.h:282

X
int X[4]
Definition: covdev_test.cpp:70

QUDA_5D_PC
Definition: enum_quda.h:396

quda::PackGhost
void PackGhost(void *ghost[2 *QUDA_MAX_DIM], const ColorSpinorField &field, MemoryLocation location, int nFace, bool dagger, int parity, bool spin_project, double a, double b, double c, const cudaStream_t &stream)
Dslash face packing routine.
Definition: dslash_pack2.cu:367

quda::Pack
Definition: dslash_pack2.cu:45

quda::Pack::~Pack
virtual ~Pack()
Definition: dslash_pack2.cu:189

quda::DslashConstant::ghostFaceCB
int ghostFaceCB[QUDA_MAX_DIM+1]
Definition: color_spinor_field.h:296

quda::Host
Definition: color_spinor_field.h:15

quda::TunableVectorYZ::initTuneParam
void initTuneParam(TuneParam &param) const
Definition: tune_quda.h:523

quda::Pack::in
const ColorSpinorField & in
Definition: dslash_pack2.cu:50

quda::Arg
Definition: spinor_noise.cu:22

quda::ColorSpinorField::getDslashConstant
const DslashConstant & getDslashConstant() const
Get the dslash_constant structure from this field.
Definition: color_spinor_field.h:506

quda::Pack::b
const double b
Definition: dslash_pack2.cu:58

quda::Pack::location
MemoryLocation location
Definition: dslash_pack2.cu:51

quda::Pack::minThreads
unsigned int minThreads() const
Definition: dslash_pack2.cu:123

QUDA_DOUBLE_PRECISION
Definition: enum_quda.h:62

quda::Pack::nParity
const int nParity
Definition: dslash_pack2.cu:55

quda::Pack::dagger
const bool dagger
Definition: dslash_pack2.cu:53

quda::Pack::c
const double c
Definition: dslash_pack2.cu:59

quda::Tunable::minGridSize
virtual unsigned int minGridSize() const
Definition: tune_quda.h:96

quda::commDim
static int commDim[QUDA_MAX_DIM]
Definition: dslash_pack.cuh:9

out
cpuColorSpinorField * out
Definition: staggered_invert_test.cpp:99

quda::pack
__device__ __host__ void pack(Arg &arg, int ghost_idx, int s, int parity)
Definition: dslash_pack.cuh:83

QUDA_SINGLE_PRECISION
Definition: enum_quda.h:61

quda::TunableVectorYZ
Definition: tune_quda.h:485

quda::Pack::ghost
void ** ghost
Definition: dslash_pack2.cu:49

quda::Pack::tuningIter
int tuningIter() const
Definition: dslash_pack2.cu:323

dslash_quda.h

quda::Pack::bytes
long long bytes() const
Definition: dslash_pack2.cu:331

quda::Pack::threads
int threads
Definition: dslash_pack2.cu:56

quda::arg
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
Definition: complex_quda.h:1076

quda::Pack::initTuneParam
void initTuneParam(TuneParam &param) const
Definition: dslash_pack2.cu:297

QUDA_MAX_DIM
#define QUDA_MAX_DIM
Maximum number of dimensions supported by QUDA. In practice, no routines make use of more than 5...
Definition: quda_constants.h:17

quda::Tunable::maxDynamicSharedBytesPerBlock
unsigned int maxDynamicSharedBytesPerBlock() const
This can&#39;t be correctly queried in CUDA for all architectures so here we set set this. Based on Table 14 of the CUDA Programming Guide 10.0 (Technical Specifications per Compute Capability).
Definition: tune_quda.h:198

quda::Pack::Pack
Pack(void *ghost[], const ColorSpinorField &in, MemoryLocation location, int nFace, bool dagger, int parity, double a, double b, double c)
Definition: dslash_pack2.cu:164

quda::Pack::minGridSize
unsigned int minGridSize() const
Definition: dslash_pack2.cu:84

getTuning
QudaTune getTuning()
Query whether autotuning is enabled or not. Default is enabled but can be overridden by setting QUDA_...
Definition: util_quda.cpp:52

quda::LatticeField::Precision
QudaPrecision Precision() const
Definition: lattice_field.h:546

comm_peer2peer_enabled_global
int comm_peer2peer_enabled_global()
Definition: comm_common.cpp:289

quda::qudaLaunchKernel
cudaError_t qudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim, void **args, size_t sharedMem, cudaStream_t stream)
Wrapper around cudaLaunchKernel.
Definition: quda_cuda_api.cpp:201

quda::MemoryLocation
MemoryLocation
Definition: color_spinor_field.h:15

quda::Tunable::aux
char aux[TuneKey::aux_n]
Definition: tune_quda.h:265

quda::TuneKey
Definition: tune_key.h:8

quda::Tunable::maxSharedBytesPerBlock
virtual unsigned int maxSharedBytesPerBlock() const
The maximum shared memory that a CUDA thread block can use in the autotuner. This isn&#39;t necessarily t...
Definition: tune_quda.h:229

QUDA_4D_PC
Definition: enum_quda.h:396

quda::TunableVectorYZ::defaultTuneParam
void defaultTuneParam(TuneParam &param) const
Definition: tune_quda.h:531

quda::setPackComms
void setPackComms(const int *dim_pack)
Helper function that sets which dimensions the packing kernel should be packing for.
Definition: dslash_pack2.cu:14

quda::Pack::flops
long long flops() const
Definition: dslash_pack2.cu:325

dslash_pack.cuh