quda-ref/v1.1.0/dslash_8h_source.html

 #pragma once


 #include <typeinfo>


 #include <color_spinor_field.h>

 #include <tune_quda.h>

 #include <dslash_quda.h>

 #include <dslash_helper.cuh>

 #include <jitify_helper.cuh>

 #include <instantiate.h>

 #include <instantiate_dslash.h>


 namespace quda

 {


   template <template <int, bool, bool, KernelType, typename> class D, typename Arg>

   class Dslash : public TunableVectorYZ

   {


   protected:

     Arg &arg;

     const ColorSpinorField &out;

     const ColorSpinorField &in;


     const int nDimComms;


     char aux_base[TuneKey::aux_n - 32];

     char aux[8][TuneKey::aux_n];

     char aux_pack[TuneKey::aux_n];

     char aux_barrier[TuneKey::aux_n];


     // pointers to ghost buffers we are packing to

     void *packBuffer[4 * QUDA_MAX_DIM];


     std::string kernel_file;

     inline void fillAuxBase()

     {

       char comm[5];

       comm[0] = (arg.commDim[0] ? '1' : '0');

       comm[1] = (arg.commDim[1] ? '1' : '0');

       comm[2] = (arg.commDim[2] ? '1' : '0');

       comm[3] = (arg.commDim[3] ? '1' : '0');

       comm[4] = '\0';

       strcpy(aux_base, ",commDim=");

       strcat(aux_base, comm);


       if (arg.xpay) strcat(aux_base, ",xpay");

       if (arg.dagger) strcat(aux_base, ",dagger");

     }


     inline void fillAux(KernelType kernel_type, const char *kernel_str)

     {

       strcpy(aux[kernel_type], kernel_str);

       if (kernel_type == INTERIOR_KERNEL) strcat(aux[kernel_type], comm_dim_partitioned_string());

       strncat(aux[kernel_type], aux_base, TuneKey::aux_n - 1);

     }


     virtual bool tuneGridDim() const { return arg.kernel_type == EXTERIOR_KERNEL_ALL && arg.shmem > 0; }

     virtual unsigned int minThreads() const { return arg.threads; }


     virtual unsigned int minGridSize() const

     {

       /* when using nvshmem we perform the exterior Dslash using a grid strided loop and uniquely assign communication

        * directions to CUDA block and have all communication directions resident. We therefore figure out the number of

        * communicating dimensions and make sure that the number of blocks is a multiple of the communicating directions (2*dim)

        */

       if (arg.kernel_type == EXTERIOR_KERNEL_ALL && arg.shmem > 0) {

         int nDimComms = 0;

         for (int d = 0; d < 4; d++) nDimComms += arg.commDim[d];

         return ((deviceProp.multiProcessorCount) / (2 * nDimComms)) * (2 * nDimComms);

       } else {

         return TunableVectorYZ::minGridSize();

       }

     }


     virtual int gridStep() const

     {

       /* see comment for minGridSize above for gridStep choice when using nvshmem */

       if (arg.kernel_type == EXTERIOR_KERNEL_ALL && arg.shmem > 0) {

         int nDimComms = 0;

         for (int d = 0; d < 4; d++) nDimComms += arg.commDim[d];

         return ((deviceProp.multiProcessorCount) / (2 * nDimComms)) * (2 * nDimComms);

       } else {

         return TunableVectorYZ::gridStep();

       }

     }


     inline void setParam(TuneParam &tp)

     {

       arg.t_proj_scale = getKernelPackT() ? 1.0 : 2.0;


       // Need to reset ghost pointers prior to every call since the

       // ghost buffer may have been changed during policy tuning.

       // Also, the accessor constructor calls Ghost(), which uses

       // ghost_buf, but this is only presently set with the

       // synchronous exchangeGhost.

       static void *ghost[8] = {}; // needs to be persistent across interior and exterior calls

       for (int dim = 0; dim < 4; dim++) {


         for (int dir = 0; dir < 2; dir++) {

           // if doing interior kernel, then this is the initial call,

           // so we set all ghost pointers else if doing exterior

           // kernel, then we only have to update the non-p2p ghosts,

           // since these may have been assigned to zero-copy memory

           if (!comm_peer2peer_enabled(dir, dim) || arg.kernel_type == INTERIOR_KERNEL || arg.kernel_type == UBER_KERNEL) {

             ghost[2 * dim + dir] = (typename Arg::Float *)((char *)in.Ghost2() + in.GhostOffset(dim, dir));

           }

         }

       }


       arg.in.resetGhost(in, ghost);


       if (arg.pack_threads && (arg.kernel_type == INTERIOR_KERNEL || arg.kernel_type == UBER_KERNEL)) {

         arg.blocks_per_dir = tp.aux.x;

         arg.setPack(true, this->packBuffer); // need to recompute for updated block_per_dir

         arg.in_pack.resetGhost(in, this->packBuffer);

         tp.grid.x += arg.pack_blocks;

         arg.counter = dslash::get_shmem_sync_counter();

       }

       if (arg.shmem > 0 && arg.kernel_type == EXTERIOR_KERNEL_ALL) {

         // if we are doing tuning we should not wait on the sync_arr to be set.

         arg.counter = (activeTuning() && !policyTuning()) ? 2 : dslash::get_shmem_sync_counter();

       }

       if (arg.shmem > 0 && (arg.kernel_type == INTERIOR_KERNEL || arg.kernel_type == UBER_KERNEL)) {

         arg.counter = activeTuning() ?

           (uberTuning() && !policyTuning() ? dslash::inc_shmem_sync_counter() : dslash::get_shmem_sync_counter()) :

           dslash::get_shmem_sync_counter();

         arg.exterior_blocks = ((arg.shmem & 64) && arg.exterior_dims > 0) ?

           ((deviceProp.multiProcessorCount) / (2 * arg.exterior_dims)) * (2 * arg.exterior_dims * tp.aux.y) :

           0;

         tp.grid.x += arg.exterior_blocks;

       }

     }


     virtual int tuningIter() const { return 10; }


     virtual int blockStep() const { return 16; }

     virtual int blockMin() const { return 16; }


     unsigned int maxSharedBytesPerBlock() const { return maxDynamicSharedBytesPerBlock(); }


     virtual bool advanceAux(TuneParam &param) const

     {

       if (arg.pack_threads && (arg.kernel_type == INTERIOR_KERNEL || arg.kernel_type == UBER_KERNEL)) {


         int max_threads_per_dir = 0;

         for (int i = 0; i < 4; ++i) {

           max_threads_per_dir = std::max(max_threads_per_dir, (arg.threadDimMapUpper[i] - arg.threadDimMapLower[i]) / 2);

         }

         int nDimComms = 0;

         for (int d = 0; d < 4; d++) nDimComms += arg.commDim[d];


         /* if doing the fused packing + interior kernel we tune how many blocks to use for communication */

         // use up to a quarter of the GPU for packing (but at least up to 4 blocks per dir)

         const int max_blocks_per_dir = std::max((deviceProp.multiProcessorCount) / (8 * nDimComms), 4);

         if (param.aux.x + 1 <= max_blocks_per_dir

             && (param.aux.x + 1) * param.block.x < (max_threads_per_dir + param.block.x - 1)) {

           param.aux.x++;

           return true;

         } else {

           param.aux.x = 1;

           if (arg.exterior_dims > 0 && arg.shmem & 64) {

             /* if doing a fused interior+exterior kernel we use aux.y to control the number of blocks we add for the

              * exterior. We make sure to use multiple blocks per communication direction.

              */

             auto maxgridsize = TunableVectorYZ::maxGridSize();

             if (param.aux.y < 4) {

               param.aux.y++;

               return true;

             } else {

               param.aux.y = 1;

               return false;

             }

           }

           return false;

         }

       } else {

         return false;

       }

     }


     virtual bool advanceTuneParam(TuneParam &param) const

     {

       return advanceAux(param) || advanceSharedBytes(param) || advanceBlockDim(param) || advanceGridDim(param);

     }


     virtual void initTuneParam(TuneParam &param) const

     {

       /* for nvshmem uber kernels the current synchronization requires use to keep the y and z dimension local to the

        * block. This can be removed when we introduce a finer grained synchronization which takes into account the y and

        * z components explicitly */

       if (arg.shmem & 64) {

         step_y = vector_length_y;

         step_z = vector_length_z;

       }

       TunableVectorYZ::initTuneParam(param);

       if (arg.pack_threads && (arg.kernel_type == INTERIOR_KERNEL || arg.kernel_type == UBER_KERNEL))

         param.aux.x = 1;                                                        // packing blocks per direction

       if (arg.exterior_dims && arg.kernel_type == UBER_KERNEL) param.aux.y = 1; // exterior blocks

     }


     virtual void defaultTuneParam(TuneParam &param) const

     {

       /* for nvshmem uber kernels the current synchronization requires use to keep the y and z dimension local to the

        * block. This can be removed when we introduce a finer grained synchronization which takes into account the y and

        * z components explicitly. */

       if (arg.shmem & 64) {

         step_y = vector_length_y;

         step_z = vector_length_z;

       }

       TunableVectorYZ::defaultTuneParam(param);

       if (arg.pack_threads && (arg.kernel_type == INTERIOR_KERNEL || arg.kernel_type == UBER_KERNEL))

         param.aux.x = 1;                                                        // packing blocks per direction

       if (arg.exterior_dims && arg.kernel_type == UBER_KERNEL) param.aux.y = 1; // exterior blocks

     }


     template <template <bool, QudaPCType, typename> class P, int nParity, bool dagger, bool xpay, KernelType kernel_type>

     inline void launch(TuneParam &tp, const qudaStream_t &stream)

     {

       if (deviceProp.major >= 7) { // should test whether this is always optimal on Volta

         tp.set_max_shared_bytes = true;

       }

       qudaLaunchKernel(dslashGPU<D, P, nParity, dagger, xpay, kernel_type, Arg>, tp, stream, arg);

     }


 #ifdef JITIFY

     template <template <bool, QudaPCType, typename> class P> auto kernel_instance()

     {

       if (!program) errorQuda("Jitify program has not been created");

       using namespace jitify::reflection;

       const auto kernel = "quda::dslashGPU";


       // we need this hackery to get the naked unbound template class parameters

       auto D_instance = reflect<D<0, false, false, INTERIOR_KERNEL, Arg>>();

       auto D_naked = D_instance.substr(0, D_instance.find("<"));

       auto P_instance = reflect<P<false, QUDA_4D_PC, Arg>>();

       auto P_naked = P_instance.substr(0, P_instance.find("<"));


       // Since we pass the operator and packer classes as strings to

       // jitify, we need to handle the reflection for all other

       // template parameters here as well as opposed to leaving this

       // to jitify.

       auto instance = program->kernel(kernel).instantiate({D_naked, P_naked, reflect(arg.nParity), reflect(arg.dagger),

                                                            reflect(arg.xpay), reflect(arg.kernel_type), reflect<Arg>()});


       return instance;

     }

 #endif


   public:

     template <template <bool, QudaPCType, typename> class P, int nParity, bool dagger, bool xpay>

     inline void instantiate(TuneParam &tp, const qudaStream_t &stream)

     {

       if (in.Location() == QUDA_CPU_FIELD_LOCATION) {

         errorQuda("Not implemented");

       } else {

 #ifdef JITIFY

         Tunable::jitify_error = kernel_instance<P>().configure(tp.grid, tp.block, tp.shared_bytes, stream).launch(arg);

 #else

         switch (arg.kernel_type) {

         case INTERIOR_KERNEL: launch<P, nParity, dagger, xpay, INTERIOR_KERNEL>(tp, stream); break;

 #ifdef MULTI_GPU

 #ifdef NVSHMEM_COMMS

         case UBER_KERNEL: launch<P, nParity, dagger, xpay, UBER_KERNEL>(tp, stream); break;

 #endif

         case EXTERIOR_KERNEL_X: launch<P, nParity, dagger, xpay, EXTERIOR_KERNEL_X>(tp, stream); break;

         case EXTERIOR_KERNEL_Y: launch<P, nParity, dagger, xpay, EXTERIOR_KERNEL_Y>(tp, stream); break;

         case EXTERIOR_KERNEL_Z: launch<P, nParity, dagger, xpay, EXTERIOR_KERNEL_Z>(tp, stream); break;

         case EXTERIOR_KERNEL_T: launch<P, nParity, dagger, xpay, EXTERIOR_KERNEL_T>(tp, stream); break;

         case EXTERIOR_KERNEL_ALL: launch<P, nParity, dagger, xpay, EXTERIOR_KERNEL_ALL>(tp, stream); break;

         default: errorQuda("Unexpected kernel type %d", arg.kernel_type);

 #else

         default: errorQuda("Unexpected kernel type %d for single-GPU build", arg.kernel_type);

 #endif

         }

 #endif // JITIFY

       }

     }


     template <template <bool, QudaPCType, typename> class P, int nParity, bool xpay>

     inline void instantiate(TuneParam &tp, const qudaStream_t &stream)

     {

 #ifdef JITIFY

       Tunable::jitify_error = kernel_instance<P>().configure(tp.grid, tp.block, tp.shared_bytes, stream).launch(arg);

 #else

       if (arg.dagger)

         instantiate<P, nParity, true, xpay>(tp, stream);

       else

         instantiate<P, nParity, false, xpay>(tp, stream);

 #endif

     }


     template <template <bool, QudaPCType, typename> class P, bool xpay>

     inline void instantiate(TuneParam &tp, const qudaStream_t &stream)

     {

 #ifdef JITIFY

       Tunable::jitify_error = kernel_instance<P>().configure(tp.grid, tp.block, tp.shared_bytes, stream).launch(arg);

 #else

       switch (arg.nParity) {

       case 1: instantiate<P, 1, xpay>(tp, stream); break;

       case 2: instantiate<P, 2, xpay>(tp, stream); break;

       default: errorQuda("nParity = %d undefined\n", arg.nParity);

       }

 #endif

     }


     template <template <bool, QudaPCType, typename> class P>

     inline void instantiate(TuneParam &tp, const qudaStream_t &stream)

     {

 #ifdef JITIFY

       Tunable::jitify_error = kernel_instance<P>().configure(tp.grid, tp.block, tp.shared_bytes, stream).launch(arg);

 #else

       if (arg.xpay)

         instantiate<P, true>(tp, stream);

       else

         instantiate<P, false>(tp, stream);

 #endif

     }


     Arg &dslashParam; // temporary addition for policy compatibility


     Dslash(Arg &arg, const ColorSpinorField &out, const ColorSpinorField &in) :

       TunableVectorYZ(1, arg.nParity),

       arg(arg),

       out(out),

       in(in),

       nDimComms(4),

       dslashParam(arg)

     {

       if (checkLocation(out, in) == QUDA_CPU_FIELD_LOCATION)

         errorQuda("CPU Fields not supported in Dslash framework yet");


       // this sets the communications pattern for the packing kernel

       setPackComms(arg.commDim);

       // strcpy(aux, in.AuxString());

       fillAuxBase();

 #ifdef MULTI_GPU

       fillAux(INTERIOR_KERNEL, "policy_kernel=interior");

       fillAux(UBER_KERNEL, "policy_kernel=uber");

       fillAux(EXTERIOR_KERNEL_ALL, "policy_kernel=exterior_all");

       fillAux(EXTERIOR_KERNEL_X, "policy_kernel=exterior_x");

       fillAux(EXTERIOR_KERNEL_Y, "policy_kernel=exterior_y");

       fillAux(EXTERIOR_KERNEL_Z, "policy_kernel=exterior_z");

       fillAux(EXTERIOR_KERNEL_T, "policy_kernel=exterior_t");

 #else

       fillAux(INTERIOR_KERNEL, "policy_kernel=single-GPU");

 #endif // MULTI_GPU

       fillAux(KERNEL_POLICY, "policy");


 #ifdef NVSHMEM_COMMS

       strcpy(aux_barrier, aux[EXTERIOR_KERNEL_ALL]);

       strcat(aux_barrier, ",shmem");

 #endif


       // extract the filename from the template template class (do

       // this regardless of jitify to ensure a build error if filename

       // helper isn't defined)

       using D_ = D<0, false, false, INTERIOR_KERNEL, Arg>;

       kernel_file = std::string("kernels/") + D_::filename();

 #ifdef JITIFY

       create_jitify_program(kernel_file);

 #endif

     }


     void setShmem(int shmem)

     {

 #ifdef NVSHMEM_COMMS

       arg.shmem = shmem;

 #endif

       setUberTuning(arg.shmem & 64);

     }


     void setPack(bool pack, MemoryLocation location)

     {

       if (!pack) {

         arg.setPack(pack, packBuffer);

         return;

       }


       for (int dim = 0; dim < 4; dim++) {

         for (int dir = 0; dir < 2; dir++) {

           if ((location & Remote) && comm_peer2peer_enabled(dir, dim)) { // pack to p2p remote

             packBuffer[2 * dim + dir] = static_cast<char *>(in.remoteFace_d(dir, dim)) + in.GhostOffset(dim, 1 - dir);

           } else if (location & Host && !comm_peer2peer_enabled(dir, dim)) { // pack to cpu memory

             packBuffer[2 * dim + dir] = in.myFace_hd(dir, dim);

           } else if (location & Shmem) {

             // we check whether we can directly pack into the in.remoteFace_d(dir, dim) buffer on the remote GPU

             // pack directly into remote or local memory

             packBuffer[2 * dim + dir] = in.remoteFace_d(dir, dim) ?

               static_cast<char *>(in.remoteFace_d(dir, dim)) + in.GhostOffset(dim, 1 - dir) :

               in.myFace_d(dir, dim);

             // whether we need to shmem_putmem into the receiving buffer

             packBuffer[2 * QUDA_MAX_DIM + 2 * dim + dir] = in.remoteFace_d(dir, dim) ?

               nullptr :

               static_cast<char *>(in.remoteFace_r()) + in.GhostOffset(dim, 1 - dir);

           } else { // pack to local gpu memory

             packBuffer[2 * dim + dir] = in.myFace_d(dir, dim);

           }

         }

       }


       arg.setPack(pack, packBuffer);

       // set the tuning string for the fused interior + packer kernel

       strcpy(aux_pack, aux[arg.kernel_type]);

       strcat(aux_pack, "");


       // label the locations we are packing to

       // location label is nonp2p-p2p

       switch ((int)location) {

       case Device | Remote: strcat(aux_pack, ",device-remote"); break;

       case Host | Remote: strcat(aux_pack, ",host-remote"); break;

       case Device: strcat(aux_pack, ",device-device"); break;

       case Host: strcat(aux_pack, comm_peer2peer_enabled_global() ? ",host-device" : ",host-host"); break;

       case Shmem:

         strcat(aux_pack, arg.exterior_dims > 0 ? ",shmemuber" : ",shmem");

         strcat(aux_pack, (arg.shmem & 1 && arg.shmem & 2) ? "3" : "1");

         break;


       default: errorQuda("Unknown pack target location %d\n", location);

       }

     }


     int Nface() const

     {

       return 2 * arg.nFace;

     } // factor of 2 is for forwards/backwards (convention used in dslash policy)

     int Dagger() const { return arg.dagger; }


     const char *getAux(KernelType type) const { return aux[type]; }


     void setAux(KernelType type, const char *aux_) { strcpy(aux[type], aux_); }


     void augmentAux(KernelType type, const char *extra) { strcat(aux[type], extra); }


     virtual TuneKey tuneKey() const

     {

       auto aux_ = (arg.pack_blocks > 0 && (arg.kernel_type == INTERIOR_KERNEL || arg.kernel_type == UBER_KERNEL)) ?

         aux_pack :

         ((arg.shmem > 0 && arg.kernel_type == EXTERIOR_KERNEL_ALL) ? aux_barrier : aux[arg.kernel_type]);

       return TuneKey(in.VolString(), typeid(*this).name(), aux_);

     }


     virtual void preTune()

     {

       if (arg.kernel_type != INTERIOR_KERNEL && arg.kernel_type != UBER_KERNEL && arg.kernel_type != KERNEL_POLICY)

         out.backup();

     }


     virtual void postTune()

     {

       if (arg.kernel_type != INTERIOR_KERNEL && arg.kernel_type != UBER_KERNEL && arg.kernel_type != KERNEL_POLICY)

         out.restore();

     }


     /*

       per direction / dimension flops

       spin project flops = Nc * Ns

       SU(3) matrix-vector flops = (8 Nc - 2) * Nc

       spin reconstruction flops = 2 * Nc * Ns (just an accumulation to all components)

       xpay = 2 * 2 * Nc * Ns


       So for the full dslash we have, where for the final spin

       reconstruct we have -1 since the first direction does not

       require any accumulation.


       flops = (2 * Nd * Nc * Ns)  +  (2 * Nd * (Ns/2) * (8*Nc-2) * Nc)  +  ((2 * Nd - 1) * 2 * Nc * Ns)

       flops_xpay = flops + 2 * 2 * Nc * Ns


       For Wilson this should give 1344 for Nc=3,Ns=2 and 1368 for the xpay equivalent

     */

     virtual long long flops() const

     {

       int mv_flops = (8 * in.Ncolor() - 2) * in.Ncolor(); // SU(3) matrix-vector flops

       int num_mv_multiply = in.Nspin() == 4 ? 2 : 1;

       int ghost_flops = (num_mv_multiply * mv_flops + 2 * in.Ncolor() * in.Nspin());

       int xpay_flops = 2 * 2 * in.Ncolor() * in.Nspin(); // multiply and add per real component

       int num_dir = 2 * 4; // set to 4-d since we take care of 5-d fermions in derived classes where necessary

       int pack_flops = (in.Nspin() == 4 ? 2 * in.Nspin() / 2 * in.Ncolor() : 0); // only flops if spin projecting


       long long flops_ = 0;


       // FIXME - should we count the xpay flops in the derived kernels

       // since some kernels require the xpay in the exterior (preconditiond clover)


       switch (arg.kernel_type) {

       case EXTERIOR_KERNEL_X:

       case EXTERIOR_KERNEL_Y:

       case EXTERIOR_KERNEL_Z:

       case EXTERIOR_KERNEL_T:

         flops_ = (ghost_flops + (arg.xpay ? xpay_flops : xpay_flops / 2)) * 2 * in.GhostFace()[arg.kernel_type];

         break;

       case EXTERIOR_KERNEL_ALL: {

         long long ghost_sites = 2 * (in.GhostFace()[0] + in.GhostFace()[1] + in.GhostFace()[2] + in.GhostFace()[3]);

         flops_ = (ghost_flops + (arg.xpay ? xpay_flops : xpay_flops / 2)) * ghost_sites;

         break;

       }

       case INTERIOR_KERNEL:

       case UBER_KERNEL:

         if (arg.pack_threads) { flops_ += pack_flops * arg.nParity * in.getDslashConstant().Ls * arg.pack_threads; }

       case KERNEL_POLICY: {

         long long sites = in.Volume();

         flops_ = (num_dir * (in.Nspin() / 4) * in.Ncolor() * in.Nspin() + // spin project (=0 for staggered)

                   num_dir * num_mv_multiply * mv_flops +                  // SU(3) matrix-vector multiplies

                   ((num_dir - 1) * 2 * in.Ncolor() * in.Nspin()))

           * sites; // accumulation

         if (arg.xpay) flops_ += xpay_flops * sites;


         if (arg.kernel_type == KERNEL_POLICY) break;

         // now correct for flops done by exterior kernel

         long long ghost_sites = 0;

         for (int d = 0; d < 4; d++)

           if (arg.commDim[d]) ghost_sites += 2 * in.GhostFace()[d];

         flops_ -= ghost_flops * ghost_sites;


         break;

       }

       }


       return flops_;

     }


     virtual long long bytes() const

     {

       int gauge_bytes = arg.reconstruct * in.Precision();

       bool isFixed = (in.Precision() == sizeof(short) || in.Precision() == sizeof(char)) ? true : false;

       int spinor_bytes = 2 * in.Ncolor() * in.Nspin() * in.Precision() + (isFixed ? sizeof(float) : 0);

       int proj_spinor_bytes = in.Nspin() == 4 ? spinor_bytes / 2 : spinor_bytes;

       int ghost_bytes = (proj_spinor_bytes + gauge_bytes) + 2 * spinor_bytes; // 2 since we have to load the partial

       int num_dir = 2 * 4; // set to 4-d since we take care of 5-d fermions in derived classes where necessary

       int pack_bytes = 2 * ((in.Nspin() == 4 ? in.Nspin() / 2 : in.Nspin()) + in.Nspin()) * in.Ncolor() * in.Precision();


       long long bytes_ = 0;


       switch (arg.kernel_type) {

       case EXTERIOR_KERNEL_X:

       case EXTERIOR_KERNEL_Y:

       case EXTERIOR_KERNEL_Z:

       case EXTERIOR_KERNEL_T: bytes_ = ghost_bytes * 2 * in.GhostFace()[arg.kernel_type]; break;

       case EXTERIOR_KERNEL_ALL: {

         long long ghost_sites = 2 * (in.GhostFace()[0] + in.GhostFace()[1] + in.GhostFace()[2] + in.GhostFace()[3]);

         bytes_ = ghost_bytes * ghost_sites;

         break;

       }

       case INTERIOR_KERNEL:

       case UBER_KERNEL:

         if (arg.pack_threads) { bytes_ += pack_bytes * arg.nParity * in.getDslashConstant().Ls * arg.pack_threads; }

       case KERNEL_POLICY: {

         long long sites = in.Volume();

         bytes_ = (num_dir * gauge_bytes + ((num_dir - 2) * spinor_bytes + 2 * proj_spinor_bytes) + spinor_bytes) * sites;

         if (arg.xpay) bytes_ += spinor_bytes;


         if (arg.kernel_type == KERNEL_POLICY) break;

         // now correct for bytes done by exterior kernel

         long long ghost_sites = 0;

         for (int d = 0; d < 4; d++)

           if (arg.commDim[d]) ghost_sites += 2 * in.GhostFace()[d];

         bytes_ -= ghost_bytes * ghost_sites;


         break;

       }

       }

       return bytes_;

     }

   };


 } // namespace quda

quda::ColorSpinorField
Definition: color_spinor_field.h:379

quda::ColorSpinorField::getDslashConstant
const DslashConstant & getDslashConstant() const
Get the dslash_constant structure from this field.
Definition: color_spinor_field.h:595

quda::ColorSpinorField::Ghost2
virtual const void * Ghost2() const
Definition: color_spinor_field.h:504

quda::ColorSpinorField::GhostOffset
size_t GhostOffset(const int dim, const int dir) const
Definition: color_spinor_field.h:580

quda::ColorSpinorField::Nspin
int Nspin() const
Definition: color_spinor_field.h:480

quda::ColorSpinorField::Ncolor
int Ncolor() const
Definition: color_spinor_field.h:479

quda::ColorSpinorField::GhostFace
const int * GhostFace() const
Definition: color_spinor_field.h:571

quda::ColorSpinorField::Volume
size_t Volume() const
Definition: color_spinor_field.h:489

quda::Dslash
This is the generic driver for launching Dslash kernels (the base kernel of which is defined in dslas...
Definition: dslash.h:33

quda::Dslash::out
const ColorSpinorField & out
Definition: dslash.h:37

quda::Dslash::aux_barrier
char aux_barrier[TuneKey::aux_n]
Definition: dslash.h:45

quda::Dslash::fillAux
void fillAux(KernelType kernel_type, const char *kernel_str)
Specialize the auxiliary strings for each kernel type.
Definition: dslash.h:75

quda::Dslash::setPack
void setPack(bool pack, MemoryLocation location)
Definition: dslash.h:430

quda::Dslash::blockMin
virtual int blockMin() const
Definition: dslash.h:162

quda::Dslash::aux_base
char aux_base[TuneKey::aux_n - 32]
Definition: dslash.h:42

quda::Dslash::minGridSize
virtual unsigned int minGridSize() const
Definition: dslash.h:85

quda::Dslash::getAux
const char * getAux(KernelType type) const
Definition: dslash.h:486

quda::Dslash::fillAuxBase
void fillAuxBase()
Set the base strings used by the different dslash kernel types for autotuning.
Definition: dslash.h:55

quda::Dslash::in
const ColorSpinorField & in
Definition: dslash.h:38

quda::Dslash::initTuneParam
virtual void initTuneParam(TuneParam &param) const
Definition: dslash.h:211

quda::Dslash::Dagger
int Dagger() const
Definition: dslash.h:484

quda::Dslash::nDimComms
const int nDimComms
Definition: dslash.h:40

quda::Dslash::Dslash
Dslash(Arg &arg, const ColorSpinorField &out, const ColorSpinorField &in)
Definition: dslash.h:379

quda::Dslash::augmentAux
void augmentAux(KernelType type, const char *extra)
Definition: dslash.h:490

quda::Dslash::setParam
void setParam(TuneParam &tp)
Definition: dslash.h:112

quda::Dslash::advanceAux
virtual bool advanceAux(TuneParam &param) const
Definition: dslash.h:166

quda::Dslash::setShmem
void setShmem(int shmem)
Definition: dslash.h:422

quda::Dslash::arg
Arg & arg
Definition: dslash.h:36

quda::Dslash::instantiate
void instantiate(TuneParam &tp, const qudaStream_t &stream)
This instantiate function is used to instantiate the the xpay template.
Definition: dslash.h:365

quda::Dslash::instantiate
void instantiate(TuneParam &tp, const qudaStream_t &stream)
This instantiate function is used to instantiate the the dagger template.
Definition: dslash.h:326

quda::Dslash::bytes
virtual long long bytes() const
Definition: dslash.h:586

quda::Dslash::launch
void launch(TuneParam &tp, const qudaStream_t &stream)
This is a helper class that is used to instantiate the correct templated kernel for the dslash....
Definition: dslash.h:248

quda::Dslash::flops
virtual long long flops() const
Definition: dslash.h:535

quda::Dslash::defaultTuneParam
virtual void defaultTuneParam(TuneParam &param) const
Definition: dslash.h:226

quda::Dslash::dslashParam
Arg & dslashParam
Definition: dslash.h:377

quda::Dslash::blockStep
virtual int blockStep() const
Definition: dslash.h:161

quda::Dslash::gridStep
virtual int gridStep() const
gridStep sets the step size when iterating the grid size in advanceGridDim.
Definition: dslash.h:100

quda::Dslash::minThreads
virtual unsigned int minThreads() const
Definition: dslash.h:83

quda::Dslash::tuneKey
virtual TuneKey tuneKey() const
Definition: dslash.h:492

quda::Dslash::Nface
int Nface() const
Definition: dslash.h:480

quda::Dslash::aux_pack
char aux_pack[TuneKey::aux_n]
Definition: dslash.h:44

quda::Dslash::instantiate
void instantiate(TuneParam &tp, const qudaStream_t &stream)
This instantiate function is used to instantiate the the nParity template.
Definition: dslash.h:345

quda::Dslash::instantiate
void instantiate(TuneParam &tp, const qudaStream_t &stream)
This instantiate function is used to instantiate the the KernelType template required for the multi-G...
Definition: dslash.h:291

quda::Dslash::packBuffer
void * packBuffer[4 *QUDA_MAX_DIM]
Definition: dslash.h:48

quda::Dslash::tuneGridDim
virtual bool tuneGridDim() const
Definition: dslash.h:82

quda::Dslash::tuningIter
virtual int tuningIter() const
Definition: dslash.h:159

quda::Dslash::preTune
virtual void preTune()
Save the output field since the output field is both read from and written to in the exterior kernels...
Definition: dslash.h:504

quda::Dslash::setAux
void setAux(KernelType type, const char *aux_)
Definition: dslash.h:488

quda::Dslash::maxSharedBytesPerBlock
unsigned int maxSharedBytesPerBlock() const
The maximum shared memory that a CUDA thread block can use in the autotuner. This isn't necessarily t...
Definition: dslash.h:164

quda::Dslash::advanceTuneParam
virtual bool advanceTuneParam(TuneParam &param) const
Definition: dslash.h:206

quda::Dslash::kernel_file
std::string kernel_file
Definition: dslash.h:50

quda::Dslash::aux
char aux[8][TuneKey::aux_n]
Definition: dslash.h:43

quda::Dslash::postTune
virtual void postTune()
Restore the output field if doing exterior kernel.
Definition: dslash.h:513

quda::LatticeField::remoteFace_r
void * remoteFace_r() const
Return base pointer to the ghost recv buffer. Since this is a base pointer, one still needs to take c...
Definition: lattice_field.h:673

quda::LatticeField::backup
virtual void backup() const
Backs up the LatticeField.
Definition: lattice_field.h:699

quda::LatticeField::VolString
const char * VolString() const
Definition: lattice_field.h:693

quda::LatticeField::remoteFace_d
void * remoteFace_d(int dir, int dim) const
Return base pointer to a remote device buffer for direct sending in a given direction and dimension....
Definition: lattice_field.h:665

quda::LatticeField::myFace_d
void * myFace_d(int dir, int dim) const
Return pointer to the device send buffer in a given direction and dimension.
Definition: lattice_field.h:654

quda::LatticeField::Precision
QudaPrecision Precision() const
Definition: lattice_field.h:567

quda::LatticeField::Location
QudaFieldLocation Location() const
Definition: lattice_field.cpp:683

quda::LatticeField::myFace_hd
void * myFace_hd(int dir, int dim) const
Return pointer to the local mapped my_face buffer in a given direction and dimension.
Definition: lattice_field.h:645

quda::LatticeField::restore
virtual void restore() const
Restores the LatticeField.
Definition: lattice_field.h:702

quda::Tunable::maxDynamicSharedBytesPerBlock
unsigned int maxDynamicSharedBytesPerBlock() const
Returns the maximum dynamic shared memory per block.
Definition: tune_quda.h:220

quda::Tunable::gridStep
virtual int gridStep() const
gridStep sets the step size when iterating the grid size in advanceGridDim.
Definition: tune_quda.h:138

quda::Tunable::advanceGridDim
virtual bool advanceGridDim(TuneParam &param) const
Definition: tune_quda.h:113

quda::Tunable::jitify_error
CUresult jitify_error
Definition: tune_quda.h:283

quda::Tunable::minGridSize
virtual unsigned int minGridSize() const
Definition: tune_quda.h:131

quda::Tunable::advanceSharedBytes
virtual bool advanceSharedBytes(TuneParam &param) const
Definition: tune_quda.h:242

quda::Tunable::maxGridSize
virtual unsigned int maxGridSize() const
Definition: tune_quda.h:130

quda::TunableVectorY::step_y
unsigned int step_y
Definition: tune_quda.h:469

quda::TunableVectorY::vector_length_y
unsigned int vector_length_y
Definition: tune_quda.h:468

quda::TunableVectorYZ
Definition: tune_quda.h:521

quda::TunableVectorYZ::step_z
unsigned step_z
Definition: tune_quda.h:525

quda::TunableVectorYZ::initTuneParam
void initTuneParam(TuneParam &param) const
Definition: tune_quda.h:560

quda::TunableVectorYZ::vector_length_z
unsigned vector_length_z
Definition: tune_quda.h:524

quda::TunableVectorYZ::advanceBlockDim
bool advanceBlockDim(TuneParam &param) const
Definition: tune_quda.h:533

quda::TunableVectorYZ::defaultTuneParam
void defaultTuneParam(TuneParam &param) const
Definition: tune_quda.h:568

quda::TuneParam
Definition: tune_quda.h:25

quda::TuneParam::aux
int4 aux
Definition: tune_quda.h:32

quda::TuneParam::shared_bytes
int shared_bytes
Definition: tune_quda.h:30

quda::TuneParam::set_max_shared_bytes
bool set_max_shared_bytes
Definition: tune_quda.h:31

quda::TuneParam::block
dim3 block
Definition: tune_quda.h:28

quda::TuneParam::grid
dim3 grid
Definition: tune_quda.h:29

color_spinor_field.h

comm_dim_partitioned_string
const char * comm_dim_partitioned_string(const int *comm_dim_override=0)
Return a string that defines the comm partitioning (used as a tuneKey)
Definition: communicator_stack.cpp:82

comm_peer2peer_enabled
bool comm_peer2peer_enabled(int dir, int dim)
Definition: communicator_stack.cpp:116

comm_peer2peer_enabled_global
int comm_peer2peer_enabled_global()
Definition: communicator_stack.cpp:114

dim
std::array< int, 4 > dim
Definition: command_line_params.cpp:34

dagger
bool dagger
Definition: command_line_params.cpp:40

dslash_quda.h

QUDA_CPU_FIELD_LOCATION
@ QUDA_CPU_FIELD_LOCATION
Definition: enum_quda.h:325

instantiate.h

instantiate_dslash.h

checkLocation
#define checkLocation(...)
Definition: lattice_field.h:760

quda::blas::xpay
void xpay(ColorSpinorField &x, double a, ColorSpinorField &y)
Definition: blas_quda.h:45

quda::dslash::inc_shmem_sync_counter
shmem_sync_t inc_shmem_sync_counter()
increase the shmem sync counter for the next dslash application

quda::dslash::get_shmem_sync_counter
shmem_sync_t get_shmem_sync_counter()
Get the shmem sync counter.

quda
Definition: blas_lapack.h:24

quda::policyTuning
bool policyTuning()
Query whether we are currently tuning a policy.
Definition: tune.cpp:512

quda::setPackComms
void setPackComms(const int *dim_pack)
Helper function that sets which dimensions the packing kernel should be packing for.

quda::MemoryLocation
MemoryLocation
Definition: color_spinor_field.h:50

quda::Host
@ Host
Definition: color_spinor_field.h:50

quda::Device
@ Device
Definition: color_spinor_field.h:50

quda::Remote
@ Remote
Definition: color_spinor_field.h:50

quda::Shmem
@ Shmem
Definition: color_spinor_field.h:50

quda::setUberTuning
void setUberTuning(bool)
Enable / disable whether we are tuning an uber kernel.
Definition: tune.cpp:519

quda::stream
qudaStream_t * stream
Definition: cuda_color_spinor_field.cpp:644

quda::activeTuning
bool activeTuning()
query if tuning is in progress
Definition: tune.cpp:137

quda::qudaLaunchKernel
qudaError_t qudaLaunchKernel(const void *func, const TuneParam &tp, void **args, qudaStream_t stream)
Wrapper around cudaLaunchKernel.
Definition: quda_api.cpp:57

quda::getKernelPackT
bool getKernelPackT()

quda::uberTuning
bool uberTuning()
Query whether we are tuning an uber kernel.
Definition: tune.cpp:517

testing::internal::Float
FloatingPoint< float > Float
Definition: gtest-internal.h:396

testing::internal::string
::std::string string
Definition: gtest-port.h:891

param
QudaGaugeParam param
Definition: pack_test.cpp:18

deviceProp
cudaDeviceProp deviceProp
Definition: device.cpp:14

qudaStream_t
cudaStream_t qudaStream_t
Definition: quda_api.h:9

QUDA_MAX_DIM
#define QUDA_MAX_DIM
Maximum number of dimensions supported by QUDA. In practice, no routines make use of more than 5.
Definition: quda_constants.h:17

quda::DslashConstant::Ls
int Ls
Definition: color_spinor_field.h:350

quda::TuneKey
Definition: tune_key.h:8

quda::TuneKey::aux_n
static const int aux_n
Definition: tune_key.h:12

quda::isFixed
Definition: register_traits.h:348

tune_quda.h

errorQuda
#define errorQuda(...)
Definition: util_quda.h:120