quda-ref/v1.0.0/dslash__coarse_8cu_source.html

 #include <gauge_field.h>
 #include <color_spinor_field.h>
 #include <uint_to_char.h>
 #include <worker.h>
 #include <tune_quda.h>

 #include <jitify_helper.cuh>
 #include <kernels/dslash_coarse.cuh>

 namespace quda {

 #ifdef GPU_MULTIGRID

   template <typename Float, typename yFloat, typename ghostFloat, int nDim, int Ns, int Nc, int Mc, bool dslash, bool clover, bool dagger, DslashType type>
   class DslashCoarse : public TunableVectorY {

   protected:
     ColorSpinorField &out;
     const ColorSpinorField &inA;
     const ColorSpinorField &inB;
     const GaugeField &Y;
     const GaugeField &X;
     const double kappa;
     const int parity;
     const int nParity;
     const int nSrc;

     const int max_color_col_stride = 8;
     mutable int color_col_stride;
     mutable int dim_threads;
     char *saveOut;

     long long flops() const
     {
       return ((dslash*2*nDim+clover*1)*(8*Ns*Nc*Ns*Nc)-2*Ns*Nc)*nParity*(long long)out.VolumeCB();
     }
     long long bytes() const
     {
      return (dslash||clover) * out.Bytes() + dslash*8*inA.Bytes() + clover*inB.Bytes() +
        nSrc*nParity*(dslash*Y.Bytes()*Y.VolumeCB()/(2*Y.Stride()) + clover*X.Bytes()/2);
     }
     unsigned int sharedBytesPerThread() const { return (sizeof(complex<Float>) * Mc); }
     unsigned int sharedBytesPerBlock(const TuneParam &param) const { return 0; }
     bool tuneGridDim() const { return false; } // Don't tune the grid dimensions
     bool tuneAuxDim() const { return true; } // Do tune the aux dimensions
     unsigned int minThreads() const { return color_col_stride * X.VolumeCB(); } // 4-d volume since this x threads only

     bool advanceBlockDim(TuneParam &param) const
     {
       dim3 grid = param.grid;
       bool ret = TunableVectorY::advanceBlockDim(param);
       param.grid.z = grid.z;

       if (ret) { // we advanced the block.x so we're done
   return true;
       } else { // block.x (spacetime) was reset

         // let's try to advance spin/block-color
         while(param.block.z <= (unsigned int)(dim_threads * 2 * 2 * (Nc/Mc))) {
           param.block.z+=dim_threads * 2;
           if ( (dim_threads*2*2*(Nc/Mc)) % param.block.z == 0) {
             param.grid.z = (dim_threads * 2 * 2 * (Nc/Mc)) / param.block.z;
             break;
           }
         }

         // we can advance spin/block-color since this is valid
         if (param.block.z <= (unsigned int)(dim_threads * 2 * 2 * (Nc/Mc)) &&
             param.block.z <= (unsigned int)deviceProp.maxThreadsDim[2] &&
             param.block.x*param.block.y*param.block.z <= (unsigned int)deviceProp.maxThreadsPerBlock ) { //
           return true;
         } else { // we have run off the end so let's reset
           param.block.z = dim_threads * 2;
           param.grid.z = 2 * (Nc/Mc);
           return false;
         }
       }
     }

     // FIXME: understand why this leads to slower perf and variable correctness
     //int blockStep() const { return deviceProp.warpSize/4; }
     //int blockMin() const { return deviceProp.warpSize/4; }

     // Experimental autotuning of the color column stride
     bool advanceAux(TuneParam &param) const
     {

 #ifdef DOT_PRODUCT_SPLIT
       // we can only split the dot product on Kepler and later since we need the __shfl instruction
       if (2*param.aux.x <= max_color_col_stride && Nc % (2*param.aux.x) == 0 &&
     param.block.x % deviceProp.warpSize == 0) {
   // An x-dimension block size that is not a multiple of the
   // warp size is incompatible with splitting the dot product
   // across the warp so we must skip this

   param.aux.x *= 2; // safe to advance
   color_col_stride = param.aux.x;

   // recompute grid size since minThreads() has now been updated
   param.grid.x = (minThreads()+param.block.x-1)/param.block.x;

   // check this grid size is valid before returning
   if (param.grid.x < (unsigned int)deviceProp.maxGridSize[0]) return true;
       }
 #endif

       // reset color column stride if too large or not divisible
       param.aux.x = 1;
       color_col_stride = param.aux.x;

       // recompute grid size since minThreads() has now been updated
       param.grid.x = (minThreads()+param.block.x-1)/param.block.x;

       if (2*param.aux.y <= nDim &&
           param.block.x*param.block.y*dim_threads*2 <= (unsigned int)deviceProp.maxThreadsPerBlock) {
   param.aux.y *= 2;
   dim_threads = param.aux.y;

   // need to reset z-block/grid size/shared_bytes since dim_threads has changed
   param.block.z = dim_threads * 2;
   param.grid.z = 2* (Nc / Mc);

   param.shared_bytes = sharedBytesPerThread()*param.block.x*param.block.y*param.block.z > sharedBytesPerBlock(param) ?
     sharedBytesPerThread()*param.block.x*param.block.y*param.block.z : sharedBytesPerBlock(param);

   return true;
       } else {
   param.aux.y = 1;
   dim_threads = param.aux.y;

   // need to reset z-block/grid size/shared_bytes since
   // dim_threads has changed.  Strictly speaking this isn't needed
   // since this is the outer dimension to tune, but would be
   // needed if we added an aux.z tuning dimension
   param.block.z = dim_threads * 2;
   param.grid.z = 2* (Nc / Mc);

   param.shared_bytes = sharedBytesPerThread()*param.block.x*param.block.y*param.block.z > sharedBytesPerBlock(param) ?
     sharedBytesPerThread()*param.block.x*param.block.y*param.block.z : sharedBytesPerBlock(param);

   return false;
       }
     }

     virtual void initTuneParam(TuneParam &param) const
     {
       param.aux = make_int4(1,1,1,1);
       color_col_stride = param.aux.x;
       dim_threads = param.aux.y;

       TunableVectorY::initTuneParam(param);
       param.block.z = dim_threads * 2;
       param.grid.z = 2*(Nc/Mc);
       param.shared_bytes = sharedBytesPerThread()*param.block.x*param.block.y*param.block.z > sharedBytesPerBlock(param) ?
   sharedBytesPerThread()*param.block.x*param.block.y*param.block.z : sharedBytesPerBlock(param);
     }

     virtual void defaultTuneParam(TuneParam &param) const
     {
       param.aux = make_int4(1,1,1,1);
       color_col_stride = param.aux.x;
       dim_threads = param.aux.y;

       TunableVectorY::defaultTuneParam(param);
       // ensure that the default x block size is divisible by the warpSize
       param.block.x = deviceProp.warpSize;
       param.grid.x = (minThreads()+param.block.x-1)/param.block.x;
       param.block.z = dim_threads * 2;
       param.grid.z = 2*(Nc/Mc);
       param.shared_bytes = sharedBytesPerThread()*param.block.x*param.block.y*param.block.z > sharedBytesPerBlock(param) ?
   sharedBytesPerThread()*param.block.x*param.block.y*param.block.z : sharedBytesPerBlock(param);
     }

   public:
     inline DslashCoarse(ColorSpinorField &out, const ColorSpinorField &inA, const ColorSpinorField &inB,
       const GaugeField &Y, const GaugeField &X, double kappa, int parity,
                         MemoryLocation *halo_location)
       : TunableVectorY(out.SiteSubset() * (out.Ndim()==5 ? out.X(4) : 1)),
         out(out), inA(inA), inB(inB), Y(Y), X(X), kappa(kappa), parity(parity),
         nParity(out.SiteSubset()), nSrc(out.Ndim()==5 ? out.X(4) : 1)
     {
       strcpy(aux, "policy_kernel,");
       if (out.Location() == QUDA_CUDA_FIELD_LOCATION) {
 #ifdef JITIFY
         create_jitify_program("kernels/dslash_coarse.cuh");
 #endif
       }
       strcat(aux, compile_type_str(out));
       strcat(aux, out.AuxString());
       strcat(aux, comm_dim_partitioned_string());

       // record the location of where each pack buffer is in [2*dim+dir] ordering
       // 0 - no packing
       // 1 - pack to local GPU memory
       // 2 - pack to local mapped CPU memory
       // 3 - pack to remote mapped GPU memory
       switch(type) {
       case DSLASH_INTERIOR: strcat(aux,",interior"); break;
       case DSLASH_EXTERIOR: strcat(aux,",exterior"); break;
       case DSLASH_FULL:     strcat(aux,",full"); break;
       }

       if (doHalo<type>()) {
   char label[15] = ",halo=";
   for (int dim=0; dim<4; dim++) {
     for (int dir=0; dir<2; dir++) {
       label[2*dim+dir+6] = !comm_dim_partitioned(dim) ? '0' : halo_location[2*dim+dir] == Device ? '1' : halo_location[2*dim+dir] == Host ? '2' : '3';
     }
   }
   label[14] = '\0';
   strcat(aux,label);
       }
     }
     virtual ~DslashCoarse() { }

     inline void apply(const cudaStream_t &stream) {

       if (out.Location() == QUDA_CPU_FIELD_LOCATION) {

   if (out.FieldOrder() != QUDA_SPACE_SPIN_COLOR_FIELD_ORDER || Y.FieldOrder() != QUDA_QDP_GAUGE_ORDER)
     errorQuda("Unsupported field order colorspinor=%d gauge=%d combination\n", inA.FieldOrder(), Y.FieldOrder());

   DslashCoarseArg<Float,yFloat,ghostFloat,Ns,Nc,QUDA_SPACE_SPIN_COLOR_FIELD_ORDER,QUDA_QDP_GAUGE_ORDER> arg(out, inA, inB, Y, X, (Float)kappa, parity);
   coarseDslash<Float,nDim,Ns,Nc,Mc,dslash,clover,dagger,type>(arg);
       } else {

         const TuneParam &tp = tuneLaunch(*this, getTuning(), getVerbosity());

   if (out.FieldOrder() != QUDA_FLOAT2_FIELD_ORDER || Y.FieldOrder() != QUDA_FLOAT2_GAUGE_ORDER)
     errorQuda("Unsupported field order colorspinor=%d gauge=%d combination\n", inA.FieldOrder(), Y.FieldOrder());

         typedef DslashCoarseArg<Float,yFloat,ghostFloat,Ns,Nc,QUDA_FLOAT2_FIELD_ORDER,QUDA_FLOAT2_GAUGE_ORDER> Arg;
         Arg arg(out, inA, inB, Y, X, (Float)kappa, parity);

 #ifdef JITIFY
         using namespace jitify::reflection;
         jitify_error = program->kernel("quda::coarseDslashKernel")
           .instantiate(Type<Float>(),nDim,Ns,Nc,Mc,(int)tp.aux.x,(int)tp.aux.y,dslash,clover,dagger,type,Type<Arg>())
           .configure(tp.grid,tp.block,tp.shared_bytes,stream).launch(arg);
 #else
         switch (tp.aux.y) { // dimension gather parallelisation
   case 1:
     switch (tp.aux.x) { // this is color_col_stride
     case 1:
       coarseDslashKernel<Float,nDim,Ns,Nc,Mc,1,1,dslash,clover,dagger,type> <<<tp.grid,tp.block,tp.shared_bytes,stream>>>(arg);
       break;
 #ifdef DOT_PRODUCT_SPLIT
     case 2:
       coarseDslashKernel<Float,nDim,Ns,Nc,Mc,2,1,dslash,clover,dagger,type> <<<tp.grid,tp.block,tp.shared_bytes,stream>>>(arg);
       break;
     case 4:
       coarseDslashKernel<Float,nDim,Ns,Nc,Mc,4,1,dslash,clover,dagger,type> <<<tp.grid,tp.block,tp.shared_bytes,stream>>>(arg);
       break;
     case 8:
       coarseDslashKernel<Float,nDim,Ns,Nc,Mc,8,1,dslash,clover,dagger,type> <<<tp.grid,tp.block,tp.shared_bytes,stream>>>(arg);
       break;
 #endif // DOT_PRODUCT_SPLIT
     default:
       errorQuda("Color column stride %d not valid", tp.aux.x);
     }
     break;
   case 2:
     switch (tp.aux.x) { // this is color_col_stride
     case 1:
       coarseDslashKernel<Float,nDim,Ns,Nc,Mc,1,2,dslash,clover,dagger,type> <<<tp.grid,tp.block,tp.shared_bytes,stream>>>(arg);
       break;
 #ifdef DOT_PRODUCT_SPLIT
     case 2:
       coarseDslashKernel<Float,nDim,Ns,Nc,Mc,2,2,dslash,clover,dagger,type> <<<tp.grid,tp.block,tp.shared_bytes,stream>>>(arg);
       break;
     case 4:
       coarseDslashKernel<Float,nDim,Ns,Nc,Mc,4,2,dslash,clover,dagger,type> <<<tp.grid,tp.block,tp.shared_bytes,stream>>>(arg);
       break;
     case 8:
       coarseDslashKernel<Float,nDim,Ns,Nc,Mc,8,2,dslash,clover,dagger,type> <<<tp.grid,tp.block,tp.shared_bytes,stream>>>(arg);
       break;
 #endif // DOT_PRODUCT_SPLIT
     default:
       errorQuda("Color column stride %d not valid", tp.aux.x);
     }
     break;
   case 4:
     switch (tp.aux.x) { // this is color_col_stride
     case 1:
       coarseDslashKernel<Float,nDim,Ns,Nc,Mc,1,4,dslash,clover,dagger,type> <<<tp.grid,tp.block,tp.shared_bytes,stream>>>(arg);
       break;
 #ifdef DOT_PRODUCT_SPLIT
     case 2:
       coarseDslashKernel<Float,nDim,Ns,Nc,Mc,2,4,dslash,clover,dagger,type> <<<tp.grid,tp.block,tp.shared_bytes,stream>>>(arg);
       break;
     case 4:
       coarseDslashKernel<Float,nDim,Ns,Nc,Mc,4,4,dslash,clover,dagger,type> <<<tp.grid,tp.block,tp.shared_bytes,stream>>>(arg);
       break;
     case 8:
       coarseDslashKernel<Float,nDim,Ns,Nc,Mc,8,4,dslash,clover,dagger,type> <<<tp.grid,tp.block,tp.shared_bytes,stream>>>(arg);
       break;
 #endif // DOT_PRODUCT_SPLIT
     default:
       errorQuda("Color column stride %d not valid", tp.aux.x);
     }
     break;
   default:
     errorQuda("Invalid dimension thread splitting %d", tp.aux.y);
   }
 #endif
       }
     }

     TuneKey tuneKey() const {
       return TuneKey(out.VolString(), typeid(*this).name(), aux);
     }

     void preTune() {
       saveOut = new char[out.Bytes()];
       cudaMemcpy(saveOut, out.V(), out.Bytes(), cudaMemcpyDeviceToHost);
     }

     void postTune()
     {
       cudaMemcpy(out.V(), saveOut, out.Bytes(), cudaMemcpyHostToDevice);
       delete[] saveOut;
     }

   };


   template <typename Float, typename yFloat, typename ghostFloat, int coarseColor, int coarseSpin>
   inline void ApplyCoarse(ColorSpinorField &out, const ColorSpinorField &inA, const ColorSpinorField &inB,
         const GaugeField &Y, const GaugeField &X, double kappa, int parity, bool dslash,
         bool clover, bool dagger, DslashType type, MemoryLocation *halo_location) {

     const int colors_per_thread = 1;
     const int nDim = 4;

     if (dagger) {
       if (dslash) {
   if (clover) {

     if (type == DSLASH_FULL) {
       DslashCoarse<Float,yFloat,ghostFloat,nDim,coarseSpin,coarseColor,colors_per_thread,true,true,true,DSLASH_FULL> dslash(out, inA, inB, Y, X, kappa, parity, halo_location);
       dslash.apply(0);
     } else if (type == DSLASH_INTERIOR) {
       DslashCoarse<Float,yFloat,ghostFloat,nDim,coarseSpin,coarseColor,colors_per_thread,true,true,true,DSLASH_INTERIOR> dslash(out, inA, inB, Y, X, kappa, parity, halo_location);
       dslash.apply(0);
     } else { errorQuda("Dslash type %d not instantiated", type); }

   } else { // plain dslash

     if (type == DSLASH_FULL) {
       DslashCoarse<Float,yFloat,ghostFloat,nDim,coarseSpin,coarseColor,colors_per_thread,true,false,true,DSLASH_FULL> dslash(out, inA, inB, Y, X, kappa, parity, halo_location);
       dslash.apply(0);
     } else if (type == DSLASH_INTERIOR) {
       DslashCoarse<Float,yFloat,ghostFloat,nDim,coarseSpin,coarseColor,colors_per_thread,true,false,true,DSLASH_INTERIOR> dslash(out, inA, inB, Y, X, kappa, parity, halo_location);
       dslash.apply(0);
     } else { errorQuda("Dslash type %d not instantiated", type); }

   }
       } else {

   if (type == DSLASH_EXTERIOR) errorQuda("Cannot call halo on pure clover kernel");
   if (clover) {
     DslashCoarse<Float,yFloat,ghostFloat,nDim,coarseSpin,coarseColor,colors_per_thread,false,true,true,DSLASH_FULL> dslash(out, inA, inB, Y, X, kappa, parity, halo_location);
     dslash.apply(0);
   } else {
     errorQuda("Unsupported dslash=false clover=false");
   }

       }
     } else {

       if (dslash) {
   if (clover) {

     if (type == DSLASH_FULL) {
       DslashCoarse<Float,yFloat,ghostFloat,nDim,coarseSpin,coarseColor,colors_per_thread,true,true,false,DSLASH_FULL> dslash(out, inA, inB, Y, X, kappa, parity, halo_location);
       dslash.apply(0);
     } else if (type == DSLASH_INTERIOR) {
       DslashCoarse<Float,yFloat,ghostFloat,nDim,coarseSpin,coarseColor,colors_per_thread,true,true,false,DSLASH_INTERIOR> dslash(out, inA, inB, Y, X, kappa, parity, halo_location);
       dslash.apply(0);
     } else { errorQuda("Dslash type %d not instantiated", type); }

   } else { // plain dslash

     if (type == DSLASH_FULL) {
       DslashCoarse<Float,yFloat,ghostFloat,nDim,coarseSpin,coarseColor,colors_per_thread,true,false,false,DSLASH_FULL> dslash(out, inA, inB, Y, X, kappa, parity, halo_location);
       dslash.apply(0);
     } else if (type == DSLASH_INTERIOR) {
       DslashCoarse<Float,yFloat,ghostFloat,nDim,coarseSpin,coarseColor,colors_per_thread,true,false,false,DSLASH_INTERIOR> dslash(out, inA, inB, Y, X, kappa, parity, halo_location);
       dslash.apply(0);
     } else { errorQuda("Dslash type %d not instantiated", type); }

   }
       } else {
   if (type == DSLASH_EXTERIOR) errorQuda("Cannot call halo on pure clover kernel");
   if (clover) {
     DslashCoarse<Float,yFloat,ghostFloat,nDim,coarseSpin,coarseColor,colors_per_thread,false,true,false,DSLASH_FULL> dslash(out, inA, inB, Y, X, kappa, parity, halo_location);
     dslash.apply(0);
   } else {
     errorQuda("Unsupported dslash=false clover=false");
   }
       }
     }
   }

   // template on the number of coarse colors
   template <typename Float, typename yFloat, typename ghostFloat>
   inline void ApplyCoarse(ColorSpinorField &out, const ColorSpinorField &inA, const ColorSpinorField &inB,
         const GaugeField &Y, const GaugeField &X, double kappa, int parity, bool dslash,
         bool clover, bool dagger, DslashType type, MemoryLocation *halo_location) {

     if (Y.FieldOrder() != X.FieldOrder())
       errorQuda("Field order mismatch Y = %d, X = %d", Y.FieldOrder(), X.FieldOrder());

     if (inA.FieldOrder() != out.FieldOrder())
       errorQuda("Field order mismatch inA = %d, out = %d", inA.FieldOrder(), out.FieldOrder());

     if (inA.Nspin() != 2)
       errorQuda("Unsupported number of coarse spins %d\n",inA.Nspin());

 #if 0
     } else if (inA.Ncolor() == 4) {
       ApplyCoarse<Float,yFloat,ghostFloat,4,2>(out, inA, inB, Y, X, kappa, parity, dslash, clover, dagger, type, halo_location);
 #endif
     if (inA.Ncolor() == 6) { // free field Wilson
       ApplyCoarse<Float,yFloat,ghostFloat,6,2>(out, inA, inB, Y, X, kappa, parity, dslash, clover, dagger, type, halo_location);
 #if 0
     } else if (inA.Ncolor() == 8) {
       ApplyCoarse<Float,yFloat,ghostFloat,8,2>(out, inA, inB, Y, X, kappa, parity, dslash, clover, dagger, type, halo_location);
     } else if (inA.Ncolor() == 12) {
       ApplyCoarse<Float,yFloat,ghostFloat,12,2>(out, inA, inB, Y, X, kappa, parity, dslash, clover, dagger, type, halo_location);
     } else if (inA.Ncolor() == 16) {
       ApplyCoarse<Float,yFloat,ghostFloat,16,2>(out, inA, inB, Y, X, kappa, parity, dslash, clover, dagger, type, halo_location);
     } else if (inA.Ncolor() == 20) {
       ApplyCoarse<Float,yFloat,ghostFloat,20,2>(out, inA, inB, Y, X, kappa, parity, dslash, clover, dagger, type, halo_location);
 #endif
     } else if (inA.Ncolor() == 24) {
       ApplyCoarse<Float,yFloat,ghostFloat,24,2>(out, inA, inB, Y, X, kappa, parity, dslash, clover, dagger, type, halo_location);
 #if 0
     } else if (inA.Ncolor() == 28) {
       ApplyCoarse<Float,yFloat,ghostFloat,28,2>(out, inA, inB, Y, X, kappa, parity, dslash, clover, dagger, type, halo_location);
 #endif
     } else if (inA.Ncolor() == 32) {
       ApplyCoarse<Float,yFloat,ghostFloat,32,2>(out, inA, inB, Y, X, kappa, parity, dslash, clover, dagger, type, halo_location);
     } else {
       errorQuda("Unsupported number of coarse dof %d\n", Y.Ncolor());
     }
   }

   // this is the Worker pointer that may have issue additional work
   // while we're waiting on communication to finish
   namespace dslash {
     extern Worker* aux_worker;
   }

 #endif // GPU_MULTIGRID

   enum class DslashCoarsePolicy {
     DSLASH_COARSE_BASIC,          // stage both sends and recvs in host memory using memcpys
     DSLASH_COARSE_ZERO_COPY_PACK, // zero copy write pack buffers
     DSLASH_COARSE_ZERO_COPY_READ, // zero copy read halos in dslash kernel
     DSLASH_COARSE_ZERO_COPY,      // full zero copy
     DSLASH_COARSE_GDR_SEND,       // GDR send
     DSLASH_COARSE_GDR_RECV,       // GDR recv
     DSLASH_COARSE_GDR,             // full GDR
     DSLASH_COARSE_ZERO_COPY_PACK_GDR_RECV, // zero copy write and GDR recv
     DSLASH_COARSE_GDR_SEND_ZERO_COPY_READ, // GDR send and zero copy read
     DSLASH_COARSE_POLICY_DISABLED
   };

   struct DslashCoarseLaunch {

     ColorSpinorField &out;
     const ColorSpinorField &inA;
     const ColorSpinorField &inB;
     const GaugeField &Y;
     const GaugeField &X;
     double kappa;
     int parity;
     bool dslash;
     bool clover;
     bool dagger;
     const int *commDim;
     const QudaPrecision halo_precision;

     inline DslashCoarseLaunch(ColorSpinorField &out, const ColorSpinorField &inA, const ColorSpinorField &inB,
             const GaugeField &Y, const GaugeField &X, double kappa, int parity,
             bool dslash, bool clover, bool dagger, const int *commDim, QudaPrecision halo_precision)
       : out(out), inA(inA), inB(inB), Y(Y), X(X), kappa(kappa), parity(parity),
   dslash(dslash), clover(clover), dagger(dagger), commDim(commDim),
         halo_precision(halo_precision == QUDA_INVALID_PRECISION ? Y.Precision() : halo_precision) { }

     inline void operator()(DslashCoarsePolicy policy) {
 #ifdef GPU_MULTIGRID
       if (inA.V() == out.V()) errorQuda("Aliasing pointers");

       // check all precisions match
       QudaPrecision precision = checkPrecision(out, inA, inB);
       checkPrecision(Y, X);

       // check all locations match
       checkLocation(out, inA, inB, Y, X);

       int comm_sum = 4;
       if (commDim) for (int i=0; i<4; i++) comm_sum -= (1-commDim[i]);
       if (comm_sum != 4 && comm_sum != 0) errorQuda("Unsupported comms %d", comm_sum);
       bool comms = comm_sum;

       MemoryLocation pack_destination[2*QUDA_MAX_DIM]; // where we will pack the ghost buffer to
       MemoryLocation halo_location[2*QUDA_MAX_DIM]; // where we load the halo from
       for (int i=0; i<2*QUDA_MAX_DIM; i++) {
   pack_destination[i] = (policy == DslashCoarsePolicy::DSLASH_COARSE_ZERO_COPY_PACK || policy == DslashCoarsePolicy::DSLASH_COARSE_ZERO_COPY ||
              policy == DslashCoarsePolicy::DSLASH_COARSE_ZERO_COPY_PACK_GDR_RECV) ? Host : Device;
   halo_location[i] = (policy == DslashCoarsePolicy::DSLASH_COARSE_ZERO_COPY_READ || policy == DslashCoarsePolicy::DSLASH_COARSE_ZERO_COPY ||
           policy == DslashCoarsePolicy::DSLASH_COARSE_GDR_SEND_ZERO_COPY_READ) ? Host : Device;
       }
       bool gdr_send = (policy == DslashCoarsePolicy::DSLASH_COARSE_GDR_SEND || policy == DslashCoarsePolicy::DSLASH_COARSE_GDR ||
            policy == DslashCoarsePolicy::DSLASH_COARSE_GDR_SEND_ZERO_COPY_READ) ? true : false;
       bool gdr_recv = (policy == DslashCoarsePolicy::DSLASH_COARSE_GDR_RECV || policy == DslashCoarsePolicy::DSLASH_COARSE_GDR ||
            policy == DslashCoarsePolicy::DSLASH_COARSE_ZERO_COPY_PACK_GDR_RECV) ? true : false;

       // disable peer-to-peer if doing a zero-copy policy (temporary)
       if ( policy == DslashCoarsePolicy::DSLASH_COARSE_ZERO_COPY_PACK ||
      policy == DslashCoarsePolicy::DSLASH_COARSE_ZERO_COPY_READ ||
      policy == DslashCoarsePolicy::DSLASH_COARSE_ZERO_COPY ||
      policy == DslashCoarsePolicy::DSLASH_COARSE_ZERO_COPY_PACK_GDR_RECV ||
      policy == DslashCoarsePolicy::DSLASH_COARSE_GDR_SEND_ZERO_COPY_READ) comm_enable_peer2peer(false);

       if (dslash && comm_partitioned() && comms) {
   const int nFace = 1;
         inA.exchangeGhost((QudaParity)(inA.SiteSubset() == QUDA_PARITY_SITE_SUBSET ? (1 - parity) : 0), nFace, dagger,
                           pack_destination, halo_location, gdr_send, gdr_recv, halo_precision);
       }

       if (dslash::aux_worker) dslash::aux_worker->apply(0);

       if (precision == QUDA_DOUBLE_PRECISION) {
 #ifdef GPU_MULTIGRID_DOUBLE
   if (Y.Precision() != QUDA_DOUBLE_PRECISION)
           errorQuda("Y Precision %d not supported", Y.Precision());
   if (halo_precision != QUDA_DOUBLE_PRECISION)
           errorQuda("Halo precision %d not supported with field precision %d and link precision %d", halo_precision, precision, Y.Precision());
   ApplyCoarse<double,double,double>(out, inA, inB, Y, X, kappa, parity, dslash, clover,
                                           dagger, comms ? DSLASH_FULL : DSLASH_INTERIOR, halo_location);
   //if (dslash && comm_partitioned()) ApplyCoarse<double>(out, inA, inB, Y, X, kappa, parity, dslash, clover, dagger, true, halo_location);
 #else
   errorQuda("Double precision multigrid has not been enabled");
 #endif
       } else if (precision == QUDA_SINGLE_PRECISION) {
         if (Y.Precision() == QUDA_SINGLE_PRECISION) {
           if (halo_precision == QUDA_SINGLE_PRECISION) {
             ApplyCoarse<float,float,float>(out, inA, inB, Y, X, kappa, parity, dslash, clover,
                                          dagger, comms ? DSLASH_FULL : DSLASH_INTERIOR, halo_location);
           } else {
             errorQuda("Halo precision %d not supported with field precision %d and link precision %d", halo_precision, precision, Y.Precision());
           }
         } else if (Y.Precision() == QUDA_HALF_PRECISION) {
 #if QUDA_PRECISION & 2
           if (halo_precision == QUDA_HALF_PRECISION) {
             ApplyCoarse<float,short,short>(out, inA, inB, Y, X, kappa, parity, dslash, clover,
                                            dagger, comms ? DSLASH_FULL : DSLASH_INTERIOR, halo_location);
           } else if (halo_precision == QUDA_QUARTER_PRECISION) {
 #if QUDA_PRECISION & 1
             ApplyCoarse<float,short,char>(out, inA, inB, Y, X, kappa, parity, dslash, clover,
                                           dagger, comms ? DSLASH_FULL : DSLASH_INTERIOR, halo_location);
 #else
             errorQuda("QUDA_PRECISION=%d does not enable quarter precision", QUDA_PRECISION);
 #endif
           } else {
             errorQuda("Halo precision %d not supported with field precision %d and link precision %d", halo_precision, precision, Y.Precision());
           }
 #else
           errorQuda("QUDA_PRECISION=%d does not enable half precision", QUDA_PRECISION);
 #endif
         } else {
           errorQuda("Unsupported precision %d\n", Y.Precision());
         }
   //if (dslash && comm_partitioned()) ApplyCoarse<float>(out, inA, inB, Y, X, kappa, parity, dslash, clover, dagger, true, halo_location);
       } else {
   errorQuda("Unsupported precision %d\n", Y.Precision());
       }

       if (dslash && comm_partitioned() && comms) inA.bufferIndex = (1 - inA.bufferIndex);
       comm_enable_peer2peer(true);
 #else
       errorQuda("Multigrid has not been built");
 #endif
     }

   };

   static bool dslash_init = false;
   static std::vector<DslashCoarsePolicy> policies(static_cast<int>(DslashCoarsePolicy::DSLASH_COARSE_POLICY_DISABLED), DslashCoarsePolicy::DSLASH_COARSE_POLICY_DISABLED);
   static int first_active_policy=static_cast<int>(DslashCoarsePolicy::DSLASH_COARSE_POLICY_DISABLED);

   // string used as a tunekey to ensure we retune if the dslash policy env changes
   static char policy_string[TuneKey::aux_n];

   void enable_policy(DslashCoarsePolicy p){
     policies[static_cast<std::size_t>(p)] = p;
   }

   void disable_policy(DslashCoarsePolicy p){
     policies[static_cast<std::size_t>(p)] = DslashCoarsePolicy::DSLASH_COARSE_POLICY_DISABLED;
   }

  class DslashCoarsePolicyTune : public Tunable {

    DslashCoarseLaunch &dslash;

    bool tuneGridDim() const { return false; } // Don't tune the grid dimensions.
    bool tuneAuxDim() const { return true; } // Do tune the aux dimensions.
    unsigned int sharedBytesPerThread() const { return 0; }
    unsigned int sharedBytesPerBlock(const TuneParam &param) const { return 0; }

  public:
    inline DslashCoarsePolicyTune(DslashCoarseLaunch &dslash) : dslash(dslash)
    {
       if (!dslash_init) {

   static char *dslash_policy_env = getenv("QUDA_ENABLE_DSLASH_COARSE_POLICY");

   if (dslash_policy_env) { // set the policies to tune for explicitly
     std::stringstream policy_list(dslash_policy_env);

     int policy_;
     while (policy_list >> policy_) {
       DslashCoarsePolicy dslash_policy = static_cast<DslashCoarsePolicy>(policy_);

       // check this is a valid policy choice
       if ( (dslash_policy == DslashCoarsePolicy::DSLASH_COARSE_GDR_SEND ||
             dslash_policy == DslashCoarsePolicy::DSLASH_COARSE_GDR_RECV ||
             dslash_policy == DslashCoarsePolicy::DSLASH_COARSE_GDR ||
             dslash_policy == DslashCoarsePolicy::DSLASH_COARSE_ZERO_COPY_PACK_GDR_RECV ||
             dslash_policy == DslashCoarsePolicy::DSLASH_COARSE_GDR_SEND_ZERO_COPY_READ) && !comm_gdr_enabled() ) {
         errorQuda("Cannot select a GDR policy %d unless QUDA_ENABLE_GDR is set", static_cast<int>(dslash_policy));
       }

       enable_policy(dslash_policy);
       first_active_policy = policy_ < first_active_policy ? policy_ : first_active_policy;
       if (policy_list.peek() == ',') policy_list.ignore();
     }
     if(first_active_policy == static_cast<int>(DslashCoarsePolicy::DSLASH_COARSE_POLICY_DISABLED)) errorQuda("No valid policy found in QUDA_ENABLE_DSLASH_COARSE_POLICY");
   } else {
     first_active_policy = 0;
     enable_policy(DslashCoarsePolicy::DSLASH_COARSE_BASIC);
     enable_policy(DslashCoarsePolicy::DSLASH_COARSE_ZERO_COPY_PACK);
     enable_policy(DslashCoarsePolicy::DSLASH_COARSE_ZERO_COPY_READ);
     enable_policy(DslashCoarsePolicy::DSLASH_COARSE_ZERO_COPY);
     if (comm_gdr_enabled()) {
       enable_policy(DslashCoarsePolicy::DSLASH_COARSE_GDR_SEND);
       enable_policy(DslashCoarsePolicy::DSLASH_COARSE_GDR_RECV);
       enable_policy(DslashCoarsePolicy::DSLASH_COARSE_GDR);
       enable_policy(DslashCoarsePolicy::DSLASH_COARSE_ZERO_COPY_PACK_GDR_RECV);
       enable_policy(DslashCoarsePolicy::DSLASH_COARSE_GDR_SEND_ZERO_COPY_READ);
     }
   }

         // construct string specifying which policies have been enabled
         strcat(policy_string, ",pol=");
         for (int i = 0; i < (int)DslashCoarsePolicy::DSLASH_COARSE_POLICY_DISABLED; i++) {
           strcat(policy_string, (int)policies[i] == i ? "1" : "0");
         }

         dslash_init = true;
       }

       strcpy(aux, "policy,");
       if (dslash.dslash) strcat(aux, "dslash");
       strcat(aux, dslash.clover ? "clover," : ",");
       strcat(aux, dslash.inA.AuxString());
       strcat(aux, ",gauge_prec=");

       char prec_str[8];
       i32toa(prec_str, dslash.Y.Precision());
       strcat(aux, prec_str);
       strcat(aux, ",halo_prec=");
       i32toa(prec_str, dslash.halo_precision);
       strcat(aux, prec_str);
       strcat(aux, comm_dim_partitioned_string(dslash.commDim));
       strcat(aux, comm_dim_topology_string());
       strcat(aux, comm_config_string()); // and change in P2P/GDR will be stored as a separate tunecache entry
       strcat(aux, policy_string);        // any change in policies enabled will be stored as a separate entry

       int comm_sum = 4;
       if (dslash.commDim)
         for (int i = 0; i < 4; i++) comm_sum -= (1 - dslash.commDim[i]);
       strcat(aux, comm_sum ? ",full" : ",interior");

       // before we do policy tuning we must ensure the kernel
       // constituents have been tuned since we can't do nested tuning
       if (getTuning() && getTuneCache().find(tuneKey()) == getTuneCache().end()) {
   disableProfileCount();
   for (auto &i : policies) if(i!= DslashCoarsePolicy::DSLASH_COARSE_POLICY_DISABLED) dslash(i);
   enableProfileCount();
   setPolicyTuning(true);
       }
    }

    virtual ~DslashCoarsePolicyTune() { setPolicyTuning(false); }

    inline void apply(const cudaStream_t &stream) {
      TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());

      if (tp.aux.x >= (int)policies.size()) errorQuda("Requested policy that is outside of range");
      if (policies[tp.aux.x] == DslashCoarsePolicy::DSLASH_COARSE_POLICY_DISABLED ) errorQuda("Requested policy is disabled");
      dslash(policies[tp.aux.x]);
    }

    int tuningIter() const { return 10; }

    bool advanceAux(TuneParam &param) const
    {
     while ((unsigned)param.aux.x < policies.size()-1) {
       param.aux.x++;
       if(policies[param.aux.x] != DslashCoarsePolicy::DSLASH_COARSE_POLICY_DISABLED) return true;
     }
     param.aux.x = 0;
     return false;
    }

    bool advanceTuneParam(TuneParam &param) const { return advanceAux(param); }

    void initTuneParam(TuneParam &param) const  {
      Tunable::initTuneParam(param);
      param.aux.x = first_active_policy;
      param.aux.y = 0;
      param.aux.z = 0;
      param.aux.w = 0;
    }

    void defaultTuneParam(TuneParam &param) const  {
      Tunable::defaultTuneParam(param);
      param.aux.x = first_active_policy;
      param.aux.y = 0;
      param.aux.z = 0;
      param.aux.w = 0;
    }

    TuneKey tuneKey() const {
      return TuneKey(dslash.inA.VolString(), typeid(*this).name(), aux);
    }

    long long flops() const {
      int nDim = 4;
      int Ns = dslash.inA.Nspin();
      int Nc = dslash.inA.Ncolor();
      int nParity = dslash.inA.SiteSubset();
      int volumeCB = dslash.inA.VolumeCB();
      return ((dslash.dslash*2*nDim+dslash.clover*1)*(8*Ns*Nc*Ns*Nc)-2*Ns*Nc)*nParity*volumeCB;
    }

    long long bytes() const {
      int nParity = dslash.inA.SiteSubset();
      return (dslash.dslash||dslash.clover) * dslash.out.Bytes() +
        dslash.dslash*8*dslash.inA.Bytes() + dslash.clover*dslash.inB.Bytes() +
        nParity*(dslash.dslash*dslash.Y.Bytes()*dslash.Y.VolumeCB()/(2*dslash.Y.Stride())
     + dslash.clover*dslash.X.Bytes()/2);
      // multiply Y by volume / stride to correct for pad
    }
   };


   //Apply the coarse Dirac matrix to a coarse grid vector
   //out(x) = M*in = X*in - kappa*\sum_mu Y_{-\mu}(x)in(x+mu) + Y^\dagger_mu(x-mu)in(x-mu)
   //  or
   //out(x) = M^dagger*in = X^dagger*in - kappa*\sum_mu Y^\dagger_{-\mu}(x)in(x+mu) + Y_mu(x-mu)in(x-mu)
   //Uses the kappa normalization for the Wilson operator.
   void ApplyCoarse(ColorSpinorField &out, const ColorSpinorField &inA, const ColorSpinorField &inB,
              const GaugeField &Y, const GaugeField &X, double kappa, int parity,
        bool dslash, bool clover, bool dagger, const int *commDim, QudaPrecision halo_precision) {

     DslashCoarseLaunch Dslash(out, inA, inB, Y, X, kappa, parity, dslash, clover, dagger, commDim, halo_precision);

     DslashCoarsePolicyTune policy(Dslash);
     policy.apply(0);

   }//ApplyCoarse


 } // namespace quda
quda::Worker::apply
virtual void apply(const cudaStream_t &stream)=0

quda::Device
Definition: color_spinor_field.h:15

quda::ColorSpinorField::Nspin
int Nspin() const
Definition: color_spinor_field.h:406

quda::TuneParam
Definition: tune_quda.h:17

quda::DslashCoarseLaunch::operator()
void operator()(DslashCoarsePolicy policy)
Execute the coarse dslash using the given policy.
Definition: dslash_coarse.cu:496

quda::ApplyCoarse
void ApplyCoarse(ColorSpinorField &out, const ColorSpinorField &inA, const ColorSpinorField &inB, const GaugeField &Y, const GaugeField &X, double kappa, int parity=QUDA_INVALID_PARITY, bool dslash=true, bool clover=true, bool dagger=false, const int *commDim=0, QudaPrecision halo_precision=QUDA_INVALID_PRECISION)
Apply the coarse dslash stencil. This single driver accounts for all variations with and without the ...
Definition: dslash_coarse.cu:772

quda::DslashCoarseLaunch::X
const GaugeField & X
Definition: dslash_coarse.cu:477

quda::DSLASH_FULL
Definition: dslash_coarse.cuh:19

worker.h

QudaPrecision
enum QudaPrecision_s QudaPrecision

quda::dslash_init
static bool dslash_init
Definition: dslash_coarse.cu:595

quda::ColorSpinorField
Definition: color_spinor_field.h:311

quda::GaugeField::FieldOrder
QudaGaugeFieldOrder FieldOrder() const
Definition: gauge_field.h:257

quda::LatticeField::AuxString
const char * AuxString() const
Definition: lattice_field.h:627

quda::Worker
Definition: worker.h:5

deviceProp
cudaDeviceProp deviceProp
Definition: interface_quda.cpp:156

quda::disableProfileCount
void disableProfileCount()
Disable the profile kernel counting.
Definition: tune.cpp:125

getVerbosity
QudaVerbosity getVerbosity()
Definition: util_quda.cpp:21

QUDA_INVALID_PRECISION
Definition: enum_quda.h:63

kappa
double kappa
Definition: test_util.cpp:1647

quda::blas::end
void end(void)
Definition: blas_quda.cu:489

quda::DslashCoarsePolicyTune::flops
long long flops() const
Definition: dslash_coarse.cu:747

checkPrecision
#define checkPrecision(...)
Definition: lattice_field.h:695

quda::DslashCoarseLaunch::inB
const ColorSpinorField & inB
Definition: dslash_coarse.cu:475

quda::LatticeField::Stride
int Stride() const
Definition: lattice_field.h:526

errorQuda
#define errorQuda(...)
Definition: util_quda.h:121

color_spinor_field.h

QUDA_CUDA_FIELD_LOCATION
Definition: enum_quda.h:326

quda::ColorSpinorField::VolumeCB
int VolumeCB() const
Definition: color_spinor_field.h:416

jitify_helper.cuh
Helper file when using jitify run-time compilation. This file should be included in source code...

QUDA_QUARTER_PRECISION
Definition: enum_quda.h:59

QUDA_SPACE_SPIN_COLOR_FIELD_ORDER
Definition: enum_quda.h:350

QUDA_HALF_PRECISION
Definition: enum_quda.h:60

QUDA_QDP_GAUGE_ORDER
Definition: enum_quda.h:41

quda::policy_string
static char policy_string[TuneKey::aux_n]
Definition: dslash_coarse.cu:600

quda::DslashCoarseLaunch::parity
int parity
Definition: dslash_coarse.cu:479

QUDA_FLOAT2_GAUGE_ORDER
Definition: enum_quda.h:39

quda::DslashType
DslashType
Definition: dslash_coarse.cuh:16

quda::stream
cudaStream_t * stream
Definition: cuda_color_spinor_field.cpp:897

comm_partitioned
int comm_partitioned()
Loop over comm_dim_partitioned(dim) for all comms dimensions.
Definition: comm_common.cpp:640

quda::DslashCoarsePolicy::DSLASH_COARSE_ZERO_COPY

quda::DslashCoarseLaunch::DslashCoarseLaunch
DslashCoarseLaunch(ColorSpinorField &out, const ColorSpinorField &inA, const ColorSpinorField &inB, const GaugeField &Y, const GaugeField &X, double kappa, int parity, bool dslash, bool clover, bool dagger, const int *commDim, QudaPrecision halo_precision)
Definition: dslash_coarse.cu:486

quda::DslashCoarsePolicyTune::sharedBytesPerThread
unsigned int sharedBytesPerThread() const
Definition: dslash_coarse.cu:616

quda::DslashCoarsePolicy::DSLASH_COARSE_POLICY_DISABLED

quda::LatticeField::VolString
const char * VolString() const
Definition: lattice_field.h:624

quda::DslashCoarsePolicyTune::sharedBytesPerBlock
unsigned int sharedBytesPerBlock(const TuneParam &param) const
Definition: dslash_coarse.cu:617

quda::TuneParam::shared_bytes
int shared_bytes
Definition: tune_quda.h:22

quda::TunableVectorY::initTuneParam
void initTuneParam(TuneParam &param) const
Definition: tune_quda.h:466

quda::TunableVectorY::advanceBlockDim
bool advanceBlockDim(TuneParam &param) const
Definition: tune_quda.h:440

quda::first_active_policy
static int first_active_policy
Definition: dslash_coarse.cu:597

quda::ColorSpinorField::Ncolor
int Ncolor() const
Definition: color_spinor_field.h:405

quda::TuneParam::grid
dim3 grid
Definition: tune_quda.h:21

quda::TuneParam::aux
int4 aux
Definition: tune_quda.h:23

comm_dim_partitioned_string
const char * comm_dim_partitioned_string(const int *comm_dim_override=0)
Return a string that defines the comm partitioning (used as a tuneKey)
Definition: comm_common.cpp:782

quda::compile_type_str
const char * compile_type_str(const LatticeField &meta, QudaFieldLocation location_=QUDA_INVALID_FIELD_LOCATION)
Helper function for setting auxilary string.
Definition: lattice_field.h:718

quda
Definition: blas_cublas.h:5

quda::DSLASH_INTERIOR
Definition: dslash_coarse.cuh:17

quda::enableProfileCount
void enableProfileCount()
Enable the profile kernel counting.
Definition: tune.cpp:126

quda::DslashCoarsePolicy::DSLASH_COARSE_ZERO_COPY_PACK

quda::DslashCoarsePolicyTune::dslash
DslashCoarseLaunch & dslash
Definition: dslash_coarse.cu:612

comm_enable_peer2peer
void comm_enable_peer2peer(bool enable)
Enable / disable peer-to-peer communication: used for dslash policies that do not presently support p...
Definition: comm_common.cpp:308

quda::i32toa
void i32toa(char *buffer, int32_t value)
Definition: uint_to_char.h:117

param
QudaGaugeParam param
Definition: pack_test.cpp:17

quda::DslashCoarseLaunch::clover
bool clover
Definition: dslash_coarse.cu:481

quda::DslashCoarsePolicy
DslashCoarsePolicy
Definition: dslash_coarse.cu:458

quda::Dslash
Definition: dslash.h:12

quda::Tunable
Definition: tune_quda.h:59

quda::DslashCoarsePolicy::DSLASH_COARSE_GDR_SEND_ZERO_COPY_READ

quda::LatticeField::bufferIndex
static int bufferIndex
Definition: lattice_field.h:484

quda::dslash::aux_worker
Worker * aux_worker
Definition: dslash_quda.cu:87

quda::DslashCoarseLaunch::commDim
const int * commDim
Definition: dslash_coarse.cu:483

quda::GaugeField::Bytes
size_t Bytes() const
Definition: gauge_field.h:311

quda::DslashCoarsePolicy::DSLASH_COARSE_ZERO_COPY_READ

quda::TuneParam::block
dim3 block
Definition: tune_quda.h:20

dslash_coarse.cuh

comm_dim_topology_string
const char * comm_dim_topology_string()
Return a string that defines the comm topology (for use as a tuneKey)
Definition: comm_common.cpp:797

quda::TunableVectorY
Definition: tune_quda.h:426

quda::DslashCoarseLaunch
Definition: dslash_coarse.cu:471

quda::DslashCoarseLaunch::out
ColorSpinorField & out
Definition: dslash_coarse.cu:473

quda::ColorSpinorField::SiteSubset
QudaSiteSubset SiteSubset() const
Definition: color_spinor_field.h:481

quda::tuneLaunch
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
Definition: tune.cpp:643

quda::DslashCoarsePolicyTune::tuningIter
int tuningIter() const
Definition: dslash_coarse.cu:713

prec_str
const char * prec_str[]
Definition: blas_test.cu:909

checkLocation
#define checkLocation(...)
Definition: lattice_field.h:664

QUDA_PARITY_SITE_SUBSET
Definition: enum_quda.h:332

QUDA_FLOAT2_FIELD_ORDER
Definition: enum_quda.h:348

quda::TunableVectorY::defaultTuneParam
void defaultTuneParam(TuneParam &param) const
Definition: tune_quda.h:474

quda::DslashCoarseLaunch::dslash
bool dslash
Definition: dslash_coarse.cu:480

quda::DslashCoarseArg
Definition: dslash_coarse.cuh:23

quda::disable_policy
void disable_policy(DslashCoarsePolicy p)
Definition: dslash_coarse.cu:606

X
int X[4]
Definition: covdev_test.cpp:70

quda::DslashCoarsePolicyTune::advanceAux
bool advanceAux(TuneParam &param) const
Definition: dslash_coarse.cu:715

QudaParity
enum QudaParity_s QudaParity

quda::setPolicyTuning
void setPolicyTuning(bool)
Enable / disable whether are tuning a policy.
Definition: tune.cpp:499

quda::Host
Definition: color_spinor_field.h:15

tune_quda.h

quda::Arg
Definition: spinor_noise.cu:22

quda::LatticeField::Location
QudaFieldLocation Location() const
Definition: lattice_field.cpp:660

quda::DslashCoarseLaunch::kappa
double kappa
Definition: dslash_coarse.cu:478

quda::DslashCoarsePolicyTune::tuneGridDim
bool tuneGridDim() const
Definition: dslash_coarse.cu:614

QUDA_DOUBLE_PRECISION
Definition: enum_quda.h:62

quda::DslashCoarsePolicyTune::bytes
long long bytes() const
Definition: dslash_coarse.cu:756

quda::DslashCoarsePolicyTune::apply
void apply(const cudaStream_t &stream)
Definition: dslash_coarse.cu:705

quda::commDim
static int commDim[QUDA_MAX_DIM]
Definition: dslash_pack.cuh:9

out
cpuColorSpinorField * out
Definition: staggered_invert_test.cpp:99

QUDA_SINGLE_PRECISION
Definition: enum_quda.h:61

quda::DslashCoarseLaunch::dagger
bool dagger
Definition: dslash_coarse.cu:482

quda::DslashCoarsePolicy::DSLASH_COARSE_GDR_RECV

quda::ColorSpinorField::Bytes
size_t Bytes() const
Definition: color_spinor_field.h:418

quda::TuneKey::aux_n
static const int aux_n
Definition: tune_key.h:12

quda::DslashCoarsePolicyTune::tuneAuxDim
bool tuneAuxDim() const
Definition: dslash_coarse.cu:615

quda::DslashCoarseLaunch::Y
const GaugeField & Y
Definition: dslash_coarse.cu:476

quda::LatticeField::VolumeCB
int VolumeCB() const
Definition: lattice_field.h:509

quda::DslashCoarsePolicyTune::~DslashCoarsePolicyTune
virtual ~DslashCoarsePolicyTune()
Definition: dslash_coarse.cu:703

quda::DslashCoarsePolicyTune
Definition: dslash_coarse.cu:610

quda::blas::flops
unsigned long long flops
Definition: blas_quda.cu:22

quda::arg
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
Definition: complex_quda.h:1076

quda::DslashCoarseLaunch::halo_precision
const QudaPrecision halo_precision
Definition: dslash_coarse.cu:484

comm_gdr_enabled
bool comm_gdr_enabled()
Query if GPU Direct RDMA communication is enabled (global setting)
Definition: comm_common.cpp:649

quda::DslashCoarseLaunch::inA
const ColorSpinorField & inA
Definition: dslash_coarse.cu:474

quda::Tunable::initTuneParam
virtual void initTuneParam(TuneParam &param) const
Definition: tune_quda.h:304

quda::DslashCoarsePolicy::DSLASH_COARSE_BASIC

quda::DslashCoarsePolicy::DSLASH_COARSE_ZERO_COPY_PACK_GDR_RECV

QUDA_MAX_DIM
#define QUDA_MAX_DIM
Maximum number of dimensions supported by QUDA. In practice, no routines make use of more than 5...
Definition: quda_constants.h:17

volumeCB
static int volumeCB
Definition: face_gauge.cpp:43

quda::enable_policy
void enable_policy(DslashCoarsePolicy p)
Definition: dslash_coarse.cu:602

quda::DslashCoarsePolicyTune::defaultTuneParam
void defaultTuneParam(TuneParam &param) const
Definition: dslash_coarse.cu:735

uint_to_char.h

quda::ColorSpinorField::exchangeGhost
virtual void exchangeGhost(QudaParity parity, int nFace, int dagger, const MemoryLocation *pack_destination=nullptr, const MemoryLocation *halo_location=nullptr, bool gdr_send=false, bool gdr_recv=false, QudaPrecision ghost_precision=QUDA_INVALID_PRECISION) const =0

quda::getTuneCache
const std::map< TuneKey, TuneParam > & getTuneCache()
Returns a reference to the tunecache map.
Definition: tune.cpp:128

quda::policies
static std::vector< DslashCoarsePolicy > policies(static_cast< int >(DslashCoarsePolicy::DSLASH_COARSE_POLICY_DISABLED), DslashCoarsePolicy::DSLASH_COARSE_POLICY_DISABLED)

quda::DslashCoarsePolicy::DSLASH_COARSE_GDR_SEND

getTuning
QudaTune getTuning()
Query whether autotuning is enabled or not. Default is enabled but can be overridden by setting QUDA_...
Definition: util_quda.cpp:52

quda::DslashCoarsePolicyTune::advanceTuneParam
bool advanceTuneParam(TuneParam &param) const
Definition: dslash_coarse.cu:725

quda::LatticeField::Precision
QudaPrecision Precision() const
Definition: lattice_field.h:546

quda::DslashCoarsePolicyTune::tuneKey
TuneKey tuneKey() const
Definition: dslash_coarse.cu:743

quda::DslashCoarsePolicyTune::initTuneParam
void initTuneParam(TuneParam &param) const
Definition: dslash_coarse.cu:727

dagger
QudaDagType dagger
Definition: test_util.cpp:1620

quda::DslashCoarsePolicy::DSLASH_COARSE_GDR

parity
QudaParity parity
Definition: covdev_test.cpp:54

QUDA_CPU_FIELD_LOCATION
Definition: enum_quda.h:325

quda::DslashCoarsePolicyTune::DslashCoarsePolicyTune
DslashCoarsePolicyTune(DslashCoarseLaunch &dslash)
Definition: dslash_coarse.cu:620

quda::ColorSpinorField::FieldOrder
QudaFieldOrder FieldOrder() const
Definition: color_spinor_field.h:483

quda::DSLASH_EXTERIOR
Definition: dslash_coarse.cuh:18

gauge_field.h

quda::MemoryLocation
MemoryLocation
Definition: color_spinor_field.h:15

quda::TuneKey
Definition: tune_key.h:8

quda::blas::bytes
unsigned long long bytes
Definition: blas_quda.cu:23

quda::ColorSpinorField::V
void * V()
Definition: color_spinor_field.h:424

comm_dim_partitioned
int comm_dim_partitioned(int dim)
Definition: comm_common.cpp:635

comm_config_string
const char * comm_config_string()
Return a string that defines the P2P/GDR environment variable configuration (for use as a tuneKey to ...
Definition: comm_common.cpp:766

quda::Tunable::defaultTuneParam
virtual void defaultTuneParam(TuneParam &param) const
Definition: tune_quda.h:329

quda::GaugeField
Definition: gauge_field.h:164