quda-ref/v1.0.0/coarse__op__preconditioned_8cu_source.html

 #include <gauge_field.h>
 #include <blas_cublas.h>
 #include <blas_quda.h>
 #include <tune_quda.h>

 #include <jitify_helper.cuh>
 #include <kernels/coarse_op_preconditioned.cuh>

 namespace quda {

 #ifdef GPU_MULTIGRID

   template <typename Float, int n, typename Arg>
   class CalculateYhat : public TunableVectorYZ {

   protected:
     Arg &arg;
     const LatticeField &meta;

     bool compute_max_only;

     long long flops() const { return 2l * arg.coarseVolumeCB * 8 * n * n * (8*n-2); } // 8 from dir, 8 from complexity,
     long long bytes() const { return 2l * (arg.Xinv.Bytes() + 8*arg.Y.Bytes() + 8*arg.Yhat.Bytes()) * n; }

     unsigned int minThreads() const { return arg.coarseVolumeCB; }

     bool tuneGridDim() const { return false; } // don't tune the grid dimension

   public:
       CalculateYhat(Arg &arg, const LatticeField &meta) :
         TunableVectorYZ(2 * n, 4 * n),
         arg(arg),
         meta(meta),
         compute_max_only(false)
       {
         if (meta.Location() == QUDA_CUDA_FIELD_LOCATION) {
 #ifdef JITIFY
         create_jitify_program("kernels/coarse_op_preconditioned.cuh");
 #endif
           arg.max_d = static_cast<Float*>(pool_device_malloc(sizeof(Float)));
         }
         arg.max_h = static_cast<Float*>(pool_pinned_malloc(sizeof(Float)));
       strcpy(aux, compile_type_str(meta));
       strcat(aux, comm_dim_partitioned_string());
       }
     virtual ~CalculateYhat() {
       if (meta.Location() == QUDA_CUDA_FIELD_LOCATION) {
         pool_device_free(arg.max_d);
       }
       pool_pinned_free(arg.max_h);
     }

     void apply(const cudaStream_t &stream) {
       TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
       if (meta.Location() == QUDA_CPU_FIELD_LOCATION) {

         if (compute_max_only)
           CalculateYhatCPU<Float, n, true, Arg>(arg);
         else
           CalculateYhatCPU<Float, n, false, Arg>(arg);

       } else {
         if (compute_max_only) {
           if (!activeTuning())
           {
             cudaMemsetAsync(arg.max_d, 0, sizeof(Float), stream);
           }
         }
 #ifdef JITIFY
         using namespace jitify::reflection;
         jitify_error = program->kernel("quda::CalculateYhatGPU")
                          .instantiate(Type<Float>(), n, compute_max_only, Type<Arg>())
                          .configure(tp.grid, tp.block, tp.shared_bytes, stream)
                          .launch(arg);
 #else
         if (compute_max_only)
           CalculateYhatGPU<Float, n, true, Arg><<<tp.grid, tp.block, tp.shared_bytes, stream>>>(arg);
         else
           CalculateYhatGPU<Float, n, false, Arg><<<tp.grid, tp.block, tp.shared_bytes, stream>>>(arg);
 #endif
         if (compute_max_only) {
           if (!activeTuning()) { // only do copy once tuning is done
             qudaMemcpyAsync(arg.max_h, arg.max_d, sizeof(Float), cudaMemcpyDeviceToHost, stream);
             qudaStreamSynchronize(const_cast<cudaStream_t&>(stream));
           }
         }
       }
     }

     void setComputeMaxOnly(bool compute_max_only_) { compute_max_only = compute_max_only_; }

     // no locality in this kernel so no point in shared-memory tuning
     bool advanceSharedBytes(TuneParam &param) const { return false; }

     bool advanceTuneParam(TuneParam &param) const {
       if (meta.Location() == QUDA_CUDA_FIELD_LOCATION && meta.MemType() == QUDA_MEMORY_DEVICE) return Tunable::advanceTuneParam(param);
       else return false;
     }

     TuneKey tuneKey() const {
       char Aux[TuneKey::aux_n];
       strcpy(Aux,aux);
       if (compute_max_only) strcat(Aux, ",compute_max_only");
       if (meta.Location() == QUDA_CUDA_FIELD_LOCATION) {
         strcat(Aux, meta.MemType() == QUDA_MEMORY_MAPPED ? ",GPU-mapped" : ",GPU-device");
       } else if (meta.Location() == QUDA_CPU_FIELD_LOCATION) {
         strcat(Aux, ",CPU");
         strcat(Aux, getOmpThreadStr());
       }
       return TuneKey(meta.VolString(), typeid(*this).name(), Aux);
     }
   };

   template<typename storeFloat, typename Float, int N, QudaGaugeFieldOrder gOrder>
   void calculateYhat(GaugeField &Yhat, GaugeField &Xinv, const GaugeField &Y, const GaugeField &X)
   {
     // invert the clover matrix field
     const int n = X.Ncolor();
     if (X.Location() == QUDA_CUDA_FIELD_LOCATION && X.Order() == QUDA_FLOAT2_GAUGE_ORDER) {
       GaugeFieldParam param(X);
       // need to copy into AoS format for CUBLAS
       param.order = QUDA_MILC_GAUGE_ORDER;
       param.setPrecision( X.Precision() < QUDA_SINGLE_PRECISION ? QUDA_SINGLE_PRECISION : X.Precision() );
       cudaGaugeField X_(param);
       cudaGaugeField Xinv_(param);
       X_.copy(X);
       blas::flops += cublas::BatchInvertMatrix((void*)Xinv_.Gauge_p(), (void*)X_.Gauge_p(), n, X_.Volume(), X_.Precision(), X.Location());

       if (Xinv.Precision() < QUDA_SINGLE_PRECISION) Xinv.Scale( Xinv_.abs_max() );

       Xinv.copy(Xinv_);

     } else if (X.Location() == QUDA_CPU_FIELD_LOCATION && X.Order() == QUDA_QDP_GAUGE_ORDER) {
       const cpuGaugeField *X_h = static_cast<const cpuGaugeField*>(&X);
       cpuGaugeField *Xinv_h = static_cast<cpuGaugeField*>(&Xinv);
       blas::flops += cublas::BatchInvertMatrix(((void**)Xinv_h->Gauge_p())[0], ((void**)X_h->Gauge_p())[0], n, X_h->Volume(), X.Precision(), QUDA_CPU_FIELD_LOCATION);
     } else {
       errorQuda("Unsupported location=%d and order=%d", X.Location(), X.Order());
     }

     // now exchange Y halos of both forwards and backwards links for multi-process dslash
     const_cast<GaugeField&>(Y).exchangeGhost(QUDA_LINK_BIDIRECTIONAL);

     // compute the preconditioned links
     // Yhat_back(x-\mu) = Y_back(x-\mu) * Xinv^dagger(x) (positive projector)
     // Yhat_fwd(x) = Xinv(x) * Y_fwd(x)                  (negative projector)
     {
       int xc_size[5];
       for (int i=0; i<4; i++) xc_size[i] = X.X()[i];
       xc_size[4] = 1;

       // use spin-ignorant accessor to make multiplication simpler
       typedef typename gauge::FieldOrder<Float,N,1,gOrder,true,storeFloat> gCoarse;
       typedef typename gauge::FieldOrder<Float,N,1,gOrder,true,storeFloat> gPreconditionedCoarse;
       gCoarse yAccessor(const_cast<GaugeField&>(Y));
       gPreconditionedCoarse yHatAccessor(const_cast<GaugeField&>(Yhat));
       gCoarse xInvAccessor(const_cast<GaugeField&>(Xinv));
       if (getVerbosity() >= QUDA_VERBOSE) printfQuda("Xinv = %e\n", Xinv.norm2(0));

       int comm_dim[4];
       for (int i=0; i<4; i++) comm_dim[i] = comm_dim_partitioned(i);
       typedef CalculateYhatArg<Float, gPreconditionedCoarse, gCoarse, N> yHatArg;
       yHatArg arg(yHatAccessor, yAccessor, xInvAccessor, xc_size, comm_dim, 1);

       CalculateYhat<Float, N, yHatArg> yHat(arg, Y);
       if (Yhat.Precision() == QUDA_HALF_PRECISION || Yhat.Precision() == QUDA_QUARTER_PRECISION) {
         yHat.setComputeMaxOnly(true);
         yHat.apply(0);

         double max_h_double = *arg.max_h;
         comm_allreduce_max(&max_h_double);
         *arg.max_h = static_cast<Float>(max_h_double);

         if (getVerbosity() >= QUDA_VERBOSE) printfQuda("Yhat Max = %e\n", *arg.max_h);

         Yhat.Scale(*arg.max_h);
         arg.Yhat.resetScale(*arg.max_h);
       }
       yHat.setComputeMaxOnly(false);
       yHat.apply(0);

       if (getVerbosity() >= QUDA_VERBOSE)
         for (int d = 0; d < 8; d++)
           printfQuda("Yhat[%d] = %e (%e %e = %e x %e)\n", d, Yhat.norm2(d), Yhat.abs_max(d),
                      Y.abs_max(d) * Xinv.abs_max(0), Y.abs_max(d), Xinv.abs_max(0));
     }

     // fill back in the bulk of Yhat so that the backward link is updated on the previous node
     // need to put this in the bulk of the previous node - but only send backwards the backwards
     // links to and not overwrite the forwards bulk
     Yhat.injectGhost(QUDA_LINK_BACKWARDS);

     // exchange forwards links for multi-process dslash dagger
     // need to put this in the ghost zone of the next node - but only send forwards the forwards
     // links and not overwrite the backwards ghost
     Yhat.exchangeGhost(QUDA_LINK_FORWARDS);
   }

   template <typename storeFloat, typename Float, int N>
   void calculateYhat(GaugeField &Yhat, GaugeField &Xinv, const GaugeField &Y, const GaugeField &X)
   {
     if (Y.Location() == QUDA_CPU_FIELD_LOCATION) {
       constexpr QudaGaugeFieldOrder gOrder = QUDA_QDP_GAUGE_ORDER;
       if (Y.FieldOrder() != gOrder) errorQuda("Unsupported field order %d\n", Y.FieldOrder());
       calculateYhat<storeFloat,Float,N,gOrder>(Yhat, Xinv, Y, X);
     } else {
       constexpr QudaGaugeFieldOrder gOrder = QUDA_FLOAT2_GAUGE_ORDER;
       if (Y.FieldOrder() != gOrder) errorQuda("Unsupported field order %d\n", Y.FieldOrder());
       calculateYhat<storeFloat,Float,N,gOrder>(Yhat, Xinv, Y, X);
     }
   }

   // template on the number of coarse degrees of freedom
   template <typename storeFloat, typename Float>
   void calculateYhat(GaugeField &Yhat, GaugeField &Xinv, const GaugeField &Y, const GaugeField &X) {
     switch (Y.Ncolor()) {
     case  2: calculateYhat<storeFloat,Float, 2>(Yhat, Xinv, Y, X); break;
     case  4: calculateYhat<storeFloat,Float, 4>(Yhat, Xinv, Y, X); break;
     case  8: calculateYhat<storeFloat,Float, 8>(Yhat, Xinv, Y, X); break;
     case 12: calculateYhat<storeFloat,Float,12>(Yhat, Xinv, Y, X); break;
     case 16: calculateYhat<storeFloat,Float,16>(Yhat, Xinv, Y, X); break;
     case 20: calculateYhat<storeFloat,Float,20>(Yhat, Xinv, Y, X); break;
     case 24: calculateYhat<storeFloat,Float,24>(Yhat, Xinv, Y, X); break;
     case 32: calculateYhat<storeFloat,Float,32>(Yhat, Xinv, Y, X); break;
     case 48: calculateYhat<storeFloat,Float,48>(Yhat, Xinv, Y, X); break;
     case 64: calculateYhat<storeFloat,Float,64>(Yhat, Xinv, Y, X); break;
     default: errorQuda("Unsupported number of coarse dof %d\n", Y.Ncolor()); break;
     }
   }

 #endif

   //Does the heavy lifting of creating the coarse color matrices Y
   void calculateYhat(GaugeField &Yhat, GaugeField &Xinv, const GaugeField &Y, const GaugeField &X) {

 #ifdef GPU_MULTIGRID
     QudaPrecision precision = checkPrecision(Xinv, Y, X);
     if (getVerbosity() >= QUDA_SUMMARIZE) printfQuda("Computing Yhat field......\n");

     if (precision == QUDA_DOUBLE_PRECISION) {
 #ifdef GPU_MULTIGRID_DOUBLE
       if (Yhat.Precision() != QUDA_DOUBLE_PRECISION) errorQuda("Unsupported precision %d\n", Yhat.Precision());
       calculateYhat<double,double>(Yhat, Xinv, Y, X);
 #else
       errorQuda("Double precision multigrid has not been enabled");
 #endif
     } else if (precision == QUDA_SINGLE_PRECISION) {
       if (Yhat.Precision() == QUDA_SINGLE_PRECISION) {
         calculateYhat<float, float>(Yhat, Xinv, Y, X);
       } else {
         errorQuda("Unsupported precision %d\n", precision);
       }
     } else if (precision == QUDA_HALF_PRECISION) {
       if (Yhat.Precision() == QUDA_HALF_PRECISION) {
         calculateYhat<short, float>(Yhat, Xinv, Y, X);
       } else {
         errorQuda("Unsupported precision %d\n", precision);
       }
     } else {
       errorQuda("Unsupported precision %d\n", precision);
     }

     if (getVerbosity() >= QUDA_SUMMARIZE) printfQuda("....done computing Yhat field\n");
 #else
     errorQuda("Multigrid has not been built");
 #endif
   }

 } //namespace quda

QUDA_VERBOSE
Definition: enum_quda.h:265

pool_pinned_free
#define pool_pinned_free(ptr)
Definition: malloc_quda.h:128

QudaPrecision
enum QudaPrecision_s QudaPrecision

getVerbosity
QudaVerbosity getVerbosity()
Definition: util_quda.cpp:21

checkPrecision
#define checkPrecision(...)
Definition: lattice_field.h:695

coarse_op_preconditioned.cuh

errorQuda
#define errorQuda(...)
Definition: util_quda.h:121

QUDA_CUDA_FIELD_LOCATION
Definition: enum_quda.h:326

jitify_helper.cuh
Helper file when using jitify run-time compilation. This file should be included in source code...

QUDA_QUARTER_PRECISION
Definition: enum_quda.h:59

QUDA_HALF_PRECISION
Definition: enum_quda.h:60

QUDA_QDP_GAUGE_ORDER
Definition: enum_quda.h:41

comm_dim
int comm_dim(int dim)
Definition: comm_common.cpp:424

QUDA_SUMMARIZE
Definition: enum_quda.h:264

QUDA_FLOAT2_GAUGE_ORDER
Definition: enum_quda.h:39

QUDA_LINK_BIDIRECTIONAL
Definition: enum_quda.h:470

quda::stream
cudaStream_t * stream
Definition: cuda_color_spinor_field.cpp:897

QUDA_LINK_FORWARDS
Definition: enum_quda.h:469

comm_dim_partitioned_string
const char * comm_dim_partitioned_string(const int *comm_dim_override=0)
Return a string that defines the comm partitioning (used as a tuneKey)
Definition: comm_common.cpp:782

quda::compile_type_str
const char * compile_type_str(const LatticeField &meta, QudaFieldLocation location_=QUDA_INVALID_FIELD_LOCATION)
Helper function for setting auxilary string.
Definition: lattice_field.h:718

quda
Definition: blas_cublas.h:5

param
QudaGaugeParam param
Definition: pack_test.cpp:17

QUDA_LINK_BACKWARDS
Definition: enum_quda.h:468

quda::qudaStreamSynchronize
cudaError_t qudaStreamSynchronize(cudaStream_t &stream)
Wrapper around cudaStreamSynchronize or cuStreamSynchronize.
Definition: quda_cuda_api.cpp:268

Xinv_h
cpuGaugeField * Xinv_h
Definition: multigrid_benchmark_test.cu:45

QUDA_MILC_GAUGE_ORDER
Definition: enum_quda.h:44

quda::cublas::BatchInvertMatrix
long long BatchInvertMatrix(void *Ainv, void *A, const int n, const int batch, QudaPrecision precision, QudaFieldLocation location)
Definition: blas_cublas.cu:54

pool_device_malloc
#define pool_device_malloc(size)
Definition: malloc_quda.h:125

quda::tuneLaunch
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
Definition: tune.cpp:643

QudaGaugeFieldOrder
enum QudaGaugeFieldOrder_s QudaGaugeFieldOrder

X
int X[4]
Definition: covdev_test.cpp:70

QUDA_MEMORY_DEVICE
Definition: enum_quda.h:12

getOmpThreadStr
char * getOmpThreadStr()
Returns a string of the form ",omp_threads=$OMP_NUM_THREADS", which can be used for storing the numbe...
Definition: util_quda.cpp:134

quda::activeTuning
bool activeTuning()
query if tuning is in progress
Definition: tune.cpp:121

tune_quda.h

quda::calculateYhat
void calculateYhat(GaugeField &Yhat, GaugeField &Xinv, const GaugeField &Y, const GaugeField &X)
Calculate preconditioned coarse links and coarse clover inverse field.
Definition: coarse_op_preconditioned.cu:245

blas_quda.h

pool_pinned_malloc
#define pool_pinned_malloc(size)
Definition: malloc_quda.h:127

QUDA_DOUBLE_PRECISION
Definition: enum_quda.h:62

qudaMemcpyAsync
#define qudaMemcpyAsync(dst, src, count, kind, stream)
Definition: quda_cuda_api.h:38

blas_cublas.h

QUDA_SINGLE_PRECISION
Definition: enum_quda.h:61

X_h
cpuGaugeField * X_h
Definition: multigrid_benchmark_test.cu:45

quda::TuneKey::aux_n
static const int aux_n
Definition: tune_key.h:12

printfQuda
#define printfQuda(...)
Definition: util_quda.h:115

quda::blas::flops
unsigned long long flops
Definition: blas_quda.cu:22

quda::arg
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
Definition: complex_quda.h:1076

pool_device_free
#define pool_device_free(ptr)
Definition: malloc_quda.h:126

comm_allreduce_max
void comm_allreduce_max(double *data)
Definition: comm_mpi.cpp:258

getTuning
QudaTune getTuning()
Query whether autotuning is enabled or not. Default is enabled but can be overridden by setting QUDA_...
Definition: util_quda.cpp:52

quda::LatticeField::Precision
QudaPrecision Precision() const
Definition: lattice_field.h:546

QUDA_CPU_FIELD_LOCATION
Definition: enum_quda.h:325

gauge_field.h

quda::blas::bytes
unsigned long long bytes
Definition: blas_quda.cu:23

comm_dim_partitioned
int comm_dim_partitioned(int dim)
Definition: comm_common.cpp:635

quda::Tunable::advanceTuneParam
virtual bool advanceTuneParam(TuneParam &param) const
Definition: tune_quda.h:335

quda::GaugeField
Definition: gauge_field.h:164

QUDA_MEMORY_MAPPED
Definition: enum_quda.h:14