quda-ref/v1.0.0/block__orthogonalize_8cu_source.html

 #include <color_spinor_field.h>
 #include <tune_quda.h>
 #include <uint_to_char.h>
 #include <typeinfo>
 #include <vector>
 #include <assert.h>

 // ensure compatibilty with C++11
 #if __cplusplus >= 201402L
 #include <utility>
 #else
 #include <integer_sequence.hpp> // C++11 version of this C++14 feature
 #endif

 #include <launch_kernel.cuh>

 #include <jitify_helper.cuh>

 #ifdef GPU_MULTIGRID
 #include <kernels/block_orthogonalize.cuh>
 #endif

 namespace quda {

 #ifdef GPU_MULTIGRID

   using namespace quda::colorspinor;

   template <typename sumType, typename vFloat, typename bFloat, int nSpin, int spinBlockSize, int nColor_, int coarseSpin, int nVec>
   class BlockOrtho : public Tunable {

     // we only support block-format on fine grid where Ncolor=3
     static constexpr int nColor = isFixed<bFloat>::value ? 3 : nColor_;

     typedef typename mapper<vFloat>::type RegType;
     ColorSpinorField &V;
     const std::vector<ColorSpinorField*> B;
     const int *fine_to_coarse;
     const int *coarse_to_fine;
     const int *geo_bs;
     const int n_block_ortho;
     int geoBlockSize;
     int nBlock;

     unsigned int sharedBytesPerThread() const { return 0; }
     unsigned int sharedBytesPerBlock(const TuneParam &param) const { return 0; }
     unsigned int minThreads() const { return V.VolumeCB(); } // fine parity is the block y dimension

   public:
       BlockOrtho(ColorSpinorField &V, const std::vector<ColorSpinorField *> B, const int *fine_to_coarse,
                  const int *coarse_to_fine, const int *geo_bs, const int n_block_ortho) :
         V(V),
         B(B),
         fine_to_coarse(fine_to_coarse),
         coarse_to_fine(coarse_to_fine),
         geo_bs(geo_bs),
         n_block_ortho(n_block_ortho)
       {
         if (nColor_ != nColor)
           errorQuda("Number of colors %d not supported with this precision %lu\n", nColor_, sizeof(bFloat));

         if (V.Location() == QUDA_CUDA_FIELD_LOCATION) {
 #ifdef JITIFY
         create_jitify_program("kernels/block_orthogonalize.cuh");
 #endif
         }
       strcat(aux, compile_type_str(V));
       strcat(aux, V.AuxString());
       strcat(aux,",block_size=");

       geoBlockSize = 1;
       char geo_str[16];
       for (int d = 0; d < V.Ndim(); d++) {
         geoBlockSize *= geo_bs[d];
         i32toa(geo_str, geo_bs[d]);
         strcat(aux, geo_str);
         if (d < V.Ndim() - 1) strcat(aux, "x");
       }

       strcat(aux, ",n_block_ortho=");
       char n_ortho_str[2];
       i32toa(n_ortho_str, n_block_ortho);
       strcat(aux, n_ortho_str);

       if (V.Location() == QUDA_CPU_FIELD_LOCATION) strcat(aux, getOmpThreadStr());

       int chiralBlocks = (nSpin==1) ? 2 : V.Nspin() / spinBlockSize; //always 2 for staggered.
       nBlock = (V.Volume()/geoBlockSize) * chiralBlocks;
       }

     virtual ~BlockOrtho() { }

     template <typename Rotator, typename Vector, std::size_t... S>
     void CPU(const std::vector<ColorSpinorField*> &B, std::index_sequence<S...>) {
       typedef BlockOrthoArg<Rotator,Vector,nSpin,spinBlockSize,coarseSpin,nVec> Arg;
       Arg arg(V, fine_to_coarse, coarse_to_fine, QUDA_INVALID_PARITY, geo_bs, n_block_ortho, V, B[S]...);
       blockOrthoCPU<sumType,RegType,nSpin,spinBlockSize,nColor,coarseSpin,nVec,Arg>(arg);
     }

     template <typename Rotator, typename Vector, std::size_t... S>
     void GPU(const TuneParam &tp, const cudaStream_t &stream, const std::vector<ColorSpinorField*> &B, std::index_sequence<S...>) {
       typedef typename mapper<vFloat>::type RegType; // need to redeclare typedef (WAR for CUDA 7 and 8)
       typedef BlockOrthoArg<Rotator,Vector,nSpin,spinBlockSize,coarseSpin,nVec> Arg;
       Arg arg(V, fine_to_coarse, coarse_to_fine, QUDA_INVALID_PARITY, geo_bs, n_block_ortho, V, B[S]...);
       arg.swizzle = tp.aux.x;
 #ifdef JITIFY
       using namespace jitify::reflection;
       auto instance = program->kernel("quda::blockOrthoGPU")
         .instantiate((int)tp.block.x,Type<sumType>(),Type<RegType>(),nSpin,spinBlockSize,nColor,coarseSpin,nVec,Type<Arg>());
       cuMemcpyHtoDAsync(instance.get_constant_ptr("quda::B_array_d"), B_array_h, MAX_MATRIX_SIZE, stream);
       jitify_error = instance.configure(tp.grid,tp.block,tp.shared_bytes,stream).launch(arg);
 #else
       cudaMemcpyToSymbolAsync(B_array_d, B_array_h, MAX_MATRIX_SIZE, 0, cudaMemcpyHostToDevice, stream);
       LAUNCH_KERNEL_MG_BLOCK_SIZE(blockOrthoGPU,tp,stream,arg,sumType,RegType,nSpin,spinBlockSize,nColor,coarseSpin,nVec,Arg);
 #endif
     }

     void apply(const cudaStream_t &stream) {
       TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
       if (V.Location() == QUDA_CPU_FIELD_LOCATION) {
   if (V.FieldOrder() == QUDA_SPACE_SPIN_COLOR_FIELD_ORDER && B[0]->FieldOrder() == QUDA_SPACE_SPIN_COLOR_FIELD_ORDER) {
     typedef FieldOrderCB<RegType,nSpin,nColor,nVec,QUDA_SPACE_SPIN_COLOR_FIELD_ORDER,vFloat,vFloat,DISABLE_GHOST> Rotator;
     typedef FieldOrderCB<RegType,nSpin,nColor,1,QUDA_SPACE_SPIN_COLOR_FIELD_ORDER,bFloat,bFloat,DISABLE_GHOST> Vector;
     CPU<Rotator,Vector>(B, std::make_index_sequence<nVec>());
   } else {
     errorQuda("Unsupported field order %d\n", V.FieldOrder());
   }
       } else {
 #if __COMPUTE_CAPABILITY__ >= 300
   if (V.FieldOrder() == QUDA_FLOAT2_FIELD_ORDER && B[0]->FieldOrder() == QUDA_FLOAT2_FIELD_ORDER) {
     typedef FieldOrderCB<RegType,nSpin,nColor,nVec,QUDA_FLOAT2_FIELD_ORDER,vFloat,vFloat,DISABLE_GHOST> Rotator;
     typedef FieldOrderCB<RegType,nSpin,nColor,1,QUDA_FLOAT2_FIELD_ORDER,bFloat,bFloat,DISABLE_GHOST,isFixed<bFloat>::value> Vector;
           GPU<Rotator,Vector>(tp,stream,B,std::make_index_sequence<nVec>());
   } else {
     errorQuda("Unsupported field order V=%d B=%d\n", V.FieldOrder(), B[0]->FieldOrder());
   }
 #else
   errorQuda("GPU block orthogonalization not supported on this GPU architecture");
 #endif
       }
     }

     bool advanceAux(TuneParam &param) const
     {
 #ifdef SWIZZLE
       if (param.aux.x < 2*deviceProp.multiProcessorCount) {
         param.aux.x++;
   return true;
       } else {
         param.aux.x = 1;
   return false;
       }
 #else
       return false;
 #endif
     }

     bool advanceTuneParam(TuneParam &param) const {
       if (V.Location() == QUDA_CUDA_FIELD_LOCATION) {
   return advanceSharedBytes(param) || advanceAux(param);
       } else {
   return false;
       }
     }

     TuneKey tuneKey() const { return TuneKey(V.VolString(), typeid(*this).name(), aux); }

     void initTuneParam(TuneParam &param) const { defaultTuneParam(param); }

     void defaultTuneParam(TuneParam &param) const {
       param.block = dim3(geoBlockSize/2, V.SiteSubset(), 1);
       param.grid = dim3((minThreads() + param.block.x - 1) / param.block.x, 1, coarseSpin);
       param.shared_bytes = 0;
       param.aux.x = 1; // swizzle factor
     }

     long long flops() const
     {
       return n_block_ortho * nBlock * (geoBlockSize / 2) * (spinBlockSize == 0 ? 1 : 2 * spinBlockSize) / 2 * nColor
         * (nVec * ((nVec - 1) * (8l + 8l)) + 6l);
     }

     long long bytes() const
     {
       return nVec * B[0]->Bytes() + (nVec - 1) * nVec / 2 * V.Bytes() / nVec + V.Bytes()
         + (n_block_ortho - 1) * (V.Bytes() + (nVec - 1) * nVec / 2 * V.Bytes() / nVec + V.Bytes());
     }

     char *saveOut, *saveOutNorm;

     void preTune() { V.backup(); }
     void postTune() { V.restore(); }

   };

   template <typename vFloat, typename bFloat, int nSpin, int spinBlockSize, int nColor, int nVec>
   void BlockOrthogonalize(ColorSpinorField &V, const std::vector<ColorSpinorField *> &B, const int *fine_to_coarse,
                           const int *coarse_to_fine, const int *geo_bs, const int n_block_ortho)
   {

     int geo_blocksize = 1;
     for (int d = 0; d < V.Ndim(); d++) geo_blocksize *= geo_bs[d];

     int blocksize = geo_blocksize * V.Ncolor();
     if (spinBlockSize == 0) { blocksize /= 2; } else { blocksize *= spinBlockSize; }
     int chiralBlocks = (spinBlockSize == 0) ? 2 : V.Nspin() / spinBlockSize; //always 2 for staggered.
     int numblocks = (V.Volume()/geo_blocksize) * chiralBlocks;
     constexpr int coarseSpin = (nSpin == 4 || nSpin == 2 || spinBlockSize == 0) ? 2 : 1;

     if (getVerbosity() >= QUDA_VERBOSE)
       printfQuda("Block Orthogonalizing %d blocks of %d length and width %d repeating %d times\n", numblocks, blocksize,
                  nVec, n_block_ortho);

     V.Scale(1.0); // by definition this is true
     BlockOrtho<double, vFloat, bFloat, nSpin, spinBlockSize, nColor, coarseSpin, nVec> ortho(
       V, B, fine_to_coarse, coarse_to_fine, geo_bs, n_block_ortho);
     ortho.apply(0);
     checkCudaError();
   }

   template <typename vFloat, typename bFloat, int nSpin, int spinBlockSize>
   void BlockOrthogonalize(ColorSpinorField &V, const std::vector<ColorSpinorField *> &B, const int *fine_to_coarse,
                           const int *coarse_to_fine, const int *geo_bs, const int n_block_ortho)
   {

     const int Nvec = B.size();
     if (V.Ncolor()/Nvec == 3) {

       constexpr int nColor = 3;
       if (Nvec == 6) { // for Wilson free field
         BlockOrthogonalize<vFloat, bFloat, nSpin, spinBlockSize, nColor, 6>(V, B, fine_to_coarse, coarse_to_fine,
                                                                             geo_bs, n_block_ortho);
       } else if (Nvec == 24) {
         BlockOrthogonalize<vFloat, bFloat, nSpin, spinBlockSize, nColor, 24>(V, B, fine_to_coarse, coarse_to_fine,
                                                                              geo_bs, n_block_ortho);
       } else if (Nvec == 32) {
         BlockOrthogonalize<vFloat, bFloat, nSpin, spinBlockSize, nColor, 32>(V, B, fine_to_coarse, coarse_to_fine,
                                                                              geo_bs, n_block_ortho);
       } else if (Nvec == 48) {
         BlockOrthogonalize<vFloat, bFloat, nSpin, spinBlockSize, nColor, 48>(V, B, fine_to_coarse, coarse_to_fine,
                                                                              geo_bs, n_block_ortho);
       } else {
         errorQuda("Unsupported nVec %d\n", Nvec);
       }

     } else if (V.Ncolor()/Nvec == 6) {

       constexpr int nColor = 6;
       if (Nvec == 6) {
         BlockOrthogonalize<vFloat, bFloat, nSpin, spinBlockSize, nColor, 6>(V, B, fine_to_coarse, coarse_to_fine,
                                                                             geo_bs, n_block_ortho);
       } else {
         errorQuda("Unsupported nVec %d\n", Nvec);
       }

     } else if (V.Ncolor()/Nvec == 24) {

       constexpr int nColor = 24;
       if (Nvec == 24) {
         BlockOrthogonalize<vFloat, bFloat, nSpin, spinBlockSize, nColor, 24>(V, B, fine_to_coarse, coarse_to_fine,
                                                                              geo_bs, n_block_ortho);
       } else if (Nvec == 32) {
         BlockOrthogonalize<vFloat, bFloat, nSpin, spinBlockSize, nColor, 32>(V, B, fine_to_coarse, coarse_to_fine,
                                                                              geo_bs, n_block_ortho);
       } else {
         errorQuda("Unsupported nVec %d\n", Nvec);
       }

     } else if (V.Ncolor()/Nvec == 32) {

       constexpr int nColor = 32;
       if (Nvec == 32) {
         BlockOrthogonalize<vFloat, bFloat, nSpin, spinBlockSize, nColor, 32>(V, B, fine_to_coarse, coarse_to_fine,
                                                                              geo_bs, n_block_ortho);
       } else {
         errorQuda("Unsupported nVec %d\n", Nvec);
       }
     } else {
       errorQuda("Unsupported nColor %d\n", V.Ncolor()/Nvec);
     }
   }

   template <typename vFloat, typename bFloat>
   void BlockOrthogonalize(ColorSpinorField &V, const std::vector<ColorSpinorField *> &B, const int *fine_to_coarse,
                           const int *coarse_to_fine, const int *geo_bs, const int spin_bs, const int n_block_ortho)
   {
     if(V.Nspin() ==2 && spin_bs == 1) { //coarsening coarse fermions w/ chirality.
       BlockOrthogonalize<vFloat, bFloat, 2, 1>(V, B, fine_to_coarse, coarse_to_fine, geo_bs, n_block_ortho);
 #ifdef GPU_WILSON_DIRAC
     } else if (V.Nspin() == 4 && spin_bs == 2) { // coarsening Wilson-like fermions.
       BlockOrthogonalize<vFloat, bFloat, 4, 2>(V, B, fine_to_coarse, coarse_to_fine, geo_bs, n_block_ortho);
 #endif
 #ifdef GPU_STAGGERED_DIRAC
     } else if (V.Nspin() == 1 && spin_bs == 1) { // coarsening Laplace-like operators.
       BlockOrthogonalize<vFloat, bFloat, 1, 1>(V, B, fine_to_coarse, coarse_to_fine, geo_bs, n_block_ortho);
 #endif
     } else {
       errorQuda("Unsupported nSpin %d and spinBlockSize %d combination.\n", V.Nspin(), spin_bs);
     }
   }

 #endif // GPU_MULTIGRID

   void BlockOrthogonalize(ColorSpinorField &V, const std::vector<ColorSpinorField *> &B, const int *fine_to_coarse,
                           const int *coarse_to_fine, const int *geo_bs, const int spin_bs, const int n_block_ortho)
   {
 #ifdef GPU_MULTIGRID
     if (V.Precision() == QUDA_DOUBLE_PRECISION && B[0]->Precision() == QUDA_DOUBLE_PRECISION) {
 #ifdef GPU_MULTIGRID_DOUBLE
       BlockOrthogonalize<double>(V, B, fine_to_coarse, coarse_to_fine, geo_bs, spin_bs, n_block_ortho);
 #else
       errorQuda("Double precision multigrid has not been enabled");
 #endif
     } else if (V.Precision() == QUDA_SINGLE_PRECISION && B[0]->Precision() == QUDA_SINGLE_PRECISION) {
       BlockOrthogonalize<float, float>(V, B, fine_to_coarse, coarse_to_fine, geo_bs, spin_bs, n_block_ortho);
     } else if (V.Precision() == QUDA_HALF_PRECISION && B[0]->Precision() == QUDA_SINGLE_PRECISION) {
 #if QUDA_PRECISION & 2
       BlockOrthogonalize<short, float>(V, B, fine_to_coarse, coarse_to_fine, geo_bs, spin_bs, n_block_ortho);
 #else
       errorQuda("QUDA_PRECISION=%d does not enable half precision", QUDA_PRECISION);
 #endif
     } else if (V.Precision() == QUDA_HALF_PRECISION && B[0]->Precision() == QUDA_HALF_PRECISION) {
 #if QUDA_PRECISION & 2
       BlockOrthogonalize<short, short>(V, B, fine_to_coarse, coarse_to_fine, geo_bs, spin_bs, n_block_ortho);
 #else
       errorQuda("QUDA_PRECISION=%d does not enable half precision", QUDA_PRECISION);
 #endif
     } else {
       errorQuda("Unsupported precision combination V=%d B=%d\n", V.Precision(), B[0]->Precision());
     }
 #else
     errorQuda("Multigrid has not been built");
 #endif // GPU_MULTIGRID
   }

 } // namespace quda
QUDA_VERBOSE
Definition: enum_quda.h:265

QUDA_INVALID_PARITY
Definition: enum_quda.h:289

quda::ColorSpinorField
Definition: color_spinor_field.h:311

deviceProp
cudaDeviceProp deviceProp
Definition: interface_quda.cpp:156

getVerbosity
QudaVerbosity getVerbosity()
Definition: util_quda.cpp:21

errorQuda
#define errorQuda(...)
Definition: util_quda.h:121

color_spinor_field.h

QUDA_CUDA_FIELD_LOCATION
Definition: enum_quda.h:326

jitify_helper.cuh
Helper file when using jitify run-time compilation. This file should be included in source code...

QUDA_SPACE_SPIN_COLOR_FIELD_ORDER
Definition: enum_quda.h:350

QUDA_HALF_PRECISION
Definition: enum_quda.h:60

quda::stream
cudaStream_t * stream
Definition: cuda_color_spinor_field.cpp:897

quda::isFixed::value
static const bool value
Definition: register_traits.h:144

quda::colorspinor::FieldOrderCB
Definition: color_spinor_field_order.h:468

quda::compile_type_str
const char * compile_type_str(const LatticeField &meta, QudaFieldLocation location_=QUDA_INVALID_FIELD_LOCATION)
Helper function for setting auxilary string.
Definition: lattice_field.h:718

quda
Definition: blas_cublas.h:5

quda::i32toa
void i32toa(char *buffer, int32_t value)
Definition: uint_to_char.h:117

param
QudaGaugeParam param
Definition: pack_test.cpp:17

launch_kernel.cuh

nColor
const int nColor
Definition: covdev_test.cpp:75

quda::colorspinor::S
This is just a dummy structure we use for trove to define the required structure size.
Definition: color_spinor_field_order.h:1125

quda::B_array_d
static __constant__ signed char B_array_d[MAX_MATRIX_SIZE]
Definition: block_orthogonalize.cuh:16

quda::tuneLaunch
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
Definition: tune.cpp:643

QUDA_FLOAT2_FIELD_ORDER
Definition: enum_quda.h:348

MAX_MATRIX_SIZE
#define MAX_MATRIX_SIZE
Definition: block_orthogonalize.cuh:15

getOmpThreadStr
char * getOmpThreadStr()
Returns a string of the form ",omp_threads=$OMP_NUM_THREADS", which can be used for storing the numbe...
Definition: util_quda.cpp:134

V
int V
Definition: test_util.cpp:27

tune_quda.h

QUDA_DOUBLE_PRECISION
Definition: enum_quda.h:62

QUDA_SINGLE_PRECISION
Definition: enum_quda.h:61

quda::colorspinor
Definition: color_spinor_field_order.h:187

quda::B_array_h
static signed char B_array_h[MAX_MATRIX_SIZE]
Definition: block_orthogonalize.cuh:19

printfQuda
#define printfQuda(...)
Definition: util_quda.h:115

quda::blas::flops
unsigned long long flops
Definition: blas_quda.cu:22

quda::arg
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
Definition: complex_quda.h:1076

quda::Vector
VectorXcd Vector
Definition: inv_eigcg_quda.cpp:38

checkCudaError
#define checkCudaError()
Definition: util_quda.h:161

uint_to_char.h

block_orthogonalize.cuh

getTuning
QudaTune getTuning()
Query whether autotuning is enabled or not. Default is enabled but can be overridden by setting QUDA_...
Definition: util_quda.cpp:52

n_block_ortho
int n_block_ortho[QUDA_MAX_MG_LEVEL]
Definition: test_util.cpp:1673

quda::LatticeField::Precision
QudaPrecision Precision() const
Definition: lattice_field.h:546

LAUNCH_KERNEL_MG_BLOCK_SIZE
#define LAUNCH_KERNEL_MG_BLOCK_SIZE(kernel, tp, stream, arg,...)
Definition: launch_kernel.cuh:205

QUDA_CPU_FIELD_LOCATION
Definition: enum_quda.h:325

quda::blas::bytes
unsigned long long bytes
Definition: blas_quda.cu:23

quda::BlockOrthogonalize
void BlockOrthogonalize(ColorSpinorField &V, const std::vector< ColorSpinorField *> &B, const int *fine_to_coarse, const int *coarse_to_fine, const int *geo_bs, const int spin_bs, const int n_block_ortho)
Block orthogonnalize the matrix field, where the blocks are defined by lookup tables that map the fin...
Definition: block_orthogonalize.cu:317