quda-ref/v1.0.0/staggered__oprod_8cu_source.html

 #include <cstdio>
 #include <cstdlib>
 #include <staggered_oprod.h>

 #include <tune_quda.h>
 #include <quda_internal.h>
 #include <gauge_field_order.h>
 #include <quda_matrix.h>
 #include <dslash_quda.h>

 namespace quda {

 #ifdef GPU_STAGGERED_DIRAC

   namespace { // anonymous
 #include <texture.h>
   }

   enum OprodKernelType { OPROD_INTERIOR_KERNEL, OPROD_EXTERIOR_KERNEL };

   template <typename Float, typename Output, typename InputA, typename InputB> struct StaggeredOprodArg {
     unsigned int length;
     const int parity;
     int dir;
     int displacement;
     OprodKernelType kernelType;
     const int nFace;
     const InputA inA;
     const InputB inB;
     Output outA;
     Output outB;
     Float coeff[2];
     int X[4];
     unsigned int ghostOffset[4];
     bool partitioned[4];

     StaggeredOprodArg(int parity, int dir, const unsigned int *ghostOffset, int displacement,
                       const OprodKernelType &kernelType, int nFace, const double coeff[2], InputA &inA, InputB &inB,
                       Output &outA, Output &outB, GaugeField &meta) :
       length(meta.VolumeCB()),
       parity(parity),
       dir(dir),
       displacement(displacement),
       kernelType(kernelType),
       nFace(nFace),
       inA(inA),
       inB(inB),
       outA(outA),
       outB(outB)
     {
       this->coeff[0] = coeff[0];
       this->coeff[1] = coeff[1];
       for (int i = 0; i < 4; ++i) this->X[i] = meta.X()[i];
       for (int i = 0; i < 4; ++i) this->ghostOffset[i] = ghostOffset[i];
       for (int i = 0; i < 4; ++i) this->partitioned[i] = commDimPartitioned(i) ? true : false;
     }
   };

   enum IndexType {
     EVEN_X = 0,
     EVEN_Y = 1,
     EVEN_Z = 2,
     EVEN_T = 3
   };

   template <IndexType idxType>
   __device__ inline void coordsFromIndex(int &idx, int c[4], unsigned int cb_idx, int parity, const int X[4])
   {
       const int &LX = X[0];
       const int &LY = X[1];
       const int &LZ = X[2];
       const int XYZ = X[2]*X[1]*X[0];
       const int XY = X[1]*X[0];

       idx = 2*cb_idx;

       int x, y, z, t;

       if (idxType == EVEN_X ) { // X even
         //   t = idx / XYZ;
         //   z = (idx / XY) % Z;
         //   y = (idx / X) % Y;
         //   idx += (parity + t + z + y) & 1;
         //   x = idx % X;
         // equivalent to the above, but with fewer divisions/mods:
         int aux1 = idx / LX;
         x = idx - aux1 * LX;
         int aux2 = aux1 / LY;
         y = aux1 - aux2 * LY;
         t = aux2 / LZ;
         z = aux2 - t * LZ;
         aux1 = (parity + t + z + y) & 1;
         x += aux1;
         idx += aux1;
       } else if (idxType == EVEN_Y ) { // Y even
         t = idx / XYZ;
         z = (idx / XY) % LZ;
         idx += (parity + t + z) & 1;
         y = (idx / LX) % LY;
         x = idx % LX;
       } else if (idxType == EVEN_Z ) { // Z even
         t = idx / XYZ;
         idx += (parity + t) & 1;
         z = (idx / XY) % LZ;
         y = (idx / LX) % LY;
         x = idx % LX;
       } else {
         idx += parity;
         t = idx / XYZ;
         z = (idx / XY) % LZ;
         y = (idx / LX) % LY;
         x = idx % LX;
       }

       c[0] = x;
       c[1] = y;
       c[2] = z;
       c[3] = t;
     }


   // Get the  coordinates for the exterior kernels
     __device__ inline void coordsFromIndex(int x[4], unsigned int cb_idx, const int X[4], int dir, int displacement,
                                            int parity)
     {
       int Xh[2] = {X[0] / 2, X[1] / 2};
       switch (dir) {
       case 0:
         x[2] = cb_idx / Xh[1] % X[2];
         x[3] = cb_idx / (Xh[1] * X[2]) % X[3];
         x[0] = cb_idx / (Xh[1] * X[2] * X[3]);
         x[0] += (X[0] - displacement);
         x[1] = 2 * (cb_idx % Xh[1]) + ((x[0] + x[2] + x[3] + parity) & 1);
         break;

       case 1:
         x[2] = cb_idx / Xh[0] % X[2];
         x[3] = cb_idx / (Xh[0] * X[2]) % X[3];
         x[1] = cb_idx / (Xh[0] * X[2] * X[3]);
         x[1] += (X[1] - displacement);
         x[0] = 2 * (cb_idx % Xh[0]) + ((x[1] + x[2] + x[3] + parity) & 1);
         break;

       case 2:
         x[1] = cb_idx / Xh[0] % X[1];
         x[3] = cb_idx / (Xh[0] * X[1]) % X[3];
         x[2] = cb_idx / (Xh[0] * X[1] * X[3]);
         x[2] += (X[2] - displacement);
         x[0] = 2 * (cb_idx % Xh[0]) + ((x[1] + x[2] + x[3] + parity) & 1);
         break;

       case 3:
         x[1] = cb_idx / Xh[0] % X[1];
         x[2] = cb_idx / (Xh[0] * X[1]) % X[2];
         x[3] = cb_idx / (Xh[0] * X[1] * X[2]);
         x[3] += (X[3] - displacement);
         x[0] = 2 * (cb_idx % Xh[0]) + ((x[1] + x[2] + x[3] + parity) & 1);
         break;
       }
       return;
   }

   __device__ inline int neighborIndex(unsigned int cb_idx, const int shift[4], const bool partitioned[4], int parity,
                                       const int X[4])
   {
     int full_idx;
     int x[4];

     coordsFromIndex<EVEN_X>(full_idx, x, cb_idx, parity, X);

     for(int dim = 0; dim<4; ++dim){
       if( partitioned[dim] )
   if( (x[dim]+shift[dim])<0 || (x[dim]+shift[dim])>=X[dim]) return -1;
     }

     for(int dim=0; dim<4; ++dim){
       x[dim] = shift[dim] ? (x[dim]+shift[dim] + X[dim]) % X[dim] : x[dim];
     }
     return (((x[3]*X[2] + x[2])*X[1] + x[1])*X[0] + x[0]) >> 1;
   }

   template<typename real, typename Output, typename InputA, typename InputB>
   __global__ void interiorOprodKernel(StaggeredOprodArg<real, Output, InputA, InputB> arg)
     {
       using complex = complex<real>;
       using matrix = Matrix<complex,3>;

       unsigned int idx = blockIdx.x*blockDim.x + threadIdx.x;
       const unsigned int gridSize = gridDim.x*blockDim.x;

       complex x[3], y[3], z[3];
       matrix result;

       while(idx<arg.length){
         arg.inA.load(x, idx);

 #pragma unroll
         for(int dim=0; dim<4; ++dim){
           int shift[4] = {0,0,0,0};
           shift[dim] = 1;
           const int first_nbr_idx = neighborIndex(idx, shift, arg.partitioned, arg.parity, arg.X);
           if(first_nbr_idx >= 0){
             arg.inB.load(y, first_nbr_idx);
             outerProd(y,x,&result);
             matrix tempA = arg.outA(dim, idx, arg.parity);
             result = tempA + result*arg.coeff[0];

       arg.outA(dim, idx, arg.parity) = result;

       if (arg.nFace == 3) {
         shift[dim] = 3;
         const int third_nbr_idx = neighborIndex(idx, shift, arg.partitioned, arg.parity, arg.X);
         if (third_nbr_idx >= 0) {
     arg.inB.load(z, third_nbr_idx);
     outerProd(z, x, &result);
     matrix tempB = arg.outB(dim, idx, arg.parity);
     result = tempB + result*arg.coeff[1];
     arg.outB(dim, idx, arg.parity) = result;
         }
       }
           }
         } // dim

         idx += gridSize;
       }
       return;
     } // interiorOprodKernel


   template<int dim, typename real, typename Output, typename InputA, typename InputB>
   __global__ void exteriorOprodKernel(StaggeredOprodArg<real, Output, InputA, InputB> arg)
     {
       using complex = complex<real>;
       using matrix = Matrix<complex,3>;

       unsigned int cb_idx = blockIdx.x*blockDim.x + threadIdx.x;
       const unsigned int gridSize = gridDim.x*blockDim.x;

       complex a[3], b[3];
       matrix result;

       Output& out = (arg.displacement == 1) ? arg.outA : arg.outB;
       real coeff = (arg.displacement == 1) ? arg.coeff[0] : arg.coeff[1];

       int x[4];
       while(cb_idx<arg.length){
         coordsFromIndex(x, cb_idx, arg.X, arg.dir, arg.displacement, arg.parity);
         const unsigned int bulk_cb_idx = ((((x[3]*arg.X[2] + x[2])*arg.X[1] + x[1])*arg.X[0] + x[0]) >> 1);

         matrix inmatrix = out(arg.dir, bulk_cb_idx, arg.parity);
         arg.inA.load(a, bulk_cb_idx);

         const unsigned int ghost_idx = arg.ghostOffset[dim] + cb_idx;
         arg.inB.loadGhost(b, ghost_idx, arg.dir);

         outerProd(b, a, &result);
         result = inmatrix + result*coeff;
         out(arg.dir, bulk_cb_idx, arg.parity) = result;

         cb_idx += gridSize;
       }
       return;
     }


   template<typename Float, typename Output, typename InputA, typename InputB>
   class StaggeredOprodField : public Tunable {

   private:
     StaggeredOprodArg<Float,Output,InputA,InputB> &arg;
     const GaugeField &meta;

     unsigned int sharedBytesPerThread() const { return 0; }
     unsigned int sharedBytesPerBlock(const TuneParam &) const { return 0; }

     unsigned int minThreads() const { return arg.outA.volumeCB; }
     bool tunedGridDim() const { return false; }

   public:
     StaggeredOprodField(StaggeredOprodArg<Float,Output,InputA,InputB> &arg, const GaugeField &meta)
       : arg(arg), meta(meta) {
       writeAuxString("threads=%d,prec=%lu,stride=%d",arg.length,sizeof(Complex)/2,arg.inA.Stride());
       // this sets the communications pattern for the packing kernel
       int comms[QUDA_MAX_DIM] = { commDimPartitioned(0), commDimPartitioned(1), commDimPartitioned(2), commDimPartitioned(3) };
       setPackComms(comms);
     }

     virtual ~StaggeredOprodField() {}

     void apply(const cudaStream_t &stream){
       if (meta.Location() == QUDA_CUDA_FIELD_LOCATION) {
   // Disable tuning for the time being
   TuneParam tp = tuneLaunch(*this, QUDA_TUNE_NO, getVerbosity());
   if (arg.kernelType == OPROD_INTERIOR_KERNEL) {
     interiorOprodKernel<<<tp.grid,tp.block,tp.shared_bytes, stream>>>(arg);
   } else if (arg.kernelType == OPROD_EXTERIOR_KERNEL) {
          if (arg.dir == 0) exteriorOprodKernel<0><<<tp.grid,tp.block,tp.shared_bytes, stream>>>(arg);
     else if (arg.dir == 1) exteriorOprodKernel<1><<<tp.grid,tp.block,tp.shared_bytes, stream>>>(arg);
     else if (arg.dir == 2) exteriorOprodKernel<2><<<tp.grid,tp.block,tp.shared_bytes, stream>>>(arg);
     else if (arg.dir == 3) exteriorOprodKernel<3><<<tp.grid,tp.block,tp.shared_bytes, stream>>>(arg);
   } else {
     errorQuda("Kernel type not supported\n");
   }
       } else { // run the CPU code
   errorQuda("No CPU support for staggered outer-product calculation\n");
       }
     } // apply

     void preTune(){ this->arg.outA.save(); this->arg.outB.save(); }
     void postTune(){ this->arg.outA.load(); this->arg.outB.load(); }

     long long flops() const { return 0; } // FIXME
     long long bytes() const { return 0; } // FIXME
     TuneKey tuneKey() const { return TuneKey(meta.VolString(), typeid(*this).name(), aux);}
   }; // StaggeredOprodField


   void exchangeGhost(int nFace, cudaColorSpinorField &a, int parity, int dag) {
     // need to enable packing in temporal direction to get spin-projector correct
     pushKernelPackT(true);

     // first transfer src1
     qudaDeviceSynchronize();

     MemoryLocation location[2*QUDA_MAX_DIM] = {Device, Device, Device, Device, Device, Device, Device, Device};
     a.pack(nFace, 1-parity, dag, Nstream-1, location, Device);

     qudaDeviceSynchronize();

     for(int i=3; i>=0; i--){
       if(commDimPartitioned(i)){
   // Initialize the host transfer from the source spinor
   a.gather(nFace, dag, 2*i);
       } // commDim(i)
     } // i=3,..,0

     qudaDeviceSynchronize(); comm_barrier();

     for (int i=3; i>=0; i--) {
       if(commDimPartitioned(i)) {
   a.commsStart(nFace, 2*i, dag);
       }
     }

     for (int i=3; i>=0; i--) {
       if(commDimPartitioned(i)) {
   a.commsWait(nFace, 2*i, dag);
   a.scatter(nFace, dag, 2*i);
       }
     }

     qudaDeviceSynchronize();
     popKernelPackT(); // restore packing state

     a.bufferIndex = (1 - a.bufferIndex);
     comm_barrier();
   }

   template <typename Float, typename Output, typename InputA, typename InputB>
   void computeStaggeredOprodCuda(Output outA, Output outB, GaugeField &outFieldA, GaugeField &outFieldB, InputA &inA,
                                  InputB &inB, cudaColorSpinorField &src, int parity, const int faceVolumeCB[4],
                                  const double coeff[2], int nFace)
   {
     unsigned int ghostOffset[4] = {0, 0, 0, 0};
     for (int dir = 0; dir < 4; ++dir)
       ghostOffset[dir] = src.GhostOffset(dir, 1) / src.FieldOrder(); // offset we want is the forwards one

     // Create the arguments for the interior kernel
     StaggeredOprodArg<Float, Output, InputA, InputB> arg(parity, 0, ghostOffset, 1, OPROD_INTERIOR_KERNEL, nFace, coeff,
                                                          inA, inB, outA, outB, outFieldA);
     StaggeredOprodField<Float, Output, InputA, InputB> oprod(arg, outFieldA);

     arg.kernelType = OPROD_INTERIOR_KERNEL;
     arg.length = src.VolumeCB();
     oprod.apply(streams[Nstream - 1]);

     for (int i = 3; i >= 0; i--) {
       if (commDimPartitioned(i)) {
         // update parameters for this exterior kernel
         arg.kernelType = OPROD_EXTERIOR_KERNEL;
         arg.dir = i;

         // First, do the one hop term
         {
           arg.displacement = 1;
           arg.length = faceVolumeCB[i];
           oprod.apply(streams[Nstream - 1]);
         }

         // Now do the 3 hop term
         if (nFace == 3) {
           arg.displacement = 3;
           arg.length = arg.displacement * faceVolumeCB[i];
           oprod.apply(streams[Nstream - 1]);
         }
       }
     } // i=3,..,0

     checkCudaError();
     } // computeStaggeredOprodCuda

 #endif // GPU_STAGGERED_DIRAC

     void computeStaggeredOprod(GaugeField &outA, GaugeField &outB, ColorSpinorField &inEven, ColorSpinorField &inOdd,
                                int parity, const double coeff[2], int nFace)
     {
 #ifdef GPU_STAGGERED_DIRAC
     if(outA.Order() != QUDA_FLOAT2_GAUGE_ORDER)
       errorQuda("Unsupported output ordering: %d\n", outA.Order());

     if(outB.Order() != QUDA_FLOAT2_GAUGE_ORDER)
       errorQuda("Unsupported output ordering: %d\n", outB.Order());

     if(inEven.Precision() != outA.Precision()) errorQuda("Mixed precision not supported: %d %d\n", inEven.Precision(), outA.Precision());

     cudaColorSpinorField &inA = (parity&1) ? static_cast<cudaColorSpinorField&>(inOdd) : static_cast<cudaColorSpinorField&>(inEven);
     cudaColorSpinorField &inB = (parity&1) ? static_cast<cudaColorSpinorField&>(inEven) : static_cast<cudaColorSpinorField&>(inOdd);

     inA.allocateGhostBuffer(nFace);
     inB.allocateGhostBuffer(nFace);

     if (inEven.Precision() == QUDA_DOUBLE_PRECISION) {
       Spinor<double2, double2, 3, 0> spinorA(inA, nFace);
       Spinor<double2, double2, 3, 0> spinorB(inB, nFace);
       exchangeGhost(nFace,static_cast<cudaColorSpinorField&>(inB), parity, 0);

       computeStaggeredOprodCuda<double>(gauge::FloatNOrder<double, 18, 2, 18>(outA), gauge::FloatNOrder<double, 18, 2, 18>(outB),
           outA, outB, spinorA, spinorB, inB, parity, inB.GhostFace(), coeff, nFace);
     } else if (inEven.Precision() == QUDA_SINGLE_PRECISION) {
       Spinor<float2, float2, 3, 0> spinorA(inA, nFace);
       Spinor<float2, float2, 3, 0> spinorB(inB, nFace);
       exchangeGhost(nFace,static_cast<cudaColorSpinorField&>(inB), parity, 0);

       computeStaggeredOprodCuda<float>(gauge::FloatNOrder<float, 18, 2, 18>(outA), gauge::FloatNOrder<float, 18, 2, 18>(outB),
                outA, outB, spinorA, spinorB, inB, parity, inB.GhostFace(), coeff, nFace);
     } else {
       errorQuda("Unsupported precision: %d\n", inEven.Precision());
     }

 #else // GPU_STAGGERED_DIRAC not defined
     errorQuda("Staggered Outer Product has not been built!");
 #endif

     return;
   } // computeStaggeredOprod

   void computeStaggeredOprod(GaugeField *out[], ColorSpinorField& in, const double coeff[], int nFace)
   {
     if (nFace == 1) {
       computeStaggeredOprod(*out[0], *out[0], in.Even(), in.Odd(), 0, coeff, nFace);
       double coeff_[2] = {-coeff[0],0.0}; // need to multiply by -1 on odd sites
       computeStaggeredOprod(*out[0], *out[0], in.Even(), in.Odd(), 1, coeff_, nFace);
     } else if (nFace == 3) {
       computeStaggeredOprod(*out[0], *out[1], in.Even(), in.Odd(), 0, coeff, nFace);
       computeStaggeredOprod(*out[0], *out[1], in.Even(), in.Odd(), 1, coeff, nFace);
     } else {
       errorQuda("Invalid nFace=%d", nFace);
     }
   }

 } // namespace quda
QUDA_TUNE_NO
Definition: enum_quda.h:271

quda::neighborIndex
__device__ __forceinline__ int neighborIndex(const unsigned int &cb_idx, const int(&shift)[4], const bool(&partitioned)[4], const unsigned int &parity)
Definition: shift_quark_field.cu:41

quda::Device
Definition: color_spinor_field.h:15

EVEN_Y
Definition: dslash_index.cuh:334

commDimPartitioned
int commDimPartitioned(int dir)
Definition: comm_common.cpp:815

quda::ColorSpinorField
Definition: color_spinor_field.h:311

quda::cudaColorSpinorField::allocateGhostBuffer
void allocateGhostBuffer(int nFace, bool spin_project=true) const
Allocate the ghost buffers.
Definition: cuda_color_spinor_field.cpp:747

EVEN_Z
Definition: dslash_index.cuh:335

getVerbosity
QudaVerbosity getVerbosity()
Definition: util_quda.cpp:21

errorQuda
#define errorQuda(...)
Definition: util_quda.h:121

QUDA_CUDA_FIELD_LOCATION
Definition: enum_quda.h:326

streams
cudaStream_t * streams
Definition: interface_quda.cpp:157

QUDA_FLOAT2_GAUGE_ORDER
Definition: enum_quda.h:39

quda::stream
cudaStream_t * stream
Definition: cuda_color_spinor_field.cpp:897

quda::ColorSpinorField::Even
const ColorSpinorField & Even() const
Definition: color_spinor_field.cpp:608

quda::ColorSpinorField::Odd
const ColorSpinorField & Odd() const
Definition: color_spinor_field.cpp:616

quda::Nstream
const int Nstream
Definition: quda_internal.h:83

quda::cudaColorSpinorField
Definition: color_spinor_field.h:575

length
int length[]
Definition: gauge_force_test.cpp:34

texture.h

quda
Definition: blas_cublas.h:5

quda::outerProd
__device__ __host__ void outerProd(const Array< T, N > &a, const Array< T, N > &b, Matrix< T, N > *m)
Definition: quda_matrix.h:805

quda::popKernelPackT
void popKernelPackT()
Definition: dslash_quda.cu:42

staggered_oprod.h

qudaDeviceSynchronize
#define qudaDeviceSynchronize()
Definition: quda_cuda_api.h:145

IndexType
IndexType
Definition: dslash_index.cuh:332

in
cpuColorSpinorField * in
Definition: staggered_invert_test.cpp:98

coordsFromIndex
static __device__ __forceinline__ void coordsFromIndex(int &idx, T *x, int &cb_idx, const Param &param)
Compute coordinates from index into the checkerboard (used by the interior Dslash kernels)...
Definition: dslash_index.cuh:352

quda::tuneLaunch
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
Definition: tune.cpp:643

gauge_field_order.h
Main header file for host and device accessors to GaugeFields.

X
int X[4]
Definition: covdev_test.cpp:70

quda::Complex
std::complex< double > Complex
Definition: quda_internal.h:46

tune_quda.h

Matrix
Definition: hisq_force_reference2.cpp:131

quda_matrix.h

QUDA_DOUBLE_PRECISION
Definition: enum_quda.h:62

out
cpuColorSpinorField * out
Definition: staggered_invert_test.cpp:99

QUDA_SINGLE_PRECISION
Definition: enum_quda.h:61

EVEN_X
Definition: dslash_index.cuh:333

dslash_quda.h

quda::blas::flops
unsigned long long flops
Definition: blas_quda.cu:22

quda::arg
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
Definition: complex_quda.h:1076

Spinor
Definition: texture.h:288

EVEN_T
Definition: dslash_index.cuh:336

quda::GaugeField::Order
QudaGaugeFieldOrder Order() const
Definition: gauge_field.h:251

quda::pushKernelPackT
void pushKernelPackT(bool pack)
Definition: dslash_quda.cu:30

quda::computeStaggeredOprod
void computeStaggeredOprod(GaugeField *out[], ColorSpinorField &in, const double coeff[], int nFace)
Compute the outer-product field between the staggered quark field&#39;s one and (for HISQ and ASQTAD) thr...
Definition: staggered_oprod.cu:447

QUDA_MAX_DIM
#define QUDA_MAX_DIM
Maximum number of dimensions supported by QUDA. In practice, no routines make use of more than 5...
Definition: quda_constants.h:17

checkCudaError
#define checkCudaError()
Definition: util_quda.h:161

quda::LatticeField::Precision
QudaPrecision Precision() const
Definition: lattice_field.h:546

parity
QudaParity parity
Definition: covdev_test.cpp:54

quda::MemoryLocation
MemoryLocation
Definition: color_spinor_field.h:15

quda::gauge::FloatNOrder
Definition: gauge_field_order.h:1692

quda::blas::bytes
unsigned long long bytes
Definition: blas_quda.cu:23

quda::setPackComms
void setPackComms(const int *dim_pack)
Helper function that sets which dimensions the packing kernel should be packing for.
Definition: dslash_pack2.cu:14

quda_internal.h

quda::GaugeField
Definition: gauge_field.h:164

comm_barrier
void comm_barrier(void)
Definition: comm_mpi.cpp:326