quda-ref/v0.7.0/clover__deriv__quda_8cu_source.html

 #include <cstdio>

 #include <cstdlib>

 #include <cuda.h>

 #include <quda_internal.h>

 #include <tune_quda.h>

 #include <gauge_field.h>

 #include <quda_matrix.h>

 #include <cassert>


 namespace quda {


 #ifdef GPU_CLOVER_DIRAC


   template<class Cmplx>

     struct CloverDerivArg

     {

       int X[4];

       int border[4];

       int mu;

       int nu;

       typename RealTypeId<Cmplx>::Type coeff;

       int parity;

       int volumeCB;


       Cmplx* gauge;

       Cmplx* force;

       Cmplx* oprod;


       int forceStride;

       int gaugeStride;

       int oprodStride;


       int forceOffset;

       int gaugeOffset;

       int oprodOffset;


       bool conjugate;


       CloverDerivArg(cudaGaugeField& force, cudaGaugeField& gauge, cudaGaugeField& oprod, int mu, int nu, double coeff, int parity, bool conjugate) :

         mu(mu), nu(nu), coeff(coeff), parity(parity), volumeCB(force.VolumeCB()),

         force(reinterpret_cast<Cmplx*>(force.Gauge_p())),  gauge(reinterpret_cast<Cmplx*>(gauge.Gauge_p())), oprod(reinterpret_cast<Cmplx*>(oprod.Gauge_p())),

         forceStride(force.Stride()), gaugeStride(gauge.Stride()), oprodStride(oprod.Stride()),

         forceOffset(force.Bytes()/(2*sizeof(Cmplx))), gaugeOffset(gauge.Bytes()/(2*sizeof(Cmplx))), oprodOffset(oprod.Bytes()/(2*sizeof(Cmplx)))

       {

         for(int dir=0; dir<4; ++dir) X[dir] = force.X()[dir];

         //for(int dir=0; dir<4; ++dir) border[dir] =  commDimPartitioned(dir) ? 2 : 0;

         for(int dir=0; dir<4; ++dir) border[dir] = 2;

       }

     };


   __device__ void getCoords(int x[4], int cb_index, const int X[4], int parity)

   {

     x[3] = cb_index/(X[2]*X[1]*X[0]/2);

     x[2] = (cb_index/(X[1]*X[0]/2)) % X[2];

     x[1] = (cb_index/(X[0]/2)) % X[1];

     x[0] = 2*(cb_index%(X[0]/2)) + ((x[3]+x[2]+x[1]+parity)&1);


     return;

   }


   __device__ int linkIndex(const int x[4], const int dx[4], const int X[4])

   {

     int y[4];

     for (int i=0; i<4; i++) y[i] = (x[i] + dx[i] + X[i]) % X[i];

     return (((y[3]*X[2] + y[2])*X[1] + y[1])*X[0] + y[0])/2;

   }


   template<typename Cmplx, bool isConjugate>

     __global__ void

     cloverDerivativeKernel(const CloverDerivArg<Cmplx> arg)

     {

       int index = threadIdx.x + blockIdx.x*blockDim.x;


       if(index >= arg.volumeCB) return;


       int x[4];

       int y[4];

       int otherparity = (1-arg.parity);

       getCoords(x, index, arg.X, arg.parity);

       getCoords(y, index, arg.X, otherparity);

       int X[4];

       for(int dir=0; dir<4; ++dir) X[dir] = arg.X[dir];


       for(int dir=0; dir<4; ++dir){

         x[dir] += arg.border[dir];

         y[dir] += arg.border[dir];

         X[dir] += 2*arg.border[dir];

       }


       Cmplx* thisGauge = arg.gauge + arg.parity*arg.gaugeOffset;

       Cmplx* otherGauge = arg.gauge + (otherparity)*arg.gaugeOffset;


       Cmplx* thisOprod = arg.oprod + arg.parity*arg.oprodOffset;


       const int& mu = arg.mu;

       const int& nu = arg.nu;


       Matrix<Cmplx,3> thisForce;

       Matrix<Cmplx,3> otherForce;


       // U[mu](x) U[nu](x+mu) U[*mu](x+nu) U[*nu](x) Oprod(x)

       {

         int d[4] = {0, 0, 0, 0};


         // load U(x)_(+mu)

         Matrix<Cmplx,3> U1;

         loadLinkVariableFromArray(thisGauge, mu, linkIndex(x, d, X),

             arg.gaugeStride, &U1);


         // load U(x+mu)_(+nu)

         Matrix<Cmplx,3> U2;

         d[mu]++;

         loadLinkVariableFromArray(otherGauge, nu, linkIndex(x, d, X),

             arg.gaugeStride, &U2);

         d[mu]--;


         // load U(x+nu)_(+mu)

         Matrix<Cmplx,3> U3;

         d[nu]++;

         loadLinkVariableFromArray(otherGauge, mu, linkIndex(x, d, X),

             arg.gaugeStride, &U3);

         d[nu]--;


         // load U(x)_(+nu)

         Matrix<Cmplx,3> U4;

         loadLinkVariableFromArray(thisGauge, nu, linkIndex(x, d, X),

             arg.gaugeStride, &U4);


         // load Oprod

         Matrix<Cmplx,3> Oprod1;

         loadMatrixFromArray(thisOprod, linkIndex(x, d, X), arg.oprodStride, &Oprod1);


         if(isConjugate) Oprod1 -= conj(Oprod1);

         thisForce = U1*U2*conj(U3)*conj(U4)*Oprod1;


         Matrix<Cmplx,3> Oprod2;

         d[mu]++; d[nu]++;

         loadMatrixFromArray(thisOprod, linkIndex(x, d, X), arg.oprodStride, &Oprod2);

         d[mu]--; d[nu]--;


         if(isConjugate) Oprod2 -= conj(Oprod2);


         thisForce += U1*U2*Oprod2*conj(U3)*conj(U4);


       }


       {

         int d[4] = {0, 0, 0, 0};

         // load U(x)_(+mu)

         Matrix<Cmplx,3> U1;

         loadLinkVariableFromArray(otherGauge, mu, linkIndex(y, d, X),

             arg.gaugeStride, &U1);


         // load U(x+mu)_(+nu)

         Matrix<Cmplx,3> U2;

         d[mu]++;

         loadLinkVariableFromArray(thisGauge, nu, linkIndex(y, d, X),

             arg.gaugeStride, &U2);

         d[mu]--;


         // load U(x+nu)_(+mu)

         Matrix<Cmplx,3> U3;

         d[nu]++;

         loadLinkVariableFromArray(thisGauge, mu, linkIndex(y, d, X),

             arg.gaugeStride, &U3);

         d[nu]--;


         // load U(x)_(+nu)

         Matrix<Cmplx,3> U4;

         loadLinkVariableFromArray(otherGauge, nu, linkIndex(y, d, X),

             arg.gaugeStride, &U4);


         // load opposite parity Oprod

         Matrix<Cmplx,3> Oprod3;

         d[nu]++;

         loadMatrixFromArray(thisOprod, linkIndex(y, d, X), arg.oprodStride, &Oprod3);

         d[nu]--;


         if(isConjugate) Oprod3 -= conj(Oprod3);

         otherForce = U1*U2*conj(U3)*Oprod3*conj(U4);


         // load Oprod(x+mu)

         Matrix<Cmplx, 3> Oprod4;

         d[mu]++;

         loadMatrixFromArray(thisOprod, linkIndex(y, d, X), arg.oprodStride, &Oprod4);

         d[mu]--;


         if(isConjugate) Oprod4 -= conj(Oprod4);


         otherForce += U1*Oprod4*U2*conj(U3)*conj(U4);

       }


       // Lower leaf

       // U[nu*](x-nu) U[mu](x-nu) U[nu](x+mu-nu) Oprod(x+mu) U[*mu](x)

       {

         int d[4] = {0, 0, 0, 0};

         // load U(x-nu)(+nu)

         Matrix<Cmplx,3> U1;

         d[nu]--;

         loadLinkVariableFromArray(thisGauge, nu, linkIndex(y, d, X),

             arg.gaugeStride, &U1);

         d[nu]++;


         // load U(x-nu)(+mu)

         Matrix<Cmplx, 3> U2;

         d[nu]--;

         loadLinkVariableFromArray(thisGauge, mu, linkIndex(y, d, X),

             arg.gaugeStride, &U2);

         d[nu]++;


         // load U(x+mu-nu)(nu)

         Matrix<Cmplx, 3> U3;

         d[mu]++; d[nu]--;

         loadLinkVariableFromArray(otherGauge, nu, linkIndex(y, d, X),

             arg.gaugeStride, &U3);

         d[mu]--; d[nu]++;


         // load U(x)_(+mu)

         Matrix<Cmplx,3> U4;

         loadLinkVariableFromArray(otherGauge, mu, linkIndex(y, d, X),

             arg.gaugeStride, &U4);


         // load Oprod(x+mu)

         Matrix<Cmplx, 3> Oprod1;

         d[mu]++;

         loadMatrixFromArray(thisOprod, linkIndex(y, d, X), arg.oprodStride, &Oprod1);

         d[mu]--;


         if(isConjugate) Oprod1 -= conj(Oprod1);


         otherForce -= conj(U1)*U2*U3*Oprod1*conj(U4);


         Matrix<Cmplx,3> Oprod2;

         d[nu]--;

         loadMatrixFromArray(thisOprod, linkIndex(y, d, X), arg.oprodStride, &Oprod2);

         d[nu]++;


         if(isConjugate) Oprod2 -= conj(Oprod2);

         otherForce -= conj(U1)*Oprod2*U2*U3*conj(U4);

       }


       {

         int d[4] = {0, 0, 0, 0};

         // load U(x-nu)(+nu)

         Matrix<Cmplx,3> U1;

         d[nu]--;

         loadLinkVariableFromArray(otherGauge, nu, linkIndex(x, d, X),

             arg.gaugeStride, &U1);

         d[nu]++;


         // load U(x-nu)(+mu)

         Matrix<Cmplx, 3> U2;

         d[nu]--;

         loadLinkVariableFromArray(otherGauge, mu, linkIndex(x, d, X),

             arg.gaugeStride, &U2);

         d[nu]++;


         // load U(x+mu-nu)(nu)

         Matrix<Cmplx, 3> U3;

         d[mu]++; d[nu]--;

         loadLinkVariableFromArray(thisGauge, nu, linkIndex(x, d, X),

             arg.gaugeStride, &U3);

         d[mu]--; d[nu]++;


         // load U(x)_(+mu)

         Matrix<Cmplx,3> U4;

         loadLinkVariableFromArray(thisGauge, mu, linkIndex(x, d, X),

             arg.gaugeStride, &U4);


         Matrix<Cmplx,3> Oprod1;

         d[mu]++; d[nu]--;

         loadMatrixFromArray(thisOprod, linkIndex(x, d, X), arg.oprodStride, &Oprod1);

         d[mu]--; d[nu]++;


         if(isConjugate) Oprod1 -= conj(Oprod1);

         thisForce -= conj(U1)*U2*Oprod1*U3*conj(U4);


         Matrix<Cmplx, 3> Oprod4;

         loadMatrixFromArray(thisOprod, linkIndex(x, d, X), arg.oprodStride, &Oprod4);


         if(isConjugate) Oprod4 -= conj(Oprod4);

         thisForce -= Oprod4*conj(U1)*U2*U3*conj(U4);

       }


       thisForce *= arg.coeff;

       otherForce *= arg.coeff;


       // Write to array

       {

         appendMatrixToArray(thisForce, index, arg.forceStride, arg.force + arg.parity*arg.forceOffset);

         appendMatrixToArray(otherForce, index, arg.forceStride, arg.force + otherparity*arg.forceOffset);

       }

       return;

     } // cloverDerivativeKernel


   template<typename Complex>

   class CloverDerivative : public Tunable {


   private:

     CloverDerivArg<Complex> arg;

     const GaugeField &meta;


     unsigned int sharedBytesPerThread() const { return 0; }

     unsigned int sharedBytesPerBlock(const TuneParam &) const { return 0; }


     unsigned int minThreads() const { return arg.volumeCB; }

     bool tuneGridDim() const { return false; }


   public:

     CloverDerivative(const CloverDerivArg<Complex> &arg, const GaugeField &meta)

       : arg(arg), meta(meta) {

       writeAuxString("threads=%d,prec=%lu,stride=%d,geometery=%d",arg.volumeCB,sizeof(Complex)/2,arg.forceOffset);

     }

     virtual ~CloverDerivative() {}


     void apply(const cudaStream_t &stream){

       TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());

       if(arg.conjugate){

         cloverDerivativeKernel<Complex,true><<<tp.grid,tp.block,tp.shared_bytes>>>(arg);

       }else{

         cloverDerivativeKernel<Complex,false><<<tp.grid,tp.block,tp.shared_bytes>>>(arg);

       }

     } // apply


     void preTune(){}

     void postTune(){}


     long long flops() const {

       return 0;

     }


     long long bytes() const { return 0; }


     TuneKey tuneKey() const { return TuneKey(meta.VolString(), typeid(*this).name(), aux); }

   };


   // FIXME - the Tunable class isn't used here

   template<typename Float>

     void cloverDerivative(cudaGaugeField &out,

         cudaGaugeField& gauge,

         cudaGaugeField& oprod,

         int mu, int nu, double coeff, int parity,

         int conjugate)

     {

       typedef typename ComplexTypeId<Float>::Type Complex;

       CloverDerivArg<Complex> arg(out, gauge, oprod, mu, nu, coeff, parity, conjugate);

 //      CloverDerivative<Complex> cloverDerivative(arg);

 //      cloverDerivative.apply(0);

       dim3 blockDim(128, 1, 1);

       dim3 gridDim((arg.volumeCB + blockDim.x-1)/blockDim.x, 1, 1);

       if(conjugate){

         cloverDerivativeKernel<Complex,true><<<gridDim,blockDim,0>>>(arg);

       }else{

         cloverDerivativeKernel<Complex,false><<<gridDim,blockDim,0>>>(arg);

       }

     }


 #endif


   void cloverDerivative(cudaGaugeField &out,

       cudaGaugeField& gauge,

       cudaGaugeField& oprod,

       int mu, int nu, double coeff, QudaParity parity, int conjugate)

   {

 #ifdef GPU_CLOVER_DIRAC

     assert(oprod.Geometry() == QUDA_SCALAR_GEOMETRY);

     assert(out.Geometry() == QUDA_SCALAR_GEOMETRY);


     int device_parity = (parity == QUDA_EVEN_PARITY) ? 0 : 1;


     if(out.Precision() == QUDA_DOUBLE_PRECISION){

       cloverDerivative<double>(out, gauge, oprod, mu, nu, coeff, device_parity, conjugate);

     } else if (out.Precision() == QUDA_SINGLE_PRECISION){

       cloverDerivative<float>(out, gauge, oprod, mu, nu, coeff, device_parity, conjugate);

     } else {

       errorQuda("Precision %d not supported", out.Precision());

     }

     return;

 #else

     errorQuda("Clover has not been built");

 #endif

   }


 } // namespace quda

quda::linkIndex
__device__ __host__ int linkIndex(int x[], int dx[], const int X[4])
Definition: ks_force_quda.cu:40

y
int y[4]
Definition: staggered_dslash_core.h:356

getVerbosity
QudaVerbosity getVerbosity()
Definition: util_quda.cpp:20

conj
Matrix< N, std::complex< T > > conj(const Matrix< N, std::complex< T > > &mat)
Definition: hisq_force_reference2.cpp:231

errorQuda
#define errorQuda(...)
Definition: util_quda.h:73

mu
__global__ void const RealA *const const RealA *const const RealA *const const RealB *const const RealB *const int int mu
Definition: hisq_paths_force_core.h:82

quda::Complex
std::complex< double > Complex
Definition: eig_variables.h:13

quda::stream
cudaStream_t * stream
Definition: cuda_color_spinor_field.cu:816

quda::cloverDerivative
void cloverDerivative(cudaGaugeField &out, cudaGaugeField &gauge, cudaGaugeField &oprod, int mu, int nu, double coeff, QudaParity parity, int conjugate)
Definition: clover_deriv_quda.cu:369

quda::LatticeField::Precision
QudaPrecision Precision() const
Definition: lattice_field.h:176

quda::index
__device__ __host__ int index(int i, int j)
Definition: quda_matrix.h:342

quda::tuneLaunch
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
Definition: tune.cpp:271

quda::cudaGaugeField
Definition: gauge_field.h:216

coeff
__constant__ double coeff
Definition: dslash_constants.h:180

QudaParity
enum QudaParity_s QudaParity

x
int x[4]
Definition: hisq_paths_force_core.h:99

tune_quda.h

X
int X
Definition: asym_wilson_clover_dslash_dagger_fermi_core.h:394

dx
int dx[4]
Definition: hisq_paths_force_core.h:98

quda::loadMatrixFromArray
__device__ void loadMatrixFromArray(const T *const array, const int idx, const int stride, Matrix< T, N > *mat)
Definition: quda_matrix.h:778

Matrix
Definition: hisq_force_reference2.cpp:131

quda_matrix.h

QUDA_DOUBLE_PRECISION
Definition: enum_quda.h:50

out
cpuColorSpinorField * out
Definition: staggered_invert_test.cpp:51

QUDA_SINGLE_PRECISION
Definition: enum_quda.h:49

quda::loadLinkVariableFromArray
__device__ void loadLinkVariableFromArray(const T *const array, const int dir, const int idx, const int stride, Matrix< T, 3 > *link)
Definition: quda_matrix.h:767

quda::arg
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
Definition: complex_quda.h:843

QUDA_EVEN_PARITY
Definition: enum_quda.h:239

quda::appendMatrixToArray
__device__ void appendMatrixToArray(const Matrix< double2, 3 > &mat, const int idx, const int stride, double2 *const array)
Definition: quda_matrix.h:810

QUDA_SCALAR_GEOMETRY
Definition: enum_quda.h:385

quda::GaugeField::Geometry
QudaFieldGeometry Geometry() const
Definition: gauge_field.h:177

quda::conj
__host__ __device__ ValueType conj(ValueType x)
Definition: complex_quda.h:115

getTuning
QudaTune getTuning()
Definition: util_quda.cpp:32

parity
const QudaParity parity
Definition: dslash_test.cpp:29

gauge_field.h

gauge
void * gauge[4]
Definition: su3_test.cpp:15

quda::getCoords
__device__ __host__ void getCoords(int x[4], int cb_index, const int X[4], int parity)
Definition: ks_force_quda.cu:48

quda_internal.h