quda-ref/v0.7.0/clover__quda_8cu_source.html

 #include <quda_internal.h>

 #include <quda_matrix.h>

 #include <tune_quda.h>

 #include <clover_field.h>

 #include <gauge_field.h>

 #include <gauge_field_order.h>


 namespace CloverOrder {

   using namespace quda;

 #include <clover_field_order.h>

 } // CloverOrder


 namespace quda {


 #ifdef GPU_CLOVER_DIRAC


   template<typename Float, typename Clover, typename Gauge>

     struct CloverArg {

       int threads; // number of active threads required

       int X[4]; // grid dimensions

 #ifdef MULTI_GPU

       int border[4];

 #endif

       double cloverCoeff;


       int FmunuStride; // stride used on Fmunu field

       int FmunuOffset; // parity offset


       typename ComplexTypeId<Float>::Type* Fmunu;

       Gauge  gauge;

       Clover clover;


       CloverArg(Clover &clover, Gauge &gauge, GaugeField& Fmunu, double cloverCoeff)

         : threads(Fmunu.Volume()),

         cloverCoeff(cloverCoeff),

         FmunuStride(Fmunu.Stride()), FmunuOffset(Fmunu.Bytes()/(4*sizeof(Float))),

         Fmunu(reinterpret_cast<typename ComplexTypeId<Float>::Type*>(Fmunu.Gauge_p())),

         gauge(gauge), clover(clover) {

           for(int dir=0; dir<4; ++dir) X[dir] = Fmunu.X()[dir];


 #ifdef MULTI_GPU

           for(int dir=0; dir<4; ++dir){

             border[dir] = 2;

           }

 #endif

         }

     };


   __device__ __host__ inline int linkIndex(int x[], int dx[], const int X[4]) {

     int y[4];

     for (int i=0; i<4; i++) y[i] = (x[i] + dx[i] + X[i]) % X[i];

     int idx = (((y[3]*X[2] + y[2])*X[1] + y[1])*X[0] + y[0]) >> 1;

     return idx;

   }


   __device__ __host__ inline void getCoords(int x[4], int cb_index, const int X[4], int parity)

   {

     x[3] = cb_index/(X[2]*X[1]*X[0]/2);

     x[2] = (cb_index/(X[1]*X[0]/2)) % X[2];

     x[1] = (cb_index/(X[0]/2)) % X[1];

     x[0] = 2*(cb_index%(X[0]/2)) + ((x[3]+x[2]+x[1]+parity)&1);


     return;

   }


   template <typename Float, typename Clover, typename GaugeOrder>

     __host__ __device__ void computeFmunuCore(CloverArg<Float,Clover,GaugeOrder>& arg, int idx) {


       // compute spacetime dimensions and parity

       int parity = 0;

       if(idx >= arg.threads/2){

         parity = 1;

         idx -= arg.threads/2;

       }


       int X[4];

       for(int dir=0; dir<4; ++dir) X[dir] = arg.X[dir];


       int x[4];

       getCoords(x, idx, X, parity);

 #ifdef MULTI_GPU

       for(int dir=0; dir<4; ++dir){

            x[dir] += arg.border[dir];

            X[dir] += 2*arg.border[dir];

       }

 #endif


       typedef typename ComplexTypeId<Float>::Type Cmplx;


       for (int mu=0; mu<4; mu++) {

         for (int nu=0; nu<mu; nu++) {

           Matrix<Cmplx,3> F;

           setZero(&F);

           { // U(x,mu) U(x+mu,nu) U[dagger](x+nu,mu) U[dagger](x,nu)


             // load U(x)_(+mu)

             Matrix<Cmplx,3> U1;

             int dx[4] = {0, 0, 0, 0};

             arg.gauge.load((Float*)(U1.data),linkIndex(x,dx,X), mu, parity);

             // load U(x+mu)_(+nu)

             Matrix<Cmplx,3> U2;

             dx[mu]++;

             arg.gauge.load((Float*)(U2.data),linkIndex(x,dx,X), nu, 1-parity);

             dx[mu]--;


             Matrix<Cmplx,3> Ftmp = U1 * U2;


             // load U(x+nu)_(+mu)

             Matrix<Cmplx,3> U3;

             dx[nu]++;

             arg.gauge.load((Float*)(U3.data),linkIndex(x,dx,X), mu, 1-parity);

             dx[nu]--;


             Ftmp = Ftmp * conj(U3) ;


             // load U(x)_(+nu)

             Matrix<Cmplx,3> U4;

             arg.gauge.load((Float*)(U4.data),linkIndex(x,dx,X), nu, parity);


             // complete the plaquette

             F = Ftmp * conj(U4);

           }


           { // U(x,nu) U[dagger](x+nu-mu,mu) U[dagger](x-mu,nu) U(x-mu, mu)


             // load U(x)_(+nu)

             Matrix<Cmplx,3> U1;

             int dx[4] = {0, 0, 0, 0};

             arg.gauge.load((Float*)(U1.data), linkIndex(x,dx,X), nu, parity);


             // load U(x+nu)_(-mu) = U(x+nu-mu)_(+mu)

             Matrix<Cmplx,3> U2;

             dx[nu]++;

             dx[mu]--;

             arg.gauge.load((Float*)(U2.data), linkIndex(x,dx,X), mu, parity);

             dx[mu]++;

             dx[nu]--;


             Matrix<Cmplx,3> Ftmp =  U1 * conj(U2);


             // load U(x-mu)_nu

             Matrix<Cmplx,3> U3;

             dx[mu]--;

             arg.gauge.load((Float*)(U3.data), linkIndex(x,dx,X), nu, 1-parity);

             dx[mu]++;


             Ftmp =  Ftmp * conj(U3);


             // load U(x)_(-mu) = U(x-mu)_(+mu)

             Matrix<Cmplx,3> U4;

             dx[mu]--;

             arg.gauge.load((Float*)(U4.data), linkIndex(x,dx,X), mu, 1-parity);

             dx[mu]++;


             // complete the plaquette

             Ftmp = Ftmp * U4;


             // sum this contribution to Fmunu

             F += Ftmp;

           }


           { // U[dagger](x-nu,nu) U(x-nu,mu) U(x+mu-nu,nu) U[dagger](x,mu)


             // load U(x)_(-nu)

             Matrix<Cmplx,3> U1;

             int dx[4] = {0, 0, 0, 0};

             dx[nu]--;

             arg.gauge.load((Float*)(U1.data), linkIndex(x,dx,X), nu, 1-parity);

             dx[nu]++;


             // load U(x-nu)_(+mu)

             Matrix<Cmplx,3> U2;

             dx[nu]--;

             arg.gauge.load((Float*)(U2.data), linkIndex(x,dx,X), mu, 1-parity);

             dx[nu]++;


             Matrix<Cmplx,3> Ftmp = conj(U1) * U2;


             // load U(x+mu-nu)_(+nu)

             Matrix<Cmplx,3> U3;

             dx[mu]++;

             dx[nu]--;

             arg.gauge.load((Float*)(U3.data), linkIndex(x,dx,X), nu, parity);

             dx[nu]++;

             dx[mu]--;


             Ftmp = Ftmp * U3;


             // load U(x)_(+mu)

             Matrix<Cmplx,3> U4;

             arg.gauge.load((Float*)(U4.data), linkIndex(x,dx,X), mu, parity);


             Ftmp = Ftmp * conj(U4);


             // sum this contribution to Fmunu

             F += Ftmp;

           }


           { // U[dagger](x-mu,mu) U[dagger](x-mu-nu,nu) U(x-mu-nu,mu) U(x-nu,nu)


             // load U(x)_(-mu)

             Matrix<Cmplx,3> U1;

             int dx[4] = {0, 0, 0, 0};

             dx[mu]--;

             arg.gauge.load((Float*)(U1.data), linkIndex(x,dx,X), mu, 1-parity);

             dx[mu]++;


             // load U(x-mu)_(-nu) = U(x-mu-nu)_(+nu)

             Matrix<Cmplx,3> U2;

             dx[mu]--;

             dx[nu]--;

             arg.gauge.load((Float*)(U2.data), linkIndex(x,dx,X), nu, parity);

             dx[nu]++;

             dx[mu]++;


             Matrix<Cmplx,3> Ftmp = conj(U1) * conj(U2);


             // load U(x-nu)_mu

             Matrix<Cmplx,3> U3;

             dx[mu]--;

             dx[nu]--;

             arg.gauge.load((Float*)(U3.data), linkIndex(x,dx,X), mu, parity);

             dx[nu]++;

             dx[mu]++;


             Ftmp = Ftmp * U3;


             // load U(x)_(-nu) = U(x-nu)_(+nu)

             Matrix<Cmplx,3> U4;

             dx[nu]--;

             arg.gauge.load((Float*)(U4.data), linkIndex(x,dx,X), nu, 1-parity);

             dx[nu]++;


             // complete the plaquette

             Ftmp = Ftmp * U4;


             // sum this contribution to Fmunu

             F += Ftmp;


           }

           // 3 matrix additions, 12 matrix-matrix multiplications, 8 matrix conjugations

           // Each matrix conjugation involves 9 unary minus operations but these ar not included in the operation count

           // Each matrix addition involves 18 real additions

           // Each matrix-matrix multiplication involves 9*3 complex multiplications and 9*2 complex additions

           // = 9*3*6 + 9*2*2 = 198 floating-point ops

           // => Total number of floating point ops per site above is

           // 3*18 + 12*198 =  54 + 2376 = 2430


           {

             F -= conj(F); // 18 real subtractions + one matrix conjugation

             F *= 1.0/8.0; // 18 real multiplications

             // 36 floating point operations here

           }


           Cmplx* thisFmunu = arg.Fmunu + parity*arg.FmunuOffset;

           int munu_idx = (mu*(mu-1))/2 + nu; // lower-triangular indexing


           writeLinkVariableToArray(F, munu_idx, idx, arg.FmunuStride, thisFmunu);

         } // nu < mu

       } // mu

       // F[1,0], F[2,0], F[2,1], F[3,0], F[3,1], F[3,2]

       return;

     }


   template<typename Float, typename Clover, typename Gauge>

     __global__ void computeFmunuKernel(CloverArg<Float,Clover,Gauge> arg){

       int idx = threadIdx.x + blockIdx.x*blockDim.x;

       if(idx >= arg.threads) return;

       computeFmunuCore<Float,Clover,Gauge>(arg,idx);

     }


   template<typename Float, typename Clover, typename Gauge>

     void computeFmunuCPU(CloverArg<Float,Clover,Gauge>& arg){

       errorQuda("computeFmunuCPU not yet supported\n");

       for(int idx=0; idx<arg.threads; idx++){

         computeFmunuCore(arg,idx);

       }

     }


   template<typename Float, typename Clover, typename Gauge>

     class FmunuCompute : Tunable {

       CloverArg<Float,Clover,Gauge> arg;

       const GaugeField &meta;

       const QudaFieldLocation location;


       private:

       unsigned int sharedBytesPerThread() const { return 0; }

       unsigned int sharedBytesPerBlock(const TuneParam &param) const { return 0; }


       bool tuneSharedBytes() const { return false; } // Don't tune shared memory

       bool tuneGridDim() const { return false; } // Don't tune the grid dimensions.

       unsigned int minThreads() const { return arg.threads; }


       public:

       FmunuCompute(CloverArg<Float,Clover,Gauge> &arg, const GaugeField &meta, QudaFieldLocation location)

         : arg(arg), meta(meta), location(location) {

         writeAuxString("threads=%d,stride=%d,prec=%lu",arg.threads,arg.clover.stride,sizeof(Float));

       }

       virtual ~FmunuCompute() {}


       void apply(const cudaStream_t &stream){

         if(location == QUDA_CUDA_FIELD_LOCATION){

 #if (__COMPUTE_CAPABILITY__ >= 200)

           TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());

           computeFmunuKernel<<<tp.grid,tp.block,tp.shared_bytes>>>(arg);

 #else

           errorQuda("computeFmunuKernel not supported on pre-Fermi architecture");

 #endif

         }else{

           computeFmunuCPU(arg);

         }

       }


       TuneKey tuneKey() const {

         return TuneKey(meta.VolString(), typeid(*this).name(), aux);

       }


       std::string paramString(const TuneParam &param) const {

         std::stringstream ps;

         ps << "block=(" << param.block.x << "," << param.block.y << "," << param.block.z << ")";

         ps << "shared=" << param.shared_bytes;

         return ps.str();

       }


       void preTune(){}

       void postTune(){}

       long long flops() const { return (2430 + 36)*6*arg.threads; }

       long long bytes() const { return (4*4*18 + 18)*6*arg.threads*sizeof(Float); } //  Ignores link reconstruction


     }; // FmunuCompute


   // Put into clover order

   // Upper-left block (chirality index 0)

   //     /                                                                                \

   //     |  1 + c*I*(F[0,1] - F[2,3]) ,     c*I*(F[1,2] - F[0,3]) + c*(F[0,2] + F[1,3])   |

   //     |                                                                                |

   //     |  c*I*(F[1,2] - F[0,3]) - c*(F[0,2] + F[1,3]),   1 - c*I*(F[0,1] - F[2,3])      |

   //     |                                                                                |

   //     \                                                                                /


   //     /

   //     | 1 - c*I*(F[0] - F[5]),   -c*I*(F[2] - F[3]) - c*(F[1] + F[4])

   //     |

   //     |  -c*I*(F[2] -F[3]) + c*(F[1] + F[4]),   1 + c*I*(F[0] - F[5])

   //     |

   //     \

   //

   // Lower-right block (chirality index 1)

   //

   //     /                                                                  \

   //     |  1 - c*I*(F[0] + F[5]),  -c*I*(F[2] + F[3]) - c*(F[1] - F[4])    |

   //     |                                                                  |

   //     |  -c*I*(F[2]+F[3]) + c*(F[1]-F[4]),     1 + c*I*(F[0] + F[5])     |

   //     \                                                                  /

   //


   // Core routine for constructing clover term from field strength

   template<typename Float, typename Clover, typename Gauge>

     __device__ __host__

     void cloverComputeCore(CloverArg<Float,Clover,Gauge>& arg, int idx){


       int parity = 0;

       if(idx >= arg.threads/2){

         parity = 1;

         idx -= arg.threads/2;

       }

       typedef typename ComplexTypeId<Float>::Type Cmplx;


       // Load the field-strength tensor from global memory

       Matrix<Cmplx,3> F[6];

       for(int i=0; i<6; ++i){

         loadLinkVariableFromArray(arg.Fmunu + parity*arg.FmunuOffset, i, idx, arg.FmunuStride, &F[i]);

       }


       Cmplx I; I.x = 0; I.y = 1.0;

       Cmplx coeff; coeff.x = 0; coeff.y = arg.cloverCoeff;

       Matrix<Cmplx,3> block1[2];

       Matrix<Cmplx,3> block2[2];

       block1[0] =  coeff*(F[0]-F[5]); // (18 + 6*9=) 72 floating-point ops

       block1[1] =  coeff*(F[0]+F[5]); // 72 floating-point ops

       block2[0] =  arg.cloverCoeff*(F[1]+F[4] - I*(F[2]-F[3])); // 126 floating-point ops

       block2[1] =  arg.cloverCoeff*(F[1]-F[4] - I*(F[2]+F[3])); // 126 floating-point ops


       const int idtab[15]={0,1,3,6,10,2,4,7,11,5,8,12,9,13,14};

       Float diag[6];

       Cmplx triangle[15];

       Float A[72];


       // This uses lots of unnecessary memory

       for(int ch=0; ch<2; ++ch){

         // c = 0(1) => positive(negative) chiral block

         // Compute real diagonal elements

         for(int i=0; i<3; ++i){

           diag[i]   = 1.0 - block1[ch](i,i).x;

           diag[i+3] = 1.0 + block1[ch](i,i).x;

         }


         // Compute off diagonal components

         // First row

         triangle[0]  = - block1[ch](1,0);

         // Second row

         triangle[1]  = - block1[ch](2,0);

         triangle[2]  = - block1[ch](2,1);

         // Third row

         triangle[3]  =   block2[ch](0,0);

         triangle[4]  =   block2[ch](0,1);

         triangle[5]  =   block2[ch](0,2);

         // Fourth row

         triangle[6]  =   block2[ch](1,0);

         triangle[7]  =   block2[ch](1,1);

         triangle[8]  =   block2[ch](1,2);

         triangle[9]  =   block1[ch](1,0);

         // Fifth row

         triangle[10] =   block2[ch](2,0);

         triangle[11] =   block2[ch](2,1);

         triangle[12] =   block2[ch](2,2);

         triangle[13] =   block1[ch](2,0);

         triangle[14] =   block1[ch](2,1);


         for(int i=0; i<6; ++i){

           A[ch*36 + i] = 0.5*diag[i];

         }

         for(int i=0; i<15; ++i){

           A[ch*36+6+2*i]     = 0.5*triangle[idtab[i]].x;

           A[ch*36+6+2*i + 1] = 0.5*triangle[idtab[i]].y;

         }

       } // ch

       // 84 floating-point ops


       arg.clover.save(A, idx, parity);

       return;

     }


   template<typename Float, typename Clover, typename Gauge>

     __global__

     void cloverComputeKernel(CloverArg<Float,Clover,Gauge> arg){

       int idx = threadIdx.x + blockIdx.x*blockDim.x;

       if(idx >= arg.threads) return;

       cloverComputeCore(arg, idx);

     }


   template<typename Float, typename Clover, typename Gauge>

     void cloverComputeCPU(CloverArg<Float,Clover,Gauge> arg){

       for(int idx=0; idx<arg.threads; ++idx){

         cloverComputeCore(arg, idx);

       }

     }


   template<typename Float, typename Clover, typename Gauge>

     class CloverCompute : Tunable {

       CloverArg<Float, Clover, Gauge> arg;

       const GaugeField &meta;

       const QudaFieldLocation location;


       private:

       unsigned int sharedBytesPerThread() const { return 0; }

       unsigned int sharedBytesPerBlock(const TuneParam &param) const { return 0; }


       bool tuneSharedBytes() const { return false; } // Don't tune the shared memory.

       bool tuneGridDim() const { return false; } // Don't tune the grid dimensions.

       unsigned int minThreads() const { return arg.threads; }


       public:

       CloverCompute(CloverArg<Float,Clover,Gauge> &arg, const GaugeField &meta, QudaFieldLocation location)

         : arg(arg), meta(meta), location(location) {

         writeAuxString("threads=%d,stride=%d,prec=%lu",arg.threads,arg.clover.stride,sizeof(Float));

       }


       virtual ~CloverCompute() {}


       void apply(const cudaStream_t &stream) {

         if(location == QUDA_CUDA_FIELD_LOCATION){

 #if (__COMPUTE_CAPABILITY__ >= 200)

           TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());

           cloverComputeKernel<<<tp.grid,tp.block,tp.shared_bytes>>>(arg);

 #else

           errorQuda("cloverComputeKernel not supported on pre-Fermi architecture");

 #endif

         }else{ // run the CPU code

           cloverComputeCPU(arg);

         }

       }


       TuneKey tuneKey() const {

         return TuneKey(meta.VolString(), typeid(*this).name(), aux);

       }


       std::string paramString(const TuneParam &param) const { // Don't print the grid dim.

         std::stringstream ps;

         ps << "block=(" << param.block.x << "," << param.block.y << "," << param.block.z << "), ";

         ps << "shared=" << param.shared_bytes;

         return ps.str();

       }


       void preTune(){}

       void postTune(){}

       long long flops() const { return 480*arg.threads; }

       long long bytes() const { return arg.threads*(6*18 + 72)*sizeof(Float); }

     };


   template<typename Float,typename Clover,typename Gauge>

     void computeClover(Clover clover, Gauge gauge, GaugeField& Fmunu, Float cloverCoeff, QudaFieldLocation location){

       CloverArg<Float,Clover,Gauge> arg(clover, gauge, Fmunu, cloverCoeff);

       FmunuCompute<Float,Clover,Gauge> fmunuCompute(arg, Fmunu, location);

       fmunuCompute.apply(0);

       CloverCompute<Float,Clover,Gauge> cloverCompute(arg, Fmunu, location);

       cloverCompute.apply(0);

       cudaDeviceSynchronize();

     }


   template<typename Float>

     void computeClover(CloverField &clover, const GaugeField& gauge, Float cloverCoeff, QudaFieldLocation location){

       int pad = 0;

       GaugeFieldParam tensorParam(clover.X(), clover.Precision(), QUDA_RECONSTRUCT_NO, pad, QUDA_TENSOR_GEOMETRY);

       tensorParam.siteSubset = QUDA_FULL_SITE_SUBSET;

       GaugeField* Fmunu = NULL;

       if(location == QUDA_CPU_FIELD_LOCATION){

         Fmunu = new cpuGaugeField(tensorParam);

       } else if (location == QUDA_CUDA_FIELD_LOCATION){

         Fmunu = new cudaGaugeField(tensorParam);

       } else {

         errorQuda("Invalid location\n");

       }


       // Switching to FloatNOrder for the gauge field in order to support RECONSTRUCT_12

       // Need to fix this!!


       if(clover.Order() == QUDA_FLOAT2_CLOVER_ORDER){

         if(gauge.Order() == QUDA_FLOAT2_GAUGE_ORDER){

           if(gauge.Reconstruct() == QUDA_RECONSTRUCT_NO){

             computeClover(CloverOrder::quda::FloatNOrder<Float,72,2>(clover,0), FloatNOrder<Float, 18, 2, 18>(gauge), *Fmunu, cloverCoeff, location);

           }else if(gauge.Reconstruct() == QUDA_RECONSTRUCT_12){

             computeClover(CloverOrder::quda::FloatNOrder<Float,72,2>(clover,0), FloatNOrder<Float, 18, 2, 12>(gauge),  *Fmunu, cloverCoeff, location);

           }else if(gauge.Reconstruct() == QUDA_RECONSTRUCT_8){

             computeClover(CloverOrder::quda::FloatNOrder<Float,72,2>(clover,0), FloatNOrder<Float, 18, 2, 8>(gauge),  *Fmunu, cloverCoeff, location);

           }else{

             errorQuda("Reconstruction type %d not supported",gauge.Reconstruct());

           }


         }else if(gauge.Order() == QUDA_FLOAT4_GAUGE_ORDER){

           if(gauge.Reconstruct() == QUDA_RECONSTRUCT_12){

             computeClover(CloverOrder::quda::FloatNOrder<Float,72,2>(clover,0), FloatNOrder<Float,18,4,12>(gauge),  *Fmunu, cloverCoeff, location);

           }else{

             errorQuda("Reconstruction type %d not supported",gauge.Reconstruct());

           }

         }

       }else if(clover.Order() == QUDA_FLOAT4_CLOVER_ORDER){

         if(gauge.Order() == QUDA_FLOAT2_GAUGE_ORDER){

           if(gauge.Reconstruct() == QUDA_RECONSTRUCT_NO){

             computeClover(CloverOrder::quda::FloatNOrder<Float,72,4>(clover,0), FloatNOrder<Float,18,2,18>(gauge),  *Fmunu, cloverCoeff, location);

           }else if(gauge.Reconstruct() == QUDA_RECONSTRUCT_12){

             computeClover(CloverOrder::quda::FloatNOrder<Float,72,4>(clover,0), FloatNOrder<Float,18,2,12>(gauge),  *Fmunu, cloverCoeff, location);

           }else{

             errorQuda("Reconstruction type %d not supported",gauge.Reconstruct());

           }


         }else if(gauge.Order() == QUDA_FLOAT4_GAUGE_ORDER){

           if(gauge.Reconstruct() == QUDA_RECONSTRUCT_12){

             computeClover(CloverOrder::quda::FloatNOrder<Float,72,4>(clover,0), FloatNOrder<Float,18,4,12>(gauge), *Fmunu, cloverCoeff, location);

           }else{

             errorQuda("Reconstruction type %d not supported",gauge.Reconstruct());

           } // gauge order

         }

       } // clover order


       if(Fmunu) delete Fmunu;

     }


 #endif


   void computeClover(CloverField &clover, const GaugeField& gauge, double cloverCoeff, QudaFieldLocation location){


 #ifdef GPU_CLOVER_DIRAC

     if(clover.Precision() == QUDA_HALF_PRECISION){

       errorQuda("Half precision not supported\n");

     }


     if (clover.Precision() == QUDA_SINGLE_PRECISION){

       computeClover<float>(clover, gauge, cloverCoeff, location);

     } else if(clover.Precision() == QUDA_DOUBLE_PRECISION) {

       computeClover<double>(clover, gauge, cloverCoeff, location);

     } else {

       errorQuda("Precision %d not supported", clover.Precision());

     }

     return;

 #else

     errorQuda("Clover has not been built");

 #endif


   }


 } // namespace quda


QUDA_RECONSTRUCT_NO
Definition: enum_quda.h:55

quda::TuneParam
Definition: tune_quda.h:16

quda::linkIndex
__device__ __host__ int linkIndex(int x[], int dx[], const int X[4])
Definition: ks_force_quda.cu:40

quda::setZero
__device__ __host__ void setZero(Matrix< T, N > *m)
Definition: quda_matrix.h:640

y
int y[4]
Definition: staggered_dslash_core.h:356

getVerbosity
QudaVerbosity getVerbosity()
Definition: util_quda.cpp:20

conj
Matrix< N, std::complex< T > > conj(const Matrix< N, std::complex< T > > &mat)
Definition: hisq_force_reference2.cpp:231

errorQuda
#define errorQuda(...)
Definition: util_quda.h:73

quda::LatticeField::X
const int * X() const
Definition: lattice_field.h:162

QUDA_CUDA_FIELD_LOCATION
Definition: enum_quda.h:271

QUDA_HALF_PRECISION
Definition: enum_quda.h:48

quda::CloverField
Definition: clover_field.h:32

mu
__global__ void const RealA *const const RealA *const const RealA *const const RealB *const const RealB *const int int mu
Definition: hisq_paths_force_core.h:82

quda::GaugeField::Order
QudaGaugeFieldOrder Order() const
Definition: gauge_field.h:169

QUDA_FLOAT2_GAUGE_ORDER
Definition: enum_quda.h:30

quda::stream
cudaStream_t * stream
Definition: cuda_color_spinor_field.cu:816

threads
__global__ void const FloatN FloatM FloatM Float Float int threads
Definition: llfat_core.h:1099

testing::internal::string
::std::string string
Definition: gtest.h:1979

QUDA_FULL_SITE_SUBSET
Definition: enum_quda.h:277

quda::TuneParam::shared_bytes
int shared_bytes
Definition: tune_quda.h:21

clover_field_order.h

quda::TuneParam::grid
dim3 grid
Definition: tune_quda.h:20

quda::LatticeFieldParam::siteSubset
QudaSiteSubset siteSubset
Definition: lattice_field.h:42

quda::ComplexTypeId
Definition: quda_matrix.h:19

param
QudaGaugeParam param
Definition: pack_test.cpp:17

quda::LatticeField::Precision
QudaPrecision Precision() const
Definition: lattice_field.h:176

quda::GaugeFieldParam
Definition: gauge_field.h:10

QUDA_FLOAT2_CLOVER_ORDER
Definition: enum_quda.h:206

quda::Tunable
Definition: tune_quda.h:40

QUDA_RECONSTRUCT_12
Definition: enum_quda.h:56

location
const QudaFieldLocation location
Definition: pack_test.cpp:46

testing::internal::Float
FloatingPoint< float > Float
Definition: gtest.h:7350

quda::TuneParam::block
dim3 block
Definition: tune_quda.h:19

quda::CloverField::Order
QudaCloverFieldOrder Order() const
Definition: clover_field.h:66

quda::cpuGaugeField
Definition: gauge_field.h:278

quda::Matrix::data
T data[N *N]
Definition: quda_matrix.h:351

quda::tuneLaunch
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
Definition: tune.cpp:271

quda::GaugeField::Reconstruct
QudaReconstructType Reconstruct() const
Definition: gauge_field.h:168

quda::FloatNOrder
Definition: clover_field_order.h:56

quda::cudaGaugeField
Definition: gauge_field.h:216

coeff
__constant__ double coeff
Definition: dslash_constants.h:180

gauge_field_order.h

QUDA_FLOAT4_CLOVER_ORDER
Definition: enum_quda.h:207

clover_field.h

x
int x[4]
Definition: hisq_paths_force_core.h:99

tune_quda.h

X
int X
Definition: asym_wilson_clover_dslash_dagger_fermi_core.h:394

QUDA_RECONSTRUCT_8
Definition: enum_quda.h:57

dx
int dx[4]
Definition: hisq_paths_force_core.h:98

quda_matrix.h

QUDA_DOUBLE_PRECISION
Definition: enum_quda.h:50

QudaFieldLocation
enum QudaFieldLocation_s QudaFieldLocation

idx
int idx
Definition: staggered_fused_exterior_dslash_core.h:342

QUDA_SINGLE_PRECISION
Definition: enum_quda.h:49

QUDA_FLOAT4_GAUGE_ORDER
Definition: enum_quda.h:31

quda::loadLinkVariableFromArray
__device__ void loadLinkVariableFromArray(const T *const array, const int dir, const int idx, const int stride, Matrix< T, 3 > *link)
Definition: quda_matrix.h:767

quda::arg
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
Definition: complex_quda.h:843

quda::writeLinkVariableToArray
__device__ void writeLinkVariableToArray(const Matrix< T, 3 > &link, const int dir, const int idx, const int stride, T *const array)
Definition: quda_matrix.h:830

getTuning
QudaTune getTuning()
Definition: util_quda.cpp:32

getCoords
getCoords(x, sid, kparam.D, oddBit)

quda::computeClover
void computeClover(CloverField &clover, const GaugeField &gauge, double coeff, QudaFieldLocation location)
Definition: clover_quda.cu:602

QUDA_TENSOR_GEOMETRY
Definition: enum_quda.h:387

QUDA_CPU_FIELD_LOCATION
Definition: enum_quda.h:270

parity
const QudaParity parity
Definition: dslash_test.cpp:29

gauge_field.h

gauge
void * gauge[4]
Definition: su3_test.cpp:15

quda::Matrix
Definition: quda_matrix.h:348

quda::TuneKey
Definition: tune_key.h:8

quda_internal.h

quda::GaugeField
Definition: gauge_field.h:118