quda-ref/v0.7.0/dslash__util_8h_source.html

 #ifndef _DSLASH_UTIL_H

 #define _DSLASH_UTIL_H


 #include <test_util.h>


 template <typename Float>

 static inline void sum(Float *dst, Float *a, Float *b, int cnt) {

   for (int i = 0; i < cnt; i++)

     dst[i] = a[i] + b[i];

 }


 template <typename Float>

 static inline void sub(Float *dst, Float *a, Float *b, int cnt) {

   for (int i = 0; i < cnt; i++)

     dst[i] = a[i] - b[i];

 }


 template <typename Float>

 static inline void ax(Float *dst, Float a, Float *x, int cnt) {

   for (int i = 0; i < cnt; i++)

     dst[i] = a * x[i];

 }


 // performs the operation y[i] = x[i] + a*y[i]

 template <typename Float>

 static inline void xpay(Float *x, Float a, Float *y, int len) {

   for (int i=0; i<len; i++) y[i] = x[i] + a*y[i];

 }


 // performs the operation y[i] = a*x[i] + y[i]

 template <typename Float>

 static inline void axpy(Float a, Float *x, Float *y, int len) {

   for (int i=0; i<len; i++) y[i] = a*x[i] + y[i];

 }


 // performs the operation y[i] = a*x[i] + b*y[i]

 template <typename Float>

 static inline void axpby(Float a, Float *x, Float b, Float *y, int len) {

   for (int i=0; i<len; i++) y[i] = a*x[i] + b*y[i];

 }


 // performs the operation y[i] = a*x[i] - y[i]

 template <typename Float>

 static inline void axmy(Float *x, Float a, Float *y, int len) {

   for (int i=0; i<len; i++) y[i] = a*x[i] - y[i];

 }


 template <typename Float>

 static double norm2(Float *v, int len) {

   double sum=0.0;

   for (int i=0; i<len; i++) sum += v[i]*v[i];

   return sum;

 }


 template <typename Float>

 static inline void negx(Float *x, int len) {

   for (int i=0; i<len; i++) x[i] = -x[i];

 }


 template <typename sFloat, typename gFloat>

 static inline void dot(sFloat* res, gFloat* a, sFloat* b) {

   res[0] = res[1] = 0;

   for (int m = 0; m < 3; m++) {

     sFloat a_re = a[2*m+0];

     sFloat a_im = a[2*m+1];

     sFloat b_re = b[2*m+0];

     sFloat b_im = b[2*m+1];

     res[0] += a_re * b_re - a_im * b_im;

     res[1] += a_re * b_im + a_im * b_re;

   }

 }


 template <typename Float>

 static inline void su3Transpose(Float *res, Float *mat) {

   for (int m = 0; m < 3; m++) {

     for (int n = 0; n < 3; n++) {

       res[m*(3*2) + n*(2) + 0] = + mat[n*(3*2) + m*(2) + 0];

       res[m*(3*2) + n*(2) + 1] = - mat[n*(3*2) + m*(2) + 1];

     }

   }

 }


 template <typename sFloat, typename gFloat>

 static inline void su3Mul(sFloat *res, gFloat *mat, sFloat *vec) {

   for (int n = 0; n < 3; n++) dot(&res[n*(2)], &mat[n*(3*2)], vec);

 }


 template <typename sFloat, typename gFloat>

 static inline void su3Tmul(sFloat *res, gFloat *mat, sFloat *vec) {

   gFloat matT[3*3*2];

   su3Transpose(matT, mat);

   su3Mul(res, matT, vec);

 }


 // i represents a "half index" into an even or odd "half lattice".

 // when oddBit={0,1} the half lattice is {even,odd}.

 //

 // the displacements, such as dx, refer to the full lattice coordinates.

 //

 // neighborIndex() takes a "half index", displaces it, and returns the

 // new "half index", which can be an index into either the even or odd lattices.

 // displacements of magnitude one always interchange odd and even lattices.

 //


 template <typename Float>

 static inline Float *gaugeLink(int i, int dir, int oddBit, Float **gaugeEven, Float **gaugeOdd, int nbr_distance) {

   Float **gaugeField;

   int j;

   int d = nbr_distance;

   if (dir % 2 == 0) {

     j = i;

     gaugeField = (oddBit ? gaugeOdd : gaugeEven);

   }

   else {

     switch (dir) {

     case 1: j = neighborIndex(i, oddBit, 0, 0, 0, -d); break;

     case 3: j = neighborIndex(i, oddBit, 0, 0, -d, 0); break;

     case 5: j = neighborIndex(i, oddBit, 0, -d, 0, 0); break;

     case 7: j = neighborIndex(i, oddBit, -d, 0, 0, 0); break;

     default: j = -1; break;

     }

     gaugeField = (oddBit ? gaugeEven : gaugeOdd);

   }


   return &gaugeField[dir/2][j*(3*3*2)];

 }


 template <typename Float>

 static inline Float *spinorNeighbor(int i, int dir, int oddBit, Float *spinorField, int neighbor_distance)

 {

   int j;

   int nb = neighbor_distance;

   switch (dir) {

   case 0: j = neighborIndex(i, oddBit, 0, 0, 0, +nb); break;

   case 1: j = neighborIndex(i, oddBit, 0, 0, 0, -nb); break;

   case 2: j = neighborIndex(i, oddBit, 0, 0, +nb, 0); break;

   case 3: j = neighborIndex(i, oddBit, 0, 0, -nb, 0); break;

   case 4: j = neighborIndex(i, oddBit, 0, +nb, 0, 0); break;

   case 5: j = neighborIndex(i, oddBit, 0, -nb, 0, 0); break;

   case 6: j = neighborIndex(i, oddBit, +nb, 0, 0, 0); break;

   case 7: j = neighborIndex(i, oddBit, -nb, 0, 0, 0); break;

   default: j = -1; break;

   }


   return &spinorField[j*(mySpinorSiteSize)];

 }


 #ifdef MULTI_GPU


 static inline int

 x4_mg(int i, int oddBit)

 {

   int Y = fullLatticeIndex(i, oddBit);

   int x4 = Y/(Z[2]*Z[1]*Z[0]);

   return x4;

 }


 template <typename Float>

 static inline Float *gaugeLink_mg4dir(int i, int dir, int oddBit, Float **gaugeEven, Float **gaugeOdd,

                         Float** ghostGaugeEven, Float** ghostGaugeOdd, int n_ghost_faces, int nbr_distance) {

   Float **gaugeField;

   int j;

   int d = nbr_distance;

   if (dir % 2 == 0) {

     j = i;

     gaugeField = (oddBit ? gaugeOdd : gaugeEven);

   }

   else {


     int Y = fullLatticeIndex(i, oddBit);

     int x4 = Y/(Z[2]*Z[1]*Z[0]);

     int x3 = (Y/(Z[1]*Z[0])) % Z[2];

     int x2 = (Y/Z[0]) % Z[1];

     int x1 = Y % Z[0];

     int X1= Z[0];

     int X2= Z[1];

     int X3= Z[2];

     int X4= Z[3];

     Float* ghostGaugeField;


     switch (dir) {

     case 1:

       { //-X direction

         int new_x1 = (x1 - d + X1 )% X1;

         if (x1 -d < 0){

           ghostGaugeField = (oddBit?ghostGaugeEven[0]: ghostGaugeOdd[0]);

           int offset = (n_ghost_faces + x1 -d)*X4*X3*X2/2 + (x4*X3*X2 + x3*X2+x2)/2;

           return &ghostGaugeField[offset*(3*3*2)];

         }

         j = (x4*X3*X2*X1 + x3*X2*X1 + x2*X1 + new_x1) / 2;

         break;

       }

     case 3:

       { //-Y direction

         int new_x2 = (x2 - d + X2 )% X2;

         if (x2 -d < 0){

           ghostGaugeField = (oddBit?ghostGaugeEven[1]: ghostGaugeOdd[1]);

           int offset = (n_ghost_faces + x2 -d)*X4*X3*X1/2 + (x4*X3*X1 + x3*X1+x1)/2;

           return &ghostGaugeField[offset*(3*3*2)];

         }

         j = (x4*X3*X2*X1 + x3*X2*X1 + new_x2*X1 + x1) / 2;

         break;


       }

     case 5:

       { //-Z direction

         int new_x3 = (x3 - d + X3 )% X3;

         if (x3 -d < 0){

           ghostGaugeField = (oddBit?ghostGaugeEven[2]: ghostGaugeOdd[2]);

           int offset = (n_ghost_faces + x3 -d)*X4*X2*X1/2 + (x4*X2*X1 + x2*X1+x1)/2;

           return &ghostGaugeField[offset*(3*3*2)];

         }

         j = (x4*X3*X2*X1 + new_x3*X2*X1 + x2*X1 + x1) / 2;

         break;

       }

     case 7:

       { //-T direction

         int new_x4 = (x4 - d + X4)% X4;

         if (x4 -d < 0){

           ghostGaugeField = (oddBit?ghostGaugeEven[3]: ghostGaugeOdd[3]);

           int offset = (n_ghost_faces + x4 -d)*X1*X2*X3/2 + (x3*X2*X1 + x2*X1+x1)/2;

           return &ghostGaugeField[offset*(3*3*2)];

         }

         j = (new_x4*(X3*X2*X1) + x3*(X2*X1) + x2*(X1) + x1) / 2;

         break;

       }//7


     default: j = -1; printf("ERROR: wrong dir \n"); exit(1);

     }

     gaugeField = (oddBit ? gaugeEven : gaugeOdd);


   }


   return &gaugeField[dir/2][j*(3*3*2)];

 }


 template <typename Float>

 static inline Float *spinorNeighbor_mg4dir(int i, int dir, int oddBit, Float *spinorField, Float** fwd_nbr_spinor,

                                            Float** back_nbr_spinor, int neighbor_distance, int nFace)

 {

   int j;

   int nb = neighbor_distance;

   int Y = fullLatticeIndex(i, oddBit);

   int x4 = Y/(Z[2]*Z[1]*Z[0]);

   int x3 = (Y/(Z[1]*Z[0])) % Z[2];

   int x2 = (Y/Z[0]) % Z[1];

   int x1 = Y % Z[0];

   int X1= Z[0];

   int X2= Z[1];

   int X3= Z[2];

   int X4= Z[3];


   switch (dir) {

   case 0://+X

     {

       int new_x1 = (x1 + nb)% X1;

       if(x1+nb >=X1){

         int offset = ( x1 + nb -X1)*X4*X3*X2/2+(x4*X3*X2 + x3*X2+x2)/2;

         return fwd_nbr_spinor[0] + offset*mySpinorSiteSize;

       }

       j = (x4*X3*X2*X1 + x3*X2*X1 + x2*X1 + new_x1) / 2;

       break;


     }

   case 1://-X

     {

       int new_x1 = (x1 - nb + X1)% X1;

       if(x1 - nb < 0){

         int offset = ( x1+nFace- nb)*X4*X3*X2/2+(x4*X3*X2 + x3*X2+x2)/2;

         return back_nbr_spinor[0] + offset*mySpinorSiteSize;

       }

       j = (x4*X3*X2*X1 + x3*X2*X1 + x2*X1 + new_x1) / 2;

       break;

     }

   case 2://+Y

     {

       int new_x2 = (x2 + nb)% X2;

       if(x2+nb >=X2){

         int offset = ( x2 + nb -X2)*X4*X3*X1/2+(x4*X3*X1 + x3*X1+x1)/2;

         return fwd_nbr_spinor[1] + offset*mySpinorSiteSize;

       }

       j = (x4*X3*X2*X1 + x3*X2*X1 + new_x2*X1 + x1) / 2;

       break;

     }

   case 3:// -Y

     {

       int new_x2 = (x2 - nb + X2)% X2;

       if(x2 - nb < 0){

         int offset = ( x2 + nFace -nb)*X4*X3*X1/2+(x4*X3*X1 + x3*X1+x1)/2;

         return back_nbr_spinor[1] + offset*mySpinorSiteSize;

       }

       j = (x4*X3*X2*X1 + x3*X2*X1 + new_x2*X1 + x1) / 2;

       break;

     }

   case 4://+Z

     {

       int new_x3 = (x3 + nb)% X3;

       if(x3+nb >=X3){

         int offset = ( x3 + nb -X3)*X4*X2*X1/2+(x4*X2*X1 + x2*X1+x1)/2;

         return fwd_nbr_spinor[2] + offset*mySpinorSiteSize;

       }

       j = (x4*X3*X2*X1 + new_x3*X2*X1 + x2*X1 + x1) / 2;

       break;

     }

   case 5://-Z

     {

       int new_x3 = (x3 - nb + X3)% X3;

       if(x3 - nb < 0){

         int offset = ( x3 + nFace -nb)*X4*X2*X1/2+(x4*X2*X1 + x2*X1+x1)/2;

         return back_nbr_spinor[2] + offset*mySpinorSiteSize;

       }

       j = (x4*X3*X2*X1 + new_x3*X2*X1 + x2*X1 + x1) / 2;

       break;

     }

   case 6://+T

     {

       j = neighborIndex_mg(i, oddBit, +nb, 0, 0, 0);

       int x4 = x4_mg(i, oddBit);

       if ( (x4 + nb) >= Z[3]){

         int offset = (x4+nb - Z[3])*Vsh_t;

         return &fwd_nbr_spinor[3][(offset+j)*mySpinorSiteSize];

       }

       break;

     }

   case 7://-T

     {

       j = neighborIndex_mg(i, oddBit, -nb, 0, 0, 0);

       int x4 = x4_mg(i, oddBit);

       if ( (x4 - nb) < 0){

         int offset = ( x4 - nb +nFace)*Vsh_t;

         return &back_nbr_spinor[3][(offset+j)*mySpinorSiteSize];

       }

       break;

     }

   default: j = -1; printf("ERROR: wrong dir\n"); exit(1);

   }


   return &spinorField[j*(mySpinorSiteSize)];

 }


 #endif // MULTI_GPU


 #endif // _DSLASH_UTIL_H

quda::neighborIndex
__device__ __forceinline__ int neighborIndex(const unsigned int &cb_idx, const int(&shift)[4], const bool(&partitioned)[4], const unsigned int &parity)
Definition: shift_quark_field.cu:41

X2
__constant__ int X2
Definition: dslash_constants.h:107

y
int y[4]
Definition: staggered_dslash_core.h:356

quda::axpby
void axpby(const Float &a, const Float *x, const Float &b, Float *y, const int N)
Definition: blas_cpu.cpp:8

test_util.h

X1
__constant__ int X1
Definition: dslash_constants.h:106

mat
void mat(void *out, void **fatlink, void **longlink, void *in, double kappa, int dagger_bit, QudaPrecision sPrecision, QudaPrecision gPrecision)
Definition: staggered_dslash_reference.cpp:136

x2
int x2
Definition: asym_wilson_clover_dslash_dagger_fermi_core.h:393

Vsh_t
#define Vsh_t
Definition: llfat_core.h:4

x4
int x4
Definition: asym_wilson_clover_dslash_dagger_fermi_core.h:393

ax
void ax(double a, void *x, int len, QudaPrecision precision)
Definition: blas_reference.cpp:22

fullLatticeIndex
int fullLatticeIndex(int dim[4], int index, int oddBit)
Definition: test_util.cpp:400

x1
int x1
Definition: asym_wilson_clover_dslash_dagger_fermi_core.h:393

testing::internal::Float
FloatingPoint< float > Float
Definition: gtest.h:7350

mySpinorSiteSize
#define mySpinorSiteSize
Definition: staggered_invert_test.cpp:28

norm2
double norm2(Float *v, int len)
Definition: blas_reference.cpp:41

x
int x[4]
Definition: hisq_paths_force_core.h:99

neighborIndex_mg
int neighborIndex_mg(int i, int oddBit, int dx4, int dx3, int dx2, int dx1)
Definition: test_util.cpp:485

axpy
void axpy(double a, void *x, void *y, int len, QudaPrecision precision)
Definition: blas_reference.cpp:11

Y
int Y[4]
Definition: tm_ndeg_fused_exterior_dslash_core.h:197

Z
int Z[4]
Definition: test_util.cpp:28

X3
__constant__ int X3
Definition: dslash_constants.h:108

x3
int x3
Definition: asym_wilson_clover_dslash_dagger_fermi_core.h:393

oddBit
int oddBit
Definition: hisq_paths_force_core.h:263

X4
__constant__ int X4
Definition: dslash_constants.h:109