v0.9.0/doc/dslash__util_8h_source.html

 #ifndef _DSLASH_UTIL_H
 #define _DSLASH_UTIL_H

 #include <test_util.h>
 #include <comm_quda.h>

 template <typename Float>
 static inline void sum(Float *dst, Float *a, Float *b, int cnt) {
   for (int i = 0; i < cnt; i++)
     dst[i] = a[i] + b[i];
 }

 template <typename Float>
 static inline void sub(Float *dst, Float *a, Float *b, int cnt) {
   for (int i = 0; i < cnt; i++)
     dst[i] = a[i] - b[i];
 }

 template <typename Float>
 static inline void ax(Float *dst, Float a, Float *x, int cnt) {
   for (int i = 0; i < cnt; i++)
     dst[i] = a * x[i];
 }

 // performs the operation y[i] = a*x[i] + y[i]
 template <typename Float>
 static inline void axpy(Float a, Float *x, Float *y, int len) {
   for (int i=0; i<len; i++) y[i] = a*x[i] + y[i];
 }

 // performs the operation y[i] = a*x[i] + b*y[i]
 template <typename Float>
 static inline void axpby(Float a, Float *x, Float b, Float *y, int len) {
   for (int i=0; i<len; i++) y[i] = a*x[i] + b*y[i];
 }

 // performs the operation y[i] = a*x[i] - y[i]
 template <typename Float>
 static inline void axmy(Float *x, Float a, Float *y, int len) {
   for (int i=0; i<len; i++) y[i] = a*x[i] - y[i];
 }

 template <typename Float>
 static double norm2(Float *v, int len) {
   double sum=0.0;
   for (int i=0; i<len; i++) sum += v[i]*v[i];
   return sum;
 }

 template <typename Float>
 static inline void negx(Float *x, int len) {
   for (int i=0; i<len; i++) x[i] = -x[i];
 }

 template <typename sFloat, typename gFloat>
 static inline void dot(sFloat* res, gFloat* a, sFloat* b) {
   res[0] = res[1] = 0;
   for (int m = 0; m < 3; m++) {
     sFloat a_re = a[2*m+0];
     sFloat a_im = a[2*m+1];
     sFloat b_re = b[2*m+0];
     sFloat b_im = b[2*m+1];
     res[0] += a_re * b_re - a_im * b_im;
     res[1] += a_re * b_im + a_im * b_re;
   }
 }

 template <typename Float>
 static inline void su3Transpose(Float *res, Float *mat) {
   for (int m = 0; m < 3; m++) {
     for (int n = 0; n < 3; n++) {
       res[m*(3*2) + n*(2) + 0] = + mat[n*(3*2) + m*(2) + 0];
       res[m*(3*2) + n*(2) + 1] = - mat[n*(3*2) + m*(2) + 1];
     }
   }
 }


 template <typename sFloat, typename gFloat>
 static inline void su3Mul(sFloat *res, gFloat *mat, sFloat *vec) {
   for (int n = 0; n < 3; n++) dot(&res[n*(2)], &mat[n*(3*2)], vec);
 }

 template <typename sFloat, typename gFloat>
 static inline void su3Tmul(sFloat *res, gFloat *mat, sFloat *vec) {
   gFloat matT[3*3*2];
   su3Transpose(matT, mat);
   su3Mul(res, matT, vec);
 }


 // i represents a "half index" into an even or odd "half lattice".
 // when oddBit={0,1} the half lattice is {even,odd}.
 //
 // the displacements, such as dx, refer to the full lattice coordinates.
 //
 // neighborIndex() takes a "half index", displaces it, and returns the
 // new "half index", which can be an index into either the even or odd lattices.
 // displacements of magnitude one always interchange odd and even lattices.
 //


 template <typename Float>
 static inline Float *gaugeLink(int i, int dir, int oddBit, Float **gaugeEven, Float **gaugeOdd, int nbr_distance) {
   Float **gaugeField;
   int j;
   int d = nbr_distance;
   if (dir % 2 == 0) {
     j = i;
     gaugeField = (oddBit ? gaugeOdd : gaugeEven);
   }
   else {
     switch (dir) {
     case 1: j = neighborIndex(i, oddBit, 0, 0, 0, -d); break;
     case 3: j = neighborIndex(i, oddBit, 0, 0, -d, 0); break;
     case 5: j = neighborIndex(i, oddBit, 0, -d, 0, 0); break;
     case 7: j = neighborIndex(i, oddBit, -d, 0, 0, 0); break;
     default: j = -1; break;
     }
     gaugeField = (oddBit ? gaugeEven : gaugeOdd);
   }

   return &gaugeField[dir/2][j*(3*3*2)];
 }

 template <typename Float>
 static inline Float *spinorNeighbor(int i, int dir, int oddBit, Float *spinorField, int neighbor_distance)
 {
   int j;
   int nb = neighbor_distance;
   switch (dir) {
   case 0: j = neighborIndex(i, oddBit, 0, 0, 0, +nb); break;
   case 1: j = neighborIndex(i, oddBit, 0, 0, 0, -nb); break;
   case 2: j = neighborIndex(i, oddBit, 0, 0, +nb, 0); break;
   case 3: j = neighborIndex(i, oddBit, 0, 0, -nb, 0); break;
   case 4: j = neighborIndex(i, oddBit, 0, +nb, 0, 0); break;
   case 5: j = neighborIndex(i, oddBit, 0, -nb, 0, 0); break;
   case 6: j = neighborIndex(i, oddBit, +nb, 0, 0, 0); break;
   case 7: j = neighborIndex(i, oddBit, -nb, 0, 0, 0); break;
   default: j = -1; break;
   }

   return &spinorField[j*(mySpinorSiteSize)];
 }


 // i represents a "half index" into an even or odd "half lattice".
 // when oddBit={0,1} the half lattice is {even,odd}.
 //
 // the displacements, such as dx, refer to the full lattice coordinates.
 //
 // neighborIndex() takes a "half index", displaces it, and returns the
 // new "half index", which can be an index into either the even or odd lattices.
 // displacements of magnitude one always interchange odd and even lattices.
 //
 //
 template<QudaDWFPCType type>
 int neighborIndex_5d(int i, int oddBit, int dxs, int dx4, int dx3, int dx2, int dx1) {
   // fullLatticeIndex was modified for fullLatticeIndex_4d.  It is in util_quda.cpp.
   // This code bit may not properly perform 5dPC.
   int X = type == QUDA_5D_PC ? fullLatticeIndex_5d(i, oddBit) : fullLatticeIndex_5d_4dpc(i, oddBit);
   // Checked that this matches code in dslash_core_ante.h.
   int xs = X/(Z[3]*Z[2]*Z[1]*Z[0]);
   int x4 = (X/(Z[2]*Z[1]*Z[0])) % Z[3];
   int x3 = (X/(Z[1]*Z[0])) % Z[2];
   int x2 = (X/Z[0]) % Z[1];
   int x1 = X % Z[0];
   // Displace and project back into domain 0,...,Ls-1.
   // Note that we add Ls to avoid the negative problem
   // of the C % operator.
   xs = (xs+dxs+Ls) % Ls;
   // Etc.
   x4 = (x4+dx4+Z[3]) % Z[3];
   x3 = (x3+dx3+Z[2]) % Z[2];
   x2 = (x2+dx2+Z[1]) % Z[1];
   x1 = (x1+dx1+Z[0]) % Z[0];
   // Return linear half index.  Remember that integer division
   // rounds down.
   return (xs*(Z[3]*Z[2]*Z[1]*Z[0]) + x4*(Z[2]*Z[1]*Z[0]) + x3*(Z[1]*Z[0]) + x2*(Z[0]) + x1) / 2;
 }


 template <QudaDWFPCType type, typename Float>
   Float *spinorNeighbor_5d(int i, int dir, int oddBit, Float *spinorField, int neighbor_distance=1, int siteSize=24) {
   int nb = neighbor_distance;
   int j;
   switch (dir) {
   case 0: j = neighborIndex_5d<type>(i, oddBit, 0, 0, 0, 0, +nb); break;
   case 1: j = neighborIndex_5d<type>(i, oddBit, 0, 0, 0, 0, -nb); break;
   case 2: j = neighborIndex_5d<type>(i, oddBit, 0, 0, 0, +nb, 0); break;
   case 3: j = neighborIndex_5d<type>(i, oddBit, 0, 0, 0, -nb, 0); break;
   case 4: j = neighborIndex_5d<type>(i, oddBit, 0, 0, +nb, 0, 0); break;
   case 5: j = neighborIndex_5d<type>(i, oddBit, 0, 0, -nb, 0, 0); break;
   case 6: j = neighborIndex_5d<type>(i, oddBit, 0, +nb, 0, 0, 0); break;
   case 7: j = neighborIndex_5d<type>(i, oddBit, 0, -nb, 0, 0, 0); break;
   case 8: j = neighborIndex_5d<type>(i, oddBit, +nb, 0, 0, 0, 0); break;
   case 9: j = neighborIndex_5d<type>(i, oddBit, -nb, 0, 0, 0, 0); break;
   default: j = -1; break;
   }
   return &spinorField[j*siteSize];
 }


 #ifdef MULTI_GPU

 static inline int
 x4_mg(int i, int oddBit)
 {
   int Y = fullLatticeIndex(i, oddBit);
   int x4 = Y/(Z[2]*Z[1]*Z[0]);
   return x4;
 }

 template <typename Float>
 static inline Float *gaugeLink_mg4dir(int i, int dir, int oddBit, Float **gaugeEven, Float **gaugeOdd,
       Float** ghostGaugeEven, Float** ghostGaugeOdd, int n_ghost_faces, int nbr_distance) {
   Float **gaugeField;
   int j;
   int d = nbr_distance;
   if (dir % 2 == 0) {
     j = i;
     gaugeField = (oddBit ? gaugeOdd : gaugeEven);
   }
   else {

     int Y = fullLatticeIndex(i, oddBit);
     int x4 = Y/(Z[2]*Z[1]*Z[0]);
     int x3 = (Y/(Z[1]*Z[0])) % Z[2];
     int x2 = (Y/Z[0]) % Z[1];
     int x1 = Y % Z[0];
     int X1= Z[0];
     int X2= Z[1];
     int X3= Z[2];
     int X4= Z[3];
     Float* ghostGaugeField;

     switch (dir) {
     case 1:
       { //-X direction
         int new_x1 = (x1 - d + X1 )% X1;
         if (x1 -d < 0 && comm_dim_partitioned(0)){
     ghostGaugeField = (oddBit?ghostGaugeEven[0]: ghostGaugeOdd[0]);
     int offset = (n_ghost_faces + x1 -d)*X4*X3*X2/2 + (x4*X3*X2 + x3*X2+x2)/2;
     return &ghostGaugeField[offset*(3*3*2)];
         }
         j = (x4*X3*X2*X1 + x3*X2*X1 + x2*X1 + new_x1) / 2;
         break;
       }
     case 3:
       { //-Y direction
         int new_x2 = (x2 - d + X2 )% X2;
         if (x2 -d < 0 && comm_dim_partitioned(1)){
           ghostGaugeField = (oddBit?ghostGaugeEven[1]: ghostGaugeOdd[1]);
           int offset = (n_ghost_faces + x2 -d)*X4*X3*X1/2 + (x4*X3*X1 + x3*X1+x1)/2;
           return &ghostGaugeField[offset*(3*3*2)];
         }
         j = (x4*X3*X2*X1 + x3*X2*X1 + new_x2*X1 + x1) / 2;
         break;

       }
     case 5:
       { //-Z direction
         int new_x3 = (x3 - d + X3 )% X3;
         if (x3 -d < 0 && comm_dim_partitioned(2)){
           ghostGaugeField = (oddBit?ghostGaugeEven[2]: ghostGaugeOdd[2]);
           int offset = (n_ghost_faces + x3 -d)*X4*X2*X1/2 + (x4*X2*X1 + x2*X1+x1)/2;
           return &ghostGaugeField[offset*(3*3*2)];
         }
         j = (x4*X3*X2*X1 + new_x3*X2*X1 + x2*X1 + x1) / 2;
         break;
       }
     case 7:
       { //-T direction
         int new_x4 = (x4 - d + X4)% X4;
         if (x4 -d < 0 && comm_dim_partitioned(3)){
           ghostGaugeField = (oddBit?ghostGaugeEven[3]: ghostGaugeOdd[3]);
           int offset = (n_ghost_faces + x4 -d)*X1*X2*X3/2 + (x3*X2*X1 + x2*X1+x1)/2;
           return &ghostGaugeField[offset*(3*3*2)];
         }
         j = (new_x4*(X3*X2*X1) + x3*(X2*X1) + x2*(X1) + x1) / 2;
         break;
       }//7

     default: j = -1; printf("ERROR: wrong dir \n"); exit(1);
     }
     gaugeField = (oddBit ? gaugeEven : gaugeOdd);

   }

   return &gaugeField[dir/2][j*(3*3*2)];
 }

 template <typename Float>
 static inline Float *spinorNeighbor_mg4dir(int i, int dir, int oddBit, Float *spinorField, Float** fwd_nbr_spinor,
              Float** back_nbr_spinor, int neighbor_distance, int nFace)
 {
   int j;
   int nb = neighbor_distance;
   int Y = fullLatticeIndex(i, oddBit);
   int x4 = Y/(Z[2]*Z[1]*Z[0]);
   int x3 = (Y/(Z[1]*Z[0])) % Z[2];
   int x2 = (Y/Z[0]) % Z[1];
   int x1 = Y % Z[0];
   int X1= Z[0];
   int X2= Z[1];
   int X3= Z[2];
   int X4= Z[3];

   switch (dir) {
   case 0://+X
     {
       int new_x1 = (x1 + nb)% X1;
       if(x1+nb >=X1 && comm_dim_partitioned(0) ){
         int offset = ( x1 + nb -X1)*X4*X3*X2/2+(x4*X3*X2 + x3*X2+x2)/2;
         return fwd_nbr_spinor[0] + offset*mySpinorSiteSize;
       }
       j = (x4*X3*X2*X1 + x3*X2*X1 + x2*X1 + new_x1) / 2;
       break;
     }
   case 1://-X
     {
       int new_x1 = (x1 - nb + X1)% X1;
       if(x1 - nb < 0 && comm_dim_partitioned(0)){
         int offset = ( x1+nFace- nb)*X4*X3*X2/2+(x4*X3*X2 + x3*X2+x2)/2;
         return back_nbr_spinor[0] + offset*mySpinorSiteSize;
       }
       j = (x4*X3*X2*X1 + x3*X2*X1 + x2*X1 + new_x1) / 2;
       break;
     }
   case 2://+Y
     {
       int new_x2 = (x2 + nb)% X2;
       if(x2+nb >=X2 && comm_dim_partitioned(1)){
         int offset = ( x2 + nb -X2)*X4*X3*X1/2+(x4*X3*X1 + x3*X1+x1)/2;
         return fwd_nbr_spinor[1] + offset*mySpinorSiteSize;
       }
       j = (x4*X3*X2*X1 + x3*X2*X1 + new_x2*X1 + x1) / 2;
       break;
     }
   case 3:// -Y
     {
       int new_x2 = (x2 - nb + X2)% X2;
       if(x2 - nb < 0 && comm_dim_partitioned(1)){
         int offset = ( x2 + nFace -nb)*X4*X3*X1/2+(x4*X3*X1 + x3*X1+x1)/2;
         return back_nbr_spinor[1] + offset*mySpinorSiteSize;
       }
       j = (x4*X3*X2*X1 + x3*X2*X1 + new_x2*X1 + x1) / 2;
       break;
     }
   case 4://+Z
     {
       int new_x3 = (x3 + nb)% X3;
       if(x3+nb >=X3 && comm_dim_partitioned(2)){
         int offset = ( x3 + nb -X3)*X4*X2*X1/2+(x4*X2*X1 + x2*X1+x1)/2;
         return fwd_nbr_spinor[2] + offset*mySpinorSiteSize;
       }
       j = (x4*X3*X2*X1 + new_x3*X2*X1 + x2*X1 + x1) / 2;
       break;
     }
   case 5://-Z
     {
       int new_x3 = (x3 - nb + X3)% X3;
       if(x3 - nb < 0 && comm_dim_partitioned(2)){
         int offset = ( x3 + nFace -nb)*X4*X2*X1/2+(x4*X2*X1 + x2*X1+x1)/2;
         return back_nbr_spinor[2] + offset*mySpinorSiteSize;
       }
       j = (x4*X3*X2*X1 + new_x3*X2*X1 + x2*X1 + x1) / 2;
       break;
     }
   case 6://+T
     {
       j = neighborIndex_mg(i, oddBit, +nb, 0, 0, 0);
       int x4 = x4_mg(i, oddBit);
       if ( (x4 + nb) >= Z[3]  && comm_dim_partitioned(3)){
         int offset = (x4+nb - Z[3])*Vsh_t;
         return &fwd_nbr_spinor[3][(offset+j)*mySpinorSiteSize];
       }
       break;
     }
   case 7://-T
     {
       j = neighborIndex_mg(i, oddBit, -nb, 0, 0, 0);
       int x4 = x4_mg(i, oddBit);
       if ( (x4 - nb) < 0 && comm_dim_partitioned(3)){
         int offset = ( x4 - nb +nFace)*Vsh_t;
         return &back_nbr_spinor[3][(offset+j)*mySpinorSiteSize];
       }
       break;
     }
   default: j = -1; printf("ERROR: wrong dir\n"); exit(1);
   }

   return &spinorField[j*(mySpinorSiteSize)];
 }

 template<QudaDWFPCType type>
 int neighborIndex_5d_mgpu(int i, int oddBit, int dxs, int dx4, int dx3, int dx2, int dx1)
 {
   int ret;

   int Y = (type == QUDA_5D_PC) ? fullLatticeIndex_5d(i, oddBit) : fullLatticeIndex_5d_4dpc(i, oddBit);

   int xs = Y/(Z[3]*Z[2]*Z[1]*Z[0]);
   int x4 = (Y/(Z[2]*Z[1]*Z[0])) % Z[3];
   int x3 = (Y/(Z[1]*Z[0])) % Z[2];
   int x2 = (Y/Z[0]) % Z[1];
   int x1 = Y % Z[0];
   int ghost_x4 = x4+ dx4;

   xs = (xs+dxs+Ls) % Ls;
   x4 = (x4+dx4+Z[3]) % Z[3];
   x3 = (x3+dx3+Z[2]) % Z[2];
   x2 = (x2+dx2+Z[1]) % Z[1];
   x1 = (x1+dx1+Z[0]) % Z[0];

   if ( (ghost_x4 >= 0 && ghost_x4) < Z[3] || !comm_dim_partitioned(3)){
     ret = (xs*Z[3]*Z[2]*Z[1]*Z[0] + x4*Z[2]*Z[1]*Z[0] + x3*Z[1]*Z[0] + x2*Z[0] + x1) >> 1;
   }else{
     ret = (xs*Z[2]*Z[1]*Z[0] + x3*Z[1]*Z[0] + x2*Z[0] + x1) >> 1;
   }

   return ret;
 }

 template <QudaDWFPCType type>
 int x4_5d_mgpu(int i, int oddBit)
 {
   int Y = (type == QUDA_5D_PC) ? fullLatticeIndex_5d(i, oddBit) : fullLatticeIndex_5d_4dpc(i, oddBit);
   return (Y/(Z[2]*Z[1]*Z[0])) % Z[3];
 }


 template <QudaDWFPCType type, typename Float>
 Float *spinorNeighbor_5d_mgpu(int i, int dir, int oddBit, Float *spinorField, Float** fwd_nbr_spinor, Float** back_nbr_spinor, int neighbor_distance, int nFace, int spinorSize = 24)
 {
   int j;
   int nb = neighbor_distance;
   int Y = (type == QUDA_5D_PC) ? fullLatticeIndex_5d(i, oddBit) : fullLatticeIndex_5d_4dpc(i, oddBit);

   int xs = Y/(Z[3]*Z[2]*Z[1]*Z[0]);
   int x4 = (Y/(Z[2]*Z[1]*Z[0])) % Z[3];
   int x3 = (Y/(Z[1]*Z[0])) % Z[2];
   int x2 = (Y/Z[0]) % Z[1];
   int x1 = Y % Z[0];

   int X1= Z[0];
   int X2= Z[1];
   int X3= Z[2];
   int X4= Z[3];
   switch (dir) {
   case 0://+X
     {
       int new_x1 = (x1 + nb)% X1;
       if(x1+nb >=X1 && comm_dim_partitioned(0)) {
         int offset = ((x1 + nb -X1)*Ls*X4*X3*X2+xs*X4*X3*X2+x4*X3*X2 + x3*X2+x2) >> 1;
         return fwd_nbr_spinor[0] + offset*spinorSize;
       }
       j = (xs*X4*X3*X2*X1 + x4*X3*X2*X1 + x3*X2*X1 + x2*X1 + new_x1) >> 1;
       break;
     }
   case 1://-X
     {
       int new_x1 = (x1 - nb + X1)% X1;
       if(x1 - nb < 0 && comm_dim_partitioned(0)) {
         int offset = (( x1+nFace- nb)*Ls*X4*X3*X2 + xs*X4*X3*X2 + x4*X3*X2 + x3*X2 + x2) >> 1;
         return back_nbr_spinor[0] + offset*spinorSize;
       }
       j = (xs*X4*X3*X2*X1 + x4*X3*X2*X1 + x3*X2*X1 + x2*X1 + new_x1) >> 1;
       break;
     }
   case 2://+Y
     {
       int new_x2 = (x2 + nb)% X2;
       if(x2+nb >=X2 && comm_dim_partitioned(1)) {
         int offset = (( x2 + nb -X2)*Ls*X4*X3*X1+xs*X4*X3*X1+x4*X3*X1 + x3*X1+x1) >> 1;
         return fwd_nbr_spinor[1] + offset*spinorSize;
       }
       j = (xs*X4*X3*X2*X1 + x4*X3*X2*X1 + x3*X2*X1 + new_x2*X1 + x1) >> 1;
       break;
     }
   case 3:// -Y
     {
       int new_x2 = (x2 - nb + X2)% X2;
       if(x2 - nb < 0 && comm_dim_partitioned(1)) {
         int offset = (( x2 + nFace -nb)*Ls*X4*X3*X1+xs*X4*X3*X1+ x4*X3*X1 + x3*X1+x1) >> 1;
         return back_nbr_spinor[1] + offset*spinorSize;
       }
       j = (xs*X4*X3*X2*X1 + x4*X3*X2*X1 + x3*X2*X1 + new_x2*X1 + x1) >> 1;
       break;
     }
   case 4://+Z
     {
       int new_x3 = (x3 + nb)% X3;
       if(x3+nb >=X3 && comm_dim_partitioned(2)) {
         int offset = (( x3 + nb -X3)*Ls*X4*X2*X1+xs*X4*X2*X1+x4*X2*X1 + x2*X1+x1) >> 1;
         return fwd_nbr_spinor[2] + offset*spinorSize;
       }
       j = (xs*X4*X3*X2*X1 + x4*X3*X2*X1 + new_x3*X2*X1 + x2*X1 + x1) >> 1;
       break;
     }
   case 5://-Z
     {
       int new_x3 = (x3 - nb + X3)% X3;
       if(x3 - nb < 0 && comm_dim_partitioned(2)){
         int offset = (( x3 + nFace -nb)*Ls*X4*X2*X1+xs*X4*X2*X1+x4*X2*X1+x2*X1+x1) >> 1;
         return back_nbr_spinor[2] + offset*spinorSize;
       }
       j = (xs*X4*X3*X2*X1 + x4*X3*X2*X1 + new_x3*X2*X1 + x2*X1 + x1) >> 1;
       break;
     }
   case 6://+T
     {
       int x4 = x4_5d_mgpu<type>(i, oddBit);
       if ( (x4 + nb) >= Z[3] && comm_dim_partitioned(3)) {
         int offset = ((x4 + nb - Z[3])*Ls*X3*X2*X1+xs*X3*X2*X1+x3*X2*X1+x2*X1+x1) >> 1;
         return fwd_nbr_spinor[3] + offset*spinorSize;
       }
       j = neighborIndex_5d_mgpu<type>(i, oddBit, 0, +nb, 0, 0, 0);
       break;
     }
   case 7://-T
     {
       int x4 = x4_5d_mgpu<type>(i, oddBit);
       if ( (x4 - nb) < 0 && comm_dim_partitioned(3)) {
         int offset = (( x4 - nb +nFace)*Ls*X3*X2*X1+xs*X3*X2*X1+x3*X2*X1+x2*X1+x1) >> 1;
         return back_nbr_spinor[3] + offset*spinorSize;
       }
       j = neighborIndex_5d_mgpu<type>(i, oddBit, 0, -nb, 0, 0, 0);
       break;
     }
   default: j = -1; printf("ERROR: wrong dir\n"); exit(1);
   }

   return &spinorField[j*(spinorSize)];
 }


 #endif // MULTI_GPU

 #endif // _DSLASH_UTIL_H
quda::neighborIndex
__device__ __forceinline__ int neighborIndex(const unsigned int &cb_idx, const int(&shift)[4], const bool(&partitioned)[4], const unsigned int &parity)
Definition: shift_quark_field.cu:41

sum
static void sum(Float *dst, Float *a, Float *b, int cnt)
Definition: dslash_util.h:8

spinorNeighbor_5d
Float * spinorNeighbor_5d(int i, int dir, int oddBit, Float *spinorField, int neighbor_distance=1, int siteSize=24)
Definition: dslash_util.h:184

norm2
static double norm2(Float *v, int len)
Definition: dslash_util.h:44

test_util.h

deg_tm_dslash_cuda_gen.a_re
def a_re(b, s, c)
Definition: deg_tm_dslash_cuda_gen.py:127

sub
static void sub(Float *dst, Float *a, Float *b, int cnt)
Definition: dslash_util.h:14

fullLatticeIndex_5d_4dpc
int fullLatticeIndex_5d_4dpc(int i, int oddBit)
Definition: test_util.cpp:697

axmy
static void axmy(Float *x, Float a, Float *y, int len)
Definition: dslash_util.h:39

ret
return ret
Definition: CMakeCUDACompilerId.cpp1.ii:13161

cnt
static unsigned cnt
Definition: CMakeCUDACompilerId.cpp1.ii:12978

exit
void exit(int) __attribute__((noreturn))

axpby
static void axpby(Float a, Float *x, Float b, Float *y, int len)
Definition: dslash_util.h:33

spinorNeighbor
static Float * spinorNeighbor(int i, int dir, int oddBit, Float *spinorField, int neighbor_distance)
Definition: dslash_util.h:127

offset
size_t size_t offset
Definition: CMakeCUDACompilerId.cpp1.ii:2497

fullLatticeIndex
int fullLatticeIndex(int dim[4], int index, int oddBit)
Definition: test_util.cpp:442

Ls
int Ls
Definition: test_util.cpp:39

comm_quda.h

b
#define b
Definition: dw_dslash4_core.h:83

x
p x
Definition: CMakeCUDACompilerId.cpp1.ii:3011

Vsh_t
int Vsh_t
Definition: test_util.cpp:31

printf
int printf(const char *,...) __attribute__((__format__(__printf__

mySpinorSiteSize
#define mySpinorSiteSize
Definition: staggered_invert_test.cpp:23

fused_exterior_ndeg_tm_dslash_cuda_gen.i
int i
start here
Definition: fused_exterior_ndeg_tm_dslash_cuda_gen.py:816

Z
int Z[4]
Definition: test_util.cpp:27

ax
static void ax(Float *dst, Float a, Float *x, int cnt)
Definition: dslash_util.h:20

neighborIndex_5d
int neighborIndex_5d(int i, int oddBit, int dxs, int dx4, int dx3, int dx2, int dx1)
Definition: dslash_util.h:158

X
int X
Definition: asym_wilson_clover_dslash_dagger_fermi_core.h:394

neighborIndex_mg
int neighborIndex_mg(int i, int oddBit, int dx4, int dx3, int dx2, int dx1)
Definition: test_util.cpp:527

gaugeLink
static Float * gaugeLink(int i, int dir, int oddBit, Float **gaugeEven, Float **gaugeOdd, int nbr_distance)
Definition: dslash_util.h:104

fullLatticeIndex_5d
int fullLatticeIndex_5d(int i, int oddBit)
Definition: test_util.cpp:692

QUDA_5D_PC
Definition: enum_quda.h:361

n
int n
Definition: CMakeCUDACompilerId.cpp1.ii:8086

su3Mul
static void su3Mul(sFloat *res, gFloat *mat, sFloat *vec)
Definition: dslash_util.h:80

su3Tmul
static void su3Tmul(sFloat *res, gFloat *mat, sFloat *vec)
Definition: dslash_util.h:85

mat
void mat(void *out, void **link, void *in, int dagger_bit, int mu, QudaPrecision sPrecision, QudaPrecision gPrecision)
Definition: covdev_reference.cpp:117

d
static __inline__ size_t size_t d
Definition: CMakeCUDACompilerId.cpp1.ii:3019

axpy
static void axpy(Float a, Float *x, Float *y, int len)
Definition: dslash_util.h:27

a
#define a
Definition: dw_dslash4_core.h:82

deg_tm_dslash_cuda_gen.a_im
def a_im(b, s, c)
Definition: deg_tm_dslash_cuda_gen.py:128

comm_dim_partitioned
int comm_dim_partitioned(int dim)
Definition: comm_common.cpp:597

negx
static void negx(Float *x, int len)
Definition: dslash_util.h:51

su3Transpose
static void su3Transpose(Float *res, Float *mat)
Definition: dslash_util.h:69

y
int y
Definition: CMakeCUDACompilerId.cpp1.ii:2637

dot
static void dot(sFloat *res, gFloat *a, sFloat *b)
Definition: dslash_util.h:56

len
int len
Definition: CMakeCUDACompilerId.cpp1.ii:2352