quda-ref/v0.7.0/test__util_8cpp_source.html

 #include <complex>

 #include <stdlib.h>

 #include <stdio.h>

 #include <string.h>

 #include <short.h>


 #if defined(QMP_COMMS)

 #include <qmp.h>

 #elif defined(MPI_COMMS)

 #include <mpi.h>

 #endif


 #include <wilson_dslash_reference.h>

 #include <test_util.h>


 #include <face_quda.h>

 #include <dslash_quda.h>

 #include "misc.h"


 using namespace std;


 #define XUP 0

 #define YUP 1

 #define ZUP 2

 #define TUP 3


 int Z[4];

 int V;

 int Vh;

 int Vs_x, Vs_y, Vs_z, Vs_t;

 int Vsh_x, Vsh_y, Vsh_z, Vsh_t;

 int faceVolume[4];


 //extended volume, +4

 int E1, E1h, E2, E3, E4;

 int E[4];

 int V_ex, Vh_ex;


 int Ls;

 int V5;

 int V5h;


 int mySpinorSiteSize;


 extern float fat_link_max;


 void initComms(int argc, char **argv, const int *commDims)

 {

 #if defined(QMP_COMMS)

   QMP_thread_level_t tl;

   QMP_init_msg_passing(&argc, &argv, QMP_THREAD_SINGLE, &tl);


   // FIXME? - tests crash without this

   QMP_declare_logical_topology(commDims, 4);


 #elif defined(MPI_COMMS)

   MPI_Init(&argc, &argv);

 #endif

   initCommsGridQuda(4, commDims, NULL, NULL);

   initRand();

 }


 void finalizeComms()

 {

 #if defined(QMP_COMMS)

   QMP_finalize_msg_passing();

 #elif defined(MPI_COMMS)

   MPI_Finalize();

 #endif

 }


 void initRand()

 {

   int rank = 0;


 #if defined(QMP_COMMS)

   rank = QMP_get_node_number();

 #elif defined(MPI_COMMS)

   MPI_Comm_rank(MPI_COMM_WORLD, &rank);

 #endif


   srand(17*rank + 137);

 }


 void setDims(int *X) {

   V = 1;

   for (int d=0; d< 4; d++) {

     V *= X[d];

     Z[d] = X[d];


     faceVolume[d] = 1;

     for (int i=0; i<4; i++) {

       if (i==d) continue;

       faceVolume[d] *= X[i];

     }

   }

   Vh = V/2;


   Vs_x = X[1]*X[2]*X[3];

   Vs_y = X[0]*X[2]*X[3];

   Vs_z = X[0]*X[1]*X[3];

   Vs_t = X[0]*X[1]*X[2];


   Vsh_x = Vs_x/2;

   Vsh_y = Vs_y/2;

   Vsh_z = Vs_z/2;

   Vsh_t = Vs_t/2;


   E1=X[0]+4; E2=X[1]+4; E3=X[2]+4; E4=X[3]+4;

   E1h=E1/2;

   E[0] = E1;

   E[1] = E2;

   E[2] = E3;

   E[3] = E4;

   V_ex = E1*E2*E3*E4;

   Vh_ex = V_ex/2;


 }


 void dw_setDims(int *X, const int L5)

 {

   V = 1;

   for (int d=0; d< 4; d++)

   {

     V *= X[d];

     Z[d] = X[d];


     faceVolume[d] = 1;

     for (int i=0; i<4; i++) {

       if (i==d) continue;

       faceVolume[d] *= X[i];

     }

   }

   Vh = V/2;


   Ls = L5;

   V5 = V*Ls;

   V5h = Vh*Ls;


   Vs_t = Z[0]*Z[1]*Z[2]*Ls;//?

   Vsh_t = Vs_t/2;  //?

 }


 void setSpinorSiteSize(int n)

 {

   mySpinorSiteSize = n;

 }


 template <typename Float>

 static void printVector(Float *v) {

   printfQuda("{(%f %f) (%f %f) (%f %f)}\n", v[0], v[1], v[2], v[3], v[4], v[5]);

 }


 // X indexes the lattice site

 void printSpinorElement(void *spinor, int X, QudaPrecision precision) {

   if (precision == QUDA_DOUBLE_PRECISION)

     for (int s=0; s<4; s++) printVector((double*)spinor+X*24+s*6);

   else

     for (int s=0; s<4; s++) printVector((float*)spinor+X*24+s*6);

 }


 // X indexes the full lattice

 void printGaugeElement(void *gauge, int X, QudaPrecision precision) {

   if (getOddBit(X) == 0) {

     if (precision == QUDA_DOUBLE_PRECISION)

       for (int m=0; m<3; m++) printVector((double*)gauge +(X/2)*gaugeSiteSize + m*3*2);

     else

       for (int m=0; m<3; m++) printVector((float*)gauge +(X/2)*gaugeSiteSize + m*3*2);


   } else {

     if (precision == QUDA_DOUBLE_PRECISION)

       for (int m = 0; m < 3; m++) printVector((double*)gauge + (X/2+Vh)*gaugeSiteSize + m*3*2);

     else

       for (int m = 0; m < 3; m++) printVector((float*)gauge + (X/2+Vh)*gaugeSiteSize + m*3*2);

   }

 }


 // returns 0 or 1 if the full lattice index X is even or odd

 int getOddBit(int Y) {

   int x4 = Y/(Z[2]*Z[1]*Z[0]);

   int x3 = (Y/(Z[1]*Z[0])) % Z[2];

   int x2 = (Y/Z[0]) % Z[1];

   int x1 = Y % Z[0];

   return (x4+x3+x2+x1) % 2;

 }


 // a+=b

 template <typename Float>

 inline void complexAddTo(Float *a, Float *b) {

   a[0] += b[0];

   a[1] += b[1];

 }


 // a = b*c

 template <typename Float>

 inline void complexProduct(Float *a, Float *b, Float *c) {

   a[0] = b[0]*c[0] - b[1]*c[1];

   a[1] = b[0]*c[1] + b[1]*c[0];

 }


 // a = conj(b)*conj(c)

 template <typename Float>

 inline void complexConjugateProduct(Float *a, Float *b, Float *c) {

   a[0] = b[0]*c[0] - b[1]*c[1];

   a[1] = -b[0]*c[1] - b[1]*c[0];

 }


 // a = conj(b)*c

 template <typename Float>

 inline void complexDotProduct(Float *a, Float *b, Float *c) {

   a[0] = b[0]*c[0] + b[1]*c[1];

   a[1] = b[0]*c[1] - b[1]*c[0];

 }


 // a += b*c

 template <typename Float>

 inline void accumulateComplexProduct(Float *a, Float *b, Float *c, Float sign) {

   a[0] += sign*(b[0]*c[0] - b[1]*c[1]);

   a[1] += sign*(b[0]*c[1] + b[1]*c[0]);

 }


 // a += conj(b)*c)

 template <typename Float>

 inline void accumulateComplexDotProduct(Float *a, Float *b, Float *c) {

   a[0] += b[0]*c[0] + b[1]*c[1];

   a[1] += b[0]*c[1] - b[1]*c[0];

 }


 template <typename Float>

 inline void accumulateConjugateProduct(Float *a, Float *b, Float *c, int sign) {

   a[0] += sign * (b[0]*c[0] - b[1]*c[1]);

   a[1] -= sign * (b[0]*c[1] + b[1]*c[0]);

 }


 template <typename Float>

 inline void su3Construct12(Float *mat) {

   Float *w = mat+12;

   w[0] = 0.0;

   w[1] = 0.0;

   w[2] = 0.0;

   w[3] = 0.0;

   w[4] = 0.0;

   w[5] = 0.0;

 }


 // Stabilized Bunk and Sommer

 template <typename Float>

 inline void su3Construct8(Float *mat) {

   mat[0] = atan2(mat[1], mat[0]);

   mat[1] = atan2(mat[13], mat[12]);

   for (int i=8; i<18; i++) mat[i] = 0.0;

 }


 void su3_construct(void *mat, QudaReconstructType reconstruct, QudaPrecision precision) {

   if (reconstruct == QUDA_RECONSTRUCT_12) {

     if (precision == QUDA_DOUBLE_PRECISION) su3Construct12((double*)mat);

     else su3Construct12((float*)mat);

   } else {

     if (precision == QUDA_DOUBLE_PRECISION) su3Construct8((double*)mat);

     else su3Construct8((float*)mat);

   }

 }


 // given first two rows (u,v) of SU(3) matrix mat, reconstruct the third row

 // as the cross product of the conjugate vectors: w = u* x v*

 //

 // 48 flops

 template <typename Float>

 static void su3Reconstruct12(Float *mat, int dir, int ga_idx, QudaGaugeParam *param) {

   Float *u = &mat[0*(3*2)];

   Float *v = &mat[1*(3*2)];

   Float *w = &mat[2*(3*2)];

   w[0] = 0.0; w[1] = 0.0; w[2] = 0.0; w[3] = 0.0; w[4] = 0.0; w[5] = 0.0;

   accumulateConjugateProduct(w+0*(2), u+1*(2), v+2*(2), +1);

   accumulateConjugateProduct(w+0*(2), u+2*(2), v+1*(2), -1);

   accumulateConjugateProduct(w+1*(2), u+2*(2), v+0*(2), +1);

   accumulateConjugateProduct(w+1*(2), u+0*(2), v+2*(2), -1);

   accumulateConjugateProduct(w+2*(2), u+0*(2), v+1*(2), +1);

   accumulateConjugateProduct(w+2*(2), u+1*(2), v+0*(2), -1);

   Float u0 = (dir < 3 ? param->anisotropy :

               (ga_idx >= (Z[3]-1)*Z[0]*Z[1]*Z[2]/2 ? param->t_boundary : 1));

   w[0]*=u0; w[1]*=u0; w[2]*=u0; w[3]*=u0; w[4]*=u0; w[5]*=u0;

 }


 template <typename Float>

 static void su3Reconstruct8(Float *mat, int dir, int ga_idx, QudaGaugeParam *param) {

   // First reconstruct first row

   Float row_sum = 0.0;

   row_sum += mat[2]*mat[2];

   row_sum += mat[3]*mat[3];

   row_sum += mat[4]*mat[4];

   row_sum += mat[5]*mat[5];

   Float u0 = (dir < 3 ? param->anisotropy :

               (ga_idx >= (Z[3]-1)*Z[0]*Z[1]*Z[2]/2 ? param->t_boundary : 1));

   Float U00_mag = sqrt(1.f/(u0*u0) - row_sum);


   mat[14] = mat[0];

   mat[15] = mat[1];


   mat[0] = U00_mag * cos(mat[14]);

   mat[1] = U00_mag * sin(mat[14]);


   Float column_sum = 0.0;

   for (int i=0; i<2; i++) column_sum += mat[i]*mat[i];

   for (int i=6; i<8; i++) column_sum += mat[i]*mat[i];

   Float U20_mag = sqrt(1.f/(u0*u0) - column_sum);


   mat[12] = U20_mag * cos(mat[15]);

   mat[13] = U20_mag * sin(mat[15]);


   // First column now restored


   // finally reconstruct last elements from SU(2) rotation

   Float r_inv2 = 1.0/(u0*row_sum);


   // U11

   Float A[2];

   complexDotProduct(A, mat+0, mat+6);

   complexConjugateProduct(mat+8, mat+12, mat+4);

   accumulateComplexProduct(mat+8, A, mat+2, u0);

   mat[8] *= -r_inv2;

   mat[9] *= -r_inv2;


   // U12

   complexConjugateProduct(mat+10, mat+12, mat+2);

   accumulateComplexProduct(mat+10, A, mat+4, -u0);

   mat[10] *= r_inv2;

   mat[11] *= r_inv2;


   // U21

   complexDotProduct(A, mat+0, mat+12);

   complexConjugateProduct(mat+14, mat+6, mat+4);

   accumulateComplexProduct(mat+14, A, mat+2, -u0);

   mat[14] *= r_inv2;

   mat[15] *= r_inv2;


   // U12

   complexConjugateProduct(mat+16, mat+6, mat+2);

   accumulateComplexProduct(mat+16, A, mat+4, u0);

   mat[16] *= -r_inv2;

   mat[17] *= -r_inv2;

 }


 void su3_reconstruct(void *mat, int dir, int ga_idx, QudaReconstructType reconstruct, QudaPrecision precision, QudaGaugeParam *param) {

   if (reconstruct == QUDA_RECONSTRUCT_12) {

     if (precision == QUDA_DOUBLE_PRECISION) su3Reconstruct12((double*)mat, dir, ga_idx, param);

     else su3Reconstruct12((float*)mat, dir, ga_idx, param);

   } else {

     if (precision == QUDA_DOUBLE_PRECISION) su3Reconstruct8((double*)mat, dir, ga_idx, param);

     else su3Reconstruct8((float*)mat, dir, ga_idx, param);

   }

 }


 /*

   void su3_construct_8_half(float *mat, short *mat_half) {

   su3Construct8(mat);


   mat_half[0] = floatToShort(mat[0] / M_PI);

   mat_half[1] = floatToShort(mat[1] / M_PI);

   for (int i=2; i<18; i++) {

   mat_half[i] = floatToShort(mat[i]);

   }

   }


   void su3_reconstruct_8_half(float *mat, short *mat_half, int dir, int ga_idx, QudaGaugeParam *param) {


   for (int i=0; i<18; i++) {

   mat[i] = shortToFloat(mat_half[i]);

   }

   mat[0] *= M_PI;

   mat[1] *= M_PI;


   su3Reconstruct8(mat, dir, ga_idx, param);

   }*/


 template <typename Float>

 static int compareFloats(Float *a, Float *b, int len, double epsilon) {

   for (int i = 0; i < len; i++) {

     double diff = fabs(a[i] - b[i]);

     if (diff > epsilon) {

       printfQuda("ERROR: i=%d, a[%d]=%f, b[%d]=%f\n", i, i, a[i], i, b[i]);

       return 0;

     }

   }

   return 1;

 }


 int compare_floats(void *a, void *b, int len, double epsilon, QudaPrecision precision) {

   if  (precision == QUDA_DOUBLE_PRECISION) return compareFloats((double*)a, (double*)b, len, epsilon);

   else return compareFloats((float*)a, (float*)b, len, epsilon);

 }


 int fullLatticeIndex(int dim[4], int index, int oddBit){


   int za = index/(dim[0]>>1);

   int zb = za/dim[1];

   int x2 = za - zb*dim[1];

   int x4 = zb/dim[2];

   int x3 = zb - x4*dim[2];


   return  2*index + ((x2 + x3 + x4 + oddBit) & 1);

 }


 // given a "half index" i into either an even or odd half lattice (corresponding

 // to oddBit = {0, 1}), returns the corresponding full lattice index.

 int fullLatticeIndex(int i, int oddBit) {

   /*

     int boundaryCrossings = i/(Z[0]/2) + i/(Z[1]*Z[0]/2) + i/(Z[2]*Z[1]*Z[0]/2);

     return 2*i + (boundaryCrossings + oddBit) % 2;

   */


   int X1 = Z[0];

   int X2 = Z[1];

   int X3 = Z[2];

   //int X4 = Z[3];

   int X1h =X1/2;


   int sid =i;

   int za = sid/X1h;

   //int x1h = sid - za*X1h;

   int zb = za/X2;

   int x2 = za - zb*X2;

   int x4 = zb/X3;

   int x3 = zb - x4*X3;

   int x1odd = (x2 + x3 + x4 + oddBit) & 1;

   //int x1 = 2*x1h + x1odd;

   int X = 2*sid + x1odd;


   return X;

 }


 // i represents a "half index" into an even or odd "half lattice".

 // when oddBit={0,1} the half lattice is {even,odd}.

 //

 // the displacements, such as dx, refer to the full lattice coordinates.

 //

 // neighborIndex() takes a "half index", displaces it, and returns the

 // new "half index", which can be an index into either the even or odd lattices.

 // displacements of magnitude one always interchange odd and even lattices.

 //


 int neighborIndex(int i, int oddBit, int dx4, int dx3, int dx2, int dx1) {

   int Y = fullLatticeIndex(i, oddBit);

   int x4 = Y/(Z[2]*Z[1]*Z[0]);

   int x3 = (Y/(Z[1]*Z[0])) % Z[2];

   int x2 = (Y/Z[0]) % Z[1];

   int x1 = Y % Z[0];


   // assert (oddBit == (x+y+z+t)%2);


   x4 = (x4+dx4+Z[3]) % Z[3];

   x3 = (x3+dx3+Z[2]) % Z[2];

   x2 = (x2+dx2+Z[1]) % Z[1];

   x1 = (x1+dx1+Z[0]) % Z[0];


   return (x4*(Z[2]*Z[1]*Z[0]) + x3*(Z[1]*Z[0]) + x2*(Z[0]) + x1) / 2;

 }


 int neighborIndex(int dim[4], int index, int oddBit, int dx[4]){


   const int fullIndex = fullLatticeIndex(dim, index, oddBit);


   int x[4];

   x[3] = fullIndex/(dim[2]*dim[1]*dim[0]);

   x[2] = (fullIndex/(dim[1]*dim[0])) % dim[2];

   x[1] = (fullIndex/dim[0]) % dim[1];

   x[0] = fullIndex % dim[0];


   for(int dir=0; dir<4; ++dir)

     x[dir] = (x[dir]+dx[dir]+dim[dir]) % dim[dir];


   return (((x[3]*dim[2] + x[2])*dim[1] + x[1])*dim[0] + x[0])/2;

 }


 int

 neighborIndex_mg(int i, int oddBit, int dx4, int dx3, int dx2, int dx1)

 {

   int ret;


   int Y = fullLatticeIndex(i, oddBit);

   int x4 = Y/(Z[2]*Z[1]*Z[0]);

   int x3 = (Y/(Z[1]*Z[0])) % Z[2];

   int x2 = (Y/Z[0]) % Z[1];

   int x1 = Y % Z[0];


   int ghost_x4 = x4+ dx4;


   // assert (oddBit == (x+y+z+t)%2);


   x4 = (x4+dx4+Z[3]) % Z[3];

   x3 = (x3+dx3+Z[2]) % Z[2];

   x2 = (x2+dx2+Z[1]) % Z[1];

   x1 = (x1+dx1+Z[0]) % Z[0];


   if ( ghost_x4 >= 0 && ghost_x4 < Z[3]){

     ret = (x4*(Z[2]*Z[1]*Z[0]) + x3*(Z[1]*Z[0]) + x2*(Z[0]) + x1) / 2;

   }else{

     ret = (x3*(Z[1]*Z[0]) + x2*(Z[0]) + x1) / 2;

   }


   return ret;

 }


 /*

  * This is a computation of neighbor using the full index and the displacement in each direction

  *

  */


 int

 neighborIndexFullLattice(int i, int dx4, int dx3, int dx2, int dx1)

 {

   int oddBit = 0;

   int half_idx = i;

   if (i >= Vh){

     oddBit =1;

     half_idx = i - Vh;

   }


   int nbr_half_idx = neighborIndex(half_idx, oddBit, dx4,dx3,dx2,dx1);

   int oddBitChanged = (dx4+dx3+dx2+dx1)%2;

   if (oddBitChanged){

     oddBit = 1 - oddBit;

   }

   int ret = nbr_half_idx;

   if (oddBit){

     ret = Vh + nbr_half_idx;

   }


   return ret;

 }


 int

 neighborIndexFullLattice(int dim[4], int index, int dx[4])

 {

   const int volume = dim[0]*dim[1]*dim[2]*dim[3];

   const int halfVolume = volume/2;

   int oddBit = 0;

   int halfIndex = index;


   if(index >= halfVolume){

     oddBit = 1;

     halfIndex = index - halfVolume;

   }


   int neighborHalfIndex = neighborIndex(dim, halfIndex, oddBit, dx);


   int oddBitChanged = (dx[0]+dx[1]+dx[2]+dx[3])%2;

   if(oddBitChanged){

     oddBit = 1 - oddBit;

   }


   return neighborHalfIndex + oddBit*halfVolume;

 }


 int

 neighborIndexFullLattice_mg(int i, int dx4, int dx3, int dx2, int dx1)

 {

   int ret;

   int oddBit = 0;

   int half_idx = i;

   if (i >= Vh){

     oddBit =1;

     half_idx = i - Vh;

   }


   int Y = fullLatticeIndex(half_idx, oddBit);

   int x4 = Y/(Z[2]*Z[1]*Z[0]);

   int x3 = (Y/(Z[1]*Z[0])) % Z[2];

   int x2 = (Y/Z[0]) % Z[1];

   int x1 = Y % Z[0];

   int ghost_x4 = x4+ dx4;


   x4 = (x4+dx4+Z[3]) % Z[3];

   x3 = (x3+dx3+Z[2]) % Z[2];

   x2 = (x2+dx2+Z[1]) % Z[1];

   x1 = (x1+dx1+Z[0]) % Z[0];


   if ( ghost_x4 >= 0 && ghost_x4 < Z[3]){

     ret = (x4*(Z[2]*Z[1]*Z[0]) + x3*(Z[1]*Z[0]) + x2*(Z[0]) + x1) / 2;

   }else{

     ret = (x3*(Z[1]*Z[0]) + x2*(Z[0]) + x1) / 2;

     return ret;

   }


   int oddBitChanged = (dx4+dx3+dx2+dx1)%2;

   if (oddBitChanged){

     oddBit = 1 - oddBit;

   }


   if (oddBit){

     ret += Vh;

   }


   return ret;

 }


 // 4d checkerboard.

 // given a "half index" i into either an even or odd half lattice (corresponding

 // to oddBit = {0, 1}), returns the corresponding full lattice index.

 // Cf. GPGPU code in dslash_core_ante.h.

 // There, i is the thread index.

 int fullLatticeIndex_4d(int i, int oddBit) {

   if (i >= Vh || i < 0) {printf("i out of range in fullLatticeIndex_4d"); exit(-1);}

   /*

     int boundaryCrossings = i/(Z[0]/2) + i/(Z[1]*Z[0]/2) + i/(Z[2]*Z[1]*Z[0]/2);

     return 2*i + (boundaryCrossings + oddBit) % 2;

   */


   int X1 = Z[0];

   int X2 = Z[1];

   int X3 = Z[2];

   //int X4 = Z[3];

   int X1h =X1/2;


   int sid =i;

   int za = sid/X1h;

   //int x1h = sid - za*X1h;

   int zb = za/X2;

   int x2 = za - zb*X2;

   int x4 = zb/X3;

   int x3 = zb - x4*X3;

   int x1odd = (x2 + x3 + x4 + oddBit) & 1;

   //int x1 = 2*x1h + x1odd;

   int X = 2*sid + x1odd;


   return X;

 }


 // 5d checkerboard.

 // given a "half index" i into either an even or odd half lattice (corresponding

 // to oddBit = {0, 1}), returns the corresponding full lattice index.

 // Cf. GPGPU code in dslash_core_ante.h.

 // There, i is the thread index sid.

 // This function is used by neighborIndex_5d in dslash_reference.cpp.

 //ok

 int fullLatticeIndex_5d(int i, int oddBit) {

   int boundaryCrossings = i/(Z[0]/2) + i/(Z[1]*Z[0]/2) + i/(Z[2]*Z[1]*Z[0]/2) + i/(Z[3]*Z[2]*Z[1]*Z[0]/2);

   return 2*i + (boundaryCrossings + oddBit) % 2;

 }


 int fullLatticeIndex_5d_4dpc(int i, int oddBit) {

   int boundaryCrossings = i/(Z[0]/2) + i/(Z[1]*Z[0]/2) + i/(Z[2]*Z[1]*Z[0]/2);

   return 2*i + (boundaryCrossings + oddBit) % 2;

 }


 int

 x4_from_full_index(int i)

 {

   int oddBit = 0;

   int half_idx = i;

   if (i >= Vh){

     oddBit =1;

     half_idx = i - Vh;

   }


   int Y = fullLatticeIndex(half_idx, oddBit);

   int x4 = Y/(Z[2]*Z[1]*Z[0]);


   return x4;

 }


 template <typename Float>

 static void applyGaugeFieldScaling(Float **gauge, int Vh, QudaGaugeParam *param) {

   // Apply spatial scaling factor (u0) to spatial links

   for (int d = 0; d < 3; d++) {

     for (int i = 0; i < gaugeSiteSize*Vh*2; i++) {

       gauge[d][i] /= param->anisotropy;

     }

   }


   // only apply T-boundary at edge nodes

 #ifdef MULTI_GPU

   bool last_node_in_t = (commCoords(3) == commDim(3)-1) ? true : false;

 #else

   bool last_node_in_t = true;

 #endif


   // Apply boundary conditions to temporal links

   if (param->t_boundary == QUDA_ANTI_PERIODIC_T && last_node_in_t) {

     for (int j = (Z[0]/2)*Z[1]*Z[2]*(Z[3]-1); j < Vh; j++) {

       for (int i = 0; i < gaugeSiteSize; i++) {

         gauge[3][j*gaugeSiteSize+i] *= -1.0;

         gauge[3][(Vh+j)*gaugeSiteSize+i] *= -1.0;

       }

     }

   }


   if (param->gauge_fix) {

     // set all gauge links (except for the last Z[0]*Z[1]*Z[2]/2) to the identity,

     // to simulate fixing to the temporal gauge.

     int iMax = ( last_node_in_t ? (Z[0]/2)*Z[1]*Z[2]*(Z[3]-1) : Vh );

     int dir = 3; // time direction only

     Float *even = gauge[dir];

     Float *odd  = gauge[dir]+Vh*gaugeSiteSize;

     for (int i = 0; i< iMax; i++) {

       for (int m = 0; m < 3; m++) {

         for (int n = 0; n < 3; n++) {

           even[i*(3*3*2) + m*(3*2) + n*(2) + 0] = (m==n) ? 1 : 0;

           even[i*(3*3*2) + m*(3*2) + n*(2) + 1] = 0.0;

           odd [i*(3*3*2) + m*(3*2) + n*(2) + 0] = (m==n) ? 1 : 0;

           odd [i*(3*3*2) + m*(3*2) + n*(2) + 1] = 0.0;

         }

       }

     }

   }

 }


 template <typename Float>

 void applyGaugeFieldScaling_long(Float **gauge, int Vh, QudaGaugeParam *param)

 {


   int X1h=param->X[0]/2;

   int X1 =param->X[0];

   int X2 =param->X[1];

   int X3 =param->X[2];

   int X4 =param->X[3];


   // rescale long links by the appropriate coefficient

   for(int d=0; d<4; d++){

     for(int i=0; i < V*gaugeSiteSize; i++){

       gauge[d][i] /= (-24*param->tadpole_coeff*param->tadpole_coeff);

     }

   }


   // apply the staggered phases

   for (int d = 0; d < 3; d++) {


     //even

     for (int i = 0; i < Vh; i++) {


       int index = fullLatticeIndex(i, 0);

       int i4 = index /(X3*X2*X1);

       int i3 = (index - i4*(X3*X2*X1))/(X2*X1);

       int i2 = (index - i4*(X3*X2*X1) - i3*(X2*X1))/X1;

       int i1 = index - i4*(X3*X2*X1) - i3*(X2*X1) - i2*X1;

       int sign=1;


       if (d == 0) {

         if (i4 % 2 == 1){

           sign= -1;

         }

       }


       if (d == 1){

         if ((i4+i1) % 2 == 1){

           sign= -1;

         }

       }

       if (d == 2){

         if ( (i4+i1+i2) % 2 == 1){

           sign= -1;

         }

       }


       for (int j=0;j < 6; j++){

         gauge[d][i*gaugeSiteSize + 12+ j] *= sign;

       }

     }

     //odd

     for (int i = 0; i < Vh; i++) {

       int index = fullLatticeIndex(i, 1);

       int i4 = index /(X3*X2*X1);

       int i3 = (index - i4*(X3*X2*X1))/(X2*X1);

       int i2 = (index - i4*(X3*X2*X1) - i3*(X2*X1))/X1;

       int i1 = index - i4*(X3*X2*X1) - i3*(X2*X1) - i2*X1;

       int sign=1;


       if (d == 0) {

         if (i4 % 2 == 1){

           sign= -1;

         }

       }


       if (d == 1){

         if ((i4+i1) % 2 == 1){

           sign= -1;

         }

       }

       if (d == 2){

         if ( (i4+i1+i2) % 2 == 1){

           sign = -1;

         }

       }


       for (int j=0;j < 6; j++){

         gauge[d][(Vh+i)*gaugeSiteSize + 12 + j] *= sign;

       }

     }


   }


   // Apply boundary conditions to temporal links

   if (param->t_boundary == QUDA_ANTI_PERIODIC_T) {

     for (int j = 0; j < Vh; j++) {

       int sign =1;

       if (j >= (X4-3)*X1h*X2*X3 ){

         sign= -1;

       }


       for (int i = 0; i < 6; i++) {

         gauge[3][j*gaugeSiteSize+ 12+ i ] *= sign;

         gauge[3][(Vh+j)*gaugeSiteSize+12 +i] *= sign;

       }

     }

   }

 }


 template <typename Float>

 static void constructUnitGaugeField(Float **res, QudaGaugeParam *param) {

   Float *resOdd[4], *resEven[4];

   for (int dir = 0; dir < 4; dir++) {

     resEven[dir] = res[dir];

     resOdd[dir]  = res[dir]+Vh*gaugeSiteSize;

   }


   for (int dir = 0; dir < 4; dir++) {

     for (int i = 0; i < Vh; i++) {

       for (int m = 0; m < 3; m++) {

         for (int n = 0; n < 3; n++) {

           resEven[dir][i*(3*3*2) + m*(3*2) + n*(2) + 0] = (m==n) ? 1 : 0;

           resEven[dir][i*(3*3*2) + m*(3*2) + n*(2) + 1] = 0.0;

           resOdd[dir][i*(3*3*2) + m*(3*2) + n*(2) + 0] = (m==n) ? 1 : 0;

           resOdd[dir][i*(3*3*2) + m*(3*2) + n*(2) + 1] = 0.0;

         }

       }

     }

   }


   applyGaugeFieldScaling(res, Vh, param);

 }


 // normalize the vector a

 template <typename Float>

 static void normalize(complex<Float> *a, int len) {

   double sum = 0.0;

   for (int i=0; i<len; i++) sum += norm(a[i]);

   for (int i=0; i<len; i++) a[i] /= sqrt(sum);

 }


 // orthogonalize vector b to vector a

 template <typename Float>

 static void orthogonalize(complex<Float> *a, complex<Float> *b, int len) {

   complex<double> dot = 0.0;

   for (int i=0; i<len; i++) dot += conj(a[i])*b[i];

   for (int i=0; i<len; i++) b[i] -= (complex<Float>)dot*a[i];

 }


 template <typename Float>

 static void constructGaugeField(Float **res, QudaGaugeParam *param) {

   Float *resOdd[4], *resEven[4];

   for (int dir = 0; dir < 4; dir++) {

     resEven[dir] = res[dir];

     resOdd[dir]  = res[dir]+Vh*gaugeSiteSize;

   }


   for (int dir = 0; dir < 4; dir++) {

     for (int i = 0; i < Vh; i++) {

       for (int m = 1; m < 3; m++) { // last 2 rows

         for (int n = 0; n < 3; n++) { // 3 columns

           resEven[dir][i*(3*3*2) + m*(3*2) + n*(2) + 0] = rand() / (Float)RAND_MAX;

           resEven[dir][i*(3*3*2) + m*(3*2) + n*(2) + 1] = rand() / (Float)RAND_MAX;

           resOdd[dir][i*(3*3*2) + m*(3*2) + n*(2) + 0] = rand() / (Float)RAND_MAX;

           resOdd[dir][i*(3*3*2) + m*(3*2) + n*(2) + 1] = rand() / (Float)RAND_MAX;

         }

       }

       normalize((complex<Float>*)(resEven[dir] + (i*3+1)*3*2), 3);

       orthogonalize((complex<Float>*)(resEven[dir] + (i*3+1)*3*2), (complex<Float>*)(resEven[dir] + (i*3+2)*3*2), 3);

       normalize((complex<Float>*)(resEven[dir] + (i*3 + 2)*3*2), 3);


       normalize((complex<Float>*)(resOdd[dir] + (i*3+1)*3*2), 3);

       orthogonalize((complex<Float>*)(resOdd[dir] + (i*3+1)*3*2), (complex<Float>*)(resOdd[dir] + (i*3+2)*3*2), 3);

       normalize((complex<Float>*)(resOdd[dir] + (i*3 + 2)*3*2), 3);


       {

         Float *w = resEven[dir]+(i*3+0)*3*2;

         Float *u = resEven[dir]+(i*3+1)*3*2;

         Float *v = resEven[dir]+(i*3+2)*3*2;


         for (int n = 0; n < 6; n++) w[n] = 0.0;

         accumulateConjugateProduct(w+0*(2), u+1*(2), v+2*(2), +1);

         accumulateConjugateProduct(w+0*(2), u+2*(2), v+1*(2), -1);

         accumulateConjugateProduct(w+1*(2), u+2*(2), v+0*(2), +1);

         accumulateConjugateProduct(w+1*(2), u+0*(2), v+2*(2), -1);

         accumulateConjugateProduct(w+2*(2), u+0*(2), v+1*(2), +1);

         accumulateConjugateProduct(w+2*(2), u+1*(2), v+0*(2), -1);

       }


       {

         Float *w = resOdd[dir]+(i*3+0)*3*2;

         Float *u = resOdd[dir]+(i*3+1)*3*2;

         Float *v = resOdd[dir]+(i*3+2)*3*2;


         for (int n = 0; n < 6; n++) w[n] = 0.0;

         accumulateConjugateProduct(w+0*(2), u+1*(2), v+2*(2), +1);

         accumulateConjugateProduct(w+0*(2), u+2*(2), v+1*(2), -1);

         accumulateConjugateProduct(w+1*(2), u+2*(2), v+0*(2), +1);

         accumulateConjugateProduct(w+1*(2), u+0*(2), v+2*(2), -1);

         accumulateConjugateProduct(w+2*(2), u+0*(2), v+1*(2), +1);

         accumulateConjugateProduct(w+2*(2), u+1*(2), v+0*(2), -1);

       }


     }

   }


   if (param->type == QUDA_WILSON_LINKS){

     applyGaugeFieldScaling(res, Vh, param);

   } else if (param->type == QUDA_ASQTAD_LONG_LINKS){

     applyGaugeFieldScaling_long(res, Vh, param);

   } else if (param->type == QUDA_ASQTAD_FAT_LINKS){

     for (int dir = 0; dir < 4; dir++){

       for (int i = 0; i < Vh; i++) {

         for (int m = 0; m < 3; m++) { // last 2 rows

           for (int n = 0; n < 3; n++) { // 3 columns

             resEven[dir][i*(3*3*2) + m*(3*2) + n*(2) + 0] =1.0* rand() / (Float)RAND_MAX;

             resEven[dir][i*(3*3*2) + m*(3*2) + n*(2) + 1] =2.0* rand() / (Float)RAND_MAX;

             resOdd[dir][i*(3*3*2) + m*(3*2) + n*(2) + 0] = 3.0*rand() / (Float)RAND_MAX;

             resOdd[dir][i*(3*3*2) + m*(3*2) + n*(2) + 1] = 4.0*rand() / (Float)RAND_MAX;

           }

         }

       }

     }


   }


 }


 template <typename Float>

 void constructUnitaryGaugeField(Float **res)

 {

   Float *resOdd[4], *resEven[4];

   for (int dir = 0; dir < 4; dir++) {

     resEven[dir] = res[dir];

     resOdd[dir]  = res[dir]+Vh*gaugeSiteSize;

   }


   for (int dir = 0; dir < 4; dir++) {

     for (int i = 0; i < Vh; i++) {

       for (int m = 1; m < 3; m++) { // last 2 rows

         for (int n = 0; n < 3; n++) { // 3 columns

           resEven[dir][i*(3*3*2) + m*(3*2) + n*(2) + 0] = rand() / (Float)RAND_MAX;

           resEven[dir][i*(3*3*2) + m*(3*2) + n*(2) + 1] = rand() / (Float)RAND_MAX;

           resOdd[dir][i*(3*3*2) + m*(3*2) + n*(2) + 0] = rand() / (Float)RAND_MAX;

           resOdd[dir][i*(3*3*2) + m*(3*2) + n*(2) + 1] = rand() / (Float)RAND_MAX;

         }

       }

       normalize((complex<Float>*)(resEven[dir] + (i*3+1)*3*2), 3);

       orthogonalize((complex<Float>*)(resEven[dir] + (i*3+1)*3*2), (complex<Float>*)(resEven[dir] + (i*3+2)*3*2), 3);

       normalize((complex<Float>*)(resEven[dir] + (i*3 + 2)*3*2), 3);


       normalize((complex<Float>*)(resOdd[dir] + (i*3+1)*3*2), 3);

       orthogonalize((complex<Float>*)(resOdd[dir] + (i*3+1)*3*2), (complex<Float>*)(resOdd[dir] + (i*3+2)*3*2), 3);

       normalize((complex<Float>*)(resOdd[dir] + (i*3 + 2)*3*2), 3);


       {

         Float *w = resEven[dir]+(i*3+0)*3*2;

         Float *u = resEven[dir]+(i*3+1)*3*2;

         Float *v = resEven[dir]+(i*3+2)*3*2;


         for (int n = 0; n < 6; n++) w[n] = 0.0;

         accumulateConjugateProduct(w+0*(2), u+1*(2), v+2*(2), +1);

         accumulateConjugateProduct(w+0*(2), u+2*(2), v+1*(2), -1);

         accumulateConjugateProduct(w+1*(2), u+2*(2), v+0*(2), +1);

         accumulateConjugateProduct(w+1*(2), u+0*(2), v+2*(2), -1);

         accumulateConjugateProduct(w+2*(2), u+0*(2), v+1*(2), +1);

         accumulateConjugateProduct(w+2*(2), u+1*(2), v+0*(2), -1);

       }


       {

         Float *w = resOdd[dir]+(i*3+0)*3*2;

         Float *u = resOdd[dir]+(i*3+1)*3*2;

         Float *v = resOdd[dir]+(i*3+2)*3*2;


         for (int n = 0; n < 6; n++) w[n] = 0.0;

         accumulateConjugateProduct(w+0*(2), u+1*(2), v+2*(2), +1);

         accumulateConjugateProduct(w+0*(2), u+2*(2), v+1*(2), -1);

         accumulateConjugateProduct(w+1*(2), u+2*(2), v+0*(2), +1);

         accumulateConjugateProduct(w+1*(2), u+0*(2), v+2*(2), -1);

         accumulateConjugateProduct(w+2*(2), u+0*(2), v+1*(2), +1);

         accumulateConjugateProduct(w+2*(2), u+1*(2), v+0*(2), -1);

       }


     }

   }

 }


 void construct_gauge_field(void **gauge, int type, QudaPrecision precision, QudaGaugeParam *param) {

   if (type == 0) {

     if (precision == QUDA_DOUBLE_PRECISION) constructUnitGaugeField((double**)gauge, param);

     else constructUnitGaugeField((float**)gauge, param);

   } else if (type == 1) {

     if (precision == QUDA_DOUBLE_PRECISION) constructGaugeField((double**)gauge, param);

     else constructGaugeField((float**)gauge, param);

   } else {

     if (precision == QUDA_DOUBLE_PRECISION) applyGaugeFieldScaling((double**)gauge, Vh, param);

     else applyGaugeFieldScaling((float**)gauge, Vh, param);

   }


 }


 void

 construct_fat_long_gauge_field(void **fatlink, void** longlink, int type,

                                QudaPrecision precision, QudaGaugeParam* param,

                                QudaDslashType dslash_type)

 {

   if (type == 0) {

     if (precision == QUDA_DOUBLE_PRECISION) {

       constructUnitGaugeField((double**)fatlink, param);

       constructUnitGaugeField((double**)longlink, param);

     }else {

       constructUnitGaugeField((float**)fatlink, param);

       constructUnitGaugeField((float**)longlink, param);

     }

   } else {

     if (precision == QUDA_DOUBLE_PRECISION) {

       param->type = QUDA_ASQTAD_FAT_LINKS;

       constructGaugeField((double**)fatlink, param);

       param->type = QUDA_ASQTAD_LONG_LINKS;

       constructGaugeField((double**)longlink, param);

     }else {

       param->type = QUDA_ASQTAD_FAT_LINKS;

       constructGaugeField((float**)fatlink, param);

       param->type = QUDA_ASQTAD_LONG_LINKS;

       constructGaugeField((float**)longlink, param);

     }

   }


   if(param->reconstruct == QUDA_RECONSTRUCT_9 ||

      param->reconstruct == QUDA_RECONSTRUCT_13){ // incorporate non-trivial phase into long links

     const double cos_pi_3 = 0.5; // Cos(pi/3)

     const double sin_pi_3 = sqrt(0.75); // Sin(pi/3)

     for(int dir=0; dir<4; ++dir){

       for(int i=0; i<V; ++i){

         for(int j=0; j<gaugeSiteSize; j+=2){

           if(precision == QUDA_DOUBLE_PRECISION){

             const double real = ((double*)longlink[dir])[i*gaugeSiteSize + j];

             const double imag = ((double*)longlink[dir])[i*gaugeSiteSize + j + 1];

             ((double*)longlink[dir])[i*gaugeSiteSize + j] = real*cos_pi_3 - imag*sin_pi_3;

             ((double*)longlink[dir])[i*gaugeSiteSize + j + 1] = real*sin_pi_3 + imag*cos_pi_3;

           }else{

             const float real = ((float*)longlink[dir])[i*gaugeSiteSize + j];

             const float imag = ((float*)longlink[dir])[i*gaugeSiteSize + j + 1];

             ((float*)longlink[dir])[i*gaugeSiteSize + j] = real*cos_pi_3 - imag*sin_pi_3;

             ((float*)longlink[dir])[i*gaugeSiteSize + j + 1] = real*sin_pi_3 + imag*cos_pi_3;

           }

         }

       }

     }

   }


   if (dslash_type == QUDA_STAGGERED_DSLASH) { // set all links to zero to emulate the 1-link operator

     for(int dir=0; dir<4; ++dir){

       for(int i=0; i<V; ++i){

         for(int j=0; j<gaugeSiteSize; j+=2){

           if(precision == QUDA_DOUBLE_PRECISION){

             ((double*)longlink[dir])[i*gaugeSiteSize + j] = 0.0;

             ((double*)longlink[dir])[i*gaugeSiteSize + j + 1] = 0.0;

           }else{

             ((float*)longlink[dir])[i*gaugeSiteSize + j] = 0.0;

             ((float*)longlink[dir])[i*gaugeSiteSize + j + 1] = 0.0;

           }

         }

       }

     }

   }


 }


 template <typename Float>

 static void constructCloverField(Float *res, double norm, double diag) {


   Float c = 2.0 * norm / RAND_MAX;


   for(int i = 0; i < V; i++) {

     for (int j = 0; j < 72; j++) {

       res[i*72 + j] = c*rand() - norm;

     }

     for (int j = 0; j< 6; j++) {

       res[i*72 + j] += diag;

       res[i*72 + j+36] += diag;

     }

   }

 }


 void construct_clover_field(void *clover, double norm, double diag, QudaPrecision precision) {


   if (precision == QUDA_DOUBLE_PRECISION) constructCloverField((double *)clover, norm, diag);

   else constructCloverField((float *)clover, norm, diag);

 }


 /*void strong_check(void *spinorRef, void *spinorGPU, int len, QudaPrecision prec) {

   printf("Reference:\n");

   printSpinorElement(spinorRef, 0, prec); printf("...\n");

   printSpinorElement(spinorRef, len-1, prec); printf("\n");


   printf("\nCUDA:\n");

   printSpinorElement(spinorGPU, 0, prec); printf("...\n");

   printSpinorElement(spinorGPU, len-1, prec); printf("\n");


   compare_spinor(spinorRef, spinorGPU, len, prec);

   }*/


 template <typename Float>

 static void checkGauge(Float **oldG, Float **newG, double epsilon) {


   const int fail_check = 17;

   int fail[4][fail_check];

   int iter[4][18];

   for (int d=0; d<4; d++) for (int i=0; i<fail_check; i++) fail[d][i] = 0;

   for (int d=0; d<4; d++) for (int i=0; i<18; i++) iter[d][i] = 0;


   for (int d=0; d<4; d++) {

     for (int eo=0; eo<2; eo++) {

       for (int i=0; i<Vh; i++) {

         int ga_idx = (eo*Vh+i);

         for (int j=0; j<18; j++) {

           double diff = fabs(newG[d][ga_idx*18+j] - oldG[d][ga_idx*18+j]);


           for (int f=0; f<fail_check; f++) if (diff > pow(10.0,-(f+1))) fail[d][f]++;

           if (diff > epsilon) iter[d][j]++;

         }

       }

     }

   }


   printf("Component fails (X, Y, Z, T)\n");

   for (int i=0; i<18; i++) printf("%d fails = (%8d, %8d, %8d, %8d)\n", i, iter[0][i], iter[1][i], iter[2][i], iter[3][i]);


   printf("\nDeviation Failures = (X, Y, Z, T)\n");

   for (int f=0; f<fail_check; f++) {

     printf("%e Failures = (%9d, %9d, %9d, %9d) = (%6.5f, %6.5f, %6.5f, %6.5f)\n", pow(10.0,-(f+1)),

            fail[0][f], fail[1][f], fail[2][f], fail[3][f],

            fail[0][f]/(double)(V*18), fail[1][f]/(double)(V*18), fail[2][f]/(double)(V*18), fail[3][f]/(double)(V*18));

   }


 }


 void check_gauge(void **oldG, void **newG, double epsilon, QudaPrecision precision) {

   if (precision == QUDA_DOUBLE_PRECISION)

     checkGauge((double**)oldG, (double**)newG, epsilon);

   else

     checkGauge((float**)oldG, (float**)newG, epsilon);

 }


 void

 createSiteLinkCPU(void** link,  QudaPrecision precision, int phase)

 {


   if (precision == QUDA_DOUBLE_PRECISION) {

     constructUnitaryGaugeField((double**)link);

   }else {

     constructUnitaryGaugeField((float**)link);

   }


   // only apply temporal boundary condition if I'm the last node in T

 #ifdef MULTI_GPU

   bool last_node_in_t = (commCoords(3) == commDim(3)-1) ? true : false;

 #else

   bool last_node_in_t = true;

 #endif


   if(phase){


     for(int i=0;i < V;i++){

       for(int dir =XUP; dir <= TUP; dir++){

         int idx = i;

         int oddBit =0;

         if (i >= Vh) {

           idx = i - Vh;

           oddBit = 1;

         }


         int X1 = Z[0];

         int X2 = Z[1];

         int X3 = Z[2];

         int X4 = Z[3];


         int full_idx = fullLatticeIndex(idx, oddBit);

         int i4 = full_idx /(X3*X2*X1);

         int i3 = (full_idx - i4*(X3*X2*X1))/(X2*X1);

         int i2 = (full_idx - i4*(X3*X2*X1) - i3*(X2*X1))/X1;

         int i1 = full_idx - i4*(X3*X2*X1) - i3*(X2*X1) - i2*X1;


         double coeff= 1.0;

         switch(dir){

         case XUP:

           if ( (i4 & 1) != 0){

             coeff *= -1;

           }

           break;


         case YUP:

           if ( ((i4+i1) & 1) != 0){

             coeff *= -1;

           }

           break;


         case ZUP:

           if ( ((i4+i1+i2) & 1) != 0){

             coeff *= -1;

           }

           break;


         case TUP:

           if (last_node_in_t && i4 == (X4-1)){

             coeff *= -1;

           }

           break;


         default:

           printf("ERROR: wrong dir(%d)\n", dir);

           exit(1);

         }


         if (precision == QUDA_DOUBLE_PRECISION){

           //double* mylink = (double*)link;

           //mylink = mylink + (4*i + dir)*gaugeSiteSize;

           double* mylink = (double*)link[dir];

           mylink = mylink + i*gaugeSiteSize;


           mylink[12] *= coeff;

           mylink[13] *= coeff;

           mylink[14] *= coeff;

           mylink[15] *= coeff;

           mylink[16] *= coeff;

           mylink[17] *= coeff;


         }else{

           //float* mylink = (float*)link;

           //mylink = mylink + (4*i + dir)*gaugeSiteSize;

           float* mylink = (float*)link[dir];

           mylink = mylink + i*gaugeSiteSize;


           mylink[12] *= coeff;

           mylink[13] *= coeff;

           mylink[14] *= coeff;

           mylink[15] *= coeff;

           mylink[16] *= coeff;

           mylink[17] *= coeff;


         }

       }

     }

   }


 #if 1

   for(int dir= 0;dir < 4;dir++){

     for(int i=0;i< V*gaugeSiteSize;i++){

       if (precision ==QUDA_SINGLE_PRECISION){

         float* f = (float*)link[dir];

         if (f[i] != f[i] || (fabsf(f[i]) > 1.e+3) ){

           fprintf(stderr, "ERROR:  %dth: bad number(%f) in function %s \n",i, f[i], __FUNCTION__);

           exit(1);

         }

       }else{

         double* f = (double*)link[dir];

         if (f[i] != f[i] || (fabs(f[i]) > 1.e+3)){

           fprintf(stderr, "ERROR:  %dth: bad number(%f) in function %s \n",i, f[i], __FUNCTION__);

           exit(1);

         }


       }


     }

   }

 #endif


   return;

 }


 template <typename Float>

 int compareLink(Float **linkA, Float **linkB, int len) {

   const int fail_check = 16;

   int fail[fail_check];

   for (int f=0; f<fail_check; f++) fail[f] = 0;


   int iter[18];

   for (int i=0; i<18; i++) iter[i] = 0;


   for(int dir=0;dir < 4; dir++){

     for (int i=0; i<len; i++) {

       for (int j=0; j<18; j++) {

         int is = i*18+j;

         double diff = fabs(linkA[dir][is]-linkB[dir][is]);

         for (int f=0; f<fail_check; f++)

           if (diff > pow(10.0,-(f+1))) fail[f]++;

         //if (diff > 1e-1) printf("%d %d %e\n", i, j, diff);

         if (diff > 1e-3) iter[j]++;

       }

     }

   }


   for (int i=0; i<18; i++) printfQuda("%d fails = %d\n", i, iter[i]);


   int accuracy_level = 0;

   for(int f =0; f < fail_check; f++){

     if(fail[f] == 0){

       accuracy_level =f;

     }

   }


   for (int f=0; f<fail_check; f++) {

     printfQuda("%e Failures: %d / %d  = %e\n", pow(10.0,-(f+1)), fail[f], 4*len*18, fail[f] / (double)(4*len*18));

   }


   return accuracy_level;

 }


 static int

 compare_link(void **linkA, void **linkB, int len, QudaPrecision precision)

 {

   int ret;


   if (precision == QUDA_DOUBLE_PRECISION){

     ret = compareLink((double**)linkA, (double**)linkB, len);

   }else {

     ret = compareLink((float**)linkA, (float**)linkB, len);

   }


   return ret;

 }


 // X indexes the lattice site

 static void

 printLinkElement(void *link, int X, QudaPrecision precision)

 {

   if (precision == QUDA_DOUBLE_PRECISION){

     for(int i=0; i < 3;i++){

       printVector((double*)link+ X*gaugeSiteSize + i*6);

     }


   }

   else{

     for(int i=0;i < 3;i++){

       printVector((float*)link+X*gaugeSiteSize + i*6);

     }

   }

 }


 int strong_check_link(void** linkA, const char* msgA,

                       void **linkB, const char* msgB,

                       int len, QudaPrecision prec)

 {

   printfQuda("%s\n", msgA);

   printLinkElement(linkA[0], 0, prec);

   printfQuda("\n");

   printLinkElement(linkA[0], 1, prec);

   printfQuda("...\n");

   printLinkElement(linkA[3], len-1, prec);

   printfQuda("\n");


   printfQuda("\n%s\n", msgB);

   printLinkElement(linkB[0], 0, prec);

   printfQuda("\n");

   printLinkElement(linkB[0], 1, prec);

   printfQuda("...\n");

   printLinkElement(linkB[3], len-1, prec);

   printfQuda("\n");


   int ret = compare_link(linkA, linkB, len, prec);

   return ret;

 }


 void

 createMomCPU(void* mom,  QudaPrecision precision)

 {

   void* temp;


   size_t gSize = (precision == QUDA_DOUBLE_PRECISION) ? sizeof(double) : sizeof(float);

   temp = malloc(4*V*gaugeSiteSize*gSize);

   if (temp == NULL){

     fprintf(stderr, "Error: malloc failed for temp in function %s\n", __FUNCTION__);

     exit(1);

   }


   for(int i=0;i < V;i++){

     if (precision == QUDA_DOUBLE_PRECISION){

       for(int dir=0;dir < 4;dir++){

         double* thismom = (double*)mom;

         for(int k=0; k < momSiteSize; k++){

           thismom[ (4*i+dir)*momSiteSize + k ]= 1.0* rand() /RAND_MAX;

           if (k==momSiteSize-1) thismom[ (4*i+dir)*momSiteSize + k ]= 0.0;

         }

       }

     }else{

       for(int dir=0;dir < 4;dir++){

         float* thismom=(float*)mom;

         for(int k=0; k < momSiteSize; k++){

           thismom[ (4*i+dir)*momSiteSize + k ]= 1.0* rand() /RAND_MAX;

           if (k==momSiteSize-1) thismom[ (4*i+dir)*momSiteSize + k ]= 0.0;

         }

       }

     }

   }


   free(temp);

   return;

 }


 void

 createHwCPU(void* hw,  QudaPrecision precision)

 {

   for(int i=0;i < V;i++){

     if (precision == QUDA_DOUBLE_PRECISION){

       for(int dir=0;dir < 4;dir++){

         double* thishw = (double*)hw;

         for(int k=0; k < hwSiteSize; k++){

           thishw[ (4*i+dir)*hwSiteSize + k ]= 1.0* rand() /RAND_MAX;

         }

       }

     }else{

       for(int dir=0;dir < 4;dir++){

         float* thishw=(float*)hw;

         for(int k=0; k < hwSiteSize; k++){

           thishw[ (4*i+dir)*hwSiteSize + k ]= 1.0* rand() /RAND_MAX;

         }

       }

     }

   }


   return;

 }


 template <typename Float>

 int compare_mom(Float *momA, Float *momB, int len) {

   const int fail_check = 16;

   int fail[fail_check];

   for (int f=0; f<fail_check; f++) fail[f] = 0;


   int iter[momSiteSize];

   for (int i=0; i<momSiteSize; i++) iter[i] = 0;


   for (int i=0; i<len; i++) {

     for (int j=0; j<momSiteSize-1; j++) {

       int is = i*momSiteSize+j;

       double diff = fabs(momA[is]-momB[is]);

       for (int f=0; f<fail_check; f++)

         if (diff > pow(10.0,-(f+1))) fail[f]++;

       //if (diff > 1e-1) printf("%d %d %e\n", i, j, diff);

       if (diff > 1e-3) iter[j]++;

     }

   }


   int accuracy_level = 0;

   for(int f =0; f < fail_check; f++){

     if(fail[f] == 0){

       accuracy_level =f+1;

     }

   }


   for (int i=0; i<momSiteSize; i++) printfQuda("%d fails = %d\n", i, iter[i]);


   for (int f=0; f<fail_check; f++) {

     printfQuda("%e Failures: %d / %d  = %e\n", pow(10.0,-(f+1)), fail[f], len*9, fail[f]/(double)(len*9));

   }


   return accuracy_level;

 }


 static void

 printMomElement(void *mom, int X, QudaPrecision precision)

 {

   if (precision == QUDA_DOUBLE_PRECISION){

     double* thismom = ((double*)mom)+ X*momSiteSize;

     printVector(thismom);

     printfQuda("(%9f,%9f) (%9f,%9f)\n", thismom[6], thismom[7], thismom[8], thismom[9]);

   }else{

     float* thismom = ((float*)mom)+ X*momSiteSize;

     printVector(thismom);

     printfQuda("(%9f,%9f) (%9f,%9f)\n", thismom[6], thismom[7], thismom[8], thismom[9]);

   }

 }

 int strong_check_mom(void * momA, void *momB, int len, QudaPrecision prec)

 {

   printfQuda("mom:\n");

   printMomElement(momA, 0, prec);

   printfQuda("\n");

   printMomElement(momA, 1, prec);

   printfQuda("\n");

   printMomElement(momA, 2, prec);

   printfQuda("\n");

   printMomElement(momA, 3, prec);

   printfQuda("...\n");


   printfQuda("\nreference mom:\n");

   printMomElement(momB, 0, prec);

   printfQuda("\n");

   printMomElement(momB, 1, prec);

   printfQuda("\n");

   printMomElement(momB, 2, prec);

   printfQuda("\n");

   printMomElement(momB, 3, prec);

   printfQuda("\n");


   int ret;

   if (prec == QUDA_DOUBLE_PRECISION){

     ret = compare_mom((double*)momA, (double*)momB, len);

   }else{

     ret = compare_mom((float*)momA, (float*)momB, len);

   }


   return ret;

 }


 /************

  * return value

  *

  * 0: command line option matched and processed sucessfully

  * non-zero: command line option does not match

  *

  */


 #ifdef MULTI_GPU

 int device = -1;

 #else

 int device = 0;

 #endif


 QudaReconstructType link_recon = QUDA_RECONSTRUCT_NO;

 QudaReconstructType link_recon_sloppy = QUDA_RECONSTRUCT_INVALID;

 QudaPrecision prec = QUDA_SINGLE_PRECISION;

 QudaPrecision  prec_sloppy = QUDA_INVALID_PRECISION;

 int xdim = 24;

 int ydim = 24;

 int zdim = 24;

 int tdim = 24;

 int Lsdim = 16;

 QudaDagType dagger = QUDA_DAG_NO;

 int gridsize_from_cmdline[4] = {1,1,1,1};

 QudaDslashType dslash_type = QUDA_WILSON_DSLASH;

 char latfile[256] = "";

 bool tune = true;

 int niter = 10;

 int test_type = 0;

 QudaInverterType inv_type;

 QudaInverterType precon_type = QUDA_INVALID_INVERTER;

 int multishift = 0;

 bool verify_results = true;

 double mass = 0.1;

 QudaTwistFlavorType twist_flavor = QUDA_TWIST_MINUS;

 bool kernel_pack_t = false;

 QudaMassNormalization normalization = QUDA_KAPPA_NORMALIZATION;

 QudaMatPCType matpc_type = QUDA_MATPC_EVEN_EVEN;


 static int dim_partitioned[4] = {0,0,0,0};


 int dimPartitioned(int dim)

 {

   return ((gridsize_from_cmdline[dim] > 1) || dim_partitioned[dim]);

 }


 void __attribute__((weak)) usage_extra(char** argv){};


 void usage(char** argv )

 {

   printf("Usage: %s [options]\n", argv[0]);

   printf("Common options: \n");

 #ifndef MULTI_GPU

   printf("    --device <n>                              # Set the CUDA device to use (default 0, single GPU only)\n");

 #endif

   printf("    --prec <double/single/half>               # Precision in GPU\n");

   printf("    --prec_sloppy <double/single/half>        # Sloppy precision in GPU\n");

   printf("    --recon <8/9/12/13/18>                    # Link reconstruction type\n");

   printf("    --recon_sloppy <8/9/12/13/18>             # Sloppy link reconstruction type\n");

   printf("    --dagger                                  # Set the dagger to 1 (default 0)\n");

   printf("    --sdim <n>                                # Set space dimention(X/Y/Z) size\n");

   printf("    --xdim <n>                                # Set X dimension size(default 24)\n");

   printf("    --ydim <n>                                # Set X dimension size(default 24)\n");

   printf("    --zdim <n>                                # Set X dimension size(default 24)\n");

   printf("    --tdim <n>                                # Set T dimension size(default 24)\n");

   printf("    --Lsdim <n>                               # Set Ls dimension size(default 16)\n");

   printf("    --xgridsize <n>                           # Set grid size in X dimension (default 1)\n");

   printf("    --ygridsize <n>                           # Set grid size in Y dimension (default 1)\n");

   printf("    --zgridsize <n>                           # Set grid size in Z dimension (default 1)\n");

   printf("    --tgridsize <n>                           # Set grid size in T dimension (default 1)\n");

   printf("    --partition <mask>                        # Set the communication topology (X=1, Y=2, Z=4, T=8, and combinations of these)\n");

   printf("    --kernel-pack-t                           # Set T dimension kernel packing to be true (default false)\n");

   printf("    --dslash_type <type>                      # Set the dslash type, the following values are valid\n"

          "                                                  wilson/clover/twisted_mass/twisted_clover/staggered\n"

          "                                                  /asqtad/domain_wall/domain_wall_4d/mobius\n");

   printf("    --flavor <type>                           # Set the twisted mass flavor type (minus (default), plus, deg_doublet, nondeg_doublet)\n");

   printf("    --load-gauge file                         # Load gauge field \"file\" for the test (requires QIO)\n");

   printf("    --niter <n>                               # The number of iterations to perform (default 10)\n");

   printf("    --inv_type <cg/bicgstab/gcr>              # The type of solver to use (default cg)\n");

   printf("    --precon_type <mr/ (unspecified)>         # The type of solver to use (default none (=unspecified))\n");

   printf("    --multishift <true/false>                 # Whether to do a multi-shift solver test or not (default false)\n");

   printf("    --mass                                    # Mass of Dirac operator (default 0.1)\n");

   printf("    --mass-normalization                      # Mass normalization (kappa (default) / mass)\n");

   printf("    --matpc                                   # Matrix preconditioning type (even-even, odd_odd, even_even_asym, odd_odd_asym) \n");

   printf("    --tune <true/false>                       # Whether to autotune or not (default true)\n");

   printf("    --test                                    # Test method (different for each test)\n");

   printf("    --verify <true/false>                     # Verify the GPU results using CPU results (default true)\n");

   printf("    --help                                    # Print out this message\n");

   usage_extra(argv);

 #ifdef MULTI_GPU

   char msg[]="multi";

 #else

   char msg[]="single";

 #endif

   printf("Note: this program is %s GPU build\n", msg);

   exit(1);

   return ;

 }


 int process_command_line_option(int argc, char** argv, int* idx)

 {

 #ifdef MULTI_GPU

   char msg[]="multi";

 #else

   char msg[]="single";

 #endif


   int ret = -1;


   int i = *idx;


   if( strcmp(argv[i], "--help")== 0){

     usage(argv);

   }


   if( strcmp(argv[i], "--verify") == 0){

     if (i+1 >= argc){

       usage(argv);

     }


     if (strcmp(argv[i+1], "true") == 0){

       verify_results = true;

     }else if (strcmp(argv[i+1], "false") == 0){

       verify_results = false;

     }else{

       fprintf(stderr, "ERROR: invalid verify type\n");

       exit(1);

     }


     i++;

     ret = 0;

     goto out;

   }


   if( strcmp(argv[i], "--device") == 0){

     if (i+1 >= argc){

       usage(argv);

     }

     device = atoi(argv[i+1]);

     if (device < 0 || device > 16){

       printf("ERROR: Invalid CUDA device number (%d)\n", device);

       usage(argv);

     }

     i++;

     ret = 0;

     goto out;

   }


   if( strcmp(argv[i], "--prec") == 0){

     if (i+1 >= argc){

       usage(argv);

     }

     prec =  get_prec(argv[i+1]);

     i++;

     ret = 0;

     goto out;

   }


   if( strcmp(argv[i], "--prec_sloppy") == 0){

     if (i+1 >= argc){

       usage(argv);

     }

     prec_sloppy =  get_prec(argv[i+1]);

     i++;

     ret = 0;

     goto out;

   }


   if( strcmp(argv[i], "--recon") == 0){

     if (i+1 >= argc){

       usage(argv);

     }

     link_recon =  get_recon(argv[i+1]);

     i++;

     ret = 0;

     goto out;

   }


   if( strcmp(argv[i], "--recon_sloppy") == 0){

     if (i+1 >= argc){

       usage(argv);

     }

     link_recon_sloppy =  get_recon(argv[i+1]);

     i++;

     ret = 0;

     goto out;

   }


   if( strcmp(argv[i], "--xdim") == 0){

     if (i+1 >= argc){

       usage(argv);

     }

     xdim= atoi(argv[i+1]);

     if (xdim < 0 || xdim > 512){

       printf("ERROR: invalid X dimension (%d)\n", xdim);

       usage(argv);

     }

     i++;

     ret = 0;

     goto out;

   }


   if( strcmp(argv[i], "--ydim") == 0){

     if (i+1 >= argc){

       usage(argv);

     }

     ydim= atoi(argv[i+1]);

     if (ydim < 0 || ydim > 512){

       printf("ERROR: invalid T dimension (%d)\n", ydim);

       usage(argv);

     }

     i++;

     ret = 0;

     goto out;

   }


   if( strcmp(argv[i], "--zdim") == 0){

     if (i+1 >= argc){

       usage(argv);

     }

     zdim= atoi(argv[i+1]);

     if (zdim < 0 || zdim > 512){

       printf("ERROR: invalid T dimension (%d)\n", zdim);

       usage(argv);

     }

     i++;

     ret = 0;

     goto out;

   }


   if( strcmp(argv[i], "--tdim") == 0){

     if (i+1 >= argc){

       usage(argv);

     }

     tdim =  atoi(argv[i+1]);

     if (tdim < 0 || tdim > 512){

       errorQuda("Error: invalid t dimension");

     }

     i++;

     ret = 0;

     goto out;

   }


   if( strcmp(argv[i], "--sdim") == 0){

     if (i+1 >= argc){

       usage(argv);

     }

     int sdim =  atoi(argv[i+1]);

     if (sdim < 0 || sdim > 512){

       printfQuda("ERROR: invalid S dimension\n");

     }

     xdim=ydim=zdim=sdim;

     i++;

     ret = 0;

     goto out;

   }


   if( strcmp(argv[i], "--Lsdim") == 0){

     if (i+1 >= argc){

       usage(argv);

     }

     int Ls =  atoi(argv[i+1]);

     if (Ls < 0 || Ls > 128){

       printfQuda("ERROR: invalid Ls dimension\n");

     }

     Lsdim=Ls;

     i++;

     ret = 0;

     goto out;

   }


   if( strcmp(argv[i], "--dagger") == 0){

     dagger = QUDA_DAG_YES;

     ret = 0;

     goto out;

   }


   if( strcmp(argv[i], "--partition") == 0){

     if (i+1 >= argc){

       usage(argv);

     }

 #ifdef MULTI_GPU

     int value  =  atoi(argv[i+1]);

     for(int j=0; j < 4;j++){

       if (value &  (1 << j)){

         commDimPartitionedSet(j);

         dim_partitioned[j] = 1;

       }

     }

 #else

     printfQuda("WARNING: Ignoring --partition option since this is a single-GPU build.\n");

 #endif

     i++;

     ret = 0;

     goto out;

   }


   if( strcmp(argv[i], "--kernel-pack-t") == 0){

     kernel_pack_t = true;

     ret= 0;

     goto out;

   }


   if( strcmp(argv[i], "--tune") == 0){

     if (i+1 >= argc){

       usage(argv);

     }


     if (strcmp(argv[i+1], "true") == 0){

       tune = true;

     }else if (strcmp(argv[i+1], "false") == 0){

       tune = false;

     }else{

       fprintf(stderr, "ERROR: invalid tuning type\n");

       exit(1);

     }


     i++;

     ret = 0;

     goto out;

   }


   if( strcmp(argv[i], "--multishift") == 0){

     if (i+1 >= argc){

       usage(argv);

     }


     if (strcmp(argv[i+1], "true") == 0){

       multishift = true;

     }else if (strcmp(argv[i+1], "false") == 0){

       multishift = false;

     }else{

       fprintf(stderr, "ERROR: invalid multishift boolean\n");

       exit(1);

     }


     i++;

     ret = 0;

     goto out;

   }


   if( strcmp(argv[i], "--xgridsize") == 0){

     if (i+1 >= argc){

       usage(argv);

     }

     int xsize =  atoi(argv[i+1]);

     if (xsize <= 0 ){

       errorQuda("ERROR: invalid X grid size");

     }

     gridsize_from_cmdline[0] = xsize;

     i++;

     ret = 0;

     goto out;

   }


   if( strcmp(argv[i], "--ygridsize") == 0){

     if (i+1 >= argc){

       usage(argv);

     }

     int ysize =  atoi(argv[i+1]);

     if (ysize <= 0 ){

       errorQuda("ERROR: invalid Y grid size");

     }

     gridsize_from_cmdline[1] = ysize;

     i++;

     ret = 0;

     goto out;

   }


   if( strcmp(argv[i], "--zgridsize") == 0){

     if (i+1 >= argc){

       usage(argv);

     }

     int zsize =  atoi(argv[i+1]);

     if (zsize <= 0 ){

       errorQuda("ERROR: invalid Z grid size");

     }

     gridsize_from_cmdline[2] = zsize;

     i++;

     ret = 0;

     goto out;

   }


   if( strcmp(argv[i], "--tgridsize") == 0){

     if (i+1 >= argc){

       usage(argv);

     }

     int tsize =  atoi(argv[i+1]);

     if (tsize <= 0 ){

       errorQuda("ERROR: invalid T grid size");

     }

     gridsize_from_cmdline[3] = tsize;

     i++;

     ret = 0;

     goto out;

   }


   if( strcmp(argv[i], "--dslash_type") == 0){

     if (i+1 >= argc){

       usage(argv);

     }

     dslash_type =  get_dslash_type(argv[i+1]);

     i++;

     ret = 0;

     goto out;

   }


   if( strcmp(argv[i], "--flavor") == 0){

     if (i+1 >= argc){

       usage(argv);

     }

     twist_flavor =  get_flavor_type(argv[i+1]);

     i++;

     ret = 0;

     goto out;

   }


   if( strcmp(argv[i], "--inv_type") == 0){

     if (i+1 >= argc){

       usage(argv);

     }

     inv_type = get_solver_type(argv[i+1]);

     i++;

     ret = 0;

     goto out;

   }


   if( strcmp(argv[i], "--precon_type") == 0){

     if (i+1 >= argc){

       usage(argv);

     }

     precon_type = get_solver_type(argv[i+1]);

     i++;

     ret = 0;

     goto out;

   }


   if( strcmp(argv[i], "--mass") == 0){

     if (i+1 >= argc){

       usage(argv);

     }

     mass= atof(argv[i+1]);

     i++;

     ret = 0;

     goto out;

   }


   if( strcmp(argv[i], "--mass-normalization") == 0){

     if (i+1 >= argc){

       usage(argv);

     }

     normalization = get_mass_normalization_type(argv[i+1]);

     i++;

     ret = 0;

     goto out;

   }


   if( strcmp(argv[i], "--matpc") == 0){

     if (i+1 >= argc){

       usage(argv);

     }

     matpc_type = get_matpc_type(argv[i+1]);

     i++;

     ret = 0;

     goto out;

   }


   if( strcmp(argv[i], "--load-gauge") == 0){

     if (i+1 >= argc){

       usage(argv);

     }

     strcpy(latfile, argv[i+1]);

     i++;

     ret = 0;

     goto out;

   }


   if( strcmp(argv[i], "--test") == 0){

     if (i+1 >= argc){

       usage(argv);

     }

     test_type = atoi(argv[i+1]);

     i++;

     ret = 0;

     goto out;

   }


   if( strcmp(argv[i], "--niter") == 0){

     if (i+1 >= argc){

       usage(argv);

     }

     niter= atoi(argv[i+1]);

     if (niter < 1 || niter > 1e6){

       printf("ERROR: invalid number of iterations (%d)\n", niter);

       usage(argv);

     }

     i++;

     ret = 0;

     goto out;

   }


   if( strcmp(argv[i], "--version") == 0){

     printf("This program is linked with QUDA library, version %s,",

            get_quda_ver_str());

     printf(" %s GPU build\n", msg);

     exit(0);

   }


  out:

   *idx = i;

   return ret ;


 }


 static struct timeval startTime;


 void stopwatchStart() {

   gettimeofday(&startTime, NULL);

 }


 double stopwatchReadSeconds() {

   struct timeval endTime;

   gettimeofday(&endTime, NULL);


   long ds = endTime.tv_sec - startTime.tv_sec;

   long dus = endTime.tv_usec - startTime.tv_usec;

   return ds + 0.000001*dus;

 }


printGaugeElement
void printGaugeElement(void *gauge, int X, QudaPrecision precision)
Definition: test_util.cpp:170

commDim
int commDim(int)
Definition: face_buffer.cpp:535

dimPartitioned
int dimPartitioned(int dim)
Definition: test_util.cpp:1577

QudaMassNormalization
enum QudaMassNormalization_s QudaMassNormalization

QudaGaugeParam_s::anisotropy
double anisotropy
Definition: quda.h:31

QUDA_RECONSTRUCT_NO
Definition: enum_quda.h:55

Vh_ex
int Vh_ex
Definition: test_util.cpp:38

X1h
__constant__ int X1h
Definition: dslash_constants.h:104

construct_gauge_field
void construct_gauge_field(void **gauge, int type, QudaPrecision precision, QudaGaugeParam *param)
Definition: test_util.cpp:1003

compareLink
int compareLink(Float **linkA, Float **linkB, int len)
Definition: test_util.cpp:1296

X2
__constant__ int X2
Definition: dslash_constants.h:107

QudaPrecision
enum QudaPrecision_s QudaPrecision

misc.h

get_flavor_type
QudaTwistFlavorType get_flavor_type(char *s)
Definition: misc.cpp:946

dw_setDims
void dw_setDims(int *X, const int L5)
Definition: test_util.cpp:125

getOddBit
int getOddBit(int Y)
Definition: test_util.cpp:186

QudaGaugeParam_s::gauge_fix
QudaGaugeFixed gauge_fix
Definition: quda.h:51

quda::norm
__host__ __device__ ValueType norm(const complex< ValueType > &z)
Returns the magnitude of z squared.
Definition: complex_quda.h:859

Vs_z
int Vs_z
Definition: test_util.cpp:31

accumulateConjugateProduct
void accumulateConjugateProduct(Float *a, Float *b, Float *c, int sign)
Definition: test_util.cpp:237

zdim
int zdim
Definition: test_util.cpp:1555

TUP
#define TUP
Definition: test_util.cpp:25

QUDA_INVALID_PRECISION
Definition: enum_quda.h:51

conj
Matrix< N, std::complex< T > > conj(const Matrix< N, std::complex< T > > &mat)
Definition: hisq_force_reference2.cpp:231

QudaGaugeParam_s::type
QudaLinkType type
Definition: quda.h:35

niter
int niter
Definition: test_util.cpp:1563

za
int za
Definition: staggered_dslash_core.h:354

mySpinorSiteSize
int mySpinorSiteSize
Definition: test_util.cpp:44

errorQuda
#define errorQuda(...)
Definition: util_quda.h:73

printSpinorElement
void printSpinorElement(void *spinor, int X, QudaPrecision precision)
Definition: test_util.cpp:162

ZUP
#define ZUP
Definition: test_util.cpp:24

createHwCPU
void createHwCPU(void *hw, QudaPrecision precision)
Definition: test_util.cpp:1429

test_util.h

multishift
int multishift
Definition: test_util.cpp:1567

createMomCPU
void createMomCPU(void *mom, QudaPrecision precision)
Definition: test_util.cpp:1391

device
int device
Definition: test_util.cpp:1546

X1
__constant__ int X1
Definition: dslash_constants.h:106

quda::sqrt
__host__ __device__ ValueType sqrt(ValueType x)
Definition: complex_quda.h:105

QUDA_WILSON_LINKS
Definition: enum_quda.h:20

V_ex
int V_ex
Definition: test_util.cpp:38

Vs_y
int Vs_y
Definition: test_util.cpp:31

process_command_line_option
int process_command_line_option(int argc, char **argv, int *idx)
Definition: test_util.cpp:1635

complexDotProduct
void complexDotProduct(Float *a, Float *b, Float *c)
Definition: test_util.cpp:217

ga_idx
const int ga_idx
Definition: asym_wilson_clover_dslash_dagger_fermi_core.h:934

fullLatticeIndex_5d_4dpc
int fullLatticeIndex_5d_4dpc(int i, int oddBit)
Definition: test_util.cpp:655

mat
void mat(void *out, void **fatlink, void **longlink, void *in, double kappa, int dagger_bit, QudaPrecision sPrecision, QudaPrecision gPrecision)
Definition: staggered_dslash_reference.cpp:136

kernel_pack_t
bool kernel_pack_t
Definition: test_util.cpp:1571

gaugeSiteSize
#define gaugeSiteSize
Definition: fat_force_quda.h:11

finalizeComms
void finalizeComms()
Definition: test_util.cpp:65

tdim
int tdim
Definition: test_util.cpp:1556

x2
int x2
Definition: asym_wilson_clover_dslash_dagger_fermi_core.h:393

su3Construct8
void su3Construct8(Float *mat)
Definition: test_util.cpp:255

stopwatchStart
void stopwatchStart()
Definition: test_util.cpp:2055

constructUnitaryGaugeField
void constructUnitaryGaugeField(Float **res)
Definition: test_util.cpp:944

spinor
cpuColorSpinorField * spinor
Definition: dslash_test.cpp:40

gridsize_from_cmdline
int gridsize_from_cmdline[4]
Definition: test_util.cpp:1559

E1
int E1
Definition: test_util.cpp:36

ydim
int ydim
Definition: test_util.cpp:1554

compare_floats
int compare_floats(void *a, void *b, int len, double epsilon, QudaPrecision precision)
Definition: test_util.cpp:395

commDimPartitionedSet
void commDimPartitionedSet(int dir)
Definition: face_buffer.cpp:541

zb
int zb
Definition: staggered_dslash_core.h:354

sid
int sid
Definition: asym_wilson_clover_dslash_dagger_fermi_core.h:400

get_recon
QudaReconstructType get_recon(char *s)
Definition: misc.cpp:614

precon_type
QudaInverterType precon_type
Definition: test_util.cpp:1566

QUDA_DAG_YES
Definition: enum_quda.h:172

createSiteLinkCPU
void createSiteLinkCPU(void **link, QudaPrecision precision, int phase)
Definition: test_util.cpp:1166

Vsh_y
int Vsh_y
Definition: test_util.cpp:32

x4
int x4
Definition: asym_wilson_clover_dslash_dagger_fermi_core.h:393

short.h

initCommsGridQuda
void initCommsGridQuda(int nDim, const int *dims, QudaCommsMap func, void *fdata)
Definition: interface_quda.cpp:254

longlink
void * longlink[4]
Definition: staggered_dslash_test.cpp:51

fullLatticeIndex
int fullLatticeIndex(int dim[4], int index, int oddBit)
Definition: test_util.cpp:400

Ls
int Ls
Definition: test_util.cpp:40

param
QudaGaugeParam param
Definition: pack_test.cpp:17

x1
int x1
Definition: asym_wilson_clover_dslash_dagger_fermi_core.h:393

Vsh_t
int Vsh_t
Definition: test_util.cpp:32

QUDA_RECONSTRUCT_9
Definition: enum_quda.h:58

compare_mom
int compare_mom(Float *momA, Float *momB, int len)
Definition: test_util.cpp:1454

stopwatchReadSeconds
double stopwatchReadSeconds()
Definition: test_util.cpp:2059

xdim
int xdim
Definition: test_util.cpp:1553

Vs_x
int Vs_x
Definition: test_util.cpp:31

QUDA_RECONSTRUCT_12
Definition: enum_quda.h:56

get_solver_type
QudaInverterType get_solver_type(char *s)
Definition: misc.cpp:998

mass
double mass
Definition: test_util.cpp:1569

QudaGaugeParam_s
Definition: quda.h:25

quda::sin
__host__ __device__ ValueType sin(ValueType x)
Definition: complex_quda.h:40

testing::internal::Float
FloatingPoint< float > Float
Definition: gtest.h:7350

setSpinorSiteSize
void setSpinorSiteSize(int n)
Definition: test_util.cpp:150

V5h
int V5h
Definition: test_util.cpp:42

dagger
QudaDagType dagger
Definition: test_util.cpp:1558

momSiteSize
#define momSiteSize
Definition: fat_force_quda.h:10

get_dslash_type
QudaDslashType get_dslash_type(char *s)
Definition: misc.cpp:783

complexAddTo
void complexAddTo(Float *a, Float *b)
Definition: test_util.cpp:196

quda::atan2
__host__ __device__ ValueType atan2(ValueType x, ValueType y)
Definition: complex_quda.h:65

face_quda.h

QudaMatPCType
enum QudaMatPCType_s QudaMatPCType

dim
int dim
Definition: tm_ndeg_fused_exterior_dslash_core.h:195

accumulateComplexProduct
void accumulateComplexProduct(Float *a, Float *b, Float *c, Float sign)
Definition: test_util.cpp:224

test_type
int test_type
Definition: test_util.cpp:1564

quda::pow
__host__ __device__ ValueType pow(ValueType x, ExponentType e)
Definition: complex_quda.h:100

link_recon_sloppy
QudaReconstructType link_recon_sloppy
Definition: test_util.cpp:1550

coeff
__constant__ double coeff
Definition: dslash_constants.h:180

get_matpc_type
QudaMatPCType get_matpc_type(char *s)
Definition: misc.cpp:899

YUP
#define YUP
Definition: test_util.cpp:23

QudaDagType
enum QudaDagType_s QudaDagType

su3Construct12
void su3Construct12(Float *mat)
Definition: test_util.cpp:243

QudaGaugeParam_s::reconstruct
QudaReconstructType reconstruct
Definition: quda.h:43

__attribute__
void __attribute__((weak)) usage_extra(char **argv)
Definition: test_util.cpp:1582

QUDA_KAPPA_NORMALIZATION
Definition: enum_quda.h:177

QUDA_ASQTAD_LONG_LINKS
Definition: enum_quda.h:22

QudaGaugeParam_s::X
int X[4]
Definition: quda.h:29

complexProduct
void complexProduct(Float *a, Float *b, Float *c)
Definition: test_util.cpp:203

neighborIndex
int neighborIndex(int i, int oddBit, int dx4, int dx3, int dx2, int dx1)
Definition: test_util.cpp:450

Z
int Z[4]
Definition: test_util.cpp:28

commCoords
int commCoords(int)
Definition: face_buffer.cpp:537

strong_check_mom
int strong_check_mom(void *momA, void *momB, int len, QudaPrecision prec)
Definition: test_util.cpp:1502

check_gauge
void check_gauge(void **oldG, void **newG, double epsilon, QudaPrecision precision)
Definition: test_util.cpp:1156

twist_flavor
QudaTwistFlavorType twist_flavor
Definition: test_util.cpp:1570

applyGaugeFieldScaling_long
void applyGaugeFieldScaling_long(Float **gauge, int Vh, QudaGaugeParam *param)
Definition: test_util.cpp:723

x
int x[4]
Definition: hisq_paths_force_core.h:99

strong_check_link
int strong_check_link(void **linkA, const char *msgA, void **linkB, const char *msgB, int len, QudaPrecision prec)
Definition: test_util.cpp:1365

X
int X
Definition: asym_wilson_clover_dslash_dagger_fermi_core.h:394

QUDA_WILSON_DSLASH
Definition: enum_quda.h:75

neighborIndex_mg
int neighborIndex_mg(int i, int oddBit, int dx4, int dx3, int dx2, int dx1)
Definition: test_util.cpp:485

hwSiteSize
#define hwSiteSize
Definition: hw_quda.cpp:8

construct_fat_long_gauge_field
void construct_fat_long_gauge_field(void **fatlink, void **longlink, int type, QudaPrecision precision, QudaGaugeParam *param, QudaDslashType dslash_type)
Definition: test_util.cpp:1018

dx
int dx[4]
Definition: hisq_paths_force_core.h:98

construct_clover_field
void construct_clover_field(void *clover, double norm, double diag, QudaPrecision precision)
Definition: test_util.cpp:1103

Y
int Y[4]
Definition: tm_ndeg_fused_exterior_dslash_core.h:197

half_idx
int half_idx
Definition: staggered_fused_exterior_dslash_core.h:377

dslash_type
QudaDslashType dslash_type
Definition: test_util.cpp:1560

QUDA_DOUBLE_PRECISION
Definition: enum_quda.h:50

neighborIndexFullLattice_mg
int neighborIndexFullLattice_mg(int i, int dx4, int dx3, int dx2, int dx1)
Definition: test_util.cpp:569

fullLatticeIndex_5d
int fullLatticeIndex_5d(int i, int oddBit)
Definition: test_util.cpp:650

QudaGaugeParam_s::tadpole_coeff
double tadpole_coeff
Definition: quda.h:32

out
cpuColorSpinorField * out
Definition: staggered_invert_test.cpp:51

QUDA_STAGGERED_DSLASH
Definition: enum_quda.h:80

get_prec
QudaPrecision get_prec(QIO_Reader *infile)
Definition: gauge_qio.cpp:38

setDims
void setDims(int *X)
Definition: test_util.cpp:88

idx
int idx
Definition: staggered_fused_exterior_dslash_core.h:342

QUDA_INVALID_INVERTER
Definition: enum_quda.h:109

E
int E[4]
Definition: test_util.cpp:37

QUDA_SINGLE_PRECISION
Definition: enum_quda.h:49

QudaReconstructType
enum QudaReconstructType_s QudaReconstructType

initRand
void initRand()
Definition: test_util.cpp:75

X3
__constant__ int X3
Definition: dslash_constants.h:108

E2
int E2
Definition: test_util.cpp:36

tune
bool tune
Definition: test_util.cpp:1562

x3
int x3
Definition: asym_wilson_clover_dslash_dagger_fermi_core.h:393

fullLatticeIndex_4d
int fullLatticeIndex_4d(int i, int oddBit)
Definition: test_util.cpp:616

dslash_quda.h

usage
void usage(char **argv)
Definition: test_util.cpp:1584

printfQuda
#define printfQuda(...)
Definition: util_quda.h:67

QUDA_DAG_NO
Definition: enum_quda.h:171

QudaGaugeParam_s::t_boundary
QudaTboundary t_boundary
Definition: quda.h:38

x4_from_full_index
int x4_from_full_index(int i)
Definition: test_util.cpp:661

get_mass_normalization_type
QudaMassNormalization get_mass_normalization_type(char *s)
Definition: misc.cpp:857

full_idx
int full_idx
Definition: staggered_dslash_core.h:358

x1odd
short x1odd
Definition: llfat_core.h:821

inv_type
QudaInverterType inv_type
Definition: test_util.cpp:1565

verify_results
bool verify_results
Definition: test_util.cpp:1568

checkGauge
quda::cudaGaugeField * checkGauge(QudaInvertParam *param)
Definition: interface_quda.cpp:1722

normalization
QudaMassNormalization normalization
Definition: test_util.cpp:1572

V5
int V5
Definition: test_util.cpp:41

QUDA_RECONSTRUCT_13
Definition: enum_quda.h:59

QudaDslashType
enum QudaDslashType_s QudaDslashType

QUDA_ANTI_PERIODIC_T
Definition: enum_quda.h:42

Lsdim
int Lsdim
Definition: test_util.cpp:1557

prec
QudaPrecision prec
Definition: test_util.cpp:1551

faceVolume
int faceVolume[4]
Definition: test_util.cpp:33

Vsh_z
int Vsh_z
Definition: test_util.cpp:32

accumulateComplexDotProduct
void accumulateComplexDotProduct(Float *a, Float *b, Float *c)
Definition: test_util.cpp:231

quda::cos
__host__ __device__ ValueType cos(ValueType x)
Definition: complex_quda.h:35

Vs_t
int Vs_t
Definition: test_util.cpp:31

complex
Definition: qio_util.h:16

return
return
Definition: hisq_paths_force_core.h:380

matpc_type
QudaMatPCType matpc_type
Definition: test_util.cpp:1573

wilson_dslash_reference.h

usage_extra
void usage_extra(char **argv)
Definition: fermion_force_test.cpp:253

E4
int E4
Definition: test_util.cpp:36

s
VOLATILE spinorFloat * s
Definition: asym_wilson_clover_dslash_dagger_g80_core.h:387

latfile
char latfile[256]
Definition: test_util.cpp:1561

QUDA_MATPC_EVEN_EVEN
Definition: enum_quda.h:163

E3
int E3
Definition: test_util.cpp:36

QUDA_TWIST_MINUS
Definition: enum_quda.h:331

su3_construct
void su3_construct(void *mat, QudaReconstructType reconstruct, QudaPrecision precision)
Definition: test_util.cpp:261

get_quda_ver_str
const char * get_quda_ver_str()
Definition: misc.cpp:1050

su3_reconstruct
void su3_reconstruct(void *mat, int dir, int ga_idx, QudaReconstructType reconstruct, QudaPrecision precision, QudaGaugeParam *param)
Definition: test_util.cpp:351

prec_sloppy
QudaPrecision prec_sloppy
Definition: test_util.cpp:1552

initComms
void initComms(int argc, char **argv, const int *commDims)
Definition: test_util.cpp:48

QUDA_ASQTAD_FAT_LINKS
Definition: enum_quda.h:21

gauge
void * gauge[4]
Definition: su3_test.cpp:15

fat_link_max
float fat_link_max

Vh
int Vh
Definition: test_util.cpp:30

QudaInverterType
enum QudaInverterType_s QudaInverterType

oddBit
int oddBit
Definition: hisq_paths_force_core.h:263

V
int V
Definition: test_util.cpp:29

X4
__constant__ int X4
Definition: dslash_constants.h:109

Vsh_x
int Vsh_x
Definition: test_util.cpp:32

XUP
#define XUP
Definition: test_util.cpp:22

complexConjugateProduct
void complexConjugateProduct(Float *a, Float *b, Float *c)
Definition: test_util.cpp:210

E1h
int E1h
Definition: test_util.cpp:36

neighborIndexFullLattice
int neighborIndexFullLattice(int i, int dx4, int dx3, int dx2, int dx1)
Definition: test_util.cpp:521

QUDA_RECONSTRUCT_INVALID
Definition: enum_quda.h:61

fatlink
void * fatlink[4]
Definition: staggered_dslash_test.cpp:51

link_recon
QudaReconstructType link_recon
Definition: test_util.cpp:1549

QudaTwistFlavorType
enum QudaTwistFlavorType_s QudaTwistFlavorType