QUDA: quda/tests/test_util.cpp Source File

QUDA v0.3.2
A library for QCD on GPUs
00001 #include <complex>
00002 #include <stdlib.h>
00003 #include <stdio.h>
00004 
00005 #include <short.h>
00006 
00007 #include <wilson_dslash_reference.h>
00008 #include <test_util.h>
00009 
00010 #define XUP 0
00011 #define YUP 1
00012 #define ZUP 2
00013 #define TUP 3
00014 
00015 using namespace std;
00016 
00017 template <typename Float>
00018 static void printVector(Float *v) {
00019   printf("{(%f %f) (%f %f) (%f %f)}\n", v[0], v[1], v[2], v[3], v[4], v[5]);
00020 }
00021 
00022 // X indexes the lattice site
00023 void printSpinorElement(void *spinor, int X, QudaPrecision precision) {
00024   if (precision == QUDA_DOUBLE_PRECISION)
00025     for (int s=0; s<4; s++) printVector((double*)spinor+X*24+s*6);
00026   else
00027     for (int s=0; s<4; s++) printVector((float*)spinor+X*24+s*6);
00028 }
00029 
00030 // X indexes the full lattice
00031 void printGaugeElement(void *gauge, int X, QudaPrecision precision) {
00032   if (getOddBit(X) == 0) {
00033     if (precision == QUDA_DOUBLE_PRECISION)
00034       for (int m=0; m<3; m++) printVector((double*)gauge +(X/2)*gaugeSiteSize + m*3*2);
00035     else
00036       for (int m=0; m<3; m++) printVector((float*)gauge +(X/2)*gaugeSiteSize + m*3*2);
00037       
00038   } else {
00039     if (precision == QUDA_DOUBLE_PRECISION)
00040       for (int m = 0; m < 3; m++) printVector((double*)gauge + (X/2+Vh)*gaugeSiteSize + m*3*2);
00041     else
00042       for (int m = 0; m < 3; m++) printVector((float*)gauge + (X/2+Vh)*gaugeSiteSize + m*3*2);
00043   }
00044 }
00045 
00046 // returns 0 or 1 if the full lattice index X is even or odd
00047 int getOddBit(int Y) {
00048   int x4 = Y/(Z[2]*Z[1]*Z[0]);
00049   int x3 = (Y/(Z[1]*Z[0])) % Z[2];
00050   int x2 = (Y/Z[0]) % Z[1];
00051   int x1 = Y % Z[0];
00052   return (x4+x3+x2+x1) % 2;
00053 }
00054 
00055 // a+=b
00056 template <typename Float>
00057 inline void complexAddTo(Float *a, Float *b) {
00058   a[0] += b[0];
00059   a[1] += b[1];
00060 }
00061 
00062 // a = b*c
00063 template <typename Float>
00064 inline void complexProduct(Float *a, Float *b, Float *c) {
00065     a[0] = b[0]*c[0] - b[1]*c[1];
00066     a[1] = b[0]*c[1] + b[1]*c[0];
00067 }
00068 
00069 // a = conj(b)*conj(c)
00070 template <typename Float>
00071 inline void complexConjugateProduct(Float *a, Float *b, Float *c) {
00072     a[0] = b[0]*c[0] - b[1]*c[1];
00073     a[1] = -b[0]*c[1] - b[1]*c[0];
00074 }
00075 
00076 // a = conj(b)*c
00077 template <typename Float>
00078 inline void complexDotProduct(Float *a, Float *b, Float *c) {
00079     a[0] = b[0]*c[0] + b[1]*c[1];
00080     a[1] = b[0]*c[1] - b[1]*c[0];
00081 }
00082 
00083 // a += b*c
00084 template <typename Float>
00085 inline void accumulateComplexProduct(Float *a, Float *b, Float *c, Float sign) {
00086   a[0] += sign*(b[0]*c[0] - b[1]*c[1]);
00087   a[1] += sign*(b[0]*c[1] + b[1]*c[0]);
00088 }
00089 
00090 // a += conj(b)*c)
00091 template <typename Float>
00092 inline void accumulateComplexDotProduct(Float *a, Float *b, Float *c) {
00093     a[0] += b[0]*c[0] + b[1]*c[1];
00094     a[1] += b[0]*c[1] - b[1]*c[0];
00095 }
00096 
00097 template <typename Float>
00098 inline void accumulateConjugateProduct(Float *a, Float *b, Float *c, int sign) {
00099   a[0] += sign * (b[0]*c[0] - b[1]*c[1]);
00100   a[1] -= sign * (b[0]*c[1] + b[1]*c[0]);
00101 }
00102 
00103 template <typename Float>
00104 inline void su3Construct12(Float *mat) {
00105   Float *w = mat+12;
00106   w[0] = 0.0;
00107   w[1] = 0.0;
00108   w[2] = 0.0;
00109   w[3] = 0.0;
00110   w[4] = 0.0;
00111   w[5] = 0.0;
00112 }
00113 
00114 // Stabilized Bunk and Sommer
00115 template <typename Float>
00116 inline void su3Construct8(Float *mat) {
00117   mat[0] = atan2(mat[1], mat[0]);
00118   mat[1] = atan2(mat[13], mat[12]);
00119   for (int i=8; i<18; i++) mat[i] = 0.0;
00120 }
00121 
00122 void su3_construct(void *mat, QudaReconstructType reconstruct, QudaPrecision precision) {
00123   if (reconstruct == QUDA_RECONSTRUCT_12) {
00124     if (precision == QUDA_DOUBLE_PRECISION) su3Construct12((double*)mat);
00125     else su3Construct12((float*)mat);
00126   } else {
00127     if (precision == QUDA_DOUBLE_PRECISION) su3Construct8((double*)mat);
00128     else su3Construct8((float*)mat);
00129   }
00130 }
00131 
00132 // given first two rows (u,v) of SU(3) matrix mat, reconstruct the third row
00133 // as the cross product of the conjugate vectors: w = u* x v*
00134 // 
00135 // 48 flops
00136 template <typename Float>
00137 static void su3Reconstruct12(Float *mat, int dir, int ga_idx, QudaGaugeParam *param) {
00138   Float *u = &mat[0*(3*2)];
00139   Float *v = &mat[1*(3*2)];
00140   Float *w = &mat[2*(3*2)];
00141   w[0] = 0.0; w[1] = 0.0; w[2] = 0.0; w[3] = 0.0; w[4] = 0.0; w[5] = 0.0;
00142   accumulateConjugateProduct(w+0*(2), u+1*(2), v+2*(2), +1);
00143   accumulateConjugateProduct(w+0*(2), u+2*(2), v+1*(2), -1);
00144   accumulateConjugateProduct(w+1*(2), u+2*(2), v+0*(2), +1);
00145   accumulateConjugateProduct(w+1*(2), u+0*(2), v+2*(2), -1);
00146   accumulateConjugateProduct(w+2*(2), u+0*(2), v+1*(2), +1);
00147   accumulateConjugateProduct(w+2*(2), u+1*(2), v+0*(2), -1);
00148   Float u0 = (dir < 3 ? param->anisotropy :
00149               (ga_idx >= (Z[3]-1)*Z[0]*Z[1]*Z[2]/2 ? param->t_boundary : 1));
00150   w[0]*=u0; w[1]*=u0; w[2]*=u0; w[3]*=u0; w[4]*=u0; w[5]*=u0;
00151 }
00152 
00153 template <typename Float>
00154 static void su3Reconstruct8(Float *mat, int dir, int ga_idx, QudaGaugeParam *param) {
00155   // First reconstruct first row
00156   Float row_sum = 0.0;
00157   row_sum += mat[2]*mat[2];
00158   row_sum += mat[3]*mat[3];
00159   row_sum += mat[4]*mat[4];
00160   row_sum += mat[5]*mat[5];
00161   Float u0 = (dir < 3 ? param->anisotropy :
00162               (ga_idx >= (Z[3]-1)*Z[0]*Z[1]*Z[2]/2 ? param->t_boundary : 1));
00163   Float U00_mag = sqrt(1.f/(u0*u0) - row_sum);
00164 
00165   mat[14] = mat[0];
00166   mat[15] = mat[1];
00167 
00168   mat[0] = U00_mag * cos(mat[14]);
00169   mat[1] = U00_mag * sin(mat[14]);
00170 
00171   Float column_sum = 0.0;
00172   for (int i=0; i<2; i++) column_sum += mat[i]*mat[i];
00173   for (int i=6; i<8; i++) column_sum += mat[i]*mat[i];
00174   Float U20_mag = sqrt(1.f/(u0*u0) - column_sum);
00175 
00176   mat[12] = U20_mag * cos(mat[15]);
00177   mat[13] = U20_mag * sin(mat[15]);
00178 
00179   // First column now restored
00180 
00181   // finally reconstruct last elements from SU(2) rotation
00182   Float r_inv2 = 1.0/(u0*row_sum);
00183 
00184   // U11
00185   Float A[2];
00186   complexDotProduct(A, mat+0, mat+6);
00187   complexConjugateProduct(mat+8, mat+12, mat+4);
00188   accumulateComplexProduct(mat+8, A, mat+2, u0);
00189   mat[8] *= -r_inv2;
00190   mat[9] *= -r_inv2;
00191 
00192   // U12
00193   complexConjugateProduct(mat+10, mat+12, mat+2);
00194   accumulateComplexProduct(mat+10, A, mat+4, -u0);
00195   mat[10] *= r_inv2;
00196   mat[11] *= r_inv2;
00197 
00198   // U21
00199   complexDotProduct(A, mat+0, mat+12);
00200   complexConjugateProduct(mat+14, mat+6, mat+4);
00201   accumulateComplexProduct(mat+14, A, mat+2, -u0);
00202   mat[14] *= r_inv2;
00203   mat[15] *= r_inv2;
00204 
00205   // U12
00206   complexConjugateProduct(mat+16, mat+6, mat+2);
00207   accumulateComplexProduct(mat+16, A, mat+4, u0);
00208   mat[16] *= -r_inv2;
00209   mat[17] *= -r_inv2;
00210 }
00211 
00212 void su3_reconstruct(void *mat, int dir, int ga_idx, QudaReconstructType reconstruct, QudaPrecision precision, QudaGaugeParam *param) {
00213   if (reconstruct == QUDA_RECONSTRUCT_12) {
00214     if (precision == QUDA_DOUBLE_PRECISION) su3Reconstruct12((double*)mat, dir, ga_idx, param);
00215     else su3Reconstruct12((float*)mat, dir, ga_idx, param);
00216   } else {
00217     if (precision == QUDA_DOUBLE_PRECISION) su3Reconstruct8((double*)mat, dir, ga_idx, param);
00218     else su3Reconstruct8((float*)mat, dir, ga_idx, param);
00219   }
00220 }
00221 
00222 /*
00223 void su3_construct_8_half(float *mat, short *mat_half) {
00224   su3Construct8(mat);
00225 
00226   mat_half[0] = floatToShort(mat[0] / M_PI);
00227   mat_half[1] = floatToShort(mat[1] / M_PI);
00228   for (int i=2; i<18; i++) {
00229     mat_half[i] = floatToShort(mat[i]);
00230   }
00231 }
00232 
00233 void su3_reconstruct_8_half(float *mat, short *mat_half, int dir, int ga_idx, QudaGaugeParam *param) {
00234 
00235   for (int i=0; i<18; i++) {
00236     mat[i] = shortToFloat(mat_half[i]);
00237   }
00238   mat[0] *= M_PI;
00239   mat[1] *= M_PI;
00240 
00241   su3Reconstruct8(mat, dir, ga_idx, param);
00242   }*/
00243 
00244 template <typename Float>
00245 static int compareFloats(Float *a, Float *b, int len, double epsilon) {
00246   for (int i = 0; i < len; i++) {
00247     double diff = fabs(a[i] - b[i]);
00248     if (diff > epsilon) {
00249       printf("error: i=%d, a[%d]=%f, b[%d]=%f\n", i, i, a[i], i, b[i]);
00250       return 0;
00251     }
00252   }
00253   return 1;
00254 }
00255 
00256 int compare_floats(void *a, void *b, int len, double epsilon, QudaPrecision precision) {
00257   if  (precision == QUDA_DOUBLE_PRECISION) return compareFloats((double*)a, (double*)b, len, epsilon);
00258   else return compareFloats((float*)a, (float*)b, len, epsilon);
00259 }
00260 
00261 
00262 
00263 // given a "half index" i into either an even or odd half lattice (corresponding
00264 // to oddBit = {0, 1}), returns the corresponding full lattice index.
00265 int fullLatticeIndex(int i, int oddBit) {
00266   int boundaryCrossings = i/(Z[0]/2) + i/(Z[1]*Z[0]/2) + i/(Z[2]*Z[1]*Z[0]/2);
00267   return 2*i + (boundaryCrossings + oddBit) % 2;
00268 }
00269 
00270 
00271 // i represents a "half index" into an even or odd "half lattice".
00272 // when oddBit={0,1} the half lattice is {even,odd}.
00273 // 
00274 // the displacements, such as dx, refer to the full lattice coordinates. 
00275 //
00276 // neighborIndex() takes a "half index", displaces it, and returns the
00277 // new "half index", which can be an index into either the even or odd lattices.
00278 // displacements of magnitude one always interchange odd and even lattices.
00279 //
00280 
00281 int neighborIndex(int i, int oddBit, int dx4, int dx3, int dx2, int dx1) {
00282   int Y = fullLatticeIndex(i, oddBit);
00283   int x4 = Y/(Z[2]*Z[1]*Z[0]);
00284   int x3 = (Y/(Z[1]*Z[0])) % Z[2];
00285   int x2 = (Y/Z[0]) % Z[1];
00286   int x1 = Y % Z[0];
00287   
00288   // assert (oddBit == (x+y+z+t)%2);
00289   
00290   x4 = (x4+dx4+Z[3]) % Z[3];
00291   x3 = (x3+dx3+Z[2]) % Z[2];
00292   x2 = (x2+dx2+Z[1]) % Z[1];
00293   x1 = (x1+dx1+Z[0]) % Z[0];
00294   
00295   return (x4*(Z[2]*Z[1]*Z[0]) + x3*(Z[1]*Z[0]) + x2*(Z[0]) + x1) / 2;
00296 }
00297 
00298 /*  
00299  * This is a computation of neighbor using the full index and the displacement in each direction
00300  *
00301  */
00302 
00303 int
00304 neighborIndexFullLattice(int i, int dx4, int dx3, int dx2, int dx1) 
00305 {
00306     int oddBit = 0;
00307     int half_idx = i;
00308     if (i >= Vh){
00309         oddBit =1;
00310         half_idx = i - Vh;
00311     }
00312     
00313     int nbr_half_idx = neighborIndex(half_idx, oddBit, dx4,dx3,dx2,dx1);
00314     int oddBitChanged = (dx4+dx3+dx2+dx1)%2;
00315     if (oddBitChanged){
00316         oddBit = 1 - oddBit;
00317     }
00318     int ret = nbr_half_idx;
00319     if (oddBit){
00320         ret = Vh + nbr_half_idx;
00321     }
00322     
00323     return ret;
00324 }
00325 
00326 // 4d checkerboard.
00327 // given a "half index" i into either an even or odd half lattice (corresponding
00328 // to oddBit = {0, 1}), returns the corresponding full lattice index.
00329 // Cf. GPGPU code in dslash_core_ante.h.
00330 // There, i is the thread index.
00331 int fullLatticeIndex_4d(int i, int oddBit) {
00332   if (i >= Vh || i < 0) {printf("i out of range in fullLatticeIndex_4d"); exit(-1);}
00333   int boundaryCrossings = i/(Z[0]/2) + i/(Z[1]*Z[0]/2) + i/(Z[2]*Z[1]*Z[0]/2);
00334   return 2*i + (boundaryCrossings + oddBit) % 2;
00335 }
00336 
00337 // 5d checkerboard.
00338 // given a "half index" i into either an even or odd half lattice (corresponding
00339 // to oddBit = {0, 1}), returns the corresponding full lattice index.
00340 // Cf. GPGPU code in dslash_core_ante.h.
00341 // There, i is the thread index sid.
00342 // This function is used by neighborIndex_5d in dslash_reference.cpp.
00343 //ok
00344 int fullLatticeIndex_5d(int i, int oddBit) {
00345   int boundaryCrossings = i/(Z[0]/2) + i/(Z[1]*Z[0]/2) + i/(Z[2]*Z[1]*Z[0]/2) + i/(Z[3]*Z[2]*Z[1]*Z[0]/2);
00346   return 2*i + (boundaryCrossings + oddBit) % 2;
00347 }
00348 
00349 template <typename Float>
00350 static void applyGaugeFieldScaling(Float **gauge, int Vh, QudaGaugeParam *param) {
00351   // Apply spatial scaling factor (u0) to spatial links
00352   for (int d = 0; d < 3; d++) {
00353     for (int i = 0; i < gaugeSiteSize*Vh*2; i++) {
00354       gauge[d][i] /= param->anisotropy;
00355     }
00356   }
00357     
00358   // Apply boundary conditions to temporal links
00359   if (param->t_boundary == QUDA_ANTI_PERIODIC_T) {
00360     for (int j = (Z[0]/2)*Z[1]*Z[2]*(Z[3]-1); j < Vh; j++) {
00361       for (int i = 0; i < gaugeSiteSize; i++) {
00362         gauge[3][j*gaugeSiteSize+i] *= -1.0;
00363         gauge[3][(Vh+j)*gaugeSiteSize+i] *= -1.0;
00364       }
00365     }
00366   }
00367     
00368   if (param->gauge_fix) {
00369     // set all gauge links (except for the first Z[0]*Z[1]*Z[2]/2) to the identity,
00370     // to simulate fixing to the temporal gauge.
00371     int dir = 3; // time direction only
00372     Float *even = gauge[dir];
00373     Float *odd  = gauge[dir]+Vh*gaugeSiteSize;
00374     for (int i = Z[0]*Z[1]*Z[2]/2; i < Vh; i++) {
00375       for (int m = 0; m < 3; m++) {
00376         for (int n = 0; n < 3; n++) {
00377           even[i*(3*3*2) + m*(3*2) + n*(2) + 0] = (m==n) ? 1 : 0;
00378           even[i*(3*3*2) + m*(3*2) + n*(2) + 1] = 0.0;
00379           odd [i*(3*3*2) + m*(3*2) + n*(2) + 0] = (m==n) ? 1 : 0;
00380           odd [i*(3*3*2) + m*(3*2) + n*(2) + 1] = 0.0;
00381         }
00382       }
00383     }
00384   }
00385 }
00386 
00387 template <typename Float>
00388 void applyGaugeFieldScaling_long(Float **gauge, int Vh, QudaGaugeParam *param)
00389 {
00390 
00391     int X1h=param->X[0]/2;
00392     int X1 =param->X[0];
00393     int X2 =param->X[1];
00394     int X3 =param->X[2];
00395     int X4 =param->X[3];
00396 
00397     // rescale long links by the appropriate coefficient
00398     for(int d=0; d<4; d++){
00399         for(int i=0; i < V*gaugeSiteSize; i++){
00400             gauge[d][i] /= (-24*param->tadpole_coeff*param->tadpole_coeff);
00401         }
00402     }
00403 
00404     // apply the staggered phases
00405     for (int d = 0; d < 3; d++) {
00406 
00407         //even
00408         for (int i = 0; i < Vh; i++) {
00409 
00410             int index = fullLatticeIndex(i, 0);
00411             int i4 = index /(X3*X2*X1);
00412             int i3 = (index - i4*(X3*X2*X1))/(X2*X1);
00413             int i2 = (index - i4*(X3*X2*X1) - i3*(X2*X1))/X1;
00414             int i1 = index - i4*(X3*X2*X1) - i3*(X2*X1) - i2*X1;
00415             int sign=1;
00416 
00417             if (d == 0) {
00418                 if (i4 % 2 == 1){
00419                     sign= -1;
00420                 }
00421             }
00422 
00423             if (d == 1){
00424                 if ((i4+i1) % 2 == 1){
00425                     sign= -1;
00426                 }
00427             }
00428             if (d == 2){
00429                 if ( (i4+i1+i2) % 2 == 1){
00430                     sign= -1;
00431                 }
00432             }
00433 
00434             for (int j=0;j < 6; j++){
00435                 gauge[d][i*gaugeSiteSize + 12+ j] *= sign;
00436             }
00437         }
00438         //odd
00439         for (int i = 0; i < Vh; i++) {
00440             int index = fullLatticeIndex(i, 1);
00441             int i4 = index /(X3*X2*X1);
00442             int i3 = (index - i4*(X3*X2*X1))/(X2*X1);
00443             int i2 = (index - i4*(X3*X2*X1) - i3*(X2*X1))/X1;
00444             int i1 = index - i4*(X3*X2*X1) - i3*(X2*X1) - i2*X1;
00445             int sign=1;
00446 
00447             if (d == 0) {
00448                 if (i4 % 2 == 1){
00449                     sign= -1;
00450                 }
00451             }
00452 
00453             if (d == 1){
00454                 if ((i4+i1) % 2 == 1){
00455                     sign= -1;
00456                 }
00457             }
00458             if (d == 2){
00459                 if ( (i4+i1+i2) % 2 == 1){
00460                     sign = -1;
00461                 }
00462             }
00463 
00464             for (int j=0;j < 6; j++){
00465                 gauge[d][(Vh+i)*gaugeSiteSize + 12 + j] *= sign;
00466             }
00467         }
00468 
00469     }
00470 
00471     // Apply boundary conditions to temporal links
00472     if (param->t_boundary == QUDA_ANTI_PERIODIC_T) {
00473         for (int j = 0; j < Vh; j++) {
00474             int sign =1;
00475             if (j >= (X4-3)*X1h*X2*X3 ){
00476                 sign= -1;
00477             }
00478 
00479             for (int i = 0; i < 6; i++) {
00480                 gauge[3][j*gaugeSiteSize+ 12+ i ] *= sign;
00481                 gauge[3][(Vh+j)*gaugeSiteSize+12 +i] *= sign;
00482             }
00483         }
00484     }
00485 }
00486 
00487 
00488 
00489 template <typename Float>
00490 static void constructUnitGaugeField(Float **res, QudaGaugeParam *param) {
00491   Float *resOdd[4], *resEven[4];
00492   for (int dir = 0; dir < 4; dir++) {  
00493     resEven[dir] = res[dir];
00494     resOdd[dir]  = res[dir]+Vh*gaugeSiteSize;
00495   }
00496     
00497   for (int dir = 0; dir < 4; dir++) {
00498     for (int i = 0; i < Vh; i++) {
00499       for (int m = 0; m < 3; m++) {
00500         for (int n = 0; n < 3; n++) {
00501           resEven[dir][i*(3*3*2) + m*(3*2) + n*(2) + 0] = (m==n) ? 1 : 0;
00502           resEven[dir][i*(3*3*2) + m*(3*2) + n*(2) + 1] = 0.0;
00503           resOdd[dir][i*(3*3*2) + m*(3*2) + n*(2) + 0] = (m==n) ? 1 : 0;
00504           resOdd[dir][i*(3*3*2) + m*(3*2) + n*(2) + 1] = 0.0;
00505         }
00506       }
00507     }
00508   }
00509     
00510   applyGaugeFieldScaling(res, Vh, param);
00511 }
00512 
00513 // normalize the vector a
00514 template <typename Float>
00515 static void normalize(complex<Float> *a, int len) {
00516   double sum = 0.0;
00517   for (int i=0; i<len; i++) sum += norm(a[i]);
00518   for (int i=0; i<len; i++) a[i] /= sqrt(sum);
00519 }
00520 
00521 // orthogonalize vector b to vector a
00522 template <typename Float>
00523 static void orthogonalize(complex<Float> *a, complex<Float> *b, int len) {
00524   complex<double> dot = 0.0;
00525   for (int i=0; i<len; i++) dot += conj(a[i])*b[i];
00526   for (int i=0; i<len; i++) b[i] -= (complex<Float>)dot*a[i];
00527 }
00528 
00529 template <typename Float> 
00530 static void constructGaugeField(Float **res, QudaGaugeParam *param) {
00531   Float *resOdd[4], *resEven[4];
00532   for (int dir = 0; dir < 4; dir++) {  
00533     resEven[dir] = res[dir];
00534     resOdd[dir]  = res[dir]+Vh*gaugeSiteSize;
00535   }
00536     
00537   for (int dir = 0; dir < 4; dir++) {
00538     for (int i = 0; i < Vh; i++) {
00539       for (int m = 1; m < 3; m++) { // last 2 rows
00540         for (int n = 0; n < 3; n++) { // 3 columns
00541           resEven[dir][i*(3*3*2) + m*(3*2) + n*(2) + 0] = rand() / (Float)RAND_MAX;
00542           resEven[dir][i*(3*3*2) + m*(3*2) + n*(2) + 1] = rand() / (Float)RAND_MAX;
00543           resOdd[dir][i*(3*3*2) + m*(3*2) + n*(2) + 0] = rand() / (Float)RAND_MAX;
00544           resOdd[dir][i*(3*3*2) + m*(3*2) + n*(2) + 1] = rand() / (Float)RAND_MAX;                    
00545         }
00546       }
00547       normalize((complex<Float>*)(resEven[dir] + (i*3+1)*3*2), 3);
00548       orthogonalize((complex<Float>*)(resEven[dir] + (i*3+1)*3*2), (complex<Float>*)(resEven[dir] + (i*3+2)*3*2), 3);
00549       normalize((complex<Float>*)(resEven[dir] + (i*3 + 2)*3*2), 3);
00550       
00551       normalize((complex<Float>*)(resOdd[dir] + (i*3+1)*3*2), 3);
00552       orthogonalize((complex<Float>*)(resOdd[dir] + (i*3+1)*3*2), (complex<Float>*)(resOdd[dir] + (i*3+2)*3*2), 3);
00553       normalize((complex<Float>*)(resOdd[dir] + (i*3 + 2)*3*2), 3);
00554 
00555       {
00556         Float *w = resEven[dir]+(i*3+0)*3*2;
00557         Float *u = resEven[dir]+(i*3+1)*3*2;
00558         Float *v = resEven[dir]+(i*3+2)*3*2;
00559         
00560         for (int n = 0; n < 6; n++) w[n] = 0.0;
00561         accumulateConjugateProduct(w+0*(2), u+1*(2), v+2*(2), +1);
00562         accumulateConjugateProduct(w+0*(2), u+2*(2), v+1*(2), -1);
00563         accumulateConjugateProduct(w+1*(2), u+2*(2), v+0*(2), +1);
00564         accumulateConjugateProduct(w+1*(2), u+0*(2), v+2*(2), -1);
00565         accumulateConjugateProduct(w+2*(2), u+0*(2), v+1*(2), +1);
00566         accumulateConjugateProduct(w+2*(2), u+1*(2), v+0*(2), -1);
00567       }
00568 
00569       {
00570         Float *w = resOdd[dir]+(i*3+0)*3*2;
00571         Float *u = resOdd[dir]+(i*3+1)*3*2;
00572         Float *v = resOdd[dir]+(i*3+2)*3*2;
00573         
00574         for (int n = 0; n < 6; n++) w[n] = 0.0;
00575         accumulateConjugateProduct(w+0*(2), u+1*(2), v+2*(2), +1);
00576         accumulateConjugateProduct(w+0*(2), u+2*(2), v+1*(2), -1);
00577         accumulateConjugateProduct(w+1*(2), u+2*(2), v+0*(2), +1);
00578         accumulateConjugateProduct(w+1*(2), u+0*(2), v+2*(2), -1);
00579         accumulateConjugateProduct(w+2*(2), u+0*(2), v+1*(2), +1);
00580         accumulateConjugateProduct(w+2*(2), u+1*(2), v+0*(2), -1);
00581       }
00582 
00583     }
00584   }
00585   if (param->type == QUDA_WILSON_LINKS){  
00586       applyGaugeFieldScaling(res, Vh, param);
00587   }else if (param->type == QUDA_ASQTAD_LONG_LINKS){
00588       applyGaugeFieldScaling_long(res, Vh, param);      
00589   }
00590   
00591   
00592 }
00593 
00594 template <typename Float> 
00595 void constructUnitaryGaugeField(Float **res) 
00596 {
00597     Float *resOdd[4], *resEven[4];
00598     for (int dir = 0; dir < 4; dir++) {  
00599         resEven[dir] = res[dir];
00600         resOdd[dir]  = res[dir]+Vh*gaugeSiteSize;
00601     }
00602     
00603     for (int dir = 0; dir < 4; dir++) {
00604         for (int i = 0; i < Vh; i++) {
00605             for (int m = 1; m < 3; m++) { // last 2 rows
00606                 for (int n = 0; n < 3; n++) { // 3 columns
00607                     resEven[dir][i*(3*3*2) + m*(3*2) + n*(2) + 0] = rand() / (Float)RAND_MAX;
00608                     resEven[dir][i*(3*3*2) + m*(3*2) + n*(2) + 1] = rand() / (Float)RAND_MAX;
00609                     resOdd[dir][i*(3*3*2) + m*(3*2) + n*(2) + 0] = rand() / (Float)RAND_MAX;
00610                     resOdd[dir][i*(3*3*2) + m*(3*2) + n*(2) + 1] = rand() / (Float)RAND_MAX;                    
00611                 }
00612             }
00613             normalize((complex<Float>*)(resEven[dir] + (i*3+1)*3*2), 3);
00614             orthogonalize((complex<Float>*)(resEven[dir] + (i*3+1)*3*2), (complex<Float>*)(resEven[dir] + (i*3+2)*3*2), 3);
00615             normalize((complex<Float>*)(resEven[dir] + (i*3 + 2)*3*2), 3);
00616       
00617             normalize((complex<Float>*)(resOdd[dir] + (i*3+1)*3*2), 3);
00618             orthogonalize((complex<Float>*)(resOdd[dir] + (i*3+1)*3*2), (complex<Float>*)(resOdd[dir] + (i*3+2)*3*2), 3);
00619             normalize((complex<Float>*)(resOdd[dir] + (i*3 + 2)*3*2), 3);
00620 
00621             {
00622                 Float *w = resEven[dir]+(i*3+0)*3*2;
00623                 Float *u = resEven[dir]+(i*3+1)*3*2;
00624                 Float *v = resEven[dir]+(i*3+2)*3*2;
00625         
00626                 for (int n = 0; n < 6; n++) w[n] = 0.0;
00627                 accumulateConjugateProduct(w+0*(2), u+1*(2), v+2*(2), +1);
00628                 accumulateConjugateProduct(w+0*(2), u+2*(2), v+1*(2), -1);
00629                 accumulateConjugateProduct(w+1*(2), u+2*(2), v+0*(2), +1);
00630                 accumulateConjugateProduct(w+1*(2), u+0*(2), v+2*(2), -1);
00631                 accumulateConjugateProduct(w+2*(2), u+0*(2), v+1*(2), +1);
00632                 accumulateConjugateProduct(w+2*(2), u+1*(2), v+0*(2), -1);
00633             }
00634 
00635             {
00636                 Float *w = resOdd[dir]+(i*3+0)*3*2;
00637                 Float *u = resOdd[dir]+(i*3+1)*3*2;
00638                 Float *v = resOdd[dir]+(i*3+2)*3*2;
00639         
00640                 for (int n = 0; n < 6; n++) w[n] = 0.0;
00641                 accumulateConjugateProduct(w+0*(2), u+1*(2), v+2*(2), +1);
00642                 accumulateConjugateProduct(w+0*(2), u+2*(2), v+1*(2), -1);
00643                 accumulateConjugateProduct(w+1*(2), u+2*(2), v+0*(2), +1);
00644                 accumulateConjugateProduct(w+1*(2), u+0*(2), v+2*(2), -1);
00645                 accumulateConjugateProduct(w+2*(2), u+0*(2), v+1*(2), +1);
00646                 accumulateConjugateProduct(w+2*(2), u+1*(2), v+0*(2), -1);
00647             }
00648 
00649         }
00650     }
00651 }
00652 
00653 
00654 void construct_gauge_field(void **gauge, int type, QudaPrecision precision, QudaGaugeParam *param) {
00655   if (type == 0) {
00656     if (precision == QUDA_DOUBLE_PRECISION) constructUnitGaugeField((double**)gauge, param);
00657     else constructUnitGaugeField((float**)gauge, param);
00658    } else {
00659     if (precision == QUDA_DOUBLE_PRECISION) constructGaugeField((double**)gauge, param);
00660     else constructGaugeField((float**)gauge, param);
00661   }
00662 }
00663 
00664 void
00665 construct_fat_long_gauge_field(void **fatlink, void** longlink,  
00666                                int type, QudaPrecision precision, QudaGaugeParam* param)
00667 {
00668     if (type == 0) {
00669         if (precision == QUDA_DOUBLE_PRECISION) {
00670             constructUnitGaugeField((double**)fatlink, param);
00671             constructUnitGaugeField((double**)longlink, param);
00672         }else {
00673             constructUnitGaugeField((float**)fatlink, param);
00674             constructUnitGaugeField((float**)longlink, param);
00675         }
00676     } else {
00677         if (precision == QUDA_DOUBLE_PRECISION) {
00678             param->type = QUDA_ASQTAD_FAT_LINKS;
00679             constructGaugeField((double**)fatlink, param);
00680             param->type = QUDA_ASQTAD_LONG_LINKS;
00681             constructGaugeField((double**)longlink, param);
00682         }else {
00683             param->type = QUDA_ASQTAD_FAT_LINKS;
00684             constructGaugeField((float**)fatlink, param);
00685             param->type = QUDA_ASQTAD_LONG_LINKS;
00686             constructGaugeField((float**)longlink, param);
00687         }
00688     }
00689 }
00690 
00691 
00692 template <typename Float>
00693 static void constructCloverField(Float *res, double norm, double diag) {
00694 
00695   Float c = 2.0 * norm / RAND_MAX;
00696 
00697   for(int i = 0; i < V; i++) {
00698     for (int j = 0; j < 72; j++) {
00699       res[i*72 + j] = c*rand() - norm;
00700     }
00701     for (int j = 0; j< 6; j++) {
00702       res[i*72 + j] += diag;
00703       res[i*72 + j+36] += diag;
00704     }
00705   }
00706 }
00707 
00708 void construct_clover_field(void *clover, double norm, double diag, QudaPrecision precision) {
00709 
00710   if (precision == QUDA_DOUBLE_PRECISION) constructCloverField((double *)clover, norm, diag);
00711   else constructCloverField((float *)clover, norm, diag);
00712 }
00713 
00714 /*void strong_check(void *spinorRef, void *spinorGPU, int len, QudaPrecision prec) {
00715   printf("Reference:\n");
00716   printSpinorElement(spinorRef, 0, prec); printf("...\n");
00717   printSpinorElement(spinorRef, len-1, prec); printf("\n");    
00718     
00719   printf("\nCUDA:\n");
00720   printSpinorElement(spinorGPU, 0, prec); printf("...\n");
00721   printSpinorElement(spinorGPU, len-1, prec); printf("\n");
00722 
00723   compare_spinor(spinorRef, spinorGPU, len, prec);
00724   }*/
00725 
00726 template <typename Float>
00727 static void checkGauge(Float **oldG, Float **newG, double epsilon) {
00728 
00729   int fail_check = 17;
00730   int *fail = new int[fail_check];
00731   int iter[18];
00732   for (int i=0; i<fail_check; i++) fail[i] = 0;
00733   for (int i=0; i<18; i++) iter[i] = 0;
00734 
00735   for (int d=0; d<4; d++) {
00736     for (int eo=0; eo<2; eo++) {
00737       for (int i=0; i<Vh; i++) {
00738         int ga_idx = (eo*Vh+i);
00739         for (int j=0; j<18; j++) {
00740           double diff = fabs(newG[d][ga_idx*18+j] - oldG[d][ga_idx*18+j]);
00741 
00742           for (int f=0; f<fail_check; f++) if (diff > pow(10.0,-(f+1))) fail[f]++;
00743           if (diff > epsilon) iter[j]++;
00744         }
00745       }
00746     }
00747   }
00748 
00749   for (int i=0; i<18; i++) printf("%d fails = %d\n", i, iter[i]);
00750 
00751   for (int f=0; f<fail_check; f++) {
00752     printf("%e Failures = %d / %d  = %e\n", pow(10.0,-(f+1)), fail[f], V*4*18, fail[f] / (double)(4*V*18));
00753   }
00754 
00755   delete []fail;
00756 
00757 }
00758 
00759 void check_gauge(void **oldG, void **newG, double epsilon, QudaPrecision precision) {
00760   if (precision == QUDA_DOUBLE_PRECISION) 
00761     checkGauge((double**)oldG, (double**)newG, epsilon);
00762   else 
00763     checkGauge((float**)oldG, (float**)newG, epsilon);
00764 }
00765 
00766 
00767 void 
00768 createSiteLinkCPU(void* link,  QudaPrecision precision, int phase) 
00769 {
00770     void* temp[4];
00771     
00772     size_t gSize = (precision == QUDA_DOUBLE_PRECISION) ? sizeof(double) : sizeof(float);
00773     for(int i=0;i < 4;i++){
00774         temp[i] = malloc(V*gaugeSiteSize*gSize);
00775         if (temp[i] == NULL){
00776             fprintf(stderr, "Error: malloc failed for temp in function %s\n", __FUNCTION__);
00777             exit(1);
00778         }
00779     }
00780     
00781     if (precision == QUDA_DOUBLE_PRECISION) {
00782         constructUnitaryGaugeField((double**)temp);
00783     }else {
00784         constructUnitaryGaugeField((float**)temp);
00785     }
00786         
00787     for(int i=0;i < V;i++){
00788         for(int dir=0;dir < 4;dir++){
00789             if (precision == QUDA_DOUBLE_PRECISION){
00790                 double** src= (double**)temp;
00791                 double* dst = (double*)link;
00792                 for(int k=0; k < gaugeSiteSize; k++){
00793                     dst[ (4*i+dir)*gaugeSiteSize + k ] = src[dir][i*gaugeSiteSize + k];
00794 
00795                 }
00796             }else{
00797                 float** src= (float**)temp;
00798                 float* dst = (float*)link;
00799                 for(int k=0; k < gaugeSiteSize; k++){
00800                     dst[ (4*i+dir)*gaugeSiteSize + k ] = src[dir][i*gaugeSiteSize + k];
00801                     
00802                 }
00803                 
00804             }
00805         }
00806     }
00807 
00808     if(phase){
00809         
00810         for(int i=0;i < V;i++){
00811             for(int dir =XUP; dir <= TUP; dir++){
00812                 int idx = i;
00813                 int oddBit =0;
00814                 if (i >= Vh) {
00815                     idx = i - Vh;
00816                     oddBit = 1;
00817                 }
00818 
00819                 int X1 = Z[0];
00820                 int X2 = Z[1];
00821                 int X3 = Z[2];
00822                 int X4 = Z[3];
00823 
00824                 int full_idx = fullLatticeIndex(idx, oddBit);
00825                 int i4 = full_idx /(X3*X2*X1);
00826                 int i3 = (full_idx - i4*(X3*X2*X1))/(X2*X1);
00827                 int i2 = (full_idx - i4*(X3*X2*X1) - i3*(X2*X1))/X1;
00828                 int i1 = full_idx - i4*(X3*X2*X1) - i3*(X2*X1) - i2*X1;     
00829 
00830                 double coeff= 1.0;
00831                 switch(dir){
00832                 case XUP:
00833                     if ( (i4 & 1) == 1){
00834                         coeff *= -1;
00835                     }
00836                     break;
00837 
00838                 case YUP:
00839                     if ( ((i4+i1) & 1) == 1){
00840                         coeff *= -1;
00841                     }
00842                     break;
00843 
00844                 case ZUP:
00845                     if ( ((i4+i1+i2) & 1) == 1){
00846                         coeff *= -1;
00847                     }
00848                     break;
00849                 
00850                 case TUP:
00851                     if (i4 == (X4-1) ){
00852                         coeff *= -1;
00853                     }
00854                     break;
00855 
00856                 default:
00857                     printf("ERROR: wrong dir(%d)\n", dir);
00858                     exit(1);
00859                 }
00860             
00861             
00862                 if (precision == QUDA_DOUBLE_PRECISION){
00863                     double* mylink = (double*)link;
00864                     mylink = mylink + (4*i + dir)*gaugeSiteSize;
00865                 
00866                     mylink[12] *= coeff;
00867                     mylink[13] *= coeff;
00868                     mylink[14] *= coeff;
00869                     mylink[15] *= coeff;
00870                     mylink[16] *= coeff;
00871                     mylink[17] *= coeff;
00872                 
00873                 }else{
00874                     float* mylink = (float*)link;
00875                     mylink = mylink + (4*i + dir)*gaugeSiteSize;
00876                 
00877                     mylink[12] *= coeff;
00878                     mylink[13] *= coeff;
00879                     mylink[14] *= coeff;
00880                     mylink[15] *= coeff;
00881                     mylink[16] *= coeff;
00882                     mylink[17] *= coeff;
00883                 
00884                 }
00885             }
00886         }
00887     }    
00888 
00889     
00890 #if 1
00891     for(int i=0;i< 4*V*gaugeSiteSize;i++){
00892         if (precision ==QUDA_SINGLE_PRECISION){
00893             float* f = (float*)link;
00894             if (f[i] != f[i] || (fabsf(f[i]) > 1.e+3) ){
00895                 fprintf(stderr, "ERROR:  %dth: bad number(%f) in function %s \n",i, f[i], __FUNCTION__);
00896                 exit(1);
00897             }
00898         }else{
00899             double* f = (double*)link;
00900             if (f[i] != f[i] || (fabs(f[i]) > 1.e+3)){
00901                 fprintf(stderr, "ERROR:  %dth: bad number(%f) in function %s \n",i, f[i], __FUNCTION__);
00902                 exit(1);
00903             }
00904             
00905         }
00906         
00907     }
00908 #endif
00909 
00910     for(int i=0;i < 4;i++){
00911         free(temp[i]);
00912     }
00913     return;
00914 }
00915 
00916 
00917 
00918 
00919 
00920 template <typename Float>
00921 void compareLink(Float *linkA, Float *linkB, int len) {
00922   int fail_check = 16;
00923   int fail[fail_check];
00924   for (int f=0; f<fail_check; f++) fail[f] = 0;
00925 
00926   int iter[18];
00927   for (int i=0; i<18; i++) iter[i] = 0;
00928   
00929   for (int i=0; i<len; i++) {
00930       for (int j=0; j<18; j++) {
00931           int is = i*18+j;
00932           double diff = fabs(linkA[is]-linkB[is]);
00933           for (int f=0; f<fail_check; f++)
00934               if (diff > pow(10.0,-(f+1))) fail[f]++;
00935           //if (diff > 1e-1) printf("%d %d %e\n", i, j, diff);
00936           if (diff > 1e-3) iter[j]++;
00937       }
00938   }
00939   
00940   for (int i=0; i<18; i++) printf("%d fails = %d\n", i, iter[i]);
00941   
00942   for (int f=0; f<fail_check; f++) {
00943       printf("%e Failures: %d / %d  = %e\n", pow(10.0,-(f+1)), fail[f], len*gaugeSiteSize, fail[f] / (double)(len*6));
00944   }
00945   
00946 }
00947 
00948 static void 
00949 compare_link(void *linkA, void *linkB, int len, QudaPrecision precision)
00950 {
00951     if (precision == QUDA_DOUBLE_PRECISION) compareLink((double*)linkA, (double*)linkB, len);
00952     else compareLink((float*)linkA, (float*)linkB, len);
00953     
00954     return;
00955 }
00956 
00957 
00958 // X indexes the lattice site
00959 static void 
00960 printLinkElement(void *link, int X, QudaPrecision precision) 
00961 {
00962     if (precision == QUDA_DOUBLE_PRECISION){
00963         for(int i=0; i < 3;i++){
00964             printVector((double*)link+ X*gaugeSiteSize + i*6);
00965         }
00966         
00967     }
00968     else{
00969         for(int i=0;i < 3;i++){
00970             printVector((float*)link+X*gaugeSiteSize + i*6);
00971         }
00972     }
00973 }
00974 
00975 void strong_check_link(void * linkA, void *linkB, int len, QudaPrecision prec) 
00976 {
00977     printf("LinkA:\n");
00978     printLinkElement(linkA, 0, prec); 
00979     printf("\n");
00980     printLinkElement(linkA, 1, prec); 
00981     printf("...\n");
00982     printLinkElement(linkA, len-1, prec); 
00983     printf("\n");    
00984     
00985     printf("\nlinkB:\n");
00986     printLinkElement(linkB, 0, prec); 
00987     printf("\n");
00988     printLinkElement(linkB, 1, prec); 
00989     printf("...\n");
00990     printLinkElement(linkB, len-1, prec); 
00991     printf("\n");
00992     
00993     compare_link(linkA, linkB, len, prec);
00994 }
00995 
00996 
00997 
00998 void 
00999 createMomCPU(void* mom,  QudaPrecision precision) 
01000 {
01001     void* temp;
01002     
01003     size_t gSize = (precision == QUDA_DOUBLE_PRECISION) ? sizeof(double) : sizeof(float);
01004     temp = malloc(4*V*gaugeSiteSize*gSize);
01005     if (temp == NULL){
01006         fprintf(stderr, "Error: malloc failed for temp in function %s\n", __FUNCTION__);
01007         exit(1);
01008     }
01009     
01010     
01011     
01012     for(int i=0;i < V;i++){
01013         if (precision == QUDA_DOUBLE_PRECISION){
01014             for(int dir=0;dir < 4;dir++){
01015                 double* thismom = (double*)mom;     
01016                 for(int k=0; k < momSiteSize; k++){
01017                     thismom[ (4*i+dir)*momSiteSize + k ]= 1.0* rand() /RAND_MAX;                                
01018                 }           
01019             }       
01020         }else{
01021             for(int dir=0;dir < 4;dir++){
01022                 float* thismom=(float*)mom;
01023                 for(int k=0; k < momSiteSize; k++){
01024                     thismom[ (4*i+dir)*momSiteSize + k ]= 1.0* rand() /RAND_MAX;                
01025                 }           
01026             }
01027         }
01028     }
01029     
01030     free(temp);
01031     return;
01032 }
01033 
01034 void
01035 createHwCPU(void* hw,  QudaPrecision precision)
01036 {
01037     for(int i=0;i < V;i++){
01038         if (precision == QUDA_DOUBLE_PRECISION){
01039             for(int dir=0;dir < 4;dir++){
01040                 double* thishw = (double*)hw;
01041                 for(int k=0; k < hwSiteSize; k++){
01042                     thishw[ (4*i+dir)*hwSiteSize + k ]= 1.0* rand() /RAND_MAX;
01043                 }
01044             }
01045         }else{
01046             for(int dir=0;dir < 4;dir++){
01047                 float* thishw=(float*)hw;
01048                 for(int k=0; k < hwSiteSize; k++){
01049                     thishw[ (4*i+dir)*hwSiteSize + k ]= 1.0* rand() /RAND_MAX;
01050                 }
01051             }
01052         }
01053     }
01054 
01055     return;
01056 }
01057 
01058 
01059 template <typename Float>
01060 void compare_mom(Float *momA, Float *momB, int len) {
01061   int fail_check = 16;
01062   int fail[fail_check];
01063   for (int f=0; f<fail_check; f++) fail[f] = 0;
01064 
01065   int iter[momSiteSize];
01066   for (int i=0; i<momSiteSize; i++) iter[i] = 0;
01067   
01068   for (int i=0; i<len; i++) {
01069       for (int j=0; j<momSiteSize; j++) {
01070           int is = i*momSiteSize+j;
01071           double diff = fabs(momA[is]-momB[is]);
01072           for (int f=0; f<fail_check; f++)
01073               if (diff > pow(10.0,-(f+1))) fail[f]++;
01074           //if (diff > 1e-1) printf("%d %d %e\n", i, j, diff);
01075           if (diff > 1e-3) iter[j]++;
01076       }
01077   }
01078   
01079   for (int i=0; i<momSiteSize; i++) printf("%d fails = %d\n", i, iter[i]);
01080   
01081   for (int f=0; f<fail_check; f++) {
01082       printf("%e Failures: %d / %d  = %e\n", pow(10.0,-(f+1)), fail[f], len*momSiteSize, fail[f] / (double)(len*6));
01083   }
01084   
01085 }
01086 
01087 static void 
01088 printMomElement(void *mom, int X, QudaPrecision precision) 
01089 {
01090     if (precision == QUDA_DOUBLE_PRECISION){
01091         double* thismom = ((double*)mom)+ X*momSiteSize;
01092         printVector(thismom);
01093         printf("(%9f,%9f) (%9f,%9f)\n", thismom[6], thismom[7], thismom[8], thismom[9]);
01094     }else{
01095         float* thismom = ((float*)mom)+ X*momSiteSize;
01096         printVector(thismom);
01097         printf("(%9f,%9f) (%9f,%9f)\n", thismom[6], thismom[7], thismom[8], thismom[9]);        
01098     }
01099 }
01100 void strong_check_mom(void * momA, void *momB, int len, QudaPrecision prec) 
01101 {    
01102     printf("mom:\n");
01103     printMomElement(momA, 0, prec); 
01104     printf("\n");
01105     printMomElement(momA, 1, prec); 
01106     printf("\n");
01107     printMomElement(momA, 2, prec); 
01108     printf("\n");
01109     printMomElement(momA, 3, prec); 
01110     printf("...\n");
01111 
01112     printf("\nreference mom:\n");
01113     printMomElement(momB, 0, prec); 
01114     printf("\n");
01115     printMomElement(momB, 1, prec); 
01116     printf("\n");
01117     printMomElement(momB, 2, prec); 
01118     printf("\n");
01119     printMomElement(momB, 3, prec); 
01120     printf("\n");
01121 
01122     
01123     if (prec == QUDA_DOUBLE_PRECISION){
01124         compare_mom((double*)momA, (double*)momB, len);
01125     }else{
01126         compare_mom((float*)momA, (float*)momB, len);
01127     }
01128 }