QUDA: quda/tests/test_util.cpp Source File

QUDA v0.4.0
A library for QCD on GPUs
00001 #include <complex>
00002 #include <stdlib.h>
00003 #include <stdio.h>
00004 #include <string.h>
00005 
00006 #include <short.h>
00007 
00008 #include <wilson_dslash_reference.h>
00009 #include <test_util.h>
00010 
00011 #include <face_quda.h>
00012 #include "misc.h"
00013 
00014 #define XUP 0
00015 #define YUP 1
00016 #define ZUP 2
00017 #define TUP 3
00018 
00019 extern float fat_link_max;
00020 using namespace std;
00021 
00022 template <typename Float>
00023 static void printVector(Float *v) {
00024   printfQuda("{(%f %f) (%f %f) (%f %f)}\n", v[0], v[1], v[2], v[3], v[4], v[5]);
00025 }
00026 
00027 // X indexes the lattice site
00028 void printSpinorElement(void *spinor, int X, QudaPrecision precision) {
00029   if (precision == QUDA_DOUBLE_PRECISION)
00030     for (int s=0; s<4; s++) printVector((double*)spinor+X*24+s*6);
00031   else
00032     for (int s=0; s<4; s++) printVector((float*)spinor+X*24+s*6);
00033 }
00034 
00035 // X indexes the full lattice
00036 void printGaugeElement(void *gauge, int X, QudaPrecision precision) {
00037   if (getOddBit(X) == 0) {
00038     if (precision == QUDA_DOUBLE_PRECISION)
00039       for (int m=0; m<3; m++) printVector((double*)gauge +(X/2)*gaugeSiteSize + m*3*2);
00040     else
00041       for (int m=0; m<3; m++) printVector((float*)gauge +(X/2)*gaugeSiteSize + m*3*2);
00042       
00043   } else {
00044     if (precision == QUDA_DOUBLE_PRECISION)
00045       for (int m = 0; m < 3; m++) printVector((double*)gauge + (X/2+Vh)*gaugeSiteSize + m*3*2);
00046     else
00047       for (int m = 0; m < 3; m++) printVector((float*)gauge + (X/2+Vh)*gaugeSiteSize + m*3*2);
00048   }
00049 }
00050 
00051 // returns 0 or 1 if the full lattice index X is even or odd
00052 int getOddBit(int Y) {
00053   int x4 = Y/(Z[2]*Z[1]*Z[0]);
00054   int x3 = (Y/(Z[1]*Z[0])) % Z[2];
00055   int x2 = (Y/Z[0]) % Z[1];
00056   int x1 = Y % Z[0];
00057   return (x4+x3+x2+x1) % 2;
00058 }
00059 
00060 // a+=b
00061 template <typename Float>
00062 inline void complexAddTo(Float *a, Float *b) {
00063   a[0] += b[0];
00064   a[1] += b[1];
00065 }
00066 
00067 // a = b*c
00068 template <typename Float>
00069 inline void complexProduct(Float *a, Float *b, Float *c) {
00070   a[0] = b[0]*c[0] - b[1]*c[1];
00071   a[1] = b[0]*c[1] + b[1]*c[0];
00072 }
00073 
00074 // a = conj(b)*conj(c)
00075 template <typename Float>
00076 inline void complexConjugateProduct(Float *a, Float *b, Float *c) {
00077   a[0] = b[0]*c[0] - b[1]*c[1];
00078   a[1] = -b[0]*c[1] - b[1]*c[0];
00079 }
00080 
00081 // a = conj(b)*c
00082 template <typename Float>
00083 inline void complexDotProduct(Float *a, Float *b, Float *c) {
00084   a[0] = b[0]*c[0] + b[1]*c[1];
00085   a[1] = b[0]*c[1] - b[1]*c[0];
00086 }
00087 
00088 // a += b*c
00089 template <typename Float>
00090 inline void accumulateComplexProduct(Float *a, Float *b, Float *c, Float sign) {
00091   a[0] += sign*(b[0]*c[0] - b[1]*c[1]);
00092   a[1] += sign*(b[0]*c[1] + b[1]*c[0]);
00093 }
00094 
00095 // a += conj(b)*c)
00096 template <typename Float>
00097 inline void accumulateComplexDotProduct(Float *a, Float *b, Float *c) {
00098   a[0] += b[0]*c[0] + b[1]*c[1];
00099   a[1] += b[0]*c[1] - b[1]*c[0];
00100 }
00101 
00102 template <typename Float>
00103 inline void accumulateConjugateProduct(Float *a, Float *b, Float *c, int sign) {
00104   a[0] += sign * (b[0]*c[0] - b[1]*c[1]);
00105   a[1] -= sign * (b[0]*c[1] + b[1]*c[0]);
00106 }
00107 
00108 template <typename Float>
00109 inline void su3Construct12(Float *mat) {
00110   Float *w = mat+12;
00111   w[0] = 0.0;
00112   w[1] = 0.0;
00113   w[2] = 0.0;
00114   w[3] = 0.0;
00115   w[4] = 0.0;
00116   w[5] = 0.0;
00117 }
00118 
00119 // Stabilized Bunk and Sommer
00120 template <typename Float>
00121 inline void su3Construct8(Float *mat) {
00122   mat[0] = atan2(mat[1], mat[0]);
00123   mat[1] = atan2(mat[13], mat[12]);
00124   for (int i=8; i<18; i++) mat[i] = 0.0;
00125 }
00126 
00127 void su3_construct(void *mat, QudaReconstructType reconstruct, QudaPrecision precision) {
00128   if (reconstruct == QUDA_RECONSTRUCT_12) {
00129     if (precision == QUDA_DOUBLE_PRECISION) su3Construct12((double*)mat);
00130     else su3Construct12((float*)mat);
00131   } else {
00132     if (precision == QUDA_DOUBLE_PRECISION) su3Construct8((double*)mat);
00133     else su3Construct8((float*)mat);
00134   }
00135 }
00136 
00137 // given first two rows (u,v) of SU(3) matrix mat, reconstruct the third row
00138 // as the cross product of the conjugate vectors: w = u* x v*
00139 // 
00140 // 48 flops
00141 template <typename Float>
00142 static void su3Reconstruct12(Float *mat, int dir, int ga_idx, QudaGaugeParam *param) {
00143   Float *u = &mat[0*(3*2)];
00144   Float *v = &mat[1*(3*2)];
00145   Float *w = &mat[2*(3*2)];
00146   w[0] = 0.0; w[1] = 0.0; w[2] = 0.0; w[3] = 0.0; w[4] = 0.0; w[5] = 0.0;
00147   accumulateConjugateProduct(w+0*(2), u+1*(2), v+2*(2), +1);
00148   accumulateConjugateProduct(w+0*(2), u+2*(2), v+1*(2), -1);
00149   accumulateConjugateProduct(w+1*(2), u+2*(2), v+0*(2), +1);
00150   accumulateConjugateProduct(w+1*(2), u+0*(2), v+2*(2), -1);
00151   accumulateConjugateProduct(w+2*(2), u+0*(2), v+1*(2), +1);
00152   accumulateConjugateProduct(w+2*(2), u+1*(2), v+0*(2), -1);
00153   Float u0 = (dir < 3 ? param->anisotropy :
00154               (ga_idx >= (Z[3]-1)*Z[0]*Z[1]*Z[2]/2 ? param->t_boundary : 1));
00155   w[0]*=u0; w[1]*=u0; w[2]*=u0; w[3]*=u0; w[4]*=u0; w[5]*=u0;
00156 }
00157 
00158 template <typename Float>
00159 static void su3Reconstruct8(Float *mat, int dir, int ga_idx, QudaGaugeParam *param) {
00160   // First reconstruct first row
00161   Float row_sum = 0.0;
00162   row_sum += mat[2]*mat[2];
00163   row_sum += mat[3]*mat[3];
00164   row_sum += mat[4]*mat[4];
00165   row_sum += mat[5]*mat[5];
00166   Float u0 = (dir < 3 ? param->anisotropy :
00167               (ga_idx >= (Z[3]-1)*Z[0]*Z[1]*Z[2]/2 ? param->t_boundary : 1));
00168   Float U00_mag = sqrt(1.f/(u0*u0) - row_sum);
00169 
00170   mat[14] = mat[0];
00171   mat[15] = mat[1];
00172 
00173   mat[0] = U00_mag * cos(mat[14]);
00174   mat[1] = U00_mag * sin(mat[14]);
00175 
00176   Float column_sum = 0.0;
00177   for (int i=0; i<2; i++) column_sum += mat[i]*mat[i];
00178   for (int i=6; i<8; i++) column_sum += mat[i]*mat[i];
00179   Float U20_mag = sqrt(1.f/(u0*u0) - column_sum);
00180 
00181   mat[12] = U20_mag * cos(mat[15]);
00182   mat[13] = U20_mag * sin(mat[15]);
00183 
00184   // First column now restored
00185 
00186   // finally reconstruct last elements from SU(2) rotation
00187   Float r_inv2 = 1.0/(u0*row_sum);
00188 
00189   // U11
00190   Float A[2];
00191   complexDotProduct(A, mat+0, mat+6);
00192   complexConjugateProduct(mat+8, mat+12, mat+4);
00193   accumulateComplexProduct(mat+8, A, mat+2, u0);
00194   mat[8] *= -r_inv2;
00195   mat[9] *= -r_inv2;
00196 
00197   // U12
00198   complexConjugateProduct(mat+10, mat+12, mat+2);
00199   accumulateComplexProduct(mat+10, A, mat+4, -u0);
00200   mat[10] *= r_inv2;
00201   mat[11] *= r_inv2;
00202 
00203   // U21
00204   complexDotProduct(A, mat+0, mat+12);
00205   complexConjugateProduct(mat+14, mat+6, mat+4);
00206   accumulateComplexProduct(mat+14, A, mat+2, -u0);
00207   mat[14] *= r_inv2;
00208   mat[15] *= r_inv2;
00209 
00210   // U12
00211   complexConjugateProduct(mat+16, mat+6, mat+2);
00212   accumulateComplexProduct(mat+16, A, mat+4, u0);
00213   mat[16] *= -r_inv2;
00214   mat[17] *= -r_inv2;
00215 }
00216 
00217 void su3_reconstruct(void *mat, int dir, int ga_idx, QudaReconstructType reconstruct, QudaPrecision precision, QudaGaugeParam *param) {
00218   if (reconstruct == QUDA_RECONSTRUCT_12) {
00219     if (precision == QUDA_DOUBLE_PRECISION) su3Reconstruct12((double*)mat, dir, ga_idx, param);
00220     else su3Reconstruct12((float*)mat, dir, ga_idx, param);
00221   } else {
00222     if (precision == QUDA_DOUBLE_PRECISION) su3Reconstruct8((double*)mat, dir, ga_idx, param);
00223     else su3Reconstruct8((float*)mat, dir, ga_idx, param);
00224   }
00225 }
00226 
00227 /*
00228   void su3_construct_8_half(float *mat, short *mat_half) {
00229   su3Construct8(mat);
00230 
00231   mat_half[0] = floatToShort(mat[0] / M_PI);
00232   mat_half[1] = floatToShort(mat[1] / M_PI);
00233   for (int i=2; i<18; i++) {
00234   mat_half[i] = floatToShort(mat[i]);
00235   }
00236   }
00237 
00238   void su3_reconstruct_8_half(float *mat, short *mat_half, int dir, int ga_idx, QudaGaugeParam *param) {
00239 
00240   for (int i=0; i<18; i++) {
00241   mat[i] = shortToFloat(mat_half[i]);
00242   }
00243   mat[0] *= M_PI;
00244   mat[1] *= M_PI;
00245 
00246   su3Reconstruct8(mat, dir, ga_idx, param);
00247   }*/
00248 
00249 template <typename Float>
00250 static int compareFloats(Float *a, Float *b, int len, double epsilon) {
00251   for (int i = 0; i < len; i++) {
00252     double diff = fabs(a[i] - b[i]);
00253     if (diff > epsilon) {
00254       printfQuda("error: i=%d, a[%d]=%f, b[%d]=%f\n", i, i, a[i], i, b[i]);
00255       return 0;
00256     }
00257   }
00258   return 1;
00259 }
00260 
00261 int compare_floats(void *a, void *b, int len, double epsilon, QudaPrecision precision) {
00262   if  (precision == QUDA_DOUBLE_PRECISION) return compareFloats((double*)a, (double*)b, len, epsilon);
00263   else return compareFloats((float*)a, (float*)b, len, epsilon);
00264 }
00265 
00266 
00267 
00268 // given a "half index" i into either an even or odd half lattice (corresponding
00269 // to oddBit = {0, 1}), returns the corresponding full lattice index.
00270 int fullLatticeIndex(int i, int oddBit) {
00271   /*
00272     int boundaryCrossings = i/(Z[0]/2) + i/(Z[1]*Z[0]/2) + i/(Z[2]*Z[1]*Z[0]/2);
00273     return 2*i + (boundaryCrossings + oddBit) % 2;
00274   */
00275 
00276   int X1 = Z[0];  
00277   int X2 = Z[1];
00278   int X3 = Z[2];
00279   //int X4 = Z[3];
00280   int X1h =X1/2;
00281 
00282   int sid =i;
00283   int za = sid/X1h;
00284   //int x1h = sid - za*X1h;
00285   int zb = za/X2;
00286   int x2 = za - zb*X2;
00287   int x4 = zb/X3;
00288   int x3 = zb - x4*X3;
00289   int x1odd = (x2 + x3 + x4 + oddBit) & 1;
00290   //int x1 = 2*x1h + x1odd;
00291   int X = 2*sid + x1odd; 
00292 
00293   return X;
00294 }
00295 
00296 
00297 // i represents a "half index" into an even or odd "half lattice".
00298 // when oddBit={0,1} the half lattice is {even,odd}.
00299 // 
00300 // the displacements, such as dx, refer to the full lattice coordinates. 
00301 //
00302 // neighborIndex() takes a "half index", displaces it, and returns the
00303 // new "half index", which can be an index into either the even or odd lattices.
00304 // displacements of magnitude one always interchange odd and even lattices.
00305 //
00306 
00307 int neighborIndex(int i, int oddBit, int dx4, int dx3, int dx2, int dx1) {
00308   int Y = fullLatticeIndex(i, oddBit);
00309   int x4 = Y/(Z[2]*Z[1]*Z[0]);
00310   int x3 = (Y/(Z[1]*Z[0])) % Z[2];
00311   int x2 = (Y/Z[0]) % Z[1];
00312   int x1 = Y % Z[0];
00313   
00314   // assert (oddBit == (x+y+z+t)%2);
00315   
00316   x4 = (x4+dx4+Z[3]) % Z[3];
00317   x3 = (x3+dx3+Z[2]) % Z[2];
00318   x2 = (x2+dx2+Z[1]) % Z[1];
00319   x1 = (x1+dx1+Z[0]) % Z[0];
00320   
00321   return (x4*(Z[2]*Z[1]*Z[0]) + x3*(Z[1]*Z[0]) + x2*(Z[0]) + x1) / 2;
00322 }
00323 
00324 int
00325 neighborIndex_mg(int i, int oddBit, int dx4, int dx3, int dx2, int dx1)
00326 {
00327   int ret;
00328   
00329   int Y = fullLatticeIndex(i, oddBit);
00330   int x4 = Y/(Z[2]*Z[1]*Z[0]);
00331   int x3 = (Y/(Z[1]*Z[0])) % Z[2];
00332   int x2 = (Y/Z[0]) % Z[1];
00333   int x1 = Y % Z[0];
00334   
00335   int ghost_x4 = x4+ dx4;
00336   
00337   // assert (oddBit == (x+y+z+t)%2);
00338   
00339   x4 = (x4+dx4+Z[3]) % Z[3];
00340   x3 = (x3+dx3+Z[2]) % Z[2];
00341   x2 = (x2+dx2+Z[1]) % Z[1];
00342   x1 = (x1+dx1+Z[0]) % Z[0];
00343   
00344   if ( ghost_x4 >= 0 && ghost_x4 < Z[3]){
00345     ret = (x4*(Z[2]*Z[1]*Z[0]) + x3*(Z[1]*Z[0]) + x2*(Z[0]) + x1) / 2;
00346   }else{
00347     ret = (x3*(Z[1]*Z[0]) + x2*(Z[0]) + x1) / 2;    
00348   }
00349 
00350   
00351   return ret;
00352 }
00353 
00354 
00355 /*  
00356  * This is a computation of neighbor using the full index and the displacement in each direction
00357  *
00358  */
00359 
00360 int
00361 neighborIndexFullLattice(int i, int dx4, int dx3, int dx2, int dx1) 
00362 {
00363   int oddBit = 0;
00364   int half_idx = i;
00365   if (i >= Vh){
00366     oddBit =1;
00367     half_idx = i - Vh;
00368   }
00369     
00370   int nbr_half_idx = neighborIndex(half_idx, oddBit, dx4,dx3,dx2,dx1);
00371   int oddBitChanged = (dx4+dx3+dx2+dx1)%2;
00372   if (oddBitChanged){
00373     oddBit = 1 - oddBit;
00374   }
00375   int ret = nbr_half_idx;
00376   if (oddBit){
00377     ret = Vh + nbr_half_idx;
00378   }
00379     
00380   return ret;
00381 }
00382 
00383 
00384 int
00385 neighborIndexFullLattice_mg(int i, int dx4, int dx3, int dx2, int dx1) 
00386 {
00387   int ret;
00388   int oddBit = 0;
00389   int half_idx = i;
00390   if (i >= Vh){
00391     oddBit =1;
00392     half_idx = i - Vh;
00393   }
00394     
00395   int Y = fullLatticeIndex(half_idx, oddBit);
00396   int x4 = Y/(Z[2]*Z[1]*Z[0]);
00397   int x3 = (Y/(Z[1]*Z[0])) % Z[2];
00398   int x2 = (Y/Z[0]) % Z[1];
00399   int x1 = Y % Z[0];
00400   int ghost_x4 = x4+ dx4;
00401     
00402   x4 = (x4+dx4+Z[3]) % Z[3];
00403   x3 = (x3+dx3+Z[2]) % Z[2];
00404   x2 = (x2+dx2+Z[1]) % Z[1];
00405   x1 = (x1+dx1+Z[0]) % Z[0];
00406 
00407   if ( ghost_x4 >= 0 && ghost_x4 < Z[3]){
00408     ret = (x4*(Z[2]*Z[1]*Z[0]) + x3*(Z[1]*Z[0]) + x2*(Z[0]) + x1) / 2;
00409   }else{
00410     ret = (x3*(Z[1]*Z[0]) + x2*(Z[0]) + x1) / 2;    
00411     return ret;
00412   }
00413 
00414   int oddBitChanged = (dx4+dx3+dx2+dx1)%2;
00415   if (oddBitChanged){
00416     oddBit = 1 - oddBit;
00417   }
00418     
00419   if (oddBit){
00420     ret += Vh;
00421   }
00422     
00423   return ret;
00424 }
00425 
00426 
00427 // 4d checkerboard.
00428 // given a "half index" i into either an even or odd half lattice (corresponding
00429 // to oddBit = {0, 1}), returns the corresponding full lattice index.
00430 // Cf. GPGPU code in dslash_core_ante.h.
00431 // There, i is the thread index.
00432 int fullLatticeIndex_4d(int i, int oddBit) {
00433   if (i >= Vh || i < 0) {printf("i out of range in fullLatticeIndex_4d"); exit(-1);}
00434   /*
00435     int boundaryCrossings = i/(Z[0]/2) + i/(Z[1]*Z[0]/2) + i/(Z[2]*Z[1]*Z[0]/2);
00436     return 2*i + (boundaryCrossings + oddBit) % 2;
00437   */
00438 
00439   int X1 = Z[0];  
00440   int X2 = Z[1];
00441   int X3 = Z[2];
00442   //int X4 = Z[3];
00443   int X1h =X1/2;
00444 
00445   int sid =i;
00446   int za = sid/X1h;
00447   //int x1h = sid - za*X1h;
00448   int zb = za/X2;
00449   int x2 = za - zb*X2;
00450   int x4 = zb/X3;
00451   int x3 = zb - x4*X3;
00452   int x1odd = (x2 + x3 + x4 + oddBit) & 1;
00453   //int x1 = 2*x1h + x1odd;
00454   int X = 2*sid + x1odd; 
00455 
00456   return X;
00457 }
00458 
00459 // 5d checkerboard.
00460 // given a "half index" i into either an even or odd half lattice (corresponding
00461 // to oddBit = {0, 1}), returns the corresponding full lattice index.
00462 // Cf. GPGPU code in dslash_core_ante.h.
00463 // There, i is the thread index sid.
00464 // This function is used by neighborIndex_5d in dslash_reference.cpp.
00465 //ok
00466 int fullLatticeIndex_5d(int i, int oddBit) {
00467   int boundaryCrossings = i/(Z[0]/2) + i/(Z[1]*Z[0]/2) + i/(Z[2]*Z[1]*Z[0]/2) + i/(Z[3]*Z[2]*Z[1]*Z[0]/2);
00468   return 2*i + (boundaryCrossings + oddBit) % 2;
00469 }
00470 
00471 int 
00472 x4_from_full_index(int i)
00473 {
00474   int oddBit = 0;
00475   int half_idx = i;
00476   if (i >= Vh){
00477     oddBit =1;
00478     half_idx = i - Vh;
00479   }
00480   
00481   int Y = fullLatticeIndex(half_idx, oddBit);
00482   int x4 = Y/(Z[2]*Z[1]*Z[0]);
00483   
00484   return x4;
00485 }
00486 
00487 template <typename Float>
00488 static void applyGaugeFieldScaling(Float **gauge, int Vh, QudaGaugeParam *param) {
00489   // Apply spatial scaling factor (u0) to spatial links
00490   for (int d = 0; d < 3; d++) {
00491     for (int i = 0; i < gaugeSiteSize*Vh*2; i++) {
00492       gauge[d][i] /= param->anisotropy;
00493     }
00494   }
00495     
00496   // only apply T-boundary at edge nodes
00497   bool Ntm1 = (commCoords(3) == commDim(3)-1) ? true : false;
00498 
00499   // Apply boundary conditions to temporal links
00500   if (param->t_boundary == QUDA_ANTI_PERIODIC_T && Ntm1) {
00501     for (int j = (Z[0]/2)*Z[1]*Z[2]*(Z[3]-1); j < Vh; j++) {
00502       for (int i = 0; i < gaugeSiteSize; i++) {
00503         gauge[3][j*gaugeSiteSize+i] *= -1.0;
00504         gauge[3][(Vh+j)*gaugeSiteSize+i] *= -1.0;
00505       }
00506     }
00507   }
00508     
00509   if (param->gauge_fix) {
00510     // set all gauge links (except for the last Z[0]*Z[1]*Z[2]/2) to the identity,
00511     // to simulate fixing to the temporal gauge.
00512     int iMax = ( Ntm1 ? (Z[0]/2)*Z[1]*Z[2]*(Z[3]-1) : Vh );
00513     int dir = 3; // time direction only
00514     Float *even = gauge[dir];
00515     Float *odd  = gauge[dir]+Vh*gaugeSiteSize;
00516     for (int i = 0; i< iMax; i++) {
00517       for (int m = 0; m < 3; m++) {
00518         for (int n = 0; n < 3; n++) {
00519           even[i*(3*3*2) + m*(3*2) + n*(2) + 0] = (m==n) ? 1 : 0;
00520           even[i*(3*3*2) + m*(3*2) + n*(2) + 1] = 0.0;
00521           odd [i*(3*3*2) + m*(3*2) + n*(2) + 0] = (m==n) ? 1 : 0;
00522           odd [i*(3*3*2) + m*(3*2) + n*(2) + 1] = 0.0;
00523         }
00524       }
00525     }
00526   }
00527 }
00528 
00529 template <typename Float>
00530 void applyGaugeFieldScaling_long(Float **gauge, int Vh, QudaGaugeParam *param)
00531 {
00532 
00533   int X1h=param->X[0]/2;
00534   int X1 =param->X[0];
00535   int X2 =param->X[1];
00536   int X3 =param->X[2];
00537   int X4 =param->X[3];
00538 
00539   // rescale long links by the appropriate coefficient
00540   for(int d=0; d<4; d++){
00541     for(int i=0; i < V*gaugeSiteSize; i++){
00542       gauge[d][i] /= (-24*param->tadpole_coeff*param->tadpole_coeff);
00543     }
00544   }
00545 
00546   // apply the staggered phases
00547   for (int d = 0; d < 3; d++) {
00548 
00549     //even
00550     for (int i = 0; i < Vh; i++) {
00551 
00552       int index = fullLatticeIndex(i, 0);
00553       int i4 = index /(X3*X2*X1);
00554       int i3 = (index - i4*(X3*X2*X1))/(X2*X1);
00555       int i2 = (index - i4*(X3*X2*X1) - i3*(X2*X1))/X1;
00556       int i1 = index - i4*(X3*X2*X1) - i3*(X2*X1) - i2*X1;
00557       int sign=1;
00558 
00559       if (d == 0) {
00560         if (i4 % 2 == 1){
00561           sign= -1;
00562         }
00563       }
00564 
00565       if (d == 1){
00566         if ((i4+i1) % 2 == 1){
00567           sign= -1;
00568         }
00569       }
00570       if (d == 2){
00571         if ( (i4+i1+i2) % 2 == 1){
00572           sign= -1;
00573         }
00574       }
00575 
00576       for (int j=0;j < 6; j++){
00577         gauge[d][i*gaugeSiteSize + 12+ j] *= sign;
00578       }
00579     }
00580     //odd
00581     for (int i = 0; i < Vh; i++) {
00582       int index = fullLatticeIndex(i, 1);
00583       int i4 = index /(X3*X2*X1);
00584       int i3 = (index - i4*(X3*X2*X1))/(X2*X1);
00585       int i2 = (index - i4*(X3*X2*X1) - i3*(X2*X1))/X1;
00586       int i1 = index - i4*(X3*X2*X1) - i3*(X2*X1) - i2*X1;
00587       int sign=1;
00588 
00589       if (d == 0) {
00590         if (i4 % 2 == 1){
00591           sign= -1;
00592         }
00593       }
00594 
00595       if (d == 1){
00596         if ((i4+i1) % 2 == 1){
00597           sign= -1;
00598         }
00599       }
00600       if (d == 2){
00601         if ( (i4+i1+i2) % 2 == 1){
00602           sign = -1;
00603         }
00604       }
00605 
00606       for (int j=0;j < 6; j++){
00607         gauge[d][(Vh+i)*gaugeSiteSize + 12 + j] *= sign;
00608       }
00609     }
00610 
00611   }
00612 
00613   // Apply boundary conditions to temporal links
00614   if (param->t_boundary == QUDA_ANTI_PERIODIC_T) {
00615     for (int j = 0; j < Vh; j++) {
00616       int sign =1;
00617       if (j >= (X4-3)*X1h*X2*X3 ){
00618         sign= -1;
00619       }
00620 
00621       for (int i = 0; i < 6; i++) {
00622         gauge[3][j*gaugeSiteSize+ 12+ i ] *= sign;
00623         gauge[3][(Vh+j)*gaugeSiteSize+12 +i] *= sign;
00624       }
00625     }
00626   }
00627 }
00628 
00629 
00630 
00631 template <typename Float>
00632 static void constructUnitGaugeField(Float **res, QudaGaugeParam *param) {
00633   Float *resOdd[4], *resEven[4];
00634   for (int dir = 0; dir < 4; dir++) {  
00635     resEven[dir] = res[dir];
00636     resOdd[dir]  = res[dir]+Vh*gaugeSiteSize;
00637   }
00638     
00639   for (int dir = 0; dir < 4; dir++) {
00640     for (int i = 0; i < Vh; i++) {
00641       for (int m = 0; m < 3; m++) {
00642         for (int n = 0; n < 3; n++) {
00643           resEven[dir][i*(3*3*2) + m*(3*2) + n*(2) + 0] = (m==n) ? 1 : 0;
00644           resEven[dir][i*(3*3*2) + m*(3*2) + n*(2) + 1] = 0.0;
00645           resOdd[dir][i*(3*3*2) + m*(3*2) + n*(2) + 0] = (m==n) ? 1 : 0;
00646           resOdd[dir][i*(3*3*2) + m*(3*2) + n*(2) + 1] = 0.0;
00647         }
00648       }
00649     }
00650   }
00651     
00652   applyGaugeFieldScaling(res, Vh, param);
00653 }
00654 
00655 // normalize the vector a
00656 template <typename Float>
00657 static void normalize(complex<Float> *a, int len) {
00658   double sum = 0.0;
00659   for (int i=0; i<len; i++) sum += norm(a[i]);
00660   for (int i=0; i<len; i++) a[i] /= sqrt(sum);
00661 }
00662 
00663 // orthogonalize vector b to vector a
00664 template <typename Float>
00665 static void orthogonalize(complex<Float> *a, complex<Float> *b, int len) {
00666   complex<double> dot = 0.0;
00667   for (int i=0; i<len; i++) dot += conj(a[i])*b[i];
00668   for (int i=0; i<len; i++) b[i] -= (complex<Float>)dot*a[i];
00669 }
00670 
00671 template <typename Float> 
00672 static void constructGaugeField(Float **res, QudaGaugeParam *param) {
00673   Float *resOdd[4], *resEven[4];
00674   for (int dir = 0; dir < 4; dir++) {  
00675     resEven[dir] = res[dir];
00676     resOdd[dir]  = res[dir]+Vh*gaugeSiteSize;
00677   }
00678     
00679   for (int dir = 0; dir < 4; dir++) {
00680     for (int i = 0; i < Vh; i++) {
00681       for (int m = 1; m < 3; m++) { // last 2 rows
00682         for (int n = 0; n < 3; n++) { // 3 columns
00683           resEven[dir][i*(3*3*2) + m*(3*2) + n*(2) + 0] = rand() / (Float)RAND_MAX;
00684           resEven[dir][i*(3*3*2) + m*(3*2) + n*(2) + 1] = rand() / (Float)RAND_MAX;
00685           resOdd[dir][i*(3*3*2) + m*(3*2) + n*(2) + 0] = rand() / (Float)RAND_MAX;
00686           resOdd[dir][i*(3*3*2) + m*(3*2) + n*(2) + 1] = rand() / (Float)RAND_MAX;                    
00687         }
00688       }
00689       normalize((complex<Float>*)(resEven[dir] + (i*3+1)*3*2), 3);
00690       orthogonalize((complex<Float>*)(resEven[dir] + (i*3+1)*3*2), (complex<Float>*)(resEven[dir] + (i*3+2)*3*2), 3);
00691       normalize((complex<Float>*)(resEven[dir] + (i*3 + 2)*3*2), 3);
00692       
00693       normalize((complex<Float>*)(resOdd[dir] + (i*3+1)*3*2), 3);
00694       orthogonalize((complex<Float>*)(resOdd[dir] + (i*3+1)*3*2), (complex<Float>*)(resOdd[dir] + (i*3+2)*3*2), 3);
00695       normalize((complex<Float>*)(resOdd[dir] + (i*3 + 2)*3*2), 3);
00696 
00697       {
00698         Float *w = resEven[dir]+(i*3+0)*3*2;
00699         Float *u = resEven[dir]+(i*3+1)*3*2;
00700         Float *v = resEven[dir]+(i*3+2)*3*2;
00701         
00702         for (int n = 0; n < 6; n++) w[n] = 0.0;
00703         accumulateConjugateProduct(w+0*(2), u+1*(2), v+2*(2), +1);
00704         accumulateConjugateProduct(w+0*(2), u+2*(2), v+1*(2), -1);
00705         accumulateConjugateProduct(w+1*(2), u+2*(2), v+0*(2), +1);
00706         accumulateConjugateProduct(w+1*(2), u+0*(2), v+2*(2), -1);
00707         accumulateConjugateProduct(w+2*(2), u+0*(2), v+1*(2), +1);
00708         accumulateConjugateProduct(w+2*(2), u+1*(2), v+0*(2), -1);
00709       }
00710 
00711       {
00712         Float *w = resOdd[dir]+(i*3+0)*3*2;
00713         Float *u = resOdd[dir]+(i*3+1)*3*2;
00714         Float *v = resOdd[dir]+(i*3+2)*3*2;
00715         
00716         for (int n = 0; n < 6; n++) w[n] = 0.0;
00717         accumulateConjugateProduct(w+0*(2), u+1*(2), v+2*(2), +1);
00718         accumulateConjugateProduct(w+0*(2), u+2*(2), v+1*(2), -1);
00719         accumulateConjugateProduct(w+1*(2), u+2*(2), v+0*(2), +1);
00720         accumulateConjugateProduct(w+1*(2), u+0*(2), v+2*(2), -1);
00721         accumulateConjugateProduct(w+2*(2), u+0*(2), v+1*(2), +1);
00722         accumulateConjugateProduct(w+2*(2), u+1*(2), v+0*(2), -1);
00723       }
00724 
00725     }
00726   }
00727 
00728   if (param->type == QUDA_WILSON_LINKS){  
00729     applyGaugeFieldScaling(res, Vh, param);
00730   } else if (param->type == QUDA_ASQTAD_LONG_LINKS){
00731     applyGaugeFieldScaling_long(res, Vh, param);      
00732   } else if (param->type == QUDA_ASQTAD_FAT_LINKS){
00733     for (int dir = 0; dir < 4; dir++){ 
00734       for (int i = 0; i < Vh; i++) {
00735         for (int m = 0; m < 3; m++) { // last 2 rows
00736           for (int n = 0; n < 3; n++) { // 3 columns
00737             resEven[dir][i*(3*3*2) + m*(3*2) + n*(2) + 0] =1.0* rand() / (Float)RAND_MAX;
00738             resEven[dir][i*(3*3*2) + m*(3*2) + n*(2) + 1] =2.0* rand() / (Float)RAND_MAX;
00739             resOdd[dir][i*(3*3*2) + m*(3*2) + n*(2) + 0] = 3.0*rand() / (Float)RAND_MAX;
00740             resOdd[dir][i*(3*3*2) + m*(3*2) + n*(2) + 1] = 4.0*rand() / (Float)RAND_MAX;
00741           }
00742         }
00743       }
00744     }
00745     
00746   }
00747 
00748 }
00749 
00750 template <typename Float> 
00751 void constructUnitaryGaugeField(Float **res) 
00752 {
00753   Float *resOdd[4], *resEven[4];
00754   for (int dir = 0; dir < 4; dir++) {  
00755     resEven[dir] = res[dir];
00756     resOdd[dir]  = res[dir]+Vh*gaugeSiteSize;
00757   }
00758   
00759   for (int dir = 0; dir < 4; dir++) {
00760     for (int i = 0; i < Vh; i++) {
00761       for (int m = 1; m < 3; m++) { // last 2 rows
00762         for (int n = 0; n < 3; n++) { // 3 columns
00763           resEven[dir][i*(3*3*2) + m*(3*2) + n*(2) + 0] = rand() / (Float)RAND_MAX;
00764           resEven[dir][i*(3*3*2) + m*(3*2) + n*(2) + 1] = rand() / (Float)RAND_MAX;
00765           resOdd[dir][i*(3*3*2) + m*(3*2) + n*(2) + 0] = rand() / (Float)RAND_MAX;
00766           resOdd[dir][i*(3*3*2) + m*(3*2) + n*(2) + 1] = rand() / (Float)RAND_MAX;                    
00767         }
00768       }
00769       normalize((complex<Float>*)(resEven[dir] + (i*3+1)*3*2), 3);
00770       orthogonalize((complex<Float>*)(resEven[dir] + (i*3+1)*3*2), (complex<Float>*)(resEven[dir] + (i*3+2)*3*2), 3);
00771       normalize((complex<Float>*)(resEven[dir] + (i*3 + 2)*3*2), 3);
00772       
00773       normalize((complex<Float>*)(resOdd[dir] + (i*3+1)*3*2), 3);
00774       orthogonalize((complex<Float>*)(resOdd[dir] + (i*3+1)*3*2), (complex<Float>*)(resOdd[dir] + (i*3+2)*3*2), 3);
00775       normalize((complex<Float>*)(resOdd[dir] + (i*3 + 2)*3*2), 3);
00776 
00777       {
00778         Float *w = resEven[dir]+(i*3+0)*3*2;
00779         Float *u = resEven[dir]+(i*3+1)*3*2;
00780         Float *v = resEven[dir]+(i*3+2)*3*2;
00781         
00782         for (int n = 0; n < 6; n++) w[n] = 0.0;
00783         accumulateConjugateProduct(w+0*(2), u+1*(2), v+2*(2), +1);
00784         accumulateConjugateProduct(w+0*(2), u+2*(2), v+1*(2), -1);
00785         accumulateConjugateProduct(w+1*(2), u+2*(2), v+0*(2), +1);
00786         accumulateConjugateProduct(w+1*(2), u+0*(2), v+2*(2), -1);
00787         accumulateConjugateProduct(w+2*(2), u+0*(2), v+1*(2), +1);
00788         accumulateConjugateProduct(w+2*(2), u+1*(2), v+0*(2), -1);
00789       }
00790       
00791       {
00792         Float *w = resOdd[dir]+(i*3+0)*3*2;
00793         Float *u = resOdd[dir]+(i*3+1)*3*2;
00794         Float *v = resOdd[dir]+(i*3+2)*3*2;
00795         
00796         for (int n = 0; n < 6; n++) w[n] = 0.0;
00797         accumulateConjugateProduct(w+0*(2), u+1*(2), v+2*(2), +1);
00798         accumulateConjugateProduct(w+0*(2), u+2*(2), v+1*(2), -1);
00799         accumulateConjugateProduct(w+1*(2), u+2*(2), v+0*(2), +1);
00800         accumulateConjugateProduct(w+1*(2), u+0*(2), v+2*(2), -1);
00801         accumulateConjugateProduct(w+2*(2), u+0*(2), v+1*(2), +1);
00802         accumulateConjugateProduct(w+2*(2), u+1*(2), v+0*(2), -1);
00803       }
00804       
00805     }
00806   }
00807 }
00808 
00809 
00810 void construct_gauge_field(void **gauge, int type, QudaPrecision precision, QudaGaugeParam *param) {
00811   if (type == 0) {
00812     if (precision == QUDA_DOUBLE_PRECISION) constructUnitGaugeField((double**)gauge, param);
00813     else constructUnitGaugeField((float**)gauge, param);
00814   } else if (type == 1) {
00815     if (precision == QUDA_DOUBLE_PRECISION) constructGaugeField((double**)gauge, param);
00816     else constructGaugeField((float**)gauge, param);
00817   } else {
00818     if (precision == QUDA_DOUBLE_PRECISION) applyGaugeFieldScaling((double**)gauge, Vh, param);
00819     else applyGaugeFieldScaling((float**)gauge, Vh, param);    
00820   }
00821 
00822 }
00823 
00824 void
00825 construct_fat_long_gauge_field(void **fatlink, void** longlink,  
00826                                int type, QudaPrecision precision, QudaGaugeParam* param)
00827 {
00828   if (type == 0) {
00829     if (precision == QUDA_DOUBLE_PRECISION) {
00830       constructUnitGaugeField((double**)fatlink, param);
00831       constructUnitGaugeField((double**)longlink, param);
00832     }else {
00833       constructUnitGaugeField((float**)fatlink, param);
00834       constructUnitGaugeField((float**)longlink, param);
00835     }
00836   } else {
00837     if (precision == QUDA_DOUBLE_PRECISION) {
00838       param->type = QUDA_ASQTAD_FAT_LINKS;
00839       constructGaugeField((double**)fatlink, param);
00840       param->type = QUDA_ASQTAD_LONG_LINKS;
00841       constructGaugeField((double**)longlink, param);
00842     }else {
00843       param->type = QUDA_ASQTAD_FAT_LINKS;
00844       constructGaugeField((float**)fatlink, param);
00845       param->type = QUDA_ASQTAD_LONG_LINKS;
00846       constructGaugeField((float**)longlink, param);
00847     }
00848   }
00849 }
00850 
00851 
00852 template <typename Float>
00853 static void constructCloverField(Float *res, double norm, double diag) {
00854 
00855   Float c = 2.0 * norm / RAND_MAX;
00856 
00857   for(int i = 0; i < V; i++) {
00858     for (int j = 0; j < 72; j++) {
00859       res[i*72 + j] = c*rand() - norm;
00860     }
00861     for (int j = 0; j< 6; j++) {
00862       res[i*72 + j] += diag;
00863       res[i*72 + j+36] += diag;
00864     }
00865   }
00866 }
00867 
00868 void construct_clover_field(void *clover, double norm, double diag, QudaPrecision precision) {
00869 
00870   if (precision == QUDA_DOUBLE_PRECISION) constructCloverField((double *)clover, norm, diag);
00871   else constructCloverField((float *)clover, norm, diag);
00872 }
00873 
00874 /*void strong_check(void *spinorRef, void *spinorGPU, int len, QudaPrecision prec) {
00875   printf("Reference:\n");
00876   printSpinorElement(spinorRef, 0, prec); printf("...\n");
00877   printSpinorElement(spinorRef, len-1, prec); printf("\n");    
00878     
00879   printf("\nCUDA:\n");
00880   printSpinorElement(spinorGPU, 0, prec); printf("...\n");
00881   printSpinorElement(spinorGPU, len-1, prec); printf("\n");
00882 
00883   compare_spinor(spinorRef, spinorGPU, len, prec);
00884   }*/
00885 
00886 template <typename Float>
00887 static void checkGauge(Float **oldG, Float **newG, double epsilon) {
00888 
00889   const int fail_check = 17;
00890   int fail[4][fail_check];
00891   int iter[4][18];
00892   for (int d=0; d<4; d++) for (int i=0; i<fail_check; i++) fail[d][i] = 0;
00893   for (int d=0; d<4; d++) for (int i=0; i<18; i++) iter[d][i] = 0;
00894 
00895   for (int d=0; d<4; d++) {
00896     for (int eo=0; eo<2; eo++) {
00897       for (int i=0; i<Vh; i++) {
00898         int ga_idx = (eo*Vh+i);
00899         for (int j=0; j<18; j++) {
00900           double diff = fabs(newG[d][ga_idx*18+j] - oldG[d][ga_idx*18+j]);
00901 
00902           for (int f=0; f<fail_check; f++) if (diff > pow(10.0,-(f+1))) fail[d][f]++;
00903           if (diff > epsilon) iter[d][j]++;
00904         }
00905       }
00906     }
00907   }
00908 
00909   printf("Component fails (X, Y, Z, T)\n");
00910   for (int i=0; i<18; i++) printf("%d fails = (%8d, %8d, %8d, %8d)\n", i, iter[0][i], iter[1][i], iter[2][i], iter[3][i]);
00911 
00912   printf("\nDeviation Failures = (X, Y, Z, T)\n");
00913   for (int f=0; f<fail_check; f++) {
00914     printf("%e Failures = (%9d, %9d, %9d, %9d) = (%e, %e, %e, %e)\n", pow(10.0,-(f+1)), 
00915            fail[0][f], fail[1][f], fail[2][f], fail[3][f],
00916            fail[0][f]/(double)(V*18), fail[1][f]/(double)(V*18), fail[2][f]/(double)(V*18), fail[3][f]/(double)(V*18));
00917   }
00918 
00919 }
00920 
00921 void check_gauge(void **oldG, void **newG, double epsilon, QudaPrecision precision) {
00922   if (precision == QUDA_DOUBLE_PRECISION) 
00923     checkGauge((double**)oldG, (double**)newG, epsilon);
00924   else 
00925     checkGauge((float**)oldG, (float**)newG, epsilon);
00926 }
00927 
00928 
00929 
00930 void 
00931 createSiteLinkCPU(void** link,  QudaPrecision precision, int phase) 
00932 {
00933     
00934   if (precision == QUDA_DOUBLE_PRECISION) {
00935     constructUnitaryGaugeField((double**)link);
00936   }else {
00937     constructUnitaryGaugeField((float**)link);
00938   }
00939 
00940   if(phase){
00941         
00942     for(int i=0;i < V;i++){
00943       for(int dir =XUP; dir <= TUP; dir++){
00944         int idx = i;
00945         int oddBit =0;
00946         if (i >= Vh) {
00947           idx = i - Vh;
00948           oddBit = 1;
00949         }
00950 
00951         int X1 = Z[0];
00952         int X2 = Z[1];
00953         int X3 = Z[2];
00954         int X4 = Z[3];
00955 
00956         int full_idx = fullLatticeIndex(idx, oddBit);
00957         int i4 = full_idx /(X3*X2*X1);
00958         int i3 = (full_idx - i4*(X3*X2*X1))/(X2*X1);
00959         int i2 = (full_idx - i4*(X3*X2*X1) - i3*(X2*X1))/X1;
00960         int i1 = full_idx - i4*(X3*X2*X1) - i3*(X2*X1) - i2*X1;     
00961 
00962         double coeff= 1.0;
00963         switch(dir){
00964         case XUP:
00965           if ( (i4 & 1) != 0){
00966             coeff *= -1;
00967           }
00968           break;
00969 
00970         case YUP:
00971           if ( ((i4+i1) & 1) != 0){
00972             coeff *= -1;
00973           }
00974           break;
00975 
00976         case ZUP:
00977           if ( ((i4+i1+i2) & 1) != 0){
00978             coeff *= -1;
00979           }
00980           break;
00981                 
00982         case TUP:
00983           if ((commCoords(3) == commDim(3) -1) && i4 == (X4-1) ){
00984             coeff *= -1;
00985           }
00986           break;
00987 
00988         default:
00989           printf("ERROR: wrong dir(%d)\n", dir);
00990           exit(1);
00991         }
00992             
00993         if (precision == QUDA_DOUBLE_PRECISION){
00994           //double* mylink = (double*)link;
00995           //mylink = mylink + (4*i + dir)*gaugeSiteSize;
00996           double* mylink = (double*)link[dir];
00997           mylink = mylink + i*gaugeSiteSize;
00998 
00999           mylink[12] *= coeff;
01000           mylink[13] *= coeff;
01001           mylink[14] *= coeff;
01002           mylink[15] *= coeff;
01003           mylink[16] *= coeff;
01004           mylink[17] *= coeff;
01005                 
01006         }else{
01007           //float* mylink = (float*)link;
01008           //mylink = mylink + (4*i + dir)*gaugeSiteSize;
01009           float* mylink = (float*)link[dir];
01010           mylink = mylink + i*gaugeSiteSize;
01011                   
01012           mylink[12] *= coeff;
01013           mylink[13] *= coeff;
01014           mylink[14] *= coeff;
01015           mylink[15] *= coeff;
01016           mylink[16] *= coeff;
01017           mylink[17] *= coeff;
01018                 
01019         }
01020       }
01021     }
01022   }    
01023 
01024     
01025 #if 1
01026   for(int dir= 0;dir < 4;dir++){
01027     for(int i=0;i< V*gaugeSiteSize;i++){
01028       if (precision ==QUDA_SINGLE_PRECISION){
01029         float* f = (float*)link[dir];
01030         if (f[i] != f[i] || (fabsf(f[i]) > 1.e+3) ){
01031           fprintf(stderr, "ERROR:  %dth: bad number(%f) in function %s \n",i, f[i], __FUNCTION__);
01032           exit(1);
01033         }
01034       }else{
01035         double* f = (double*)link[dir];
01036         if (f[i] != f[i] || (fabs(f[i]) > 1.e+3)){
01037           fprintf(stderr, "ERROR:  %dth: bad number(%f) in function %s \n",i, f[i], __FUNCTION__);
01038           exit(1);
01039         }
01040           
01041       }
01042         
01043     }
01044   }
01045 #endif
01046 
01047   return;
01048 }
01049 
01050 
01051 
01052 
01053 template <typename Float>
01054 int compareLink(Float **linkA, Float **linkB, int len) {
01055   const int fail_check = 16;
01056   int fail[fail_check];
01057   for (int f=0; f<fail_check; f++) fail[f] = 0;
01058 
01059   int iter[18];
01060   for (int i=0; i<18; i++) iter[i] = 0;
01061   
01062   for(int dir=0;dir < 4; dir++){
01063     for (int i=0; i<len; i++) {
01064       for (int j=0; j<18; j++) {
01065         int is = i*18+j;
01066         double diff = fabs(linkA[dir][is]-linkB[dir][is]);
01067         for (int f=0; f<fail_check; f++)
01068           if (diff > pow(10.0,-(f+1))) fail[f]++;
01069         //if (diff > 1e-1) printf("%d %d %e\n", i, j, diff);
01070         if (diff > 1e-3) iter[j]++;
01071       }
01072     }
01073   }
01074   
01075   for (int i=0; i<18; i++) printfQuda("%d fails = %d\n", i, iter[i]);
01076   
01077   int accuracy_level = 0;
01078   for(int f =0; f < fail_check; f++){
01079     if(fail[f] == 0){
01080       accuracy_level =f;
01081     }
01082   }
01083 
01084   for (int f=0; f<fail_check; f++) {
01085     printfQuda("%e Failures: %d / %d  = %e\n", pow(10.0,-(f+1)), fail[f], len*gaugeSiteSize, fail[f] / (double)(len*6));
01086   }
01087   
01088   return accuracy_level;
01089 }
01090 
01091 static int
01092 compare_link(void **linkA, void **linkB, int len, QudaPrecision precision)
01093 {
01094   int ret;
01095 
01096   if (precision == QUDA_DOUBLE_PRECISION){    
01097     ret = compareLink((double**)linkA, (double**)linkB, len);
01098   }else {
01099     ret = compareLink((float**)linkA, (float**)linkB, len);
01100   }    
01101 
01102   return ret;
01103 }
01104 
01105 
01106 // X indexes the lattice site
01107 static void 
01108 printLinkElement(void *link, int X, QudaPrecision precision) 
01109 {
01110   if (precision == QUDA_DOUBLE_PRECISION){
01111     for(int i=0; i < 3;i++){
01112       printVector((double*)link+ X*gaugeSiteSize + i*6);
01113     }
01114         
01115   }
01116   else{
01117     for(int i=0;i < 3;i++){
01118       printVector((float*)link+X*gaugeSiteSize + i*6);
01119     }
01120   }
01121 }
01122 
01123 int strong_check_link(void** linkA, const char* msgA, 
01124                       void **linkB, const char* msgB, 
01125                       int len, QudaPrecision prec) 
01126 {
01127   printfQuda("%s\n", msgA);
01128   printLinkElement(linkA[0], 0, prec); 
01129   printfQuda("\n");
01130   printLinkElement(linkA[0], 1, prec); 
01131   printfQuda("...\n");
01132   printLinkElement(linkA[3], len-1, prec); 
01133   printfQuda("\n");    
01134     
01135   printfQuda("\n%s\n", msgB);
01136   printLinkElement(linkB[0], 0, prec); 
01137   printfQuda("\n");
01138   printLinkElement(linkB[0], 1, prec); 
01139   printfQuda("...\n");
01140   printLinkElement(linkB[3], len-1, prec); 
01141   printfQuda("\n");
01142     
01143   int ret = compare_link(linkA, linkB, len, prec);
01144   return ret;
01145 }
01146 
01147 
01148 void 
01149 createMomCPU(void* mom,  QudaPrecision precision) 
01150 {
01151   void* temp;
01152     
01153   size_t gSize = (precision == QUDA_DOUBLE_PRECISION) ? sizeof(double) : sizeof(float);
01154   temp = malloc(4*V*gaugeSiteSize*gSize);
01155   if (temp == NULL){
01156     fprintf(stderr, "Error: malloc failed for temp in function %s\n", __FUNCTION__);
01157     exit(1);
01158   }
01159     
01160     
01161     
01162   for(int i=0;i < V;i++){
01163     if (precision == QUDA_DOUBLE_PRECISION){
01164       for(int dir=0;dir < 4;dir++){
01165         double* thismom = (double*)mom;     
01166         for(int k=0; k < momSiteSize; k++){
01167           thismom[ (4*i+dir)*momSiteSize + k ]= 1.0* rand() /RAND_MAX;                          
01168           if (k==momSiteSize-1) thismom[ (4*i+dir)*momSiteSize + k ]= 0.0;
01169         }           
01170       }     
01171     }else{
01172       for(int dir=0;dir < 4;dir++){
01173         float* thismom=(float*)mom;
01174         for(int k=0; k < momSiteSize; k++){
01175           thismom[ (4*i+dir)*momSiteSize + k ]= 1.0* rand() /RAND_MAX;          
01176           if (k==momSiteSize-1) thismom[ (4*i+dir)*momSiteSize + k ]= 0.0;
01177         }           
01178       }
01179     }
01180   }
01181     
01182   free(temp);
01183   return;
01184 }
01185 
01186 void
01187 createHwCPU(void* hw,  QudaPrecision precision)
01188 {
01189   for(int i=0;i < V;i++){
01190     if (precision == QUDA_DOUBLE_PRECISION){
01191       for(int dir=0;dir < 4;dir++){
01192         double* thishw = (double*)hw;
01193         for(int k=0; k < hwSiteSize; k++){
01194           thishw[ (4*i+dir)*hwSiteSize + k ]= 1.0* rand() /RAND_MAX;
01195         }
01196       }
01197     }else{
01198       for(int dir=0;dir < 4;dir++){
01199         float* thishw=(float*)hw;
01200         for(int k=0; k < hwSiteSize; k++){
01201           thishw[ (4*i+dir)*hwSiteSize + k ]= 1.0* rand() /RAND_MAX;
01202         }
01203       }
01204     }
01205   }
01206 
01207   return;
01208 }
01209 
01210 
01211 template <typename Float>
01212 int compare_mom(Float *momA, Float *momB, int len) {
01213   const int fail_check = 16;
01214   int fail[fail_check];
01215   for (int f=0; f<fail_check; f++) fail[f] = 0;
01216 
01217   int iter[momSiteSize];
01218   for (int i=0; i<momSiteSize; i++) iter[i] = 0;
01219   
01220   for (int i=0; i<len; i++) {
01221     for (int j=0; j<momSiteSize; j++) {
01222       int is = i*momSiteSize+j;
01223       double diff = fabs(momA[is]-momB[is]);
01224       for (int f=0; f<fail_check; f++)
01225         if (diff > pow(10.0,-(f+1))) fail[f]++;
01226       //if (diff > 1e-1) printf("%d %d %e\n", i, j, diff);
01227       if (diff > 1e-3) iter[j]++;
01228     }
01229   }
01230   
01231   int accuracy_level = 0;
01232   for(int f =0; f < fail_check; f++){
01233     if(fail[f] == 0){
01234       accuracy_level =f+1;
01235     }
01236   }
01237 
01238   for (int i=0; i<momSiteSize; i++) printf("%d fails = %d\n", i, iter[i]);
01239   
01240   for (int f=0; f<fail_check; f++) {
01241     printf("%e Failures: %d / %d  = %e\n", pow(10.0,-(f+1)), fail[f], len*momSiteSize, fail[f] / (double)(len*6));
01242   }
01243   
01244   return accuracy_level;
01245 }
01246 
01247 static void 
01248 printMomElement(void *mom, int X, QudaPrecision precision) 
01249 {
01250   if (precision == QUDA_DOUBLE_PRECISION){
01251     double* thismom = ((double*)mom)+ X*momSiteSize;
01252     printVector(thismom);
01253     printf("(%9f,%9f) (%9f,%9f)\n", thismom[6], thismom[7], thismom[8], thismom[9]);
01254   }else{
01255     float* thismom = ((float*)mom)+ X*momSiteSize;
01256     printVector(thismom);
01257     printf("(%9f,%9f) (%9f,%9f)\n", thismom[6], thismom[7], thismom[8], thismom[9]);    
01258   }
01259 }
01260 int strong_check_mom(void * momA, void *momB, int len, QudaPrecision prec) 
01261 {    
01262   printf("mom:\n");
01263   printMomElement(momA, 0, prec); 
01264   printf("\n");
01265   printMomElement(momA, 1, prec); 
01266   printf("\n");
01267   printMomElement(momA, 2, prec); 
01268   printf("\n");
01269   printMomElement(momA, 3, prec); 
01270   printf("...\n");
01271   
01272   printf("\nreference mom:\n");
01273   printMomElement(momB, 0, prec); 
01274   printf("\n");
01275   printMomElement(momB, 1, prec); 
01276   printf("\n");
01277   printMomElement(momB, 2, prec); 
01278   printf("\n");
01279   printMomElement(momB, 3, prec); 
01280   printf("\n");
01281   
01282   int ret;
01283   if (prec == QUDA_DOUBLE_PRECISION){
01284     ret = compare_mom((double*)momA, (double*)momB, len);
01285   }else{
01286     ret = compare_mom((float*)momA, (float*)momB, len);
01287   }
01288   
01289   return ret;
01290 }
01291 
01292 
01293 /************
01294  * return value
01295  *
01296  * 0: command line option matched and processed sucessfully
01297  * non-zero: command line option does not match
01298  *
01299  */
01300 
01301 #ifdef MULTI_GPU
01302 int device = -1;
01303 #else
01304 int device = 0;
01305 #endif
01306 
01307 QudaReconstructType link_recon = QUDA_RECONSTRUCT_12;
01308 QudaReconstructType link_recon_sloppy = QUDA_RECONSTRUCT_INVALID;
01309 QudaPrecision prec = QUDA_SINGLE_PRECISION;
01310 QudaPrecision  prec_sloppy = QUDA_INVALID_PRECISION;
01311 int xdim = 24;
01312 int ydim = 24;
01313 int zdim = 24;
01314 int tdim = 24;
01315 QudaDagType dagger = QUDA_DAG_NO;
01316 extern bool kernelPackT;
01317 int gridsize_from_cmdline[4]={1,1,1,1};
01318 QudaDslashType dslash_type = QUDA_WILSON_DSLASH;
01319 char latfile[256] = "";
01320 bool tune = true;
01321 
01322 void __attribute__((weak)) usage_extra(char** argv){};
01323 
01324 void usage(char** argv )
01325 {
01326   printf("Usage: %s [options]\n", argv[0]);
01327   printf("Common options: \n");
01328 #ifndef MULTI_GPU
01329   printf("    --device <n>                              # Set the CUDA device to use (default 0, single GPU only)\n");     
01330 #endif
01331   printf("    --prec <double/single/half>               # Precision in GPU\n"); 
01332   printf("    --prec_sloppy <double/single/half>        # Sloppy precision in GPU\n"); 
01333   printf("    --recon <8/12/18>                         # Link reconstruction type\n"); 
01334   printf("    --recon_sloppy <8/12/18>                  # Sloppy link reconstruction type\n"); 
01335   printf("    --dagger                                  # Set the dagger to 1 (default 0)\n"); 
01336   printf("    --sdim <n>                                # Set space dimention(X/Y/Z) size\n"); 
01337   printf("    --xdim <n>                                # Set X dimension size(default 24)\n");     
01338   printf("    --ydim <n>                                # Set X dimension size(default 24)\n");     
01339   printf("    --zdim <n>                                # Set X dimension size(default 24)\n");     
01340   printf("    --tdim <n>                                # Set T dimension size(default 24)\n");  
01341   printf("    --xgridsize <n>                           # Set grid size in X dimension (default 1)\n");
01342   printf("    --ygridsize <n>                           # Set grid size in Y dimension (default 1)\n");
01343   printf("    --zgridsize <n>                           # Set grid size in Z dimension (default 1)\n");
01344   printf("    --tgridsize <n>                           # Set grid size in T dimension (default 1)\n");
01345   printf("    --partition <mask>                        # Set the communication topology (X=1, Y=2, Z=4, T=8, and combinations of these)\n");
01346   printf("    --kernel_pack_t                           # Set T dimension kernel packing to be true (default false)\n");
01347   printf("    --dslash_type <type>                      # Set the dslash type, the following values are valid\n"
01348          "                                                  wilson/clover/twisted_mass/asqtad/domain_wall\n");
01349   printf("    --load-gauge file                         # Load gauge field \"file\" for the test (requires QIO)\n");
01350   printf("    --tune <true/false>                       # Whether to autotune or not (default true)\n");     
01351   printf("    --help                                    # Print out this message\n"); 
01352   usage_extra(argv); 
01353 #ifdef MULTI_GPU
01354   char msg[]="multi";
01355 #else
01356   char msg[]="single";
01357 #endif  
01358   printf("Note: this program is %s GPU build\n", msg);
01359   exit(1);
01360   return ;
01361 }
01362 
01363 int process_command_line_option(int argc, char** argv, int* idx)
01364 {
01365 #ifdef MULTI_GPU
01366   char msg[]="multi";
01367 #else
01368   char msg[]="single";
01369 #endif
01370 
01371   int ret = -1;
01372   
01373   int i = *idx;
01374 
01375   if( strcmp(argv[i], "--help")== 0){
01376     usage(argv);
01377   }
01378 
01379   if( strcmp(argv[i], "--device") == 0){
01380     if (i+1 >= argc){
01381       usage(argv);
01382     }
01383 #ifdef MULTI_GPU
01384     printf("Warning: Ignoring --device argument since this is a multi-GPU build.\n");
01385 #else
01386     device = atoi(argv[i+1]);
01387     if (device < 0 || device > 16){
01388       printf("Error: Invalid CUDA device number (%d)\n", device);
01389       usage(argv);
01390     }
01391 #endif
01392     i++;
01393     ret = 0;
01394     goto out;
01395   }
01396 
01397   if( strcmp(argv[i], "--prec") == 0){
01398     if (i+1 >= argc){
01399       usage(argv);
01400     }       
01401     prec =  get_prec(argv[i+1]);
01402     i++;
01403     ret = 0;
01404     goto out;
01405   }
01406 
01407   if( strcmp(argv[i], "--prec_sloppy") == 0){
01408     if (i+1 >= argc){
01409       usage(argv);
01410     }       
01411     prec_sloppy =  get_prec(argv[i+1]);
01412     i++;
01413     ret = 0;
01414     goto out;
01415   }
01416   
01417   if( strcmp(argv[i], "--recon") == 0){
01418     if (i+1 >= argc){
01419       usage(argv);
01420     }       
01421     link_recon =  get_recon(argv[i+1]);
01422     i++;
01423     ret = 0;
01424     goto out;
01425   }
01426 
01427   if( strcmp(argv[i], "--recon_sloppy") == 0){
01428     if (i+1 >= argc){
01429       usage(argv);
01430     }       
01431     link_recon_sloppy =  get_recon(argv[i+1]);
01432     i++;
01433     ret = 0;
01434     goto out;
01435   }
01436   
01437   if( strcmp(argv[i], "--xdim") == 0){
01438     if (i+1 >= argc){
01439       usage(argv);
01440     }
01441     xdim= atoi(argv[i+1]);
01442     if (xdim < 0 || xdim > 128){
01443       printf("ERROR: invalid X dimension (%d)\n", xdim);
01444       usage(argv);
01445     }
01446     i++;
01447     ret = 0;
01448     goto out;
01449   }
01450 
01451   if( strcmp(argv[i], "--ydim") == 0){
01452     if (i+1 >= argc){
01453       usage(argv);
01454     }
01455     ydim= atoi(argv[i+1]);
01456     if (ydim < 0 || ydim > 128){
01457       printf("ERROR: invalid T dimension (%d)\n", ydim);
01458       usage(argv);
01459     }
01460     i++;
01461     ret = 0;
01462     goto out;
01463   }
01464 
01465 
01466   if( strcmp(argv[i], "--zdim") == 0){
01467     if (i+1 >= argc){
01468       usage(argv);
01469     }
01470     zdim= atoi(argv[i+1]);
01471     if (zdim < 0 || zdim > 128){
01472       printf("ERROR: invalid T dimension (%d)\n", zdim);
01473       usage(argv);
01474     }
01475     i++;
01476     ret = 0;
01477     goto out;
01478   }
01479 
01480   if( strcmp(argv[i], "--tdim") == 0){
01481     if (i+1 >= argc){
01482       usage(argv);
01483     }       
01484     tdim =  atoi(argv[i+1]);
01485     if (tdim < 0 || tdim > 128){
01486       errorQuda("Error: invalid t dimension");
01487     }
01488     i++;
01489     ret = 0;
01490     goto out;
01491   }
01492 
01493   if( strcmp(argv[i], "--sdim") == 0){
01494     if (i+1 >= argc){
01495       usage(argv);
01496     }       
01497     int sdim =  atoi(argv[i+1]);
01498     if (sdim < 0 || sdim > 128){
01499       printfQuda("Error: invalid S dimension\n");
01500     }
01501     xdim=ydim=zdim=sdim;
01502     i++;
01503     ret = 0;
01504     goto out;
01505   }
01506   
01507   if( strcmp(argv[i], "--dagger") == 0){
01508     dagger = QUDA_DAG_YES;
01509     ret = 0;
01510     goto out;
01511   }     
01512   
01513   if( strcmp(argv[i], "--partition") == 0){
01514     if (i+1 >= argc){
01515       usage(argv);
01516     }     
01517     int value  =  atoi(argv[i+1]);
01518     for(int j=0; j < 4;j++){
01519       if (value &  (1 << j)){
01520         commDimPartitionedSet(j);
01521       }
01522     }
01523     i++;
01524     ret = 0;
01525     goto out;
01526   }
01527   
01528   if( strcmp(argv[i], "--kernel_pack_t") == 0){
01529     kernelPackT = true;
01530     ret= 0;
01531     goto out;
01532   }
01533 
01534 
01535   if( strcmp(argv[i], "--tune") == 0){
01536     if (i+1 >= argc){
01537       usage(argv);
01538     }       
01539 
01540     if (strcmp(argv[i+1], "true") == 0){
01541       tune = true;
01542     }else if (strcmp(argv[i+1], "false") == 0){
01543       tune = false;
01544     }else{
01545       fprintf(stderr, "Error: invalid tuning type\n");  
01546       exit(1);
01547     }
01548 
01549     i++;
01550     ret = 0;
01551     goto out;
01552   }
01553 
01554   if( strcmp(argv[i], "--xgridsize") == 0){
01555     if (i+1 >= argc){ 
01556       usage(argv);
01557     }     
01558     int xsize =  atoi(argv[i+1]);
01559     if (xsize <= 0 ){
01560       errorQuda("Error: invalid X grid size");
01561     }
01562     gridsize_from_cmdline[0] = xsize;
01563     i++;
01564     ret = 0;
01565     goto out;
01566   }
01567 
01568   if( strcmp(argv[i], "--ygridsize") == 0){
01569     if (i+1 >= argc){
01570       usage(argv);
01571     }     
01572     int ysize =  atoi(argv[i+1]);
01573     if (ysize <= 0 ){
01574       errorQuda("Error: invalid Y grid size");
01575     }
01576     gridsize_from_cmdline[1] = ysize;
01577     i++;
01578     ret = 0;
01579     goto out;
01580   }
01581 
01582   if( strcmp(argv[i], "--zgridsize") == 0){
01583     if (i+1 >= argc){
01584       usage(argv);
01585     }     
01586     int zsize =  atoi(argv[i+1]);
01587     if (zsize <= 0 ){
01588       errorQuda("Error: invalid Z grid size");
01589     }
01590     gridsize_from_cmdline[2] = zsize;
01591     i++;
01592     ret = 0;
01593     goto out;
01594   }
01595   
01596   if( strcmp(argv[i], "--tgridsize") == 0){
01597     if (i+1 >= argc){
01598       usage(argv);
01599     }     
01600     int tsize =  atoi(argv[i+1]);
01601     if (tsize <= 0 ){
01602       errorQuda("Error: invalid T grid size");
01603     }
01604     gridsize_from_cmdline[3] = tsize;
01605     i++;
01606     ret = 0;
01607     goto out;
01608   }
01609   
01610   if( strcmp(argv[i], "--dslash_type") == 0){
01611     if (i+1 >= argc){
01612       usage(argv);
01613     }     
01614     dslash_type =  get_dslash_type(argv[i+1]);
01615     i++;
01616     ret = 0;
01617     goto out;
01618   }
01619   
01620   if( strcmp(argv[i], "--load-gauge") == 0){
01621     if (i+1 >= argc){
01622       usage(argv);
01623     }     
01624     strcpy(latfile, argv[i+1]);
01625     i++;
01626     ret = 0;
01627     goto out;
01628   }
01629   
01630   if( strcmp(argv[i], "--version") == 0){
01631     printf("This program is linked with QUDA library, version %s,", 
01632            get_quda_ver_str());
01633     printf(" %s GPU build\n", msg);
01634     exit(0);
01635   }
01636 
01637  out:
01638   *idx = i;
01639   return ret ;
01640 
01641 }