QUDA  v0.5.0
A library for QCD on GPUs
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
test_util.cpp
Go to the documentation of this file.
1 #include <complex>
2 #include <stdlib.h>
3 #include <stdio.h>
4 #include <string.h>
5 #include <short.h>
6 
7 #if defined(QMP_COMMS)
8 #include <qmp.h>
9 #elif defined(MPI_COMMS)
10 #include <mpi.h>
11 #endif
12 
14 #include <test_util.h>
15 
16 #include <face_quda.h>
17 #include <dslash_quda.h>
18 #include "misc.h"
19 
20 using namespace std;
21 
22 #define XUP 0
23 #define YUP 1
24 #define ZUP 2
25 #define TUP 3
26 
27 
28 int Z[4];
29 int V;
30 int Vh;
31 int Vs_x, Vs_y, Vs_z, Vs_t;
33 int faceVolume[4];
34 
35 //extended volume, +4
36 int E1, E1h, E2, E3, E4;
37 int E[4];
38 int V_ex, Vh_ex;
39 
40 int Ls;
41 int V5;
42 int V5h;
43 
45 
46 extern float fat_link_max;
47 
48 
49 void initComms(int argc, char **argv, const int *commDims)
50 {
51 #if defined(QMP_COMMS)
52  QMP_thread_level_t tl;
53  QMP_init_msg_passing(&argc, &argv, QMP_THREAD_SINGLE, &tl);
54 #elif defined(MPI_COMMS)
55  MPI_Init(&argc, &argv);
56 #endif
57  initCommsGridQuda(4, commDims, NULL, NULL);
58  initRand();
59 }
60 
61 
63 {
64 #if defined(QMP_COMMS)
65  QMP_finalize_msg_passing();
66 #elif defined(MPI_COMMS)
67  MPI_Finalize();
68 #endif
69 }
70 
71 
72 void initRand()
73 {
74  int rank = 0;
75 
76 #if defined(QMP_COMMS)
77  rank = QMP_get_node_number();
78 #elif defined(MPI_COMMS)
79  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
80 #endif
81 
82  srand(17*rank + 137);
83 }
84 
85 
86 void setDims(int *X) {
87  V = 1;
88  for (int d=0; d< 4; d++) {
89  V *= X[d];
90  Z[d] = X[d];
91 
92  faceVolume[d] = 1;
93  for (int i=0; i<4; i++) {
94  if (i==d) continue;
95  faceVolume[d] *= X[i];
96  }
97  }
98  Vh = V/2;
99 
100  Vs_x = X[1]*X[2]*X[3];
101  Vs_y = X[0]*X[2]*X[3];
102  Vs_z = X[0]*X[1]*X[3];
103  Vs_t = X[0]*X[1]*X[2];
104 
105  Vsh_x = Vs_x/2;
106  Vsh_y = Vs_y/2;
107  Vsh_z = Vs_z/2;
108  Vsh_t = Vs_t/2;
109 
110 
111  E1=X[0]+4; E2=X[1]+4; E3=X[2]+4; E4=X[3]+4;
112  E1h=E1/2;
113  E[0] = E1;
114  E[1] = E2;
115  E[2] = E3;
116  E[3] = E4;
117  V_ex = E1*E2*E3*E4;
118  Vh_ex = V_ex/2;
119 
120 }
121 
122 
123 void dw_setDims(int *X, const int L5)
124 {
125  V = 1;
126  for (int d=0; d< 4; d++)
127  {
128  V *= X[d];
129  Z[d] = X[d];
130 
131  faceVolume[d] = 1;
132  for (int i=0; i<4; i++) {
133  if (i==d) continue;
134  faceVolume[d] *= X[i];
135  }
136  }
137  Vh = V/2;
138 
139  Ls = L5;
140  V5 = V*Ls;
141  V5h = Vh*Ls;
142 
143  Vs_t = Z[0]*Z[1]*Z[2]*Ls;//?
144  Vsh_t = Vs_t/2; //?
145 }
146 
147 
148 void setSpinorSiteSize(int n)
149 {
150  mySpinorSiteSize = n;
151 }
152 
153 
154 template <typename Float>
155 static void printVector(Float *v) {
156  printfQuda("{(%f %f) (%f %f) (%f %f)}\n", v[0], v[1], v[2], v[3], v[4], v[5]);
157 }
158 
159 // X indexes the lattice site
160 void printSpinorElement(void *spinor, int X, QudaPrecision precision) {
161  if (precision == QUDA_DOUBLE_PRECISION)
162  for (int s=0; s<4; s++) printVector((double*)spinor+X*24+s*6);
163  else
164  for (int s=0; s<4; s++) printVector((float*)spinor+X*24+s*6);
165 }
166 
167 // X indexes the full lattice
168 void printGaugeElement(void *gauge, int X, QudaPrecision precision) {
169  if (getOddBit(X) == 0) {
170  if (precision == QUDA_DOUBLE_PRECISION)
171  for (int m=0; m<3; m++) printVector((double*)gauge +(X/2)*gaugeSiteSize + m*3*2);
172  else
173  for (int m=0; m<3; m++) printVector((float*)gauge +(X/2)*gaugeSiteSize + m*3*2);
174 
175  } else {
176  if (precision == QUDA_DOUBLE_PRECISION)
177  for (int m = 0; m < 3; m++) printVector((double*)gauge + (X/2+Vh)*gaugeSiteSize + m*3*2);
178  else
179  for (int m = 0; m < 3; m++) printVector((float*)gauge + (X/2+Vh)*gaugeSiteSize + m*3*2);
180  }
181 }
182 
183 // returns 0 or 1 if the full lattice index X is even or odd
184 int getOddBit(int Y) {
185  int x4 = Y/(Z[2]*Z[1]*Z[0]);
186  int x3 = (Y/(Z[1]*Z[0])) % Z[2];
187  int x2 = (Y/Z[0]) % Z[1];
188  int x1 = Y % Z[0];
189  return (x4+x3+x2+x1) % 2;
190 }
191 
192 // a+=b
193 template <typename Float>
194 inline void complexAddTo(Float *a, Float *b) {
195  a[0] += b[0];
196  a[1] += b[1];
197 }
198 
199 // a = b*c
200 template <typename Float>
201 inline void complexProduct(Float *a, Float *b, Float *c) {
202  a[0] = b[0]*c[0] - b[1]*c[1];
203  a[1] = b[0]*c[1] + b[1]*c[0];
204 }
205 
206 // a = conj(b)*conj(c)
207 template <typename Float>
208 inline void complexConjugateProduct(Float *a, Float *b, Float *c) {
209  a[0] = b[0]*c[0] - b[1]*c[1];
210  a[1] = -b[0]*c[1] - b[1]*c[0];
211 }
212 
213 // a = conj(b)*c
214 template <typename Float>
215 inline void complexDotProduct(Float *a, Float *b, Float *c) {
216  a[0] = b[0]*c[0] + b[1]*c[1];
217  a[1] = b[0]*c[1] - b[1]*c[0];
218 }
219 
220 // a += b*c
221 template <typename Float>
222 inline void accumulateComplexProduct(Float *a, Float *b, Float *c, Float sign) {
223  a[0] += sign*(b[0]*c[0] - b[1]*c[1]);
224  a[1] += sign*(b[0]*c[1] + b[1]*c[0]);
225 }
226 
227 // a += conj(b)*c)
228 template <typename Float>
229 inline void accumulateComplexDotProduct(Float *a, Float *b, Float *c) {
230  a[0] += b[0]*c[0] + b[1]*c[1];
231  a[1] += b[0]*c[1] - b[1]*c[0];
232 }
233 
234 template <typename Float>
235 inline void accumulateConjugateProduct(Float *a, Float *b, Float *c, int sign) {
236  a[0] += sign * (b[0]*c[0] - b[1]*c[1]);
237  a[1] -= sign * (b[0]*c[1] + b[1]*c[0]);
238 }
239 
240 template <typename Float>
241 inline void su3Construct12(Float *mat) {
242  Float *w = mat+12;
243  w[0] = 0.0;
244  w[1] = 0.0;
245  w[2] = 0.0;
246  w[3] = 0.0;
247  w[4] = 0.0;
248  w[5] = 0.0;
249 }
250 
251 // Stabilized Bunk and Sommer
252 template <typename Float>
253 inline void su3Construct8(Float *mat) {
254  mat[0] = atan2(mat[1], mat[0]);
255  mat[1] = atan2(mat[13], mat[12]);
256  for (int i=8; i<18; i++) mat[i] = 0.0;
257 }
258 
259 void su3_construct(void *mat, QudaReconstructType reconstruct, QudaPrecision precision) {
260  if (reconstruct == QUDA_RECONSTRUCT_12) {
261  if (precision == QUDA_DOUBLE_PRECISION) su3Construct12((double*)mat);
262  else su3Construct12((float*)mat);
263  } else {
264  if (precision == QUDA_DOUBLE_PRECISION) su3Construct8((double*)mat);
265  else su3Construct8((float*)mat);
266  }
267 }
268 
269 // given first two rows (u,v) of SU(3) matrix mat, reconstruct the third row
270 // as the cross product of the conjugate vectors: w = u* x v*
271 //
272 // 48 flops
273 template <typename Float>
274 static void su3Reconstruct12(Float *mat, int dir, int ga_idx, QudaGaugeParam *param) {
275  Float *u = &mat[0*(3*2)];
276  Float *v = &mat[1*(3*2)];
277  Float *w = &mat[2*(3*2)];
278  w[0] = 0.0; w[1] = 0.0; w[2] = 0.0; w[3] = 0.0; w[4] = 0.0; w[5] = 0.0;
279  accumulateConjugateProduct(w+0*(2), u+1*(2), v+2*(2), +1);
280  accumulateConjugateProduct(w+0*(2), u+2*(2), v+1*(2), -1);
281  accumulateConjugateProduct(w+1*(2), u+2*(2), v+0*(2), +1);
282  accumulateConjugateProduct(w+1*(2), u+0*(2), v+2*(2), -1);
283  accumulateConjugateProduct(w+2*(2), u+0*(2), v+1*(2), +1);
284  accumulateConjugateProduct(w+2*(2), u+1*(2), v+0*(2), -1);
285  Float u0 = (dir < 3 ? param->anisotropy :
286  (ga_idx >= (Z[3]-1)*Z[0]*Z[1]*Z[2]/2 ? param->t_boundary : 1));
287  w[0]*=u0; w[1]*=u0; w[2]*=u0; w[3]*=u0; w[4]*=u0; w[5]*=u0;
288 }
289 
290 template <typename Float>
291 static void su3Reconstruct8(Float *mat, int dir, int ga_idx, QudaGaugeParam *param) {
292  // First reconstruct first row
293  Float row_sum = 0.0;
294  row_sum += mat[2]*mat[2];
295  row_sum += mat[3]*mat[3];
296  row_sum += mat[4]*mat[4];
297  row_sum += mat[5]*mat[5];
298  Float u0 = (dir < 3 ? param->anisotropy :
299  (ga_idx >= (Z[3]-1)*Z[0]*Z[1]*Z[2]/2 ? param->t_boundary : 1));
300  Float U00_mag = sqrt(1.f/(u0*u0) - row_sum);
301 
302  mat[14] = mat[0];
303  mat[15] = mat[1];
304 
305  mat[0] = U00_mag * cos(mat[14]);
306  mat[1] = U00_mag * sin(mat[14]);
307 
308  Float column_sum = 0.0;
309  for (int i=0; i<2; i++) column_sum += mat[i]*mat[i];
310  for (int i=6; i<8; i++) column_sum += mat[i]*mat[i];
311  Float U20_mag = sqrt(1.f/(u0*u0) - column_sum);
312 
313  mat[12] = U20_mag * cos(mat[15]);
314  mat[13] = U20_mag * sin(mat[15]);
315 
316  // First column now restored
317 
318  // finally reconstruct last elements from SU(2) rotation
319  Float r_inv2 = 1.0/(u0*row_sum);
320 
321  // U11
322  Float A[2];
323  complexDotProduct(A, mat+0, mat+6);
324  complexConjugateProduct(mat+8, mat+12, mat+4);
325  accumulateComplexProduct(mat+8, A, mat+2, u0);
326  mat[8] *= -r_inv2;
327  mat[9] *= -r_inv2;
328 
329  // U12
330  complexConjugateProduct(mat+10, mat+12, mat+2);
331  accumulateComplexProduct(mat+10, A, mat+4, -u0);
332  mat[10] *= r_inv2;
333  mat[11] *= r_inv2;
334 
335  // U21
336  complexDotProduct(A, mat+0, mat+12);
337  complexConjugateProduct(mat+14, mat+6, mat+4);
338  accumulateComplexProduct(mat+14, A, mat+2, -u0);
339  mat[14] *= r_inv2;
340  mat[15] *= r_inv2;
341 
342  // U12
343  complexConjugateProduct(mat+16, mat+6, mat+2);
344  accumulateComplexProduct(mat+16, A, mat+4, u0);
345  mat[16] *= -r_inv2;
346  mat[17] *= -r_inv2;
347 }
348 
349 void su3_reconstruct(void *mat, int dir, int ga_idx, QudaReconstructType reconstruct, QudaPrecision precision, QudaGaugeParam *param) {
350  if (reconstruct == QUDA_RECONSTRUCT_12) {
351  if (precision == QUDA_DOUBLE_PRECISION) su3Reconstruct12((double*)mat, dir, ga_idx, param);
352  else su3Reconstruct12((float*)mat, dir, ga_idx, param);
353  } else {
354  if (precision == QUDA_DOUBLE_PRECISION) su3Reconstruct8((double*)mat, dir, ga_idx, param);
355  else su3Reconstruct8((float*)mat, dir, ga_idx, param);
356  }
357 }
358 
359 /*
360  void su3_construct_8_half(float *mat, short *mat_half) {
361  su3Construct8(mat);
362 
363  mat_half[0] = floatToShort(mat[0] / M_PI);
364  mat_half[1] = floatToShort(mat[1] / M_PI);
365  for (int i=2; i<18; i++) {
366  mat_half[i] = floatToShort(mat[i]);
367  }
368  }
369 
370  void su3_reconstruct_8_half(float *mat, short *mat_half, int dir, int ga_idx, QudaGaugeParam *param) {
371 
372  for (int i=0; i<18; i++) {
373  mat[i] = shortToFloat(mat_half[i]);
374  }
375  mat[0] *= M_PI;
376  mat[1] *= M_PI;
377 
378  su3Reconstruct8(mat, dir, ga_idx, param);
379  }*/
380 
381 template <typename Float>
382 static int compareFloats(Float *a, Float *b, int len, double epsilon) {
383  for (int i = 0; i < len; i++) {
384  double diff = fabs(a[i] - b[i]);
385  if (diff > epsilon) {
386  printfQuda("ERROR: i=%d, a[%d]=%f, b[%d]=%f\n", i, i, a[i], i, b[i]);
387  return 0;
388  }
389  }
390  return 1;
391 }
392 
393 int compare_floats(void *a, void *b, int len, double epsilon, QudaPrecision precision) {
394  if (precision == QUDA_DOUBLE_PRECISION) return compareFloats((double*)a, (double*)b, len, epsilon);
395  else return compareFloats((float*)a, (float*)b, len, epsilon);
396 }
397 
398 
399 
400 // given a "half index" i into either an even or odd half lattice (corresponding
401 // to oddBit = {0, 1}), returns the corresponding full lattice index.
402 int fullLatticeIndex(int i, int oddBit) {
403  /*
404  int boundaryCrossings = i/(Z[0]/2) + i/(Z[1]*Z[0]/2) + i/(Z[2]*Z[1]*Z[0]/2);
405  return 2*i + (boundaryCrossings + oddBit) % 2;
406  */
407 
408  int X1 = Z[0];
409  int X2 = Z[1];
410  int X3 = Z[2];
411  //int X4 = Z[3];
412  int X1h =X1/2;
413 
414  int sid =i;
415  int za = sid/X1h;
416  //int x1h = sid - za*X1h;
417  int zb = za/X2;
418  int x2 = za - zb*X2;
419  int x4 = zb/X3;
420  int x3 = zb - x4*X3;
421  int x1odd = (x2 + x3 + x4 + oddBit) & 1;
422  //int x1 = 2*x1h + x1odd;
423  int X = 2*sid + x1odd;
424 
425  return X;
426 }
427 
428 
429 // i represents a "half index" into an even or odd "half lattice".
430 // when oddBit={0,1} the half lattice is {even,odd}.
431 //
432 // the displacements, such as dx, refer to the full lattice coordinates.
433 //
434 // neighborIndex() takes a "half index", displaces it, and returns the
435 // new "half index", which can be an index into either the even or odd lattices.
436 // displacements of magnitude one always interchange odd and even lattices.
437 //
438 
439 int neighborIndex(int i, int oddBit, int dx4, int dx3, int dx2, int dx1) {
440  int Y = fullLatticeIndex(i, oddBit);
441  int x4 = Y/(Z[2]*Z[1]*Z[0]);
442  int x3 = (Y/(Z[1]*Z[0])) % Z[2];
443  int x2 = (Y/Z[0]) % Z[1];
444  int x1 = Y % Z[0];
445 
446  // assert (oddBit == (x+y+z+t)%2);
447 
448  x4 = (x4+dx4+Z[3]) % Z[3];
449  x3 = (x3+dx3+Z[2]) % Z[2];
450  x2 = (x2+dx2+Z[1]) % Z[1];
451  x1 = (x1+dx1+Z[0]) % Z[0];
452 
453  return (x4*(Z[2]*Z[1]*Z[0]) + x3*(Z[1]*Z[0]) + x2*(Z[0]) + x1) / 2;
454 }
455 
456 int
457 neighborIndex_mg(int i, int oddBit, int dx4, int dx3, int dx2, int dx1)
458 {
459  int ret;
460 
461  int Y = fullLatticeIndex(i, oddBit);
462  int x4 = Y/(Z[2]*Z[1]*Z[0]);
463  int x3 = (Y/(Z[1]*Z[0])) % Z[2];
464  int x2 = (Y/Z[0]) % Z[1];
465  int x1 = Y % Z[0];
466 
467  int ghost_x4 = x4+ dx4;
468 
469  // assert (oddBit == (x+y+z+t)%2);
470 
471  x4 = (x4+dx4+Z[3]) % Z[3];
472  x3 = (x3+dx3+Z[2]) % Z[2];
473  x2 = (x2+dx2+Z[1]) % Z[1];
474  x1 = (x1+dx1+Z[0]) % Z[0];
475 
476  if ( ghost_x4 >= 0 && ghost_x4 < Z[3]){
477  ret = (x4*(Z[2]*Z[1]*Z[0]) + x3*(Z[1]*Z[0]) + x2*(Z[0]) + x1) / 2;
478  }else{
479  ret = (x3*(Z[1]*Z[0]) + x2*(Z[0]) + x1) / 2;
480  }
481 
482 
483  return ret;
484 }
485 
486 
487 /*
488  * This is a computation of neighbor using the full index and the displacement in each direction
489  *
490  */
491 
492 int
493 neighborIndexFullLattice(int i, int dx4, int dx3, int dx2, int dx1)
494 {
495  int oddBit = 0;
496  int half_idx = i;
497  if (i >= Vh){
498  oddBit =1;
499  half_idx = i - Vh;
500  }
501 
502  int nbr_half_idx = neighborIndex(half_idx, oddBit, dx4,dx3,dx2,dx1);
503  int oddBitChanged = (dx4+dx3+dx2+dx1)%2;
504  if (oddBitChanged){
505  oddBit = 1 - oddBit;
506  }
507  int ret = nbr_half_idx;
508  if (oddBit){
509  ret = Vh + nbr_half_idx;
510  }
511 
512  return ret;
513 }
514 
515 
516 int
517 neighborIndexFullLattice_mg(int i, int dx4, int dx3, int dx2, int dx1)
518 {
519  int ret;
520  int oddBit = 0;
521  int half_idx = i;
522  if (i >= Vh){
523  oddBit =1;
524  half_idx = i - Vh;
525  }
526 
527  int Y = fullLatticeIndex(half_idx, oddBit);
528  int x4 = Y/(Z[2]*Z[1]*Z[0]);
529  int x3 = (Y/(Z[1]*Z[0])) % Z[2];
530  int x2 = (Y/Z[0]) % Z[1];
531  int x1 = Y % Z[0];
532  int ghost_x4 = x4+ dx4;
533 
534  x4 = (x4+dx4+Z[3]) % Z[3];
535  x3 = (x3+dx3+Z[2]) % Z[2];
536  x2 = (x2+dx2+Z[1]) % Z[1];
537  x1 = (x1+dx1+Z[0]) % Z[0];
538 
539  if ( ghost_x4 >= 0 && ghost_x4 < Z[3]){
540  ret = (x4*(Z[2]*Z[1]*Z[0]) + x3*(Z[1]*Z[0]) + x2*(Z[0]) + x1) / 2;
541  }else{
542  ret = (x3*(Z[1]*Z[0]) + x2*(Z[0]) + x1) / 2;
543  return ret;
544  }
545 
546  int oddBitChanged = (dx4+dx3+dx2+dx1)%2;
547  if (oddBitChanged){
548  oddBit = 1 - oddBit;
549  }
550 
551  if (oddBit){
552  ret += Vh;
553  }
554 
555  return ret;
556 }
557 
558 
559 // 4d checkerboard.
560 // given a "half index" i into either an even or odd half lattice (corresponding
561 // to oddBit = {0, 1}), returns the corresponding full lattice index.
562 // Cf. GPGPU code in dslash_core_ante.h.
563 // There, i is the thread index.
564 int fullLatticeIndex_4d(int i, int oddBit) {
565  if (i >= Vh || i < 0) {printf("i out of range in fullLatticeIndex_4d"); exit(-1);}
566  /*
567  int boundaryCrossings = i/(Z[0]/2) + i/(Z[1]*Z[0]/2) + i/(Z[2]*Z[1]*Z[0]/2);
568  return 2*i + (boundaryCrossings + oddBit) % 2;
569  */
570 
571  int X1 = Z[0];
572  int X2 = Z[1];
573  int X3 = Z[2];
574  //int X4 = Z[3];
575  int X1h =X1/2;
576 
577  int sid =i;
578  int za = sid/X1h;
579  //int x1h = sid - za*X1h;
580  int zb = za/X2;
581  int x2 = za - zb*X2;
582  int x4 = zb/X3;
583  int x3 = zb - x4*X3;
584  int x1odd = (x2 + x3 + x4 + oddBit) & 1;
585  //int x1 = 2*x1h + x1odd;
586  int X = 2*sid + x1odd;
587 
588  return X;
589 }
590 
591 // 5d checkerboard.
592 // given a "half index" i into either an even or odd half lattice (corresponding
593 // to oddBit = {0, 1}), returns the corresponding full lattice index.
594 // Cf. GPGPU code in dslash_core_ante.h.
595 // There, i is the thread index sid.
596 // This function is used by neighborIndex_5d in dslash_reference.cpp.
597 //ok
598 int fullLatticeIndex_5d(int i, int oddBit) {
599  int boundaryCrossings = i/(Z[0]/2) + i/(Z[1]*Z[0]/2) + i/(Z[2]*Z[1]*Z[0]/2) + i/(Z[3]*Z[2]*Z[1]*Z[0]/2);
600  return 2*i + (boundaryCrossings + oddBit) % 2;
601 }
602 
603 int
605 {
606  int oddBit = 0;
607  int half_idx = i;
608  if (i >= Vh){
609  oddBit =1;
610  half_idx = i - Vh;
611  }
612 
613  int Y = fullLatticeIndex(half_idx, oddBit);
614  int x4 = Y/(Z[2]*Z[1]*Z[0]);
615 
616  return x4;
617 }
618 
619 template <typename Float>
620 static void applyGaugeFieldScaling(Float **gauge, int Vh, QudaGaugeParam *param) {
621  // Apply spatial scaling factor (u0) to spatial links
622  for (int d = 0; d < 3; d++) {
623  for (int i = 0; i < gaugeSiteSize*Vh*2; i++) {
624  gauge[d][i] /= param->anisotropy;
625  }
626  }
627 
628  // only apply T-boundary at edge nodes
629 #ifdef MULTI_GPU
630  bool last_node_in_t = (commCoords(3) == commDim(3)-1) ? true : false;
631 #else
632  bool last_node_in_t = true;
633 #endif
634 
635  // Apply boundary conditions to temporal links
636  if (param->t_boundary == QUDA_ANTI_PERIODIC_T && last_node_in_t) {
637  for (int j = (Z[0]/2)*Z[1]*Z[2]*(Z[3]-1); j < Vh; j++) {
638  for (int i = 0; i < gaugeSiteSize; i++) {
639  gauge[3][j*gaugeSiteSize+i] *= -1.0;
640  gauge[3][(Vh+j)*gaugeSiteSize+i] *= -1.0;
641  }
642  }
643  }
644 
645  if (param->gauge_fix) {
646  // set all gauge links (except for the last Z[0]*Z[1]*Z[2]/2) to the identity,
647  // to simulate fixing to the temporal gauge.
648  int iMax = ( last_node_in_t ? (Z[0]/2)*Z[1]*Z[2]*(Z[3]-1) : Vh );
649  int dir = 3; // time direction only
650  Float *even = gauge[dir];
651  Float *odd = gauge[dir]+Vh*gaugeSiteSize;
652  for (int i = 0; i< iMax; i++) {
653  for (int m = 0; m < 3; m++) {
654  for (int n = 0; n < 3; n++) {
655  even[i*(3*3*2) + m*(3*2) + n*(2) + 0] = (m==n) ? 1 : 0;
656  even[i*(3*3*2) + m*(3*2) + n*(2) + 1] = 0.0;
657  odd [i*(3*3*2) + m*(3*2) + n*(2) + 0] = (m==n) ? 1 : 0;
658  odd [i*(3*3*2) + m*(3*2) + n*(2) + 1] = 0.0;
659  }
660  }
661  }
662  }
663 }
664 
665 template <typename Float>
666 void applyGaugeFieldScaling_long(Float **gauge, int Vh, QudaGaugeParam *param)
667 {
668 
669  int X1h=param->X[0]/2;
670  int X1 =param->X[0];
671  int X2 =param->X[1];
672  int X3 =param->X[2];
673  int X4 =param->X[3];
674 
675  // rescale long links by the appropriate coefficient
676  for(int d=0; d<4; d++){
677  for(int i=0; i < V*gaugeSiteSize; i++){
678  gauge[d][i] /= (-24*param->tadpole_coeff*param->tadpole_coeff);
679  }
680  }
681 
682  // apply the staggered phases
683  for (int d = 0; d < 3; d++) {
684 
685  //even
686  for (int i = 0; i < Vh; i++) {
687 
688  int index = fullLatticeIndex(i, 0);
689  int i4 = index /(X3*X2*X1);
690  int i3 = (index - i4*(X3*X2*X1))/(X2*X1);
691  int i2 = (index - i4*(X3*X2*X1) - i3*(X2*X1))/X1;
692  int i1 = index - i4*(X3*X2*X1) - i3*(X2*X1) - i2*X1;
693  int sign=1;
694 
695  if (d == 0) {
696  if (i4 % 2 == 1){
697  sign= -1;
698  }
699  }
700 
701  if (d == 1){
702  if ((i4+i1) % 2 == 1){
703  sign= -1;
704  }
705  }
706  if (d == 2){
707  if ( (i4+i1+i2) % 2 == 1){
708  sign= -1;
709  }
710  }
711 
712  for (int j=0;j < 6; j++){
713  gauge[d][i*gaugeSiteSize + 12+ j] *= sign;
714  }
715  }
716  //odd
717  for (int i = 0; i < Vh; i++) {
718  int index = fullLatticeIndex(i, 1);
719  int i4 = index /(X3*X2*X1);
720  int i3 = (index - i4*(X3*X2*X1))/(X2*X1);
721  int i2 = (index - i4*(X3*X2*X1) - i3*(X2*X1))/X1;
722  int i1 = index - i4*(X3*X2*X1) - i3*(X2*X1) - i2*X1;
723  int sign=1;
724 
725  if (d == 0) {
726  if (i4 % 2 == 1){
727  sign= -1;
728  }
729  }
730 
731  if (d == 1){
732  if ((i4+i1) % 2 == 1){
733  sign= -1;
734  }
735  }
736  if (d == 2){
737  if ( (i4+i1+i2) % 2 == 1){
738  sign = -1;
739  }
740  }
741 
742  for (int j=0;j < 6; j++){
743  gauge[d][(Vh+i)*gaugeSiteSize + 12 + j] *= sign;
744  }
745  }
746 
747  }
748 
749  // Apply boundary conditions to temporal links
750  if (param->t_boundary == QUDA_ANTI_PERIODIC_T) {
751  for (int j = 0; j < Vh; j++) {
752  int sign =1;
753  if (j >= (X4-3)*X1h*X2*X3 ){
754  sign= -1;
755  }
756 
757  for (int i = 0; i < 6; i++) {
758  gauge[3][j*gaugeSiteSize+ 12+ i ] *= sign;
759  gauge[3][(Vh+j)*gaugeSiteSize+12 +i] *= sign;
760  }
761  }
762  }
763 }
764 
765 
766 
767 template <typename Float>
768 static void constructUnitGaugeField(Float **res, QudaGaugeParam *param) {
769  Float *resOdd[4], *resEven[4];
770  for (int dir = 0; dir < 4; dir++) {
771  resEven[dir] = res[dir];
772  resOdd[dir] = res[dir]+Vh*gaugeSiteSize;
773  }
774 
775  for (int dir = 0; dir < 4; dir++) {
776  for (int i = 0; i < Vh; i++) {
777  for (int m = 0; m < 3; m++) {
778  for (int n = 0; n < 3; n++) {
779  resEven[dir][i*(3*3*2) + m*(3*2) + n*(2) + 0] = (m==n) ? 1 : 0;
780  resEven[dir][i*(3*3*2) + m*(3*2) + n*(2) + 1] = 0.0;
781  resOdd[dir][i*(3*3*2) + m*(3*2) + n*(2) + 0] = (m==n) ? 1 : 0;
782  resOdd[dir][i*(3*3*2) + m*(3*2) + n*(2) + 1] = 0.0;
783  }
784  }
785  }
786  }
787 
788  applyGaugeFieldScaling(res, Vh, param);
789 }
790 
791 // normalize the vector a
792 template <typename Float>
793 static void normalize(complex<Float> *a, int len) {
794  double sum = 0.0;
795  for (int i=0; i<len; i++) sum += norm(a[i]);
796  for (int i=0; i<len; i++) a[i] /= sqrt(sum);
797 }
798 
799 // orthogonalize vector b to vector a
800 template <typename Float>
801 static void orthogonalize(complex<Float> *a, complex<Float> *b, int len) {
802  complex<double> dot = 0.0;
803  for (int i=0; i<len; i++) dot += conj(a[i])*b[i];
804  for (int i=0; i<len; i++) b[i] -= (complex<Float>)dot*a[i];
805 }
806 
807 template <typename Float>
808 static void constructGaugeField(Float **res, QudaGaugeParam *param) {
809  Float *resOdd[4], *resEven[4];
810  for (int dir = 0; dir < 4; dir++) {
811  resEven[dir] = res[dir];
812  resOdd[dir] = res[dir]+Vh*gaugeSiteSize;
813  }
814 
815  for (int dir = 0; dir < 4; dir++) {
816  for (int i = 0; i < Vh; i++) {
817  for (int m = 1; m < 3; m++) { // last 2 rows
818  for (int n = 0; n < 3; n++) { // 3 columns
819  resEven[dir][i*(3*3*2) + m*(3*2) + n*(2) + 0] = rand() / (Float)RAND_MAX;
820  resEven[dir][i*(3*3*2) + m*(3*2) + n*(2) + 1] = rand() / (Float)RAND_MAX;
821  resOdd[dir][i*(3*3*2) + m*(3*2) + n*(2) + 0] = rand() / (Float)RAND_MAX;
822  resOdd[dir][i*(3*3*2) + m*(3*2) + n*(2) + 1] = rand() / (Float)RAND_MAX;
823  }
824  }
825  normalize((complex<Float>*)(resEven[dir] + (i*3+1)*3*2), 3);
826  orthogonalize((complex<Float>*)(resEven[dir] + (i*3+1)*3*2), (complex<Float>*)(resEven[dir] + (i*3+2)*3*2), 3);
827  normalize((complex<Float>*)(resEven[dir] + (i*3 + 2)*3*2), 3);
828 
829  normalize((complex<Float>*)(resOdd[dir] + (i*3+1)*3*2), 3);
830  orthogonalize((complex<Float>*)(resOdd[dir] + (i*3+1)*3*2), (complex<Float>*)(resOdd[dir] + (i*3+2)*3*2), 3);
831  normalize((complex<Float>*)(resOdd[dir] + (i*3 + 2)*3*2), 3);
832 
833  {
834  Float *w = resEven[dir]+(i*3+0)*3*2;
835  Float *u = resEven[dir]+(i*3+1)*3*2;
836  Float *v = resEven[dir]+(i*3+2)*3*2;
837 
838  for (int n = 0; n < 6; n++) w[n] = 0.0;
839  accumulateConjugateProduct(w+0*(2), u+1*(2), v+2*(2), +1);
840  accumulateConjugateProduct(w+0*(2), u+2*(2), v+1*(2), -1);
841  accumulateConjugateProduct(w+1*(2), u+2*(2), v+0*(2), +1);
842  accumulateConjugateProduct(w+1*(2), u+0*(2), v+2*(2), -1);
843  accumulateConjugateProduct(w+2*(2), u+0*(2), v+1*(2), +1);
844  accumulateConjugateProduct(w+2*(2), u+1*(2), v+0*(2), -1);
845  }
846 
847  {
848  Float *w = resOdd[dir]+(i*3+0)*3*2;
849  Float *u = resOdd[dir]+(i*3+1)*3*2;
850  Float *v = resOdd[dir]+(i*3+2)*3*2;
851 
852  for (int n = 0; n < 6; n++) w[n] = 0.0;
853  accumulateConjugateProduct(w+0*(2), u+1*(2), v+2*(2), +1);
854  accumulateConjugateProduct(w+0*(2), u+2*(2), v+1*(2), -1);
855  accumulateConjugateProduct(w+1*(2), u+2*(2), v+0*(2), +1);
856  accumulateConjugateProduct(w+1*(2), u+0*(2), v+2*(2), -1);
857  accumulateConjugateProduct(w+2*(2), u+0*(2), v+1*(2), +1);
858  accumulateConjugateProduct(w+2*(2), u+1*(2), v+0*(2), -1);
859  }
860 
861  }
862  }
863 
864  if (param->type == QUDA_WILSON_LINKS){
865  applyGaugeFieldScaling(res, Vh, param);
866  } else if (param->type == QUDA_ASQTAD_LONG_LINKS){
867  applyGaugeFieldScaling_long(res, Vh, param);
868  } else if (param->type == QUDA_ASQTAD_FAT_LINKS){
869  for (int dir = 0; dir < 4; dir++){
870  for (int i = 0; i < Vh; i++) {
871  for (int m = 0; m < 3; m++) { // last 2 rows
872  for (int n = 0; n < 3; n++) { // 3 columns
873  resEven[dir][i*(3*3*2) + m*(3*2) + n*(2) + 0] =1.0* rand() / (Float)RAND_MAX;
874  resEven[dir][i*(3*3*2) + m*(3*2) + n*(2) + 1] =2.0* rand() / (Float)RAND_MAX;
875  resOdd[dir][i*(3*3*2) + m*(3*2) + n*(2) + 0] = 3.0*rand() / (Float)RAND_MAX;
876  resOdd[dir][i*(3*3*2) + m*(3*2) + n*(2) + 1] = 4.0*rand() / (Float)RAND_MAX;
877  }
878  }
879  }
880  }
881 
882  }
883 
884 }
885 
886 template <typename Float>
888 {
889  Float *resOdd[4], *resEven[4];
890  for (int dir = 0; dir < 4; dir++) {
891  resEven[dir] = res[dir];
892  resOdd[dir] = res[dir]+Vh*gaugeSiteSize;
893  }
894 
895  for (int dir = 0; dir < 4; dir++) {
896  for (int i = 0; i < Vh; i++) {
897  for (int m = 1; m < 3; m++) { // last 2 rows
898  for (int n = 0; n < 3; n++) { // 3 columns
899  resEven[dir][i*(3*3*2) + m*(3*2) + n*(2) + 0] = rand() / (Float)RAND_MAX;
900  resEven[dir][i*(3*3*2) + m*(3*2) + n*(2) + 1] = rand() / (Float)RAND_MAX;
901  resOdd[dir][i*(3*3*2) + m*(3*2) + n*(2) + 0] = rand() / (Float)RAND_MAX;
902  resOdd[dir][i*(3*3*2) + m*(3*2) + n*(2) + 1] = rand() / (Float)RAND_MAX;
903  }
904  }
905  normalize((complex<Float>*)(resEven[dir] + (i*3+1)*3*2), 3);
906  orthogonalize((complex<Float>*)(resEven[dir] + (i*3+1)*3*2), (complex<Float>*)(resEven[dir] + (i*3+2)*3*2), 3);
907  normalize((complex<Float>*)(resEven[dir] + (i*3 + 2)*3*2), 3);
908 
909  normalize((complex<Float>*)(resOdd[dir] + (i*3+1)*3*2), 3);
910  orthogonalize((complex<Float>*)(resOdd[dir] + (i*3+1)*3*2), (complex<Float>*)(resOdd[dir] + (i*3+2)*3*2), 3);
911  normalize((complex<Float>*)(resOdd[dir] + (i*3 + 2)*3*2), 3);
912 
913  {
914  Float *w = resEven[dir]+(i*3+0)*3*2;
915  Float *u = resEven[dir]+(i*3+1)*3*2;
916  Float *v = resEven[dir]+(i*3+2)*3*2;
917 
918  for (int n = 0; n < 6; n++) w[n] = 0.0;
919  accumulateConjugateProduct(w+0*(2), u+1*(2), v+2*(2), +1);
920  accumulateConjugateProduct(w+0*(2), u+2*(2), v+1*(2), -1);
921  accumulateConjugateProduct(w+1*(2), u+2*(2), v+0*(2), +1);
922  accumulateConjugateProduct(w+1*(2), u+0*(2), v+2*(2), -1);
923  accumulateConjugateProduct(w+2*(2), u+0*(2), v+1*(2), +1);
924  accumulateConjugateProduct(w+2*(2), u+1*(2), v+0*(2), -1);
925  }
926 
927  {
928  Float *w = resOdd[dir]+(i*3+0)*3*2;
929  Float *u = resOdd[dir]+(i*3+1)*3*2;
930  Float *v = resOdd[dir]+(i*3+2)*3*2;
931 
932  for (int n = 0; n < 6; n++) w[n] = 0.0;
933  accumulateConjugateProduct(w+0*(2), u+1*(2), v+2*(2), +1);
934  accumulateConjugateProduct(w+0*(2), u+2*(2), v+1*(2), -1);
935  accumulateConjugateProduct(w+1*(2), u+2*(2), v+0*(2), +1);
936  accumulateConjugateProduct(w+1*(2), u+0*(2), v+2*(2), -1);
937  accumulateConjugateProduct(w+2*(2), u+0*(2), v+1*(2), +1);
938  accumulateConjugateProduct(w+2*(2), u+1*(2), v+0*(2), -1);
939  }
940 
941  }
942  }
943 }
944 
945 
946 void construct_gauge_field(void **gauge, int type, QudaPrecision precision, QudaGaugeParam *param) {
947  if (type == 0) {
948  if (precision == QUDA_DOUBLE_PRECISION) constructUnitGaugeField((double**)gauge, param);
949  else constructUnitGaugeField((float**)gauge, param);
950  } else if (type == 1) {
951  if (precision == QUDA_DOUBLE_PRECISION) constructGaugeField((double**)gauge, param);
952  else constructGaugeField((float**)gauge, param);
953  } else {
954  if (precision == QUDA_DOUBLE_PRECISION) applyGaugeFieldScaling((double**)gauge, Vh, param);
955  else applyGaugeFieldScaling((float**)gauge, Vh, param);
956  }
957 
958 }
959 
960 void
962  int type, QudaPrecision precision, QudaGaugeParam* param)
963 {
964  if (type == 0) {
965  if (precision == QUDA_DOUBLE_PRECISION) {
966  constructUnitGaugeField((double**)fatlink, param);
967  constructUnitGaugeField((double**)longlink, param);
968  }else {
969  constructUnitGaugeField((float**)fatlink, param);
970  constructUnitGaugeField((float**)longlink, param);
971  }
972  } else {
973  if (precision == QUDA_DOUBLE_PRECISION) {
974  param->type = QUDA_ASQTAD_FAT_LINKS;
975  constructGaugeField((double**)fatlink, param);
976  param->type = QUDA_ASQTAD_LONG_LINKS;
977  constructGaugeField((double**)longlink, param);
978  }else {
979  param->type = QUDA_ASQTAD_FAT_LINKS;
980  constructGaugeField((float**)fatlink, param);
981  param->type = QUDA_ASQTAD_LONG_LINKS;
982  constructGaugeField((float**)longlink, param);
983  }
984  }
985 }
986 
987 
988 template <typename Float>
989 static void constructCloverField(Float *res, double norm, double diag) {
990 
991  Float c = 2.0 * norm / RAND_MAX;
992 
993  for(int i = 0; i < V; i++) {
994  for (int j = 0; j < 72; j++) {
995  res[i*72 + j] = c*rand() - norm;
996  }
997  for (int j = 0; j< 6; j++) {
998  res[i*72 + j] += diag;
999  res[i*72 + j+36] += diag;
1000  }
1001  }
1002 }
1003 
1004 void construct_clover_field(void *clover, double norm, double diag, QudaPrecision precision) {
1005 
1006  if (precision == QUDA_DOUBLE_PRECISION) constructCloverField((double *)clover, norm, diag);
1007  else constructCloverField((float *)clover, norm, diag);
1008 }
1009 
1010 /*void strong_check(void *spinorRef, void *spinorGPU, int len, QudaPrecision prec) {
1011  printf("Reference:\n");
1012  printSpinorElement(spinorRef, 0, prec); printf("...\n");
1013  printSpinorElement(spinorRef, len-1, prec); printf("\n");
1014 
1015  printf("\nCUDA:\n");
1016  printSpinorElement(spinorGPU, 0, prec); printf("...\n");
1017  printSpinorElement(spinorGPU, len-1, prec); printf("\n");
1018 
1019  compare_spinor(spinorRef, spinorGPU, len, prec);
1020  }*/
1021 
1022 template <typename Float>
1023 static void checkGauge(Float **oldG, Float **newG, double epsilon) {
1024 
1025  const int fail_check = 17;
1026  int fail[4][fail_check];
1027  int iter[4][18];
1028  for (int d=0; d<4; d++) for (int i=0; i<fail_check; i++) fail[d][i] = 0;
1029  for (int d=0; d<4; d++) for (int i=0; i<18; i++) iter[d][i] = 0;
1030 
1031  for (int d=0; d<4; d++) {
1032  for (int eo=0; eo<2; eo++) {
1033  for (int i=0; i<Vh; i++) {
1034  int ga_idx = (eo*Vh+i);
1035  for (int j=0; j<18; j++) {
1036  double diff = fabs(newG[d][ga_idx*18+j] - oldG[d][ga_idx*18+j]);
1037 
1038  for (int f=0; f<fail_check; f++) if (diff > pow(10.0,-(f+1))) fail[d][f]++;
1039  if (diff > epsilon) iter[d][j]++;
1040  }
1041  }
1042  }
1043  }
1044 
1045  printf("Component fails (X, Y, Z, T)\n");
1046  for (int i=0; i<18; i++) printf("%d fails = (%8d, %8d, %8d, %8d)\n", i, iter[0][i], iter[1][i], iter[2][i], iter[3][i]);
1047 
1048  printf("\nDeviation Failures = (X, Y, Z, T)\n");
1049  for (int f=0; f<fail_check; f++) {
1050  printf("%e Failures = (%9d, %9d, %9d, %9d) = (%e, %e, %e, %e)\n", pow(10.0,-(f+1)),
1051  fail[0][f], fail[1][f], fail[2][f], fail[3][f],
1052  fail[0][f]/(double)(V*18), fail[1][f]/(double)(V*18), fail[2][f]/(double)(V*18), fail[3][f]/(double)(V*18));
1053  }
1054 
1055 }
1056 
1057 void check_gauge(void **oldG, void **newG, double epsilon, QudaPrecision precision) {
1058  if (precision == QUDA_DOUBLE_PRECISION)
1059  checkGauge((double**)oldG, (double**)newG, epsilon);
1060  else
1061  checkGauge((float**)oldG, (float**)newG, epsilon);
1062 }
1063 
1064 
1065 
1066 void
1067 createSiteLinkCPU(void** link, QudaPrecision precision, int phase)
1068 {
1069 
1070  if (precision == QUDA_DOUBLE_PRECISION) {
1071  constructUnitaryGaugeField((double**)link);
1072  }else {
1073  constructUnitaryGaugeField((float**)link);
1074  }
1075 
1076  // only apply temporal boundary condition if I'm the last node in T
1077 #ifdef MULTI_GPU
1078  bool last_node_in_t = (commCoords(3) == commDim(3)-1) ? true : false;
1079 #else
1080  bool last_node_in_t = true;
1081 #endif
1082 
1083  if(phase){
1084 
1085  for(int i=0;i < V;i++){
1086  for(int dir =XUP; dir <= TUP; dir++){
1087  int idx = i;
1088  int oddBit =0;
1089  if (i >= Vh) {
1090  idx = i - Vh;
1091  oddBit = 1;
1092  }
1093 
1094  int X1 = Z[0];
1095  int X2 = Z[1];
1096  int X3 = Z[2];
1097  int X4 = Z[3];
1098 
1099  int full_idx = fullLatticeIndex(idx, oddBit);
1100  int i4 = full_idx /(X3*X2*X1);
1101  int i3 = (full_idx - i4*(X3*X2*X1))/(X2*X1);
1102  int i2 = (full_idx - i4*(X3*X2*X1) - i3*(X2*X1))/X1;
1103  int i1 = full_idx - i4*(X3*X2*X1) - i3*(X2*X1) - i2*X1;
1104 
1105  double coeff= 1.0;
1106  switch(dir){
1107  case XUP:
1108  if ( (i4 & 1) != 0){
1109  coeff *= -1;
1110  }
1111  break;
1112 
1113  case YUP:
1114  if ( ((i4+i1) & 1) != 0){
1115  coeff *= -1;
1116  }
1117  break;
1118 
1119  case ZUP:
1120  if ( ((i4+i1+i2) & 1) != 0){
1121  coeff *= -1;
1122  }
1123  break;
1124 
1125  case TUP:
1126  if (last_node_in_t && i4 == (X4-1)){
1127  coeff *= -1;
1128  }
1129  break;
1130 
1131  default:
1132  printf("ERROR: wrong dir(%d)\n", dir);
1133  exit(1);
1134  }
1135 
1136  if (precision == QUDA_DOUBLE_PRECISION){
1137  //double* mylink = (double*)link;
1138  //mylink = mylink + (4*i + dir)*gaugeSiteSize;
1139  double* mylink = (double*)link[dir];
1140  mylink = mylink + i*gaugeSiteSize;
1141 
1142  mylink[12] *= coeff;
1143  mylink[13] *= coeff;
1144  mylink[14] *= coeff;
1145  mylink[15] *= coeff;
1146  mylink[16] *= coeff;
1147  mylink[17] *= coeff;
1148 
1149  }else{
1150  //float* mylink = (float*)link;
1151  //mylink = mylink + (4*i + dir)*gaugeSiteSize;
1152  float* mylink = (float*)link[dir];
1153  mylink = mylink + i*gaugeSiteSize;
1154 
1155  mylink[12] *= coeff;
1156  mylink[13] *= coeff;
1157  mylink[14] *= coeff;
1158  mylink[15] *= coeff;
1159  mylink[16] *= coeff;
1160  mylink[17] *= coeff;
1161 
1162  }
1163  }
1164  }
1165  }
1166 
1167 
1168 #if 1
1169  for(int dir= 0;dir < 4;dir++){
1170  for(int i=0;i< V*gaugeSiteSize;i++){
1171  if (precision ==QUDA_SINGLE_PRECISION){
1172  float* f = (float*)link[dir];
1173  if (f[i] != f[i] || (fabsf(f[i]) > 1.e+3) ){
1174  fprintf(stderr, "ERROR: %dth: bad number(%f) in function %s \n",i, f[i], __FUNCTION__);
1175  exit(1);
1176  }
1177  }else{
1178  double* f = (double*)link[dir];
1179  if (f[i] != f[i] || (fabs(f[i]) > 1.e+3)){
1180  fprintf(stderr, "ERROR: %dth: bad number(%f) in function %s \n",i, f[i], __FUNCTION__);
1181  exit(1);
1182  }
1183 
1184  }
1185 
1186  }
1187  }
1188 #endif
1189 
1190  return;
1191 }
1192 
1193 
1194 
1195 
1196 template <typename Float>
1197 int compareLink(Float **linkA, Float **linkB, int len) {
1198  const int fail_check = 16;
1199  int fail[fail_check];
1200  for (int f=0; f<fail_check; f++) fail[f] = 0;
1201 
1202  int iter[18];
1203  for (int i=0; i<18; i++) iter[i] = 0;
1204 
1205  for(int dir=0;dir < 4; dir++){
1206  for (int i=0; i<len; i++) {
1207  for (int j=0; j<18; j++) {
1208  int is = i*18+j;
1209  double diff = fabs(linkA[dir][is]-linkB[dir][is]);
1210  for (int f=0; f<fail_check; f++)
1211  if (diff > pow(10.0,-(f+1))) fail[f]++;
1212  //if (diff > 1e-1) printf("%d %d %e\n", i, j, diff);
1213  if (diff > 1e-3) iter[j]++;
1214  }
1215  }
1216  }
1217 
1218  for (int i=0; i<18; i++) printfQuda("%d fails = %d\n", i, iter[i]);
1219 
1220  int accuracy_level = 0;
1221  for(int f =0; f < fail_check; f++){
1222  if(fail[f] == 0){
1223  accuracy_level =f;
1224  }
1225  }
1226 
1227  for (int f=0; f<fail_check; f++) {
1228  printfQuda("%e Failures: %d / %d = %e\n", pow(10.0,-(f+1)), fail[f], 4*len*18, fail[f] / (double)(4*len*18));
1229  }
1230 
1231  return accuracy_level;
1232 }
1233 
1234 static int
1235 compare_link(void **linkA, void **linkB, int len, QudaPrecision precision)
1236 {
1237  int ret;
1238 
1239  if (precision == QUDA_DOUBLE_PRECISION){
1240  ret = compareLink((double**)linkA, (double**)linkB, len);
1241  }else {
1242  ret = compareLink((float**)linkA, (float**)linkB, len);
1243  }
1244 
1245  return ret;
1246 }
1247 
1248 
1249 // X indexes the lattice site
1250 static void
1251 printLinkElement(void *link, int X, QudaPrecision precision)
1252 {
1253  if (precision == QUDA_DOUBLE_PRECISION){
1254  for(int i=0; i < 3;i++){
1255  printVector((double*)link+ X*gaugeSiteSize + i*6);
1256  }
1257 
1258  }
1259  else{
1260  for(int i=0;i < 3;i++){
1261  printVector((float*)link+X*gaugeSiteSize + i*6);
1262  }
1263  }
1264 }
1265 
1266 int strong_check_link(void** linkA, const char* msgA,
1267  void **linkB, const char* msgB,
1268  int len, QudaPrecision prec)
1269 {
1270  printfQuda("%s\n", msgA);
1271  printLinkElement(linkA[0], 0, prec);
1272  printfQuda("\n");
1273  printLinkElement(linkA[0], 1, prec);
1274  printfQuda("...\n");
1275  printLinkElement(linkA[3], len-1, prec);
1276  printfQuda("\n");
1277 
1278  printfQuda("\n%s\n", msgB);
1279  printLinkElement(linkB[0], 0, prec);
1280  printfQuda("\n");
1281  printLinkElement(linkB[0], 1, prec);
1282  printfQuda("...\n");
1283  printLinkElement(linkB[3], len-1, prec);
1284  printfQuda("\n");
1285 
1286  int ret = compare_link(linkA, linkB, len, prec);
1287  return ret;
1288 }
1289 
1290 
1291 void
1292 createMomCPU(void* mom, QudaPrecision precision)
1293 {
1294  void* temp;
1295 
1296  size_t gSize = (precision == QUDA_DOUBLE_PRECISION) ? sizeof(double) : sizeof(float);
1297  temp = malloc(4*V*gaugeSiteSize*gSize);
1298  if (temp == NULL){
1299  fprintf(stderr, "Error: malloc failed for temp in function %s\n", __FUNCTION__);
1300  exit(1);
1301  }
1302 
1303 
1304 
1305  for(int i=0;i < V;i++){
1306  if (precision == QUDA_DOUBLE_PRECISION){
1307  for(int dir=0;dir < 4;dir++){
1308  double* thismom = (double*)mom;
1309  for(int k=0; k < momSiteSize; k++){
1310  thismom[ (4*i+dir)*momSiteSize + k ]= 1.0* rand() /RAND_MAX;
1311  if (k==momSiteSize-1) thismom[ (4*i+dir)*momSiteSize + k ]= 0.0;
1312  }
1313  }
1314  }else{
1315  for(int dir=0;dir < 4;dir++){
1316  float* thismom=(float*)mom;
1317  for(int k=0; k < momSiteSize; k++){
1318  thismom[ (4*i+dir)*momSiteSize + k ]= 1.0* rand() /RAND_MAX;
1319  if (k==momSiteSize-1) thismom[ (4*i+dir)*momSiteSize + k ]= 0.0;
1320  }
1321  }
1322  }
1323  }
1324 
1325  free(temp);
1326  return;
1327 }
1328 
1329 void
1330 createHwCPU(void* hw, QudaPrecision precision)
1331 {
1332  for(int i=0;i < V;i++){
1333  if (precision == QUDA_DOUBLE_PRECISION){
1334  for(int dir=0;dir < 4;dir++){
1335  double* thishw = (double*)hw;
1336  for(int k=0; k < hwSiteSize; k++){
1337  thishw[ (4*i+dir)*hwSiteSize + k ]= 1.0* rand() /RAND_MAX;
1338  }
1339  }
1340  }else{
1341  for(int dir=0;dir < 4;dir++){
1342  float* thishw=(float*)hw;
1343  for(int k=0; k < hwSiteSize; k++){
1344  thishw[ (4*i+dir)*hwSiteSize + k ]= 1.0* rand() /RAND_MAX;
1345  }
1346  }
1347  }
1348  }
1349 
1350  return;
1351 }
1352 
1353 
1354 template <typename Float>
1355 int compare_mom(Float *momA, Float *momB, int len) {
1356  const int fail_check = 16;
1357  int fail[fail_check];
1358  for (int f=0; f<fail_check; f++) fail[f] = 0;
1359 
1360  int iter[momSiteSize];
1361  for (int i=0; i<momSiteSize; i++) iter[i] = 0;
1362 
1363  for (int i=0; i<len; i++) {
1364  for (int j=0; j<momSiteSize; j++) {
1365  int is = i*momSiteSize+j;
1366  double diff = fabs(momA[is]-momB[is]);
1367  for (int f=0; f<fail_check; f++)
1368  if (diff > pow(10.0,-(f+1))) fail[f]++;
1369  //if (diff > 1e-1) printf("%d %d %e\n", i, j, diff);
1370  if (diff > 1e-3) iter[j]++;
1371  }
1372  }
1373 
1374  int accuracy_level = 0;
1375  for(int f =0; f < fail_check; f++){
1376  if(fail[f] == 0){
1377  accuracy_level =f+1;
1378  }
1379  }
1380 
1381  for (int i=0; i<momSiteSize; i++) printfQuda("%d fails = %d\n", i, iter[i]);
1382 
1383  for (int f=0; f<fail_check; f++) {
1384  printfQuda("%e Failures: %d / %d = %e\n", pow(10.0,-(f+1)), fail[f], len*momSiteSize, fail[f] / (double)(len*6));
1385  }
1386 
1387  return accuracy_level;
1388 }
1389 
1390 static void
1391 printMomElement(void *mom, int X, QudaPrecision precision)
1392 {
1393  if (precision == QUDA_DOUBLE_PRECISION){
1394  double* thismom = ((double*)mom)+ X*momSiteSize;
1395  printVector(thismom);
1396  printfQuda("(%9f,%9f) (%9f,%9f)\n", thismom[6], thismom[7], thismom[8], thismom[9]);
1397  }else{
1398  float* thismom = ((float*)mom)+ X*momSiteSize;
1399  printVector(thismom);
1400  printfQuda("(%9f,%9f) (%9f,%9f)\n", thismom[6], thismom[7], thismom[8], thismom[9]);
1401  }
1402 }
1403 int strong_check_mom(void * momA, void *momB, int len, QudaPrecision prec)
1404 {
1405  printfQuda("mom:\n");
1406  printMomElement(momA, 0, prec);
1407  printfQuda("\n");
1408  printMomElement(momA, 1, prec);
1409  printfQuda("\n");
1410  printMomElement(momA, 2, prec);
1411  printfQuda("\n");
1412  printMomElement(momA, 3, prec);
1413  printfQuda("...\n");
1414 
1415  printfQuda("\nreference mom:\n");
1416  printMomElement(momB, 0, prec);
1417  printfQuda("\n");
1418  printMomElement(momB, 1, prec);
1419  printfQuda("\n");
1420  printMomElement(momB, 2, prec);
1421  printfQuda("\n");
1422  printMomElement(momB, 3, prec);
1423  printfQuda("\n");
1424 
1425  int ret;
1426  if (prec == QUDA_DOUBLE_PRECISION){
1427  ret = compare_mom((double*)momA, (double*)momB, len);
1428  }else{
1429  ret = compare_mom((float*)momA, (float*)momB, len);
1430  }
1431 
1432  return ret;
1433 }
1434 
1435 
1436 /************
1437  * return value
1438  *
1439  * 0: command line option matched and processed sucessfully
1440  * non-zero: command line option does not match
1441  *
1442  */
1443 
1444 #ifdef MULTI_GPU
1445 int device = -1;
1446 #else
1447 int device = 0;
1448 #endif
1449 
1454 int xdim = 24;
1455 int ydim = 24;
1456 int zdim = 24;
1457 int tdim = 24;
1458 int Lsdim = 16;
1460 int gridsize_from_cmdline[4] = {1,1,1,1};
1462 char latfile[256] = "";
1463 bool tune = true;
1464 int niter = 10;
1465 int test_type = 0;
1466 
1467 static int dim_partitioned[4] = {0,0,0,0};
1468 
1469 int dimPartitioned(int dim)
1470 {
1471  return ((gridsize_from_cmdline[dim] > 1) || dim_partitioned[dim]);
1472 }
1473 
1474 
1475 void __attribute__((weak)) usage_extra(char** argv){};
1476 
1477 void usage(char** argv )
1478 {
1479  printf("Usage: %s [options]\n", argv[0]);
1480  printf("Common options: \n");
1481 #ifndef MULTI_GPU
1482  printf(" --device <n> # Set the CUDA device to use (default 0, single GPU only)\n");
1483 #endif
1484  printf(" --prec <double/single/half> # Precision in GPU\n");
1485  printf(" --prec_sloppy <double/single/half> # Sloppy precision in GPU\n");
1486  printf(" --recon <8/12/18> # Link reconstruction type\n");
1487  printf(" --recon_sloppy <8/12/18> # Sloppy link reconstruction type\n");
1488  printf(" --dagger # Set the dagger to 1 (default 0)\n");
1489  printf(" --sdim <n> # Set space dimention(X/Y/Z) size\n");
1490  printf(" --xdim <n> # Set X dimension size(default 24)\n");
1491  printf(" --ydim <n> # Set X dimension size(default 24)\n");
1492  printf(" --zdim <n> # Set X dimension size(default 24)\n");
1493  printf(" --tdim <n> # Set T dimension size(default 24)\n");
1494  printf(" --Lsdim <n> # Set Ls dimension size(default 16)\n");
1495  printf(" --xgridsize <n> # Set grid size in X dimension (default 1)\n");
1496  printf(" --ygridsize <n> # Set grid size in Y dimension (default 1)\n");
1497  printf(" --zgridsize <n> # Set grid size in Z dimension (default 1)\n");
1498  printf(" --tgridsize <n> # Set grid size in T dimension (default 1)\n");
1499  printf(" --partition <mask> # Set the communication topology (X=1, Y=2, Z=4, T=8, and combinations of these)\n");
1500  printf(" --kernel_pack_t # Set T dimension kernel packing to be true (default false)\n");
1501  printf(" --dslash_type <type> # Set the dslash type, the following values are valid\n"
1502  " wilson/clover/twisted_mass/asqtad/domain_wall\n");
1503  printf(" --load-gauge file # Load gauge field \"file\" for the test (requires QIO)\n");
1504  printf(" --niter <n> # The number of iterations to perform (default 10)\n");
1505  printf(" --tune <true/false> # Whether to autotune or not (default true)\n");
1506  printf(" --test # Test method (different for each test)\n");
1507  printf(" --help # Print out this message\n");
1508  usage_extra(argv);
1509 #ifdef MULTI_GPU
1510  char msg[]="multi";
1511 #else
1512  char msg[]="single";
1513 #endif
1514  printf("Note: this program is %s GPU build\n", msg);
1515  exit(1);
1516  return ;
1517 }
1518 
1519 int process_command_line_option(int argc, char** argv, int* idx)
1520 {
1521 #ifdef MULTI_GPU
1522  char msg[]="multi";
1523 #else
1524  char msg[]="single";
1525 #endif
1526 
1527  int ret = -1;
1528 
1529  int i = *idx;
1530 
1531  if( strcmp(argv[i], "--help")== 0){
1532  usage(argv);
1533  }
1534 
1535  if( strcmp(argv[i], "--device") == 0){
1536  if (i+1 >= argc){
1537  usage(argv);
1538  }
1539  device = atoi(argv[i+1]);
1540  if (device < 0 || device > 16){
1541  printf("ERROR: Invalid CUDA device number (%d)\n", device);
1542  usage(argv);
1543  }
1544  i++;
1545  ret = 0;
1546  goto out;
1547  }
1548 
1549  if( strcmp(argv[i], "--prec") == 0){
1550  if (i+1 >= argc){
1551  usage(argv);
1552  }
1553  prec = get_prec(argv[i+1]);
1554  i++;
1555  ret = 0;
1556  goto out;
1557  }
1558 
1559  if( strcmp(argv[i], "--prec_sloppy") == 0){
1560  if (i+1 >= argc){
1561  usage(argv);
1562  }
1563  prec_sloppy = get_prec(argv[i+1]);
1564  i++;
1565  ret = 0;
1566  goto out;
1567  }
1568 
1569  if( strcmp(argv[i], "--recon") == 0){
1570  if (i+1 >= argc){
1571  usage(argv);
1572  }
1573  link_recon = get_recon(argv[i+1]);
1574  i++;
1575  ret = 0;
1576  goto out;
1577  }
1578 
1579  if( strcmp(argv[i], "--recon_sloppy") == 0){
1580  if (i+1 >= argc){
1581  usage(argv);
1582  }
1583  link_recon_sloppy = get_recon(argv[i+1]);
1584  i++;
1585  ret = 0;
1586  goto out;
1587  }
1588 
1589  if( strcmp(argv[i], "--xdim") == 0){
1590  if (i+1 >= argc){
1591  usage(argv);
1592  }
1593  xdim= atoi(argv[i+1]);
1594  if (xdim < 0 || xdim > 512){
1595  printf("ERROR: invalid X dimension (%d)\n", xdim);
1596  usage(argv);
1597  }
1598  i++;
1599  ret = 0;
1600  goto out;
1601  }
1602 
1603  if( strcmp(argv[i], "--ydim") == 0){
1604  if (i+1 >= argc){
1605  usage(argv);
1606  }
1607  ydim= atoi(argv[i+1]);
1608  if (ydim < 0 || ydim > 512){
1609  printf("ERROR: invalid T dimension (%d)\n", ydim);
1610  usage(argv);
1611  }
1612  i++;
1613  ret = 0;
1614  goto out;
1615  }
1616 
1617 
1618  if( strcmp(argv[i], "--zdim") == 0){
1619  if (i+1 >= argc){
1620  usage(argv);
1621  }
1622  zdim= atoi(argv[i+1]);
1623  if (zdim < 0 || zdim > 512){
1624  printf("ERROR: invalid T dimension (%d)\n", zdim);
1625  usage(argv);
1626  }
1627  i++;
1628  ret = 0;
1629  goto out;
1630  }
1631 
1632  if( strcmp(argv[i], "--tdim") == 0){
1633  if (i+1 >= argc){
1634  usage(argv);
1635  }
1636  tdim = atoi(argv[i+1]);
1637  if (tdim < 0 || tdim > 512){
1638  errorQuda("Error: invalid t dimension");
1639  }
1640  i++;
1641  ret = 0;
1642  goto out;
1643  }
1644 
1645  if( strcmp(argv[i], "--sdim") == 0){
1646  if (i+1 >= argc){
1647  usage(argv);
1648  }
1649  int sdim = atoi(argv[i+1]);
1650  if (sdim < 0 || sdim > 512){
1651  printfQuda("ERROR: invalid S dimension\n");
1652  }
1653  xdim=ydim=zdim=sdim;
1654  i++;
1655  ret = 0;
1656  goto out;
1657  }
1658 
1659  if( strcmp(argv[i], "--Lsdim") == 0){
1660  if (i+1 >= argc){
1661  usage(argv);
1662  }
1663  int Ls = atoi(argv[i+1]);
1664  if (Ls < 0 || Ls > 128){
1665  printfQuda("ERROR: invalid Ls dimension\n");
1666  }
1667  Lsdim=Ls;
1668  i++;
1669  ret = 0;
1670  goto out;
1671  }
1672 
1673  if( strcmp(argv[i], "--dagger") == 0){
1674  dagger = QUDA_DAG_YES;
1675  ret = 0;
1676  goto out;
1677  }
1678 
1679  if( strcmp(argv[i], "--partition") == 0){
1680  if (i+1 >= argc){
1681  usage(argv);
1682  }
1683 #ifdef MULTI_GPU
1684  int value = atoi(argv[i+1]);
1685  for(int j=0; j < 4;j++){
1686  if (value & (1 << j)){
1688  dim_partitioned[j] = 1;
1689  }
1690  }
1691 #else
1692  printfQuda("WARNING: Ignoring --partition option since this is a single-GPU build.\n");
1693 #endif
1694  i++;
1695  ret = 0;
1696  goto out;
1697  }
1698 
1699  if( strcmp(argv[i], "--kernel_pack_t") == 0){
1700  quda::setKernelPackT(true);
1701  ret= 0;
1702  goto out;
1703  }
1704 
1705 
1706  if( strcmp(argv[i], "--tune") == 0){
1707  if (i+1 >= argc){
1708  usage(argv);
1709  }
1710 
1711  if (strcmp(argv[i+1], "true") == 0){
1712  tune = true;
1713  }else if (strcmp(argv[i+1], "false") == 0){
1714  tune = false;
1715  }else{
1716  fprintf(stderr, "ERROR: invalid tuning type\n");
1717  exit(1);
1718  }
1719 
1720  i++;
1721  ret = 0;
1722  goto out;
1723  }
1724 
1725  if( strcmp(argv[i], "--xgridsize") == 0){
1726  if (i+1 >= argc){
1727  usage(argv);
1728  }
1729  int xsize = atoi(argv[i+1]);
1730  if (xsize <= 0 ){
1731  errorQuda("ERROR: invalid X grid size");
1732  }
1733  gridsize_from_cmdline[0] = xsize;
1734  i++;
1735  ret = 0;
1736  goto out;
1737  }
1738 
1739  if( strcmp(argv[i], "--ygridsize") == 0){
1740  if (i+1 >= argc){
1741  usage(argv);
1742  }
1743  int ysize = atoi(argv[i+1]);
1744  if (ysize <= 0 ){
1745  errorQuda("ERROR: invalid Y grid size");
1746  }
1747  gridsize_from_cmdline[1] = ysize;
1748  i++;
1749  ret = 0;
1750  goto out;
1751  }
1752 
1753  if( strcmp(argv[i], "--zgridsize") == 0){
1754  if (i+1 >= argc){
1755  usage(argv);
1756  }
1757  int zsize = atoi(argv[i+1]);
1758  if (zsize <= 0 ){
1759  errorQuda("ERROR: invalid Z grid size");
1760  }
1761  gridsize_from_cmdline[2] = zsize;
1762  i++;
1763  ret = 0;
1764  goto out;
1765  }
1766 
1767  if( strcmp(argv[i], "--tgridsize") == 0){
1768  if (i+1 >= argc){
1769  usage(argv);
1770  }
1771  int tsize = atoi(argv[i+1]);
1772  if (tsize <= 0 ){
1773  errorQuda("ERROR: invalid T grid size");
1774  }
1775  gridsize_from_cmdline[3] = tsize;
1776  i++;
1777  ret = 0;
1778  goto out;
1779  }
1780 
1781  if( strcmp(argv[i], "--dslash_type") == 0){
1782  if (i+1 >= argc){
1783  usage(argv);
1784  }
1785  dslash_type = get_dslash_type(argv[i+1]);
1786  i++;
1787  ret = 0;
1788  goto out;
1789  }
1790 
1791  if( strcmp(argv[i], "--load-gauge") == 0){
1792  if (i+1 >= argc){
1793  usage(argv);
1794  }
1795  strcpy(latfile, argv[i+1]);
1796  i++;
1797  ret = 0;
1798  goto out;
1799  }
1800 
1801  if( strcmp(argv[i], "--test") == 0){
1802  if (i+1 >= argc){
1803  usage(argv);
1804  }
1805  test_type = atoi(argv[i+1]);
1806  i++;
1807  ret = 0;
1808  goto out;
1809  }
1810 
1811  if( strcmp(argv[i], "--niter") == 0){
1812  if (i+1 >= argc){
1813  usage(argv);
1814  }
1815  niter= atoi(argv[i+1]);
1816  if (niter < 1 || niter > 1e6){
1817  printf("ERROR: invalid number of iterations (%d)\n", niter);
1818  usage(argv);
1819  }
1820  i++;
1821  ret = 0;
1822  goto out;
1823  }
1824 
1825  if( strcmp(argv[i], "--version") == 0){
1826  printf("This program is linked with QUDA library, version %s,",
1827  get_quda_ver_str());
1828  printf(" %s GPU build\n", msg);
1829  exit(0);
1830  }
1831 
1832  out:
1833  *idx = i;
1834  return ret ;
1835 
1836 }
1837 
1838 
1839 static struct timeval startTime;
1840 
1842  gettimeofday(&startTime, NULL);
1843 }
1844 
1846  struct timeval endTime;
1847  gettimeofday(&endTime, NULL);
1848 
1849  long ds = endTime.tv_sec - startTime.tv_sec;
1850  long dus = endTime.tv_usec - startTime.tv_usec;
1851  return ds + 0.000001*dus;
1852 }
1853 
1854