QUDA  0.9.0
test_util.cpp
Go to the documentation of this file.
1 #include <complex>
2 #include <stdlib.h>
3 #include <stdio.h>
4 #include <string.h>
5 #include <short.h>
6 
7 #if defined(QMP_COMMS)
8 #include <qmp.h>
9 #elif defined(MPI_COMMS)
10 #include <mpi.h>
11 #endif
12 
14 #include <test_util.h>
15 
16 #include <dslash_quda.h>
17 #include "misc.h"
18 
19 using namespace std;
20 
21 #define XUP 0
22 #define YUP 1
23 #define ZUP 2
24 #define TUP 3
25 
26 
27 int Z[4];
28 int V;
29 int Vh;
30 int Vs_x, Vs_y, Vs_z, Vs_t;
32 int faceVolume[4];
33 
34 //extended volume, +4
35 int E1, E1h, E2, E3, E4;
36 int E[4];
37 int V_ex, Vh_ex;
38 
39 int Ls;
40 int V5;
41 int V5h;
42 
44 
45 extern float fat_link_max;
46 
50 int gridsize_from_cmdline[4] = {1,1,1,1};
51 
52 static int lex_rank_from_coords_t(const int *coords, void *fdata)
53 {
54  int rank = coords[0];
55  for (int i = 1; i < 4; i++) {
56  rank = gridsize_from_cmdline[i] * rank + coords[i];
57  }
58  return rank;
59 }
60 
61 static int lex_rank_from_coords_x(const int *coords, void *fdata)
62 {
63  int rank = coords[3];
64  for (int i = 2; i >= 0; i--) {
65  rank = gridsize_from_cmdline[i] * rank + coords[i];
66  }
67  return rank;
68 }
69 
70 static int rank_order = 0;
71 
72 void initComms(int argc, char **argv, const int *commDims)
73 {
74 #if defined(QMP_COMMS)
75  QMP_thread_level_t tl;
76  QMP_init_msg_passing(&argc, &argv, QMP_THREAD_SINGLE, &tl);
77 
78  // FIXME? - tests crash without this
79  QMP_declare_logical_topology(commDims, 4);
80 
81 #elif defined(MPI_COMMS)
82 #ifdef PTHREADS
83  int provided;
84  MPI_Init_thread(&argc, &argv, MPI_THREAD_MULTIPLE, &provided);
85 #else
86  MPI_Init(&argc, &argv);
87 #endif
88 
89 #endif
91 
92  initCommsGridQuda(4, commDims, func, NULL);
93  initRand();
94 
95  printfQuda("Rank order is %s major (%s running fastest)\n",
96  rank_order == 0 ? "column" : "row", rank_order == 0 ? "t" : "x");
97 
98 #ifdef HAVE_QIO
99  int partitioned = 0;
100  for (int i=0; i<4; i++) if (comm_dim(i) > 1) partitioned++;
101  if (rank_order == 0 && partitioned > 1)
102  errorQuda("Use of QIO is not supported with column-major process ordering, use row-major instead (--rank-order row)");
103 #endif
104 }
105 
106 
108 {
109 #if defined(QMP_COMMS)
110  QMP_finalize_msg_passing();
111 #elif defined(MPI_COMMS)
112  MPI_Finalize();
113 #endif
114 }
115 
116 
117 void initRand()
118 {
119  int rank = 0;
120 
121 #if defined(QMP_COMMS)
122  rank = QMP_get_node_number();
123 #elif defined(MPI_COMMS)
124  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
125 #endif
126 
127  srand(17*rank + 137);
128 }
129 
130 void setDims(int *X) {
131  V = 1;
132  for (int d=0; d< 4; d++) {
133  V *= X[d];
134  Z[d] = X[d];
135 
136  faceVolume[d] = 1;
137  for (int i=0; i<4; i++) {
138  if (i==d) continue;
139  faceVolume[d] *= X[i];
140  }
141  }
142  Vh = V/2;
143 
144  Vs_x = X[1]*X[2]*X[3];
145  Vs_y = X[0]*X[2]*X[3];
146  Vs_z = X[0]*X[1]*X[3];
147  Vs_t = X[0]*X[1]*X[2];
148 
149  Vsh_x = Vs_x/2;
150  Vsh_y = Vs_y/2;
151  Vsh_z = Vs_z/2;
152  Vsh_t = Vs_t/2;
153 
154 
155  E1=X[0]+4; E2=X[1]+4; E3=X[2]+4; E4=X[3]+4;
156  E1h=E1/2;
157  E[0] = E1;
158  E[1] = E2;
159  E[2] = E3;
160  E[3] = E4;
161  V_ex = E1*E2*E3*E4;
162  Vh_ex = V_ex/2;
163 
164 }
165 
166 
167 void dw_setDims(int *X, const int L5)
168 {
169  V = 1;
170  for (int d=0; d< 4; d++)
171  {
172  V *= X[d];
173  Z[d] = X[d];
174 
175  faceVolume[d] = 1;
176  for (int i=0; i<4; i++) {
177  if (i==d) continue;
178  faceVolume[d] *= X[i];
179  }
180  }
181  Vh = V/2;
182 
183  Ls = L5;
184  V5 = V*Ls;
185  V5h = Vh*Ls;
186 
187  Vs_t = Z[0]*Z[1]*Z[2]*Ls;//?
188  Vsh_t = Vs_t/2; //?
189 }
190 
191 
193 {
195 }
196 
197 
198 template <typename Float>
199 static void printVector(Float *v) {
200  printfQuda("{(%f %f) (%f %f) (%f %f)}\n", v[0], v[1], v[2], v[3], v[4], v[5]);
201 }
202 
203 // X indexes the lattice site
204 void printSpinorElement(void *spinor, int X, QudaPrecision precision) {
205  if (precision == QUDA_DOUBLE_PRECISION)
206  for (int s=0; s<4; s++) printVector((double*)spinor+X*24+s*6);
207  else
208  for (int s=0; s<4; s++) printVector((float*)spinor+X*24+s*6);
209 }
210 
211 // X indexes the full lattice
212 void printGaugeElement(void *gauge, int X, QudaPrecision precision) {
213  if (getOddBit(X) == 0) {
214  if (precision == QUDA_DOUBLE_PRECISION)
215  for (int m=0; m<3; m++) printVector((double*)gauge +(X/2)*gaugeSiteSize + m*3*2);
216  else
217  for (int m=0; m<3; m++) printVector((float*)gauge +(X/2)*gaugeSiteSize + m*3*2);
218 
219  } else {
220  if (precision == QUDA_DOUBLE_PRECISION)
221  for (int m = 0; m < 3; m++) printVector((double*)gauge + (X/2+Vh)*gaugeSiteSize + m*3*2);
222  else
223  for (int m = 0; m < 3; m++) printVector((float*)gauge + (X/2+Vh)*gaugeSiteSize + m*3*2);
224  }
225 }
226 
227 // returns 0 or 1 if the full lattice index X is even or odd
228 int getOddBit(int Y) {
229  int x4 = Y/(Z[2]*Z[1]*Z[0]);
230  int x3 = (Y/(Z[1]*Z[0])) % Z[2];
231  int x2 = (Y/Z[0]) % Z[1];
232  int x1 = Y % Z[0];
233  return (x4+x3+x2+x1) % 2;
234 }
235 
236 // a+=b
237 template <typename Float>
238 inline void complexAddTo(Float *a, Float *b) {
239  a[0] += b[0];
240  a[1] += b[1];
241 }
242 
243 // a = b*c
244 template <typename Float>
245 inline void complexProduct(Float *a, Float *b, Float *c) {
246  a[0] = b[0]*c[0] - b[1]*c[1];
247  a[1] = b[0]*c[1] + b[1]*c[0];
248 }
249 
250 // a = conj(b)*conj(c)
251 template <typename Float>
252 inline void complexConjugateProduct(Float *a, Float *b, Float *c) {
253  a[0] = b[0]*c[0] - b[1]*c[1];
254  a[1] = -b[0]*c[1] - b[1]*c[0];
255 }
256 
257 // a = conj(b)*c
258 template <typename Float>
259 inline void complexDotProduct(Float *a, Float *b, Float *c) {
260  a[0] = b[0]*c[0] + b[1]*c[1];
261  a[1] = b[0]*c[1] - b[1]*c[0];
262 }
263 
264 // a += b*c
265 template <typename Float>
266 inline void accumulateComplexProduct(Float *a, Float *b, Float *c, Float sign) {
267  a[0] += sign*(b[0]*c[0] - b[1]*c[1]);
268  a[1] += sign*(b[0]*c[1] + b[1]*c[0]);
269 }
270 
271 // a += conj(b)*c)
272 template <typename Float>
273 inline void accumulateComplexDotProduct(Float *a, Float *b, Float *c) {
274  a[0] += b[0]*c[0] + b[1]*c[1];
275  a[1] += b[0]*c[1] - b[1]*c[0];
276 }
277 
278 template <typename Float>
279 inline void accumulateConjugateProduct(Float *a, Float *b, Float *c, int sign) {
280  a[0] += sign * (b[0]*c[0] - b[1]*c[1]);
281  a[1] -= sign * (b[0]*c[1] + b[1]*c[0]);
282 }
283 
284 template <typename Float>
285 inline void su3Construct12(Float *mat) {
286  Float *w = mat+12;
287  w[0] = 0.0;
288  w[1] = 0.0;
289  w[2] = 0.0;
290  w[3] = 0.0;
291  w[4] = 0.0;
292  w[5] = 0.0;
293 }
294 
295 // Stabilized Bunk and Sommer
296 template <typename Float>
297 inline void su3Construct8(Float *mat) {
298  mat[0] = atan2(mat[1], mat[0]);
299  mat[1] = atan2(mat[13], mat[12]);
300  for (int i=8; i<18; i++) mat[i] = 0.0;
301 }
302 
303 void su3_construct(void *mat, QudaReconstructType reconstruct, QudaPrecision precision) {
304  if (reconstruct == QUDA_RECONSTRUCT_12) {
305  if (precision == QUDA_DOUBLE_PRECISION) su3Construct12((double*)mat);
306  else su3Construct12((float*)mat);
307  } else {
308  if (precision == QUDA_DOUBLE_PRECISION) su3Construct8((double*)mat);
309  else su3Construct8((float*)mat);
310  }
311 }
312 
313 // given first two rows (u,v) of SU(3) matrix mat, reconstruct the third row
314 // as the cross product of the conjugate vectors: w = u* x v*
315 //
316 // 48 flops
317 template <typename Float>
318 static void su3Reconstruct12(Float *mat, int dir, int ga_idx, QudaGaugeParam *param) {
319  Float *u = &mat[0*(3*2)];
320  Float *v = &mat[1*(3*2)];
321  Float *w = &mat[2*(3*2)];
322  w[0] = 0.0; w[1] = 0.0; w[2] = 0.0; w[3] = 0.0; w[4] = 0.0; w[5] = 0.0;
323  accumulateConjugateProduct(w+0*(2), u+1*(2), v+2*(2), +1);
324  accumulateConjugateProduct(w+0*(2), u+2*(2), v+1*(2), -1);
325  accumulateConjugateProduct(w+1*(2), u+2*(2), v+0*(2), +1);
326  accumulateConjugateProduct(w+1*(2), u+0*(2), v+2*(2), -1);
327  accumulateConjugateProduct(w+2*(2), u+0*(2), v+1*(2), +1);
328  accumulateConjugateProduct(w+2*(2), u+1*(2), v+0*(2), -1);
329  Float u0 = (dir < 3 ? param->anisotropy :
330  (ga_idx >= (Z[3]-1)*Z[0]*Z[1]*Z[2]/2 ? param->t_boundary : 1));
331  w[0]*=u0; w[1]*=u0; w[2]*=u0; w[3]*=u0; w[4]*=u0; w[5]*=u0;
332 }
333 
334 template <typename Float>
335 static void su3Reconstruct8(Float *mat, int dir, int ga_idx, QudaGaugeParam *param) {
336  // First reconstruct first row
337  Float row_sum = 0.0;
338  row_sum += mat[2]*mat[2];
339  row_sum += mat[3]*mat[3];
340  row_sum += mat[4]*mat[4];
341  row_sum += mat[5]*mat[5];
342  Float u0 = (dir < 3 ? param->anisotropy :
343  (ga_idx >= (Z[3]-1)*Z[0]*Z[1]*Z[2]/2 ? param->t_boundary : 1));
344  Float U00_mag = sqrt(1.f/(u0*u0) - row_sum);
345 
346  mat[14] = mat[0];
347  mat[15] = mat[1];
348 
349  mat[0] = U00_mag * cos(mat[14]);
350  mat[1] = U00_mag * sin(mat[14]);
351 
352  Float column_sum = 0.0;
353  for (int i=0; i<2; i++) column_sum += mat[i]*mat[i];
354  for (int i=6; i<8; i++) column_sum += mat[i]*mat[i];
355  Float U20_mag = sqrt(1.f/(u0*u0) - column_sum);
356 
357  mat[12] = U20_mag * cos(mat[15]);
358  mat[13] = U20_mag * sin(mat[15]);
359 
360  // First column now restored
361 
362  // finally reconstruct last elements from SU(2) rotation
363  Float r_inv2 = 1.0/(u0*row_sum);
364 
365  // U11
366  Float A[2];
367  complexDotProduct(A, mat+0, mat+6);
369  accumulateComplexProduct(mat+8, A, mat+2, u0);
370  mat[8] *= -r_inv2;
371  mat[9] *= -r_inv2;
372 
373  // U12
374  complexConjugateProduct(mat+10, mat+12, mat+2);
375  accumulateComplexProduct(mat+10, A, mat+4, -u0);
376  mat[10] *= r_inv2;
377  mat[11] *= r_inv2;
378 
379  // U21
380  complexDotProduct(A, mat+0, mat+12);
382  accumulateComplexProduct(mat+14, A, mat+2, -u0);
383  mat[14] *= r_inv2;
384  mat[15] *= r_inv2;
385 
386  // U12
388  accumulateComplexProduct(mat+16, A, mat+4, u0);
389  mat[16] *= -r_inv2;
390  mat[17] *= -r_inv2;
391 }
392 
393 void su3_reconstruct(void *mat, int dir, int ga_idx, QudaReconstructType reconstruct, QudaPrecision precision, QudaGaugeParam *param) {
394  if (reconstruct == QUDA_RECONSTRUCT_12) {
395  if (precision == QUDA_DOUBLE_PRECISION) su3Reconstruct12((double*)mat, dir, ga_idx, param);
396  else su3Reconstruct12((float*)mat, dir, ga_idx, param);
397  } else {
398  if (precision == QUDA_DOUBLE_PRECISION) su3Reconstruct8((double*)mat, dir, ga_idx, param);
399  else su3Reconstruct8((float*)mat, dir, ga_idx, param);
400  }
401 }
402 
403 /*
404  void su3_construct_8_half(float *mat, short *mat_half) {
405  su3Construct8(mat);
406 
407  mat_half[0] = floatToShort(mat[0] / M_PI);
408  mat_half[1] = floatToShort(mat[1] / M_PI);
409  for (int i=2; i<18; i++) {
410  mat_half[i] = floatToShort(mat[i]);
411  }
412  }
413 
414  void su3_reconstruct_8_half(float *mat, short *mat_half, int dir, int ga_idx, QudaGaugeParam *param) {
415 
416  for (int i=0; i<18; i++) {
417  mat[i] = shortToFloat(mat_half[i]);
418  }
419  mat[0] *= M_PI;
420  mat[1] *= M_PI;
421 
422  su3Reconstruct8(mat, dir, ga_idx, param);
423  }*/
424 
425 template <typename Float>
426 static int compareFloats(Float *a, Float *b, int len, double epsilon) {
427  for (int i = 0; i < len; i++) {
428  double diff = fabs(a[i] - b[i]);
429  if (diff > epsilon) {
430  printfQuda("ERROR: i=%d, a[%d]=%f, b[%d]=%f\n", i, i, a[i], i, b[i]);
431  return 0;
432  }
433  }
434  return 1;
435 }
436 
437 int compare_floats(void *a, void *b, int len, double epsilon, QudaPrecision precision) {
438  if (precision == QUDA_DOUBLE_PRECISION) return compareFloats((double*)a, (double*)b, len, epsilon);
439  else return compareFloats((float*)a, (float*)b, len, epsilon);
440 }
441 
442 int fullLatticeIndex(int dim[4], int index, int oddBit){
443 
444  int za = index/(dim[0]>>1);
445  int zb = za/dim[1];
446  int x2 = za - zb*dim[1];
447  int x4 = zb/dim[2];
448  int x3 = zb - x4*dim[2];
449 
450  return 2*index + ((x2 + x3 + x4 + oddBit) & 1);
451 }
452 
453 // given a "half index" i into either an even or odd half lattice (corresponding
454 // to oddBit = {0, 1}), returns the corresponding full lattice index.
455 int fullLatticeIndex(int i, int oddBit) {
456  /*
457  int boundaryCrossings = i/(Z[0]/2) + i/(Z[1]*Z[0]/2) + i/(Z[2]*Z[1]*Z[0]/2);
458  return 2*i + (boundaryCrossings + oddBit) % 2;
459  */
460 
461  int X1 = Z[0];
462  int X2 = Z[1];
463  int X3 = Z[2];
464  //int X4 = Z[3];
465  int X1h =X1/2;
466 
467  int sid =i;
468  int za = sid/X1h;
469  //int x1h = sid - za*X1h;
470  int zb = za/X2;
471  int x2 = za - zb*X2;
472  int x4 = zb/X3;
473  int x3 = zb - x4*X3;
474  int x1odd = (x2 + x3 + x4 + oddBit) & 1;
475  //int x1 = 2*x1h + x1odd;
476  int X = 2*sid + x1odd;
477 
478  return X;
479 }
480 
481 
482 // i represents a "half index" into an even or odd "half lattice".
483 // when oddBit={0,1} the half lattice is {even,odd}.
484 //
485 // the displacements, such as dx, refer to the full lattice coordinates.
486 //
487 // neighborIndex() takes a "half index", displaces it, and returns the
488 // new "half index", which can be an index into either the even or odd lattices.
489 // displacements of magnitude one always interchange odd and even lattices.
490 //
491 
492 int neighborIndex(int i, int oddBit, int dx4, int dx3, int dx2, int dx1) {
493  int Y = fullLatticeIndex(i, oddBit);
494  int x4 = Y/(Z[2]*Z[1]*Z[0]);
495  int x3 = (Y/(Z[1]*Z[0])) % Z[2];
496  int x2 = (Y/Z[0]) % Z[1];
497  int x1 = Y % Z[0];
498 
499  // assert (oddBit == (x+y+z+t)%2);
500 
501  x4 = (x4+dx4+Z[3]) % Z[3];
502  x3 = (x3+dx3+Z[2]) % Z[2];
503  x2 = (x2+dx2+Z[1]) % Z[1];
504  x1 = (x1+dx1+Z[0]) % Z[0];
505 
506  return (x4*(Z[2]*Z[1]*Z[0]) + x3*(Z[1]*Z[0]) + x2*(Z[0]) + x1) / 2;
507 }
508 
509 
510 int neighborIndex(int dim[4], int index, int oddBit, int dx[4]){
511 
512  const int fullIndex = fullLatticeIndex(dim, index, oddBit);
513 
514  int x[4];
515  x[3] = fullIndex/(dim[2]*dim[1]*dim[0]);
516  x[2] = (fullIndex/(dim[1]*dim[0])) % dim[2];
517  x[1] = (fullIndex/dim[0]) % dim[1];
518  x[0] = fullIndex % dim[0];
519 
520  for(int dir=0; dir<4; ++dir)
521  x[dir] = (x[dir]+dx[dir]+dim[dir]) % dim[dir];
522 
523  return (((x[3]*dim[2] + x[2])*dim[1] + x[1])*dim[0] + x[0])/2;
524 }
525 
526 int
527 neighborIndex_mg(int i, int oddBit, int dx4, int dx3, int dx2, int dx1)
528 {
529  int ret;
530 
531  int Y = fullLatticeIndex(i, oddBit);
532  int x4 = Y/(Z[2]*Z[1]*Z[0]);
533  int x3 = (Y/(Z[1]*Z[0])) % Z[2];
534  int x2 = (Y/Z[0]) % Z[1];
535  int x1 = Y % Z[0];
536 
537  int ghost_x4 = x4+ dx4;
538 
539  // assert (oddBit == (x+y+z+t)%2);
540 
541  x4 = (x4+dx4+Z[3]) % Z[3];
542  x3 = (x3+dx3+Z[2]) % Z[2];
543  x2 = (x2+dx2+Z[1]) % Z[1];
544  x1 = (x1+dx1+Z[0]) % Z[0];
545 
546  if ( (ghost_x4 >= 0 && ghost_x4 < Z[3]) || !comm_dim_partitioned(3)){
547  ret = (x4*(Z[2]*Z[1]*Z[0]) + x3*(Z[1]*Z[0]) + x2*(Z[0]) + x1) / 2;
548  }else{
549  ret = (x3*(Z[1]*Z[0]) + x2*(Z[0]) + x1) / 2;
550  }
551 
552 
553  return ret;
554 }
555 
556 
557 /*
558  * This is a computation of neighbor using the full index and the displacement in each direction
559  *
560  */
561 
562 int
563 neighborIndexFullLattice(int i, int dx4, int dx3, int dx2, int dx1)
564 {
565  int oddBit = 0;
566  int half_idx = i;
567  if (i >= Vh){
568  oddBit =1;
569  half_idx = i - Vh;
570  }
571 
572  int nbr_half_idx = neighborIndex(half_idx, oddBit, dx4,dx3,dx2,dx1);
573  int oddBitChanged = (dx4+dx3+dx2+dx1)%2;
574  if (oddBitChanged){
575  oddBit = 1 - oddBit;
576  }
577  int ret = nbr_half_idx;
578  if (oddBit){
579  ret = Vh + nbr_half_idx;
580  }
581 
582  return ret;
583 }
584 
585 int
586 neighborIndexFullLattice(int dim[4], int index, int dx[4])
587 {
588  const int volume = dim[0]*dim[1]*dim[2]*dim[3];
589  const int halfVolume = volume/2;
590  int oddBit = 0;
591  int halfIndex = index;
592 
593  if(index >= halfVolume){
594  oddBit = 1;
595  halfIndex = index - halfVolume;
596  }
597 
598  int neighborHalfIndex = neighborIndex(dim, halfIndex, oddBit, dx);
599 
600  int oddBitChanged = (dx[0]+dx[1]+dx[2]+dx[3])%2;
601  if(oddBitChanged){
602  oddBit = 1 - oddBit;
603  }
604 
605  return neighborHalfIndex + oddBit*halfVolume;
606 }
607 
608 
609 
610 int
611 neighborIndexFullLattice_mg(int i, int dx4, int dx3, int dx2, int dx1)
612 {
613  int ret;
614  int oddBit = 0;
615  int half_idx = i;
616  if (i >= Vh){
617  oddBit =1;
618  half_idx = i - Vh;
619  }
620 
621  int Y = fullLatticeIndex(half_idx, oddBit);
622  int x4 = Y/(Z[2]*Z[1]*Z[0]);
623  int x3 = (Y/(Z[1]*Z[0])) % Z[2];
624  int x2 = (Y/Z[0]) % Z[1];
625  int x1 = Y % Z[0];
626  int ghost_x4 = x4+ dx4;
627 
628  x4 = (x4+dx4+Z[3]) % Z[3];
629  x3 = (x3+dx3+Z[2]) % Z[2];
630  x2 = (x2+dx2+Z[1]) % Z[1];
631  x1 = (x1+dx1+Z[0]) % Z[0];
632 
633  if ( ghost_x4 >= 0 && ghost_x4 < Z[3]){
634  ret = (x4*(Z[2]*Z[1]*Z[0]) + x3*(Z[1]*Z[0]) + x2*(Z[0]) + x1) / 2;
635  }else{
636  ret = (x3*(Z[1]*Z[0]) + x2*(Z[0]) + x1) / 2;
637  return ret;
638  }
639 
640  int oddBitChanged = (dx4+dx3+dx2+dx1)%2;
641  if (oddBitChanged){
642  oddBit = 1 - oddBit;
643  }
644 
645  if (oddBit){
646  ret += Vh;
647  }
648 
649  return ret;
650 }
651 
652 
653 // 4d checkerboard.
654 // given a "half index" i into either an even or odd half lattice (corresponding
655 // to oddBit = {0, 1}), returns the corresponding full lattice index.
656 // Cf. GPGPU code in dslash_core_ante.h.
657 // There, i is the thread index.
658 int fullLatticeIndex_4d(int i, int oddBit) {
659  if (i >= Vh || i < 0) {printf("i out of range in fullLatticeIndex_4d"); exit(-1);}
660  /*
661  int boundaryCrossings = i/(Z[0]/2) + i/(Z[1]*Z[0]/2) + i/(Z[2]*Z[1]*Z[0]/2);
662  return 2*i + (boundaryCrossings + oddBit) % 2;
663  */
664 
665  int X1 = Z[0];
666  int X2 = Z[1];
667  int X3 = Z[2];
668  //int X4 = Z[3];
669  int X1h =X1/2;
670 
671  int sid =i;
672  int za = sid/X1h;
673  //int x1h = sid - za*X1h;
674  int zb = za/X2;
675  int x2 = za - zb*X2;
676  int x4 = zb/X3;
677  int x3 = zb - x4*X3;
678  int x1odd = (x2 + x3 + x4 + oddBit) & 1;
679  //int x1 = 2*x1h + x1odd;
680  int X = 2*sid + x1odd;
681 
682  return X;
683 }
684 
685 // 5d checkerboard.
686 // given a "half index" i into either an even or odd half lattice (corresponding
687 // to oddBit = {0, 1}), returns the corresponding full lattice index.
688 // Cf. GPGPU code in dslash_core_ante.h.
689 // There, i is the thread index sid.
690 // This function is used by neighborIndex_5d in dslash_reference.cpp.
691 //ok
692 int fullLatticeIndex_5d(int i, int oddBit) {
693  int boundaryCrossings = i/(Z[0]/2) + i/(Z[1]*Z[0]/2) + i/(Z[2]*Z[1]*Z[0]/2) + i/(Z[3]*Z[2]*Z[1]*Z[0]/2);
694  return 2*i + (boundaryCrossings + oddBit) % 2;
695 }
696 
697 int fullLatticeIndex_5d_4dpc(int i, int oddBit) {
698  int boundaryCrossings = i/(Z[0]/2) + i/(Z[1]*Z[0]/2) + i/(Z[2]*Z[1]*Z[0]/2);
699  return 2*i + (boundaryCrossings + oddBit) % 2;
700 }
701 
702 int
704 {
705  int oddBit = 0;
706  int half_idx = i;
707  if (i >= Vh){
708  oddBit =1;
709  half_idx = i - Vh;
710  }
711 
712  int Y = fullLatticeIndex(half_idx, oddBit);
713  int x4 = Y/(Z[2]*Z[1]*Z[0]);
714 
715  return x4;
716 }
717 
718 template <typename Float>
719 static void applyGaugeFieldScaling(Float **gauge, int Vh, QudaGaugeParam *param) {
720  // Apply spatial scaling factor (u0) to spatial links
721  for (int d = 0; d < 3; d++) {
722  for (int i = 0; i < gaugeSiteSize*Vh*2; i++) {
723  gauge[d][i] /= param->anisotropy;
724  }
725  }
726 
727  // only apply T-boundary at edge nodes
728 #ifdef MULTI_GPU
729  bool last_node_in_t = (commCoords(3) == commDim(3)-1) ? true : false;
730 #else
731  bool last_node_in_t = true;
732 #endif
733 
734  // Apply boundary conditions to temporal links
735  if (param->t_boundary == QUDA_ANTI_PERIODIC_T && last_node_in_t) {
736  for (int j = (Z[0]/2)*Z[1]*Z[2]*(Z[3]-1); j < Vh; j++) {
737  for (int i = 0; i < gaugeSiteSize; i++) {
738  gauge[3][j*gaugeSiteSize+i] *= -1.0;
739  gauge[3][(Vh+j)*gaugeSiteSize+i] *= -1.0;
740  }
741  }
742  }
743 
744  if (param->gauge_fix) {
745  // set all gauge links (except for the last Z[0]*Z[1]*Z[2]/2) to the identity,
746  // to simulate fixing to the temporal gauge.
747  int iMax = ( last_node_in_t ? (Z[0]/2)*Z[1]*Z[2]*(Z[3]-1) : Vh );
748  int dir = 3; // time direction only
749  Float *even = gauge[dir];
750  Float *odd = gauge[dir]+Vh*gaugeSiteSize;
751  for (int i = 0; i< iMax; i++) {
752  for (int m = 0; m < 3; m++) {
753  for (int n = 0; n < 3; n++) {
754  even[i*(3*3*2) + m*(3*2) + n*(2) + 0] = (m==n) ? 1 : 0;
755  even[i*(3*3*2) + m*(3*2) + n*(2) + 1] = 0.0;
756  odd [i*(3*3*2) + m*(3*2) + n*(2) + 0] = (m==n) ? 1 : 0;
757  odd [i*(3*3*2) + m*(3*2) + n*(2) + 1] = 0.0;
758  }
759  }
760  }
761  }
762 }
763 
764 template <typename Float>
766 {
767 
768  int X1h=param->X[0]/2;
769  int X1 =param->X[0];
770  int X2 =param->X[1];
771  int X3 =param->X[2];
772  int X4 =param->X[3];
773 
774  // rescale long links by the appropriate coefficient
776  for(int d=0; d<4; d++){
777  for(int i=0; i < V*gaugeSiteSize; i++){
778  gauge[d][i] /= (-24*param->tadpole_coeff*param->tadpole_coeff);
779  }
780  }
781  }
782 
783  // apply the staggered phases
784  for (int d = 0; d < 3; d++) {
785 
786  //even
787  for (int i = 0; i < Vh; i++) {
788 
789  int index = fullLatticeIndex(i, 0);
790  int i4 = index /(X3*X2*X1);
791  int i3 = (index - i4*(X3*X2*X1))/(X2*X1);
792  int i2 = (index - i4*(X3*X2*X1) - i3*(X2*X1))/X1;
793  int i1 = index - i4*(X3*X2*X1) - i3*(X2*X1) - i2*X1;
794  int sign = 1;
795 
796  if (d == 0) {
797  if (i4 % 2 == 1){
798  sign= -1;
799  }
800  }
801 
802  if (d == 1){
803  if ((i4+i1) % 2 == 1){
804  sign= -1;
805  }
806  }
807  if (d == 2){
808  if ( (i4+i1+i2) % 2 == 1){
809  sign= -1;
810  }
811  }
812 
813  for (int j=0; j < 18; j++) {
814  gauge[d][i*gaugeSiteSize + j] *= sign;
815  }
816  }
817  //odd
818  for (int i = 0; i < Vh; i++) {
819  int index = fullLatticeIndex(i, 1);
820  int i4 = index /(X3*X2*X1);
821  int i3 = (index - i4*(X3*X2*X1))/(X2*X1);
822  int i2 = (index - i4*(X3*X2*X1) - i3*(X2*X1))/X1;
823  int i1 = index - i4*(X3*X2*X1) - i3*(X2*X1) - i2*X1;
824  int sign = 1;
825 
826  if (d == 0) {
827  if (i4 % 2 == 1){
828  sign = -1;
829  }
830  }
831 
832  if (d == 1){
833  if ((i4+i1) % 2 == 1){
834  sign = -1;
835  }
836  }
837  if (d == 2){
838  if ( (i4+i1+i2) % 2 == 1){
839  sign = -1;
840  }
841  }
842 
843  for (int j=0; j<18; j++){
844  gauge[d][(Vh+i)*gaugeSiteSize + j] *= sign;
845  }
846  }
847 
848  }
849 
850  // Apply boundary conditions to temporal links
852  for (int j = 0; j < Vh; j++) {
853  int sign =1;
855  if (j >= (X4-3)*X1h*X2*X3 ){
856  sign = -1;
857  }
858  } else {
859  if (j >= (X4-1)*X1h*X2*X3 ){
860  sign = -1;
861  }
862  }
863 
864  for (int i=0; i<18; i++) {
865  gauge[3][j*gaugeSiteSize + i] *= sign;
866  gauge[3][(Vh+j)*gaugeSiteSize + i] *= sign;
867  }
868  }
869  }
870 
871 }
872 
873 
874 
875 template <typename Float>
876 static void constructUnitGaugeField(Float **res, QudaGaugeParam *param) {
877  Float *resOdd[4], *resEven[4];
878  for (int dir = 0; dir < 4; dir++) {
879  resEven[dir] = res[dir];
880  resOdd[dir] = res[dir]+Vh*gaugeSiteSize;
881  }
882 
883  for (int dir = 0; dir < 4; dir++) {
884  for (int i = 0; i < Vh; i++) {
885  for (int m = 0; m < 3; m++) {
886  for (int n = 0; n < 3; n++) {
887  resEven[dir][i*(3*3*2) + m*(3*2) + n*(2) + 0] = (m==n) ? 1 : 0;
888  resEven[dir][i*(3*3*2) + m*(3*2) + n*(2) + 1] = 0.0;
889  resOdd[dir][i*(3*3*2) + m*(3*2) + n*(2) + 0] = (m==n) ? 1 : 0;
890  resOdd[dir][i*(3*3*2) + m*(3*2) + n*(2) + 1] = 0.0;
891  }
892  }
893  }
894  }
895 
897 }
898 
899 // normalize the vector a
900 template <typename Float>
901 static void normalize(complex<Float> *a, int len) {
902  double sum = 0.0;
903  for (int i=0; i<len; i++) sum += norm(a[i]);
904  for (int i=0; i<len; i++) a[i] /= sqrt(sum);
905 }
906 
907 // orthogonalize vector b to vector a
908 template <typename Float>
909 static void orthogonalize(complex<Float> *a, complex<Float> *b, int len) {
910  complex<double> dot = 0.0;
911  for (int i=0; i<len; i++) dot += conj(a[i])*b[i];
912  for (int i=0; i<len; i++) b[i] -= (complex<Float>)dot*a[i];
913 }
914 
915 template <typename Float>
917  Float *resOdd[4], *resEven[4];
918  for (int dir = 0; dir < 4; dir++) {
919  resEven[dir] = res[dir];
920  resOdd[dir] = res[dir]+Vh*gaugeSiteSize;
921  }
922 
923  for (int dir = 0; dir < 4; dir++) {
924  for (int i = 0; i < Vh; i++) {
925  for (int m = 1; m < 3; m++) { // last 2 rows
926  for (int n = 0; n < 3; n++) { // 3 columns
927  resEven[dir][i*(3*3*2) + m*(3*2) + n*(2) + 0] = rand() / (Float)RAND_MAX;
928  resEven[dir][i*(3*3*2) + m*(3*2) + n*(2) + 1] = rand() / (Float)RAND_MAX;
929  resOdd[dir][i*(3*3*2) + m*(3*2) + n*(2) + 0] = rand() / (Float)RAND_MAX;
930  resOdd[dir][i*(3*3*2) + m*(3*2) + n*(2) + 1] = rand() / (Float)RAND_MAX;
931  }
932  }
933  normalize((complex<Float>*)(resEven[dir] + (i*3+1)*3*2), 3);
934  orthogonalize((complex<Float>*)(resEven[dir] + (i*3+1)*3*2), (complex<Float>*)(resEven[dir] + (i*3+2)*3*2), 3);
935  normalize((complex<Float>*)(resEven[dir] + (i*3 + 2)*3*2), 3);
936 
937  normalize((complex<Float>*)(resOdd[dir] + (i*3+1)*3*2), 3);
938  orthogonalize((complex<Float>*)(resOdd[dir] + (i*3+1)*3*2), (complex<Float>*)(resOdd[dir] + (i*3+2)*3*2), 3);
939  normalize((complex<Float>*)(resOdd[dir] + (i*3 + 2)*3*2), 3);
940 
941  {
942  Float *w = resEven[dir]+(i*3+0)*3*2;
943  Float *u = resEven[dir]+(i*3+1)*3*2;
944  Float *v = resEven[dir]+(i*3+2)*3*2;
945 
946  for (int n = 0; n < 6; n++) w[n] = 0.0;
947  accumulateConjugateProduct(w+0*(2), u+1*(2), v+2*(2), +1);
948  accumulateConjugateProduct(w+0*(2), u+2*(2), v+1*(2), -1);
949  accumulateConjugateProduct(w+1*(2), u+2*(2), v+0*(2), +1);
950  accumulateConjugateProduct(w+1*(2), u+0*(2), v+2*(2), -1);
951  accumulateConjugateProduct(w+2*(2), u+0*(2), v+1*(2), +1);
952  accumulateConjugateProduct(w+2*(2), u+1*(2), v+0*(2), -1);
953  }
954 
955  {
956  Float *w = resOdd[dir]+(i*3+0)*3*2;
957  Float *u = resOdd[dir]+(i*3+1)*3*2;
958  Float *v = resOdd[dir]+(i*3+2)*3*2;
959 
960  for (int n = 0; n < 6; n++) w[n] = 0.0;
961  accumulateConjugateProduct(w+0*(2), u+1*(2), v+2*(2), +1);
962  accumulateConjugateProduct(w+0*(2), u+2*(2), v+1*(2), -1);
963  accumulateConjugateProduct(w+1*(2), u+2*(2), v+0*(2), +1);
964  accumulateConjugateProduct(w+1*(2), u+0*(2), v+2*(2), -1);
965  accumulateConjugateProduct(w+2*(2), u+0*(2), v+1*(2), +1);
966  accumulateConjugateProduct(w+2*(2), u+1*(2), v+0*(2), -1);
967  }
968 
969  }
970  }
971 
972  if (param->type == QUDA_WILSON_LINKS){
974  } else if (param->type == QUDA_ASQTAD_LONG_LINKS){
976  } else if (param->type == QUDA_ASQTAD_FAT_LINKS){
977  for (int dir = 0; dir < 4; dir++){
978  for (int i = 0; i < Vh; i++) {
979  for (int m = 0; m < 3; m++) { // last 2 rows
980  for (int n = 0; n < 3; n++) { // 3 columns
981  resEven[dir][i*(3*3*2) + m*(3*2) + n*(2) + 0] =1.0* rand() / (Float)RAND_MAX;
982  resEven[dir][i*(3*3*2) + m*(3*2) + n*(2) + 1] = 2.0* rand() / (Float)RAND_MAX;
983  resOdd[dir][i*(3*3*2) + m*(3*2) + n*(2) + 0] = 3.0*rand() / (Float)RAND_MAX;
984  resOdd[dir][i*(3*3*2) + m*(3*2) + n*(2) + 1] = 4.0*rand() / (Float)RAND_MAX;
985  }
986  }
987  }
988  }
989 
990  }
991 
992 }
993 
994 template <typename Float>
995 void constructUnitaryGaugeField(Float **res)
996 {
997  Float *resOdd[4], *resEven[4];
998  for (int dir = 0; dir < 4; dir++) {
999  resEven[dir] = res[dir];
1000  resOdd[dir] = res[dir]+Vh*gaugeSiteSize;
1001  }
1002 
1003  for (int dir = 0; dir < 4; dir++) {
1004  for (int i = 0; i < Vh; i++) {
1005  for (int m = 1; m < 3; m++) { // last 2 rows
1006  for (int n = 0; n < 3; n++) { // 3 columns
1007  resEven[dir][i*(3*3*2) + m*(3*2) + n*(2) + 0] = rand() / (Float)RAND_MAX;
1008  resEven[dir][i*(3*3*2) + m*(3*2) + n*(2) + 1] = rand() / (Float)RAND_MAX;
1009  resOdd[dir][i*(3*3*2) + m*(3*2) + n*(2) + 0] = rand() / (Float)RAND_MAX;
1010  resOdd[dir][i*(3*3*2) + m*(3*2) + n*(2) + 1] = rand() / (Float)RAND_MAX;
1011  }
1012  }
1013  normalize((complex<Float>*)(resEven[dir] + (i*3+1)*3*2), 3);
1014  orthogonalize((complex<Float>*)(resEven[dir] + (i*3+1)*3*2), (complex<Float>*)(resEven[dir] + (i*3+2)*3*2), 3);
1015  normalize((complex<Float>*)(resEven[dir] + (i*3 + 2)*3*2), 3);
1016 
1017  normalize((complex<Float>*)(resOdd[dir] + (i*3+1)*3*2), 3);
1018  orthogonalize((complex<Float>*)(resOdd[dir] + (i*3+1)*3*2), (complex<Float>*)(resOdd[dir] + (i*3+2)*3*2), 3);
1019  normalize((complex<Float>*)(resOdd[dir] + (i*3 + 2)*3*2), 3);
1020 
1021  {
1022  Float *w = resEven[dir]+(i*3+0)*3*2;
1023  Float *u = resEven[dir]+(i*3+1)*3*2;
1024  Float *v = resEven[dir]+(i*3+2)*3*2;
1025 
1026  for (int n = 0; n < 6; n++) w[n] = 0.0;
1027  accumulateConjugateProduct(w+0*(2), u+1*(2), v+2*(2), +1);
1028  accumulateConjugateProduct(w+0*(2), u+2*(2), v+1*(2), -1);
1029  accumulateConjugateProduct(w+1*(2), u+2*(2), v+0*(2), +1);
1030  accumulateConjugateProduct(w+1*(2), u+0*(2), v+2*(2), -1);
1031  accumulateConjugateProduct(w+2*(2), u+0*(2), v+1*(2), +1);
1032  accumulateConjugateProduct(w+2*(2), u+1*(2), v+0*(2), -1);
1033  }
1034 
1035  {
1036  Float *w = resOdd[dir]+(i*3+0)*3*2;
1037  Float *u = resOdd[dir]+(i*3+1)*3*2;
1038  Float *v = resOdd[dir]+(i*3+2)*3*2;
1039 
1040  for (int n = 0; n < 6; n++) w[n] = 0.0;
1041  accumulateConjugateProduct(w+0*(2), u+1*(2), v+2*(2), +1);
1042  accumulateConjugateProduct(w+0*(2), u+2*(2), v+1*(2), -1);
1043  accumulateConjugateProduct(w+1*(2), u+2*(2), v+0*(2), +1);
1044  accumulateConjugateProduct(w+1*(2), u+0*(2), v+2*(2), -1);
1045  accumulateConjugateProduct(w+2*(2), u+0*(2), v+1*(2), +1);
1046  accumulateConjugateProduct(w+2*(2), u+1*(2), v+0*(2), -1);
1047  }
1048 
1049  }
1050  }
1051 }
1052 
1053 
1054 void construct_gauge_field(void **gauge, int type, QudaPrecision precision, QudaGaugeParam *param) {
1055  if (type == 0) {
1056  if (precision == QUDA_DOUBLE_PRECISION) constructUnitGaugeField((double**)gauge, param);
1057  else constructUnitGaugeField((float**)gauge, param);
1058  } else if (type == 1) {
1059  if (precision == QUDA_DOUBLE_PRECISION) constructGaugeField((double**)gauge, param);
1060  else constructGaugeField((float**)gauge, param);
1061  } else {
1062  if (precision == QUDA_DOUBLE_PRECISION) applyGaugeFieldScaling((double**)gauge, Vh, param);
1063  else applyGaugeFieldScaling((float**)gauge, Vh, param);
1064  }
1065 
1066 }
1067 
1068 void
1070  QudaPrecision precision, QudaGaugeParam* param,
1072 {
1073  if (type == 0) {
1074  if (precision == QUDA_DOUBLE_PRECISION) {
1077  }else {
1080  }
1081  } else {
1082  if (precision == QUDA_DOUBLE_PRECISION) {
1083  // if doing naive staggered then set to long links so that the staggered phase is applied
1088  }else {
1093  }
1094  }
1095 
1097  // incorporate non-trivial phase into long links
1098 
1099  const double phase = (M_PI * rand())/RAND_MAX;
1100  const complex<double> z = polar(1.0, phase);
1101  for (int dir=0; dir<4; ++dir) {
1102  for (int i=0; i<V; ++i) {
1103  for (int j=0; j<gaugeSiteSize; j+=2) {
1104  if (precision == QUDA_DOUBLE_PRECISION) {
1105  complex<double> *l = (complex<double>*)( &(((double*)longlink[dir])[i*gaugeSiteSize + j]) );
1106  *l *= z;
1107  } else {
1108  complex<float> *l = (complex<float>*)( &(((float*)longlink[dir])[i*gaugeSiteSize + j]) );
1109  *l *= z;
1110  }
1111  }
1112  }
1113  }
1114  }
1115 
1116  // set all links to zero to emulate the 1-link operator (needed for host comparison)
1118  for(int dir=0; dir<4; ++dir){
1119  for(int i=0; i<V; ++i){
1120  for(int j=0; j<gaugeSiteSize; j+=2){
1121  if(precision == QUDA_DOUBLE_PRECISION){
1122  ((double*)longlink[dir])[i*gaugeSiteSize + j] = 0.0;
1123  ((double*)longlink[dir])[i*gaugeSiteSize + j + 1] = 0.0;
1124  }else{
1125  ((float*)longlink[dir])[i*gaugeSiteSize + j] = 0.0;
1126  ((float*)longlink[dir])[i*gaugeSiteSize + j + 1] = 0.0;
1127  }
1128  }
1129  }
1130  }
1131  }
1132 
1133 }
1134 
1135 
1136 template <typename Float>
1137 static void constructCloverField(Float *res, double norm, double diag) {
1138 
1139  Float c = 2.0 * norm / RAND_MAX;
1140 
1141  for(int i = 0; i < V; i++) {
1142  for (int j = 0; j < 72; j++) {
1143  res[i*72 + j] = c*rand() - norm;
1144  }
1145 
1146  //impose clover symmetry on each chiral block
1147  for (int ch=0; ch<2; ch++) {
1148  res[i*72 + 3 + 36*ch] = -res[i*72 + 0 + 36*ch];
1149  res[i*72 + 4 + 36*ch] = -res[i*72 + 1 + 36*ch];
1150  res[i*72 + 5 + 36*ch] = -res[i*72 + 2 + 36*ch];
1151  res[i*72 + 30 + 36*ch] = -res[i*72 + 6 + 36*ch];
1152  res[i*72 + 31 + 36*ch] = -res[i*72 + 7 + 36*ch];
1153  res[i*72 + 32 + 36*ch] = -res[i*72 + 8 + 36*ch];
1154  res[i*72 + 33 + 36*ch] = -res[i*72 + 9 + 36*ch];
1155  res[i*72 + 34 + 36*ch] = -res[i*72 + 16 + 36*ch];
1156  res[i*72 + 35 + 36*ch] = -res[i*72 + 17 + 36*ch];
1157  }
1158 
1159  for (int j = 0; j<6; j++) {
1160  res[i*72 + j] += diag;
1161  res[i*72 + j+36] += diag;
1162  }
1163  }
1164 }
1165 
1166 void construct_clover_field(void *clover, double norm, double diag, QudaPrecision precision) {
1167 
1168  if (precision == QUDA_DOUBLE_PRECISION) constructCloverField((double *)clover, norm, diag);
1169  else constructCloverField((float *)clover, norm, diag);
1170 }
1171 
1172 /*void strong_check(void *spinorRef, void *spinorGPU, int len, QudaPrecision prec) {
1173  printf("Reference:\n");
1174  printSpinorElement(spinorRef, 0, prec); printf("...\n");
1175  printSpinorElement(spinorRef, len-1, prec); printf("\n");
1176 
1177  printf("\nCUDA:\n");
1178  printSpinorElement(spinorGPU, 0, prec); printf("...\n");
1179  printSpinorElement(spinorGPU, len-1, prec); printf("\n");
1180 
1181  compare_spinor(spinorRef, spinorGPU, len, prec);
1182  }*/
1183 
1184 template <typename Float>
1185 static void checkGauge(Float **oldG, Float **newG, double epsilon) {
1186 
1187  const int fail_check = 17;
1188  int fail[4][fail_check];
1189  int iter[4][18];
1190  for (int d=0; d<4; d++) for (int i=0; i<fail_check; i++) fail[d][i] = 0;
1191  for (int d=0; d<4; d++) for (int i=0; i<18; i++) iter[d][i] = 0;
1192 
1193  for (int d=0; d<4; d++) {
1194  for (int eo=0; eo<2; eo++) {
1195  for (int i=0; i<Vh; i++) {
1196  int ga_idx = (eo*Vh+i);
1197  for (int j=0; j<18; j++) {
1198  double diff = fabs(newG[d][ga_idx*18+j] - oldG[d][ga_idx*18+j]);
1199 
1200  for (int f=0; f<fail_check; f++) if (diff > pow(10.0,-(f+1))) fail[d][f]++;
1201  if (diff > epsilon) iter[d][j]++;
1202  }
1203  }
1204  }
1205  }
1206 
1207  printf("Component fails (X, Y, Z, T)\n");
1208  for (int i=0; i<18; i++) printf("%d fails = (%8d, %8d, %8d, %8d)\n", i, iter[0][i], iter[1][i], iter[2][i], iter[3][i]);
1209 
1210  printf("\nDeviation Failures = (X, Y, Z, T)\n");
1211  for (int f=0; f<fail_check; f++) {
1212  printf("%e Failures = (%9d, %9d, %9d, %9d) = (%6.5f, %6.5f, %6.5f, %6.5f)\n", pow(10.0,-(f+1)),
1213  fail[0][f], fail[1][f], fail[2][f], fail[3][f],
1214  fail[0][f]/(double)(V*18), fail[1][f]/(double)(V*18), fail[2][f]/(double)(V*18), fail[3][f]/(double)(V*18));
1215  }
1216 
1217 }
1218 
1219 void check_gauge(void **oldG, void **newG, double epsilon, QudaPrecision precision) {
1220  if (precision == QUDA_DOUBLE_PRECISION)
1221  checkGauge((double**)oldG, (double**)newG, epsilon);
1222  else
1223  checkGauge((float**)oldG, (float**)newG, epsilon);
1224 }
1225 
1226 
1227 
1228 void
1229 createSiteLinkCPU(void** link, QudaPrecision precision, int phase)
1230 {
1231 
1232  if (precision == QUDA_DOUBLE_PRECISION) {
1233  constructUnitaryGaugeField((double**)link);
1234  }else {
1235  constructUnitaryGaugeField((float**)link);
1236  }
1237 
1238  // only apply temporal boundary condition if I'm the last node in T
1239 #ifdef MULTI_GPU
1240  bool last_node_in_t = (commCoords(3) == commDim(3)-1) ? true : false;
1241 #else
1242  bool last_node_in_t = true;
1243 #endif
1244 
1245  if(phase){
1246 
1247  for(int i=0;i < V;i++){
1248  for(int dir =XUP; dir <= TUP; dir++){
1249  int idx = i;
1250  int oddBit =0;
1251  if (i >= Vh) {
1252  idx = i - Vh;
1253  oddBit = 1;
1254  }
1255 
1256  int X1 = Z[0];
1257  int X2 = Z[1];
1258  int X3 = Z[2];
1259  int X4 = Z[3];
1260 
1261  int full_idx = fullLatticeIndex(idx, oddBit);
1262  int i4 = full_idx /(X3*X2*X1);
1263  int i3 = (full_idx - i4*(X3*X2*X1))/(X2*X1);
1264  int i2 = (full_idx - i4*(X3*X2*X1) - i3*(X2*X1))/X1;
1265  int i1 = full_idx - i4*(X3*X2*X1) - i3*(X2*X1) - i2*X1;
1266 
1267  double coeff= 1.0;
1268  switch(dir){
1269  case XUP:
1270  if ( (i4 & 1) != 0){
1271  coeff *= -1;
1272  }
1273  break;
1274 
1275  case YUP:
1276  if ( ((i4+i1) & 1) != 0){
1277  coeff *= -1;
1278  }
1279  break;
1280 
1281  case ZUP:
1282  if ( ((i4+i1+i2) & 1) != 0){
1283  coeff *= -1;
1284  }
1285  break;
1286 
1287  case TUP:
1288  if (last_node_in_t && i4 == (X4-1)){
1289  coeff *= -1;
1290  }
1291  break;
1292 
1293  default:
1294  printf("ERROR: wrong dir(%d)\n", dir);
1295  exit(1);
1296  }
1297 
1298  if (precision == QUDA_DOUBLE_PRECISION){
1299  //double* mylink = (double*)link;
1300  //mylink = mylink + (4*i + dir)*gaugeSiteSize;
1301  double* mylink = (double*)link[dir];
1302  mylink = mylink + i*gaugeSiteSize;
1303 
1304  mylink[12] *= coeff;
1305  mylink[13] *= coeff;
1306  mylink[14] *= coeff;
1307  mylink[15] *= coeff;
1308  mylink[16] *= coeff;
1309  mylink[17] *= coeff;
1310 
1311  }else{
1312  //float* mylink = (float*)link;
1313  //mylink = mylink + (4*i + dir)*gaugeSiteSize;
1314  float* mylink = (float*)link[dir];
1315  mylink = mylink + i*gaugeSiteSize;
1316 
1317  mylink[12] *= coeff;
1318  mylink[13] *= coeff;
1319  mylink[14] *= coeff;
1320  mylink[15] *= coeff;
1321  mylink[16] *= coeff;
1322  mylink[17] *= coeff;
1323 
1324  }
1325  }
1326  }
1327  }
1328 
1329 
1330 #if 1
1331  for(int dir= 0;dir < 4;dir++){
1332  for(int i=0;i< V*gaugeSiteSize;i++){
1333  if (precision ==QUDA_SINGLE_PRECISION){
1334  float* f = (float*)link[dir];
1335  if (f[i] != f[i] || (fabsf(f[i]) > 1.e+3) ){
1336  fprintf(stderr, "ERROR: %dth: bad number(%f) in function %s \n",i, f[i], __FUNCTION__);
1337  exit(1);
1338  }
1339  }else{
1340  double* f = (double*)link[dir];
1341  if (f[i] != f[i] || (fabs(f[i]) > 1.e+3)){
1342  fprintf(stderr, "ERROR: %dth: bad number(%f) in function %s \n",i, f[i], __FUNCTION__);
1343  exit(1);
1344  }
1345 
1346  }
1347 
1348  }
1349  }
1350 #endif
1351 
1352  return;
1353 }
1354 
1355 
1356 
1357 
1358 template <typename Float>
1359 int compareLink(Float **linkA, Float **linkB, int len) {
1360  const int fail_check = 16;
1361  int fail[fail_check];
1362  for (int f=0; f<fail_check; f++) fail[f] = 0;
1363 
1364  int iter[18];
1365  for (int i=0; i<18; i++) iter[i] = 0;
1366 
1367  for(int dir=0;dir < 4; dir++){
1368  for (int i=0; i<len; i++) {
1369  for (int j=0; j<18; j++) {
1370  int is = i*18+j;
1371  double diff = fabs(linkA[dir][is]-linkB[dir][is]);
1372  for (int f=0; f<fail_check; f++)
1373  if (diff > pow(10.0,-(f+1))) fail[f]++;
1374  //if (diff > 1e-1) printf("%d %d %e\n", i, j, diff);
1375  if (diff > 1e-3) iter[j]++;
1376  }
1377  }
1378  }
1379 
1380  for (int i=0; i<18; i++) printfQuda("%d fails = %d\n", i, iter[i]);
1381 
1382  int accuracy_level = 0;
1383  for(int f =0; f < fail_check; f++){
1384  if(fail[f] == 0){
1385  accuracy_level =f;
1386  }
1387  }
1388 
1389  for (int f=0; f<fail_check; f++) {
1390  printfQuda("%e Failures: %d / %d = %e\n", pow(10.0,-(f+1)), fail[f], 4*len*18, fail[f] / (double)(4*len*18));
1391  }
1392 
1393  return accuracy_level;
1394 }
1395 
1396 static int
1397 compare_link(void **linkA, void **linkB, int len, QudaPrecision precision)
1398 {
1399  int ret;
1400 
1401  if (precision == QUDA_DOUBLE_PRECISION){
1402  ret = compareLink((double**)linkA, (double**)linkB, len);
1403  }else {
1404  ret = compareLink((float**)linkA, (float**)linkB, len);
1405  }
1406 
1407  return ret;
1408 }
1409 
1410 
1411 // X indexes the lattice site
1412 static void
1413 printLinkElement(void *link, int X, QudaPrecision precision)
1414 {
1415  if (precision == QUDA_DOUBLE_PRECISION){
1416  for(int i=0; i < 3;i++){
1417  printVector((double*)link+ X*gaugeSiteSize + i*6);
1418  }
1419 
1420  }
1421  else{
1422  for(int i=0;i < 3;i++){
1423  printVector((float*)link+X*gaugeSiteSize + i*6);
1424  }
1425  }
1426 }
1427 
1428 int strong_check_link(void** linkA, const char* msgA,
1429  void **linkB, const char* msgB,
1430  int len, QudaPrecision prec)
1431 {
1432  printfQuda("%s\n", msgA);
1433  printLinkElement(linkA[0], 0, prec);
1434  printfQuda("\n");
1435  printLinkElement(linkA[0], 1, prec);
1436  printfQuda("...\n");
1437  printLinkElement(linkA[3], len-1, prec);
1438  printfQuda("\n");
1439 
1440  printfQuda("\n%s\n", msgB);
1441  printLinkElement(linkB[0], 0, prec);
1442  printfQuda("\n");
1443  printLinkElement(linkB[0], 1, prec);
1444  printfQuda("...\n");
1445  printLinkElement(linkB[3], len-1, prec);
1446  printfQuda("\n");
1447 
1448  int ret = compare_link(linkA, linkB, len, prec);
1449  return ret;
1450 }
1451 
1452 
1453 void
1454 createMomCPU(void* mom, QudaPrecision precision)
1455 {
1456  void* temp;
1457 
1458  size_t gSize = (precision == QUDA_DOUBLE_PRECISION) ? sizeof(double) : sizeof(float);
1459  temp = malloc(4*V*gaugeSiteSize*gSize);
1460  if (temp == NULL){
1461  fprintf(stderr, "Error: malloc failed for temp in function %s\n", __FUNCTION__);
1462  exit(1);
1463  }
1464 
1465 
1466 
1467  for(int i=0;i < V;i++){
1468  if (precision == QUDA_DOUBLE_PRECISION){
1469  for(int dir=0;dir < 4;dir++){
1470  double* thismom = (double*)mom;
1471  for(int k=0; k < momSiteSize; k++){
1472  thismom[ (4*i+dir)*momSiteSize + k ]= 1.0* rand() /RAND_MAX;
1473  if (k==momSiteSize-1) thismom[ (4*i+dir)*momSiteSize + k ]= 0.0;
1474  }
1475  }
1476  }else{
1477  for(int dir=0;dir < 4;dir++){
1478  float* thismom=(float*)mom;
1479  for(int k=0; k < momSiteSize; k++){
1480  thismom[ (4*i+dir)*momSiteSize + k ]= 1.0* rand() /RAND_MAX;
1481  if (k==momSiteSize-1) thismom[ (4*i+dir)*momSiteSize + k ]= 0.0;
1482  }
1483  }
1484  }
1485  }
1486 
1487  free(temp);
1488  return;
1489 }
1490 
1491 void
1492 createHwCPU(void* hw, QudaPrecision precision)
1493 {
1494  for(int i=0;i < V;i++){
1495  if (precision == QUDA_DOUBLE_PRECISION){
1496  for(int dir=0;dir < 4;dir++){
1497  double* thishw = (double*)hw;
1498  for(int k=0; k < hwSiteSize; k++){
1499  thishw[ (4*i+dir)*hwSiteSize + k ]= 1.0* rand() /RAND_MAX;
1500  }
1501  }
1502  }else{
1503  for(int dir=0;dir < 4;dir++){
1504  float* thishw=(float*)hw;
1505  for(int k=0; k < hwSiteSize; k++){
1506  thishw[ (4*i+dir)*hwSiteSize + k ]= 1.0* rand() /RAND_MAX;
1507  }
1508  }
1509  }
1510  }
1511 
1512  return;
1513 }
1514 
1515 
1516 template <typename Float>
1517 int compare_mom(Float *momA, Float *momB, int len) {
1518  const int fail_check = 16;
1519  int fail[fail_check];
1520  for (int f=0; f<fail_check; f++) fail[f] = 0;
1521 
1522  int iter[momSiteSize];
1523  for (int i=0; i<momSiteSize; i++) iter[i] = 0;
1524 
1525  for (int i=0; i<len; i++) {
1526  for (int j=0; j<momSiteSize-1; j++) {
1527  int is = i*momSiteSize+j;
1528  double diff = fabs(momA[is]-momB[is]);
1529  for (int f=0; f<fail_check; f++)
1530  if (diff > pow(10.0,-(f+1))) fail[f]++;
1531  //if (diff > 1e-1) printf("%d %d %e\n", i, j, diff);
1532  if (diff > 1e-3) iter[j]++;
1533  }
1534  }
1535 
1536  int accuracy_level = 0;
1537  for(int f =0; f < fail_check; f++){
1538  if(fail[f] == 0){
1539  accuracy_level =f+1;
1540  }
1541  }
1542 
1543  for (int i=0; i<momSiteSize; i++) printfQuda("%d fails = %d\n", i, iter[i]);
1544 
1545  for (int f=0; f<fail_check; f++) {
1546  printfQuda("%e Failures: %d / %d = %e\n", pow(10.0,-(f+1)), fail[f], len*9, fail[f]/(double)(len*9));
1547  }
1548 
1549  return accuracy_level;
1550 }
1551 
1552 static void
1553 printMomElement(void *mom, int X, QudaPrecision precision)
1554 {
1555  if (precision == QUDA_DOUBLE_PRECISION){
1556  double* thismom = ((double*)mom)+ X*momSiteSize;
1557  printVector(thismom);
1558  printfQuda("(%9f,%9f) (%9f,%9f)\n", thismom[6], thismom[7], thismom[8], thismom[9]);
1559  }else{
1560  float* thismom = ((float*)mom)+ X*momSiteSize;
1561  printVector(thismom);
1562  printfQuda("(%9f,%9f) (%9f,%9f)\n", thismom[6], thismom[7], thismom[8], thismom[9]);
1563  }
1564 }
1565 int strong_check_mom(void * momA, void *momB, int len, QudaPrecision prec)
1566 {
1567  printfQuda("mom:\n");
1568  printMomElement(momA, 0, prec);
1569  printfQuda("\n");
1570  printMomElement(momA, 1, prec);
1571  printfQuda("\n");
1572  printMomElement(momA, 2, prec);
1573  printfQuda("\n");
1574  printMomElement(momA, 3, prec);
1575  printfQuda("...\n");
1576 
1577  printfQuda("\nreference mom:\n");
1578  printMomElement(momB, 0, prec);
1579  printfQuda("\n");
1580  printMomElement(momB, 1, prec);
1581  printfQuda("\n");
1582  printMomElement(momB, 2, prec);
1583  printfQuda("\n");
1584  printMomElement(momB, 3, prec);
1585  printfQuda("\n");
1586 
1587  int ret;
1588  if (prec == QUDA_DOUBLE_PRECISION){
1589  ret = compare_mom((double*)momA, (double*)momB, len);
1590  }else{
1591  ret = compare_mom((float*)momA, (float*)momB, len);
1592  }
1593 
1594  return ret;
1595 }
1596 
1597 
1598 /************
1599  * return value
1600  *
1601  * 0: command line option matched and processed sucessfully
1602  * non-zero: command line option does not match
1603  *
1604  */
1605 
1606 #ifdef MULTI_GPU
1607 int device = -1;
1608 #else
1609 int device = 0;
1610 #endif
1611 
1619 
1620 int xdim = 24;
1621 int ydim = 24;
1622 int zdim = 24;
1623 int tdim = 24;
1624 int Lsdim = 16;
1627 char latfile[256] = "";
1628 int Nsrc = 1;
1629 int Msrc = 1;
1630 int niter = 100;
1631 int gcrNkrylov = 10;
1632 int pipeline = 0;
1634 int test_type = 0;
1636 char vec_infile[256] = "";
1637 char vec_outfile[256] = "";
1640 int multishift = 0;
1641 bool verify_results = true;
1642 double mass = 0.1;
1643 double mu = 0.1;
1644 double anisotropy = 1.0;
1645 double clover_coeff = 0.1;
1646 bool compute_clover = false;
1647 double tol = 1e-7;
1648 double tol_hq = 0.;
1650 bool kernel_pack_t = false;
1654 
1655 int mg_levels = 2;
1656 
1657 int nu_pre = 2;
1658 int nu_post = 2;
1662 double setup_tol = 5e-6;
1663 double omega = 0.85;
1667 
1669 int nev = 8;
1672 double tol_restart = 5e+3*tol;
1673 
1676 double inc_tol = 1e-2;
1677 double eigenval_tol = 1e-1;
1678 
1683 
1684 static int dim_partitioned[4] = {0,0,0,0};
1685 
1687 {
1688  return ((gridsize_from_cmdline[dim] > 1) || dim_partitioned[dim]);
1689 }
1690 
1691 void __attribute__((weak)) usage_extra(char** argv){};
1692 
1693 void usage(char** argv )
1694 {
1695  printf("Usage: %s [options]\n", argv[0]);
1696  printf("Common options: \n");
1697 #ifndef MULTI_GPU
1698  printf(" --device <n> # Set the CUDA device to use (default 0, single GPU only)\n");
1699 #endif
1700  printf(" --prec <double/single/half> # Precision in GPU\n");
1701  printf(" --prec-sloppy <double/single/half> # Sloppy precision in GPU\n");
1702  printf(" --prec-precondition <double/single/half> # Preconditioner precision in GPU\n");
1703  printf(" --prec-ritz <double/single/half> # Eigenvector precision in GPU\n");
1704  printf(" --recon <8/9/12/13/18> # Link reconstruction type\n");
1705  printf(" --recon-sloppy <8/9/12/13/18> # Sloppy link reconstruction type\n");
1706  printf(" --recon-precondition <8/9/12/13/18> # Preconditioner link reconstruction type\n");
1707  printf(" --dagger # Set the dagger to 1 (default 0)\n");
1708  printf(" --dim <n> # Set space-time dimension (X Y Z T)\n");
1709  printf(" --sdim <n> # Set space dimension(X/Y/Z) size\n");
1710  printf(" --xdim <n> # Set X dimension size(default 24)\n");
1711  printf(" --ydim <n> # Set X dimension size(default 24)\n");
1712  printf(" --zdim <n> # Set X dimension size(default 24)\n");
1713  printf(" --tdim <n> # Set T dimension size(default 24)\n");
1714  printf(" --Lsdim <n> # Set Ls dimension size(default 16)\n");
1715  printf(" --gridsize <x y z t> # Set the grid size in all four dimension (default 1 1 1 1)\n");
1716  printf(" --xgridsize <n> # Set grid size in X dimension (default 1)\n");
1717  printf(" --ygridsize <n> # Set grid size in Y dimension (default 1)\n");
1718  printf(" --zgridsize <n> # Set grid size in Z dimension (default 1)\n");
1719  printf(" --tgridsize <n> # Set grid size in T dimension (default 1)\n");
1720  printf(" --partition <mask> # Set the communication topology (X=1, Y=2, Z=4, T=8, and combinations of these)\n");
1721  printf(" --rank-order <col/row> # Set the [t][z][y][x] rank order as either column major (t fastest, default) or row major (x fastest)\n");
1722  printf(" --kernel-pack-t # Set T dimension kernel packing to be true (default false)\n");
1723  printf(" --dslash-type <type> # Set the dslash type, the following values are valid\n"
1724  " wilson/clover/twisted-mass/twisted-clover/staggered\n"
1725  " /asqtad/domain-wall/domain-wall-4d/mobius/laplace\n");
1726  printf(" --flavor <type> # Set the twisted mass flavor type (singlet (default), deg-doublet, nondeg-doublet)\n");
1727  printf(" --load-gauge file # Load gauge field \"file\" for the test (requires QIO)\n");
1728  printf(" --niter <n> # The number of iterations to perform (default 10)\n");
1729  printf(" --ngcrkrylov <n> # The number of inner iterations to use for GCR, BiCGstab-l (default 10)\n");
1730  printf(" --pipeline <n> # The pipeline length for fused operations in GCR, BiCGstab-l (default 0, no pipelining)\n");
1731  printf(" --solution-pipeline <n> # The pipeline length for fused solution accumulation (default 0, no pipelining)\n");
1732  printf(" --inv-type <cg/bicgstab/gcr> # The type of solver to use (default cg)\n");
1733  printf(" --precon-type <mr/ (unspecified)> # The type of solver to use (default none (=unspecified)).\n"
1734  " For multigrid this sets the smoother type.\n");
1735  printf(" --multishift <true/false> # Whether to do a multi-shift solver test or not (default false)\n");
1736  printf(" --mass # Mass of Dirac operator (default 0.1)\n");
1737  printf(" --mu # Twisted-Mass of Dirac operator (default 0.1)\n");
1738  printf(" --compute-clover # Compute the clover field or use random numbers (default false)\n");
1739  printf(" --clover-coeff # Clover coefficient (default 1.0)\n");
1740  printf(" --anisotropy # Temporal anisotropy factor (default 1.0)\n");
1741  printf(" --mass-normalization # Mass normalization (kappa (default) / mass / asym-mass)\n");
1742  printf(" --matpc # Matrix preconditioning type (even-even, odd-odd, even-even-asym, odd-odd-asym) \n");
1743  printf(" --solve-type # The type of solve to do (direct, direct-pc, normop, normop-pc, normerr, normerr-pc) \n");
1744  printf(" --tol <resid_tol> # Set L2 residual tolerance\n");
1745  printf(" --tolhq <resid_hq_tol> # Set heavy-quark residual tolerance\n");
1746  printf(" --test # Test method (different for each test)\n");
1747  printf(" --verify <true/false> # Verify the GPU results using CPU results (default true)\n");
1748  printf(" --mg-nvec <level nvec> # Number of null-space vectors to define the multigrid transfer operator on a given level\n");
1749  printf(" --mg-gpu-prolongate <true/false> # Whether to do the multigrid transfer operators on the GPU (default false)\n");
1750  printf(" --mg-levels <2+> # The number of multigrid levels to do (default 2)\n");
1751  printf(" --mg-nu-pre <1-20> # The number of pre-smoother applications to do at each multigrid level (default 2)\n");
1752  printf(" --mg-nu-post <1-20> # The number of post-smoother applications to do at each multigrid level (default 2)\n");
1753  printf(" --mg-setup-inv <level inv> # The inverter to use for the setup of multigrid (default bicgstab)\n");
1754  printf(" --mg-setup-tol # The tolerance to use for the setup of multigrid (default 5e-6)\n");
1755  printf(" --mg-omega # The over/under relaxation factor for the smoother of multigrid (default 0.85)\n");
1756  printf(" --mg-smoother # The smoother to use for multigrid (default mr)\n");
1757  printf(" --mg-block-size <level x y z t> # Set the geometric block size for the each multigrid level's transfer operator (default 4 4 4 4)\n");
1758  printf(" --mg-mu-factor <level factor> # Set the multiplicative factor for the twisted mass mu parameter on each level (default 1)\n");
1759  printf(" --mg-generate-nullspace <true/false> # Generate the null-space vector dynamically (default true)\n");
1760  printf(" --mg-generate-all-levels <true/talse> # true=generate nul space on all levels, false=generate on level 0 and create other levels from that (default true)\n");
1761  printf(" --mg-load-vec file # Load the vectors \"file\" for the multigrid_test (requires QIO)\n");
1762  printf(" --mg-save-vec file # Save the generated null-space vectors \"file\" from the multigrid_test (requires QIO)\n");
1763  printf(" --mg-vebosity <level verb> # The verbosity to use on each level of the multigrid (default silent)\n");
1764  printf(" --df-nev <nev> # Set number of eigenvectors computed within a single solve cycle (default 8)\n");
1765  printf(" --df-max-search-dim <dim> # Set the size of eigenvector search space (default 64)\n");
1766  printf(" --df-deflation-grid <n> # Set maximum number of cycles needed to compute eigenvectors(default 1)\n");
1767  printf(" --df-eigcg-max-restarts <n> # Set how many iterative refinement cycles will be solved with eigCG within a single physical right hand site solve (default 4)\n");
1768  printf(" --df-tol-restart <tol> # Set tolerance for the first restart in the initCG solver(default 5e-5)\n");
1769  printf(" --df-tol-inc <tol> # Set tolerance for the subsequent restarts in the initCG solver (default 1e-2)\n");
1770  printf(" --df-max-restart-num <n> # Set maximum number of the initCG restarts in the deflation stage (default 3)\n");
1771  printf(" --df-tol-eigenval <tol> # Set maximum eigenvalue residual norm (default 1e-1)\n");
1772 
1773 
1774  printf(" --solver-ext-lib-type <eigen/magma> # Set external library for the solvers (default Eigen library)\n");
1775  printf(" --df-ext-lib-type <eigen/magma> # Set external library for the deflation methods (default Eigen library)\n");
1776  printf(" --df-location-ritz <host/cuda> # Set memory location for the ritz vectors (default cuda memory loction)\n");
1777  printf(" --df-mem-type-ritz <device/pinned/mapped> # Set memory type for the ritz vectors (default device memory type)\n");
1778 
1779  printf(" --nsrc <n> # How many spinors to apply the dslash to simultaneusly (experimental for staggered only)\n");
1780 
1781  printf(" --msrc <n> # Used for testing non-square block blas routines where nsrc defines the other dimension\n");
1782  printf(" --help # Print out this message\n");
1783 
1784  usage_extra(argv);
1785 #ifdef MULTI_GPU
1786  char msg[]="multi";
1787 #else
1788  char msg[]="single";
1789 #endif
1790  printf("Note: this program is %s GPU build\n", msg);
1791  exit(1);
1792  return ;
1793 }
1794 
1795 int process_command_line_option(int argc, char** argv, int* idx)
1796 {
1797 #ifdef MULTI_GPU
1798  char msg[]="multi";
1799 #else
1800  char msg[]="single";
1801 #endif
1802 
1803  int ret = -1;
1804 
1805  int i = *idx;
1806 
1807  if( strcmp(argv[i], "--help")== 0){
1808  usage(argv);
1809  }
1810 
1811  if( strcmp(argv[i], "--verify") == 0){
1812  if (i+1 >= argc){
1813  usage(argv);
1814  }
1815 
1816  if (strcmp(argv[i+1], "true") == 0){
1817  verify_results = true;
1818  }else if (strcmp(argv[i+1], "false") == 0){
1819  verify_results = false;
1820  }else{
1821  fprintf(stderr, "ERROR: invalid verify type\n");
1822  exit(1);
1823  }
1824 
1825  i++;
1826  ret = 0;
1827  goto out;
1828  }
1829 
1830  if( strcmp(argv[i], "--device") == 0){
1831  if (i+1 >= argc){
1832  usage(argv);
1833  }
1834  device = atoi(argv[i+1]);
1835  if (device < 0 || device > 16){
1836  printf("ERROR: Invalid CUDA device number (%d)\n", device);
1837  usage(argv);
1838  }
1839  i++;
1840  ret = 0;
1841  goto out;
1842  }
1843 
1844  if( strcmp(argv[i], "--prec") == 0){
1845  if (i+1 >= argc){
1846  usage(argv);
1847  }
1848  prec = get_prec(argv[i+1]);
1849  i++;
1850  ret = 0;
1851  goto out;
1852  }
1853 
1854  if( strcmp(argv[i], "--prec-sloppy") == 0){
1855  if (i+1 >= argc){
1856  usage(argv);
1857  }
1858  prec_sloppy = get_prec(argv[i+1]);
1859  i++;
1860  ret = 0;
1861  goto out;
1862  }
1863 
1864  if( strcmp(argv[i], "--prec-precondition") == 0){
1865  if (i+1 >= argc){
1866  usage(argv);
1867  }
1868  prec_precondition = get_prec(argv[i+1]);
1869  i++;
1870  ret = 0;
1871  goto out;
1872  }
1873 
1874  if( strcmp(argv[i], "--prec-ritz") == 0){
1875  if (i+1 >= argc){
1876  usage(argv);
1877  }
1878  prec_ritz = get_prec(argv[i+1]);
1879  i++;
1880  ret = 0;
1881  goto out;
1882  }
1883 
1884  if( strcmp(argv[i], "--recon") == 0){
1885  if (i+1 >= argc){
1886  usage(argv);
1887  }
1888  link_recon = get_recon(argv[i+1]);
1889  i++;
1890  ret = 0;
1891  goto out;
1892  }
1893 
1894  if( strcmp(argv[i], "--recon-sloppy") == 0){
1895  if (i+1 >= argc){
1896  usage(argv);
1897  }
1898  link_recon_sloppy = get_recon(argv[i+1]);
1899  i++;
1900  ret = 0;
1901  goto out;
1902  }
1903 
1904  if( strcmp(argv[i], "--recon-precondition") == 0){
1905  if (i+1 >= argc){
1906  usage(argv);
1907  }
1908  link_recon_precondition = get_recon(argv[i+1]);
1909  i++;
1910  ret = 0;
1911  goto out;
1912  }
1913 
1914  if( strcmp(argv[i], "--dim") == 0){
1915  if (i+1 >= argc){
1916  usage(argv);
1917  }
1918  xdim= atoi(argv[i+1]);
1919  if (xdim < 0 || xdim > 512){
1920  printf("ERROR: invalid X dimension (%d)\n", xdim);
1921  usage(argv);
1922  }
1923  i++;
1924 
1925  if (i+1 >= argc){
1926  usage(argv);
1927  }
1928  ydim= atoi(argv[i+1]);
1929  if (ydim < 0 || ydim > 512){
1930  printf("ERROR: invalid Y dimension (%d)\n", ydim);
1931  usage(argv);
1932  }
1933  i++;
1934 
1935  if (i+1 >= argc){
1936  usage(argv);
1937  }
1938  zdim= atoi(argv[i+1]);
1939  if (zdim < 0 || zdim > 512){
1940  printf("ERROR: invalid Z dimension (%d)\n", zdim);
1941  usage(argv);
1942  }
1943  i++;
1944 
1945  if (i+1 >= argc){
1946  usage(argv);
1947  }
1948  tdim= atoi(argv[i+1]);
1949  if (tdim < 0 || tdim > 512){
1950  printf("ERROR: invalid T dimension (%d)\n", tdim);
1951  usage(argv);
1952  }
1953  i++;
1954 
1955  ret = 0;
1956  goto out;
1957  }
1958 
1959  if( strcmp(argv[i], "--xdim") == 0){
1960  if (i+1 >= argc){
1961  usage(argv);
1962  }
1963  xdim= atoi(argv[i+1]);
1964  if (xdim < 0 || xdim > 512){
1965  printf("ERROR: invalid X dimension (%d)\n", xdim);
1966  usage(argv);
1967  }
1968  i++;
1969  ret = 0;
1970  goto out;
1971  }
1972 
1973  if( strcmp(argv[i], "--ydim") == 0){
1974  if (i+1 >= argc){
1975  usage(argv);
1976  }
1977  ydim= atoi(argv[i+1]);
1978  if (ydim < 0 || ydim > 512){
1979  printf("ERROR: invalid T dimension (%d)\n", ydim);
1980  usage(argv);
1981  }
1982  i++;
1983  ret = 0;
1984  goto out;
1985  }
1986 
1987 
1988  if( strcmp(argv[i], "--zdim") == 0){
1989  if (i+1 >= argc){
1990  usage(argv);
1991  }
1992  zdim= atoi(argv[i+1]);
1993  if (zdim < 0 || zdim > 512){
1994  printf("ERROR: invalid T dimension (%d)\n", zdim);
1995  usage(argv);
1996  }
1997  i++;
1998  ret = 0;
1999  goto out;
2000  }
2001 
2002  if( strcmp(argv[i], "--tdim") == 0){
2003  if (i+1 >= argc){
2004  usage(argv);
2005  }
2006  tdim = atoi(argv[i+1]);
2007  if (tdim < 0 || tdim > 512){
2008  printf("Error: invalid t dimension");
2009  usage(argv);
2010  }
2011  i++;
2012  ret = 0;
2013  goto out;
2014  }
2015 
2016  if( strcmp(argv[i], "--sdim") == 0){
2017  if (i+1 >= argc){
2018  usage(argv);
2019  }
2020  int sdim = atoi(argv[i+1]);
2021  if (sdim < 0 || sdim > 512){
2022  printf("ERROR: invalid S dimension\n");
2023  usage(argv);
2024  }
2025  xdim=ydim=zdim=sdim;
2026  i++;
2027  ret = 0;
2028  goto out;
2029  }
2030 
2031  if( strcmp(argv[i], "--Lsdim") == 0){
2032  if (i+1 >= argc){
2033  usage(argv);
2034  }
2035  int Ls = atoi(argv[i+1]);
2036  if (Ls < 0 || Ls > 128){
2037  printf("ERROR: invalid Ls dimension\n");
2038  usage(argv);
2039  }
2040  Lsdim=Ls;
2041  i++;
2042  ret = 0;
2043  goto out;
2044  }
2045 
2046  if( strcmp(argv[i], "--dagger") == 0){
2047  dagger = QUDA_DAG_YES;
2048  ret = 0;
2049  goto out;
2050  }
2051 
2052  if( strcmp(argv[i], "--partition") == 0){
2053  if (i+1 >= argc){
2054  usage(argv);
2055  }
2056 #ifdef MULTI_GPU
2057  int value = atoi(argv[i+1]);
2058  for(int j=0; j < 4;j++){
2059  if (value & (1 << j)){
2061  dim_partitioned[j] = 1;
2062  }
2063  }
2064 #else
2065  printfQuda("WARNING: Ignoring --partition option since this is a single-GPU build.\n");
2066 #endif
2067  i++;
2068  ret = 0;
2069  goto out;
2070  }
2071 
2072  if( strcmp(argv[i], "--kernel-pack-t") == 0){
2073  kernel_pack_t = true;
2074  ret= 0;
2075  goto out;
2076  }
2077 
2078 
2079  if( strcmp(argv[i], "--multishift") == 0){
2080  if (i+1 >= argc){
2081  usage(argv);
2082  }
2083 
2084  if (strcmp(argv[i+1], "true") == 0){
2085  multishift = true;
2086  }else if (strcmp(argv[i+1], "false") == 0){
2087  multishift = false;
2088  }else{
2089  fprintf(stderr, "ERROR: invalid multishift boolean\n");
2090  exit(1);
2091  }
2092 
2093  i++;
2094  ret = 0;
2095  goto out;
2096  }
2097 
2098  if( strcmp(argv[i], "--gridsize") == 0){
2099  if (i+1 >= argc){
2100  usage(argv);
2101  }
2102  int xsize = atoi(argv[i+1]);
2103  if (xsize <= 0 ){
2104  printf("ERROR: invalid X grid size");
2105  usage(argv);
2106  }
2108  i++;
2109 
2110  int ysize = atoi(argv[i+1]);
2111  if (ysize <= 0 ){
2112  printf("ERROR: invalid Y grid size");
2113  usage(argv);
2114  }
2116  i++;
2117 
2118  int zsize = atoi(argv[i+1]);
2119  if (zsize <= 0 ){
2120  printf("ERROR: invalid Z grid size");
2121  usage(argv);
2122  }
2123  gridsize_from_cmdline[2] = zsize;
2124  i++;
2125 
2126  int tsize = atoi(argv[i+1]);
2127  if (tsize <= 0 ){
2128  printf("ERROR: invalid T grid size");
2129  usage(argv);
2130  }
2131  gridsize_from_cmdline[3] = tsize;
2132  i++;
2133 
2134  ret = 0;
2135  goto out;
2136  }
2137 
2138  if( strcmp(argv[i], "--xgridsize") == 0){
2139  if (i+1 >= argc){
2140  usage(argv);
2141  }
2142  int xsize = atoi(argv[i+1]);
2143  if (xsize <= 0 ){
2144  printf("ERROR: invalid X grid size");
2145  usage(argv);
2146  }
2148  i++;
2149  ret = 0;
2150  goto out;
2151  }
2152 
2153  if( strcmp(argv[i], "--ygridsize") == 0){
2154  if (i+1 >= argc){
2155  usage(argv);
2156  }
2157  int ysize = atoi(argv[i+1]);
2158  if (ysize <= 0 ){
2159  printf("ERROR: invalid Y grid size");
2160  usage(argv);
2161  }
2163  i++;
2164  ret = 0;
2165  goto out;
2166  }
2167 
2168  if( strcmp(argv[i], "--zgridsize") == 0){
2169  if (i+1 >= argc){
2170  usage(argv);
2171  }
2172  int zsize = atoi(argv[i+1]);
2173  if (zsize <= 0 ){
2174  printf("ERROR: invalid Z grid size");
2175  usage(argv);
2176  }
2177  gridsize_from_cmdline[2] = zsize;
2178  i++;
2179  ret = 0;
2180  goto out;
2181  }
2182 
2183  if( strcmp(argv[i], "--tgridsize") == 0){
2184  if (i+1 >= argc){
2185  usage(argv);
2186  }
2187  int tsize = atoi(argv[i+1]);
2188  if (tsize <= 0 ){
2189  printf("ERROR: invalid T grid size");
2190  usage(argv);
2191  }
2192  gridsize_from_cmdline[3] = tsize;
2193  i++;
2194  ret = 0;
2195  goto out;
2196  }
2197 
2198  if( strcmp(argv[i], "--rank-order") == 0){
2199  if (i+1 >= argc){
2200  usage(argv);
2201  }
2202  rank_order = get_rank_order(argv[i+1]);
2203  i++;
2204  ret = 0;
2205  goto out;
2206  }
2207 
2208  if( strcmp(argv[i], "--dslash-type") == 0){
2209  if (i+1 >= argc){
2210  usage(argv);
2211  }
2212  dslash_type = get_dslash_type(argv[i+1]);
2213  i++;
2214  ret = 0;
2215  goto out;
2216  }
2217 
2218  if( strcmp(argv[i], "--flavor") == 0){
2219  if (i+1 >= argc){
2220  usage(argv);
2221  }
2222  twist_flavor = get_flavor_type(argv[i+1]);
2223  i++;
2224  ret = 0;
2225  goto out;
2226  }
2227 
2228  if( strcmp(argv[i], "--inv-type") == 0){
2229  if (i+1 >= argc){
2230  usage(argv);
2231  }
2232  inv_type = get_solver_type(argv[i+1]);
2233  i++;
2234  ret = 0;
2235  goto out;
2236  }
2237 
2238  if( strcmp(argv[i], "--precon-type") == 0){
2239  if (i+1 >= argc){
2240  usage(argv);
2241  }
2242  precon_type = get_solver_type(argv[i+1]);
2243  i++;
2244  ret = 0;
2245  goto out;
2246  }
2247 
2248  if( strcmp(argv[i], "--mass") == 0){
2249  if (i+1 >= argc){
2250  usage(argv);
2251  }
2252  mass = atof(argv[i+1]);
2253  i++;
2254  ret = 0;
2255  goto out;
2256  }
2257 
2258  if( strcmp(argv[i], "--compute-clover") == 0){
2259  if (i+1 >= argc){
2260  usage(argv);
2261  }
2262  if (strcmp(argv[i+1], "true") == 0){
2263  compute_clover = true;
2264  }else if (strcmp(argv[i+1], "false") == 0){
2265  compute_clover = false;
2266  }else{
2267  fprintf(stderr, "ERROR: invalid compute_clover type\n");
2268  exit(1);
2269  }
2270 
2271  i++;
2272  ret = 0;
2273  goto out;
2274  }
2275 
2276  if( strcmp(argv[i], "--clover-coeff") == 0){
2277  if (i+1 >= argc){
2278  usage(argv);
2279  }
2280  clover_coeff = atof(argv[i+1]);
2281  i++;
2282  ret = 0;
2283  goto out;
2284  }
2285 
2286  if( strcmp(argv[i], "--mu") == 0){
2287  if (i+1 >= argc){
2288  usage(argv);
2289  }
2290  mu = atof(argv[i+1]);
2291  i++;
2292  ret = 0;
2293  goto out;
2294  }
2295 
2296  if( strcmp(argv[i], "--anisotropy") == 0){
2297  if (i+1 >= argc){
2298  usage(argv);
2299  }
2300  anisotropy = atof(argv[i+1]);
2301  i++;
2302  ret = 0;
2303  goto out;
2304  }
2305 
2306  if( strcmp(argv[i], "--tol") == 0){
2307  if (i+1 >= argc){
2308  usage(argv);
2309  }
2310  tol= atof(argv[i+1]);
2311  i++;
2312  ret = 0;
2313  goto out;
2314  }
2315 
2316  if( strcmp(argv[i], "--tolhq") == 0){
2317  if (i+1 >= argc){
2318  usage(argv);
2319  }
2320  tol_hq= atof(argv[i+1]);
2321  i++;
2322  ret = 0;
2323  goto out;
2324  }
2325 
2326  if( strcmp(argv[i], "--mass-normalization") == 0){
2327  if (i+1 >= argc){
2328  usage(argv);
2329  }
2331  i++;
2332  ret = 0;
2333  goto out;
2334  }
2335 
2336  if( strcmp(argv[i], "--matpc") == 0){
2337  if (i+1 >= argc){
2338  usage(argv);
2339  }
2340  matpc_type = get_matpc_type(argv[i+1]);
2341  i++;
2342  ret = 0;
2343  goto out;
2344  }
2345 
2346  if( strcmp(argv[i], "--solve-type") == 0){
2347  if (i+1 >= argc){
2348  usage(argv);
2349  }
2350  solve_type = get_solve_type(argv[i+1]);
2351  i++;
2352  ret = 0;
2353  goto out;
2354  }
2355 
2356  if( strcmp(argv[i], "--load-gauge") == 0){
2357  if (i+1 >= argc){
2358  usage(argv);
2359  }
2360  strcpy(latfile, argv[i+1]);
2361  i++;
2362  ret = 0;
2363  goto out;
2364  }
2365 
2366  if( strcmp(argv[i], "--nsrc") == 0){
2367  if (i+1 >= argc){
2368  usage(argv);
2369  }
2370  Nsrc = atoi(argv[i+1]);
2371  if (Nsrc < 1 || Nsrc > 128){
2372  printf("ERROR: invalid number of sources (Nsrc=%d)\n", Nsrc);
2373  usage(argv);
2374  }
2375  i++;
2376  ret = 0;
2377  goto out;
2378  }
2379 
2380  if( strcmp(argv[i], "--msrc") == 0){
2381  if (i+1 >= argc){
2382  usage(argv);
2383  }
2384  Msrc = atoi(argv[i+1]);
2385  if (Msrc < 1 || Msrc > 128){
2386  printf("ERROR: invalid number of sources (Msrc=%d)\n", Msrc);
2387  usage(argv);
2388  }
2389  i++;
2390  ret = 0;
2391  goto out;
2392  }
2393 
2394  if( strcmp(argv[i], "--test") == 0){
2395  if (i+1 >= argc){
2396  usage(argv);
2397  }
2398  test_type = atoi(argv[i+1]);
2399  i++;
2400  ret = 0;
2401  goto out;
2402  }
2403 
2404  if( strcmp(argv[i], "--mg-nvec") == 0){
2405  if (i+1 >= argc){
2406  usage(argv);
2407  }
2408  int level = atoi(argv[i+1]);
2409  if (level < 0 || level >= QUDA_MAX_MG_LEVEL) {
2410  printf("ERROR: invalid multigrid level %d", level);
2411  usage(argv);
2412  }
2413  i++;
2414 
2415  nvec[level] = atoi(argv[i+1]);
2416  if (nvec[level] < 0 || nvec[level] > 128){
2417  printf("ERROR: invalid number of vectors (%d)\n", nvec[level]);
2418  usage(argv);
2419  }
2420  i++;
2421  ret = 0;
2422  goto out;
2423  }
2424 
2425  if( strcmp(argv[i], "--mg-levels") == 0){
2426  if (i+1 >= argc){
2427  usage(argv);
2428  }
2429  mg_levels= atoi(argv[i+1]);
2430  if (mg_levels < 2 || mg_levels > QUDA_MAX_MG_LEVEL){
2431  printf("ERROR: invalid number of multigrid levels (%d)\n", mg_levels);
2432  usage(argv);
2433  }
2434  i++;
2435  ret = 0;
2436  goto out;
2437  }
2438 
2439  if( strcmp(argv[i], "--mg-nu-pre") == 0){
2440  if (i+1 >= argc){
2441  usage(argv);
2442  }
2443  nu_pre= atoi(argv[i+1]);
2444  if (nu_pre < 0 || nu_pre > 20){
2445  printf("ERROR: invalid pre-smoother applications value (nu_pre=%d)\n", nu_pre);
2446  usage(argv);
2447  }
2448  i++;
2449  ret = 0;
2450  goto out;
2451  }
2452 
2453  if( strcmp(argv[i], "--mg-nu-post") == 0){
2454  if (i+1 >= argc){
2455  usage(argv);
2456  }
2457  nu_post= atoi(argv[i+1]);
2458  if (nu_post < 0 || nu_post > 20){
2459  printf("ERROR: invalid pre-smoother applications value (nu_pist=%d)\n", nu_post);
2460  usage(argv);
2461  }
2462  i++;
2463  ret = 0;
2464  goto out;
2465  }
2466 
2467  if( strcmp(argv[i], "--mg-setup-inv") == 0){
2468  if (i+1 >= argc){
2469  usage(argv);
2470  }
2471  int level = atoi(argv[i+1]);
2472  if (level < 0 || level >= QUDA_MAX_MG_LEVEL) {
2473  printf("ERROR: invalid multigrid level %d", level);
2474  usage(argv);
2475  }
2476  i++;
2477 
2478  setup_inv[level] = get_solver_type(argv[i+1]);
2479  i++;
2480  ret = 0;
2481  goto out;
2482  }
2483 
2484  if( strcmp(argv[i], "--mg-setup-tol") == 0){
2485  if (i+1 >= argc){
2486  usage(argv);
2487  }
2488 
2489  setup_tol = atof(argv[i+1]);
2490  i++;
2491  ret = 0;
2492  goto out;
2493  }
2494 
2495  if( strcmp(argv[i], "--mg-omega") == 0){
2496  if (i+1 >= argc){
2497  usage(argv);
2498  }
2499 
2500  omega = atof(argv[i+1]);
2501  i++;
2502  ret = 0;
2503  goto out;
2504  }
2505 
2506  if( strcmp(argv[i], "--mg-verbosity") == 0){
2507  if (i+1 >= argc){
2508  usage(argv);
2509  }
2510  int level = atoi(argv[i+1]);
2511  if (level < 0 || level >= QUDA_MAX_MG_LEVEL) {
2512  printf("ERROR: invalid multigrid level %d", level);
2513  usage(argv);
2514  }
2515  i++;
2516 
2517  mg_verbosity[level] = get_verbosity_type(argv[i+1]);
2518  i++;
2519  ret = 0;
2520  goto out;
2521  }
2522 
2523  if( strcmp(argv[i], "--mg-smoother") == 0){
2524  if (i+1 >= argc){
2525  usage(argv);
2526  }
2527  smoother_type = get_solver_type(argv[i+1]);
2528  i++;
2529  ret = 0;
2530  goto out;
2531  }
2532 
2533  if( strcmp(argv[i], "--mg-block-size") == 0){
2534  if (i+1 >= argc){
2535  usage(argv);
2536  }
2537  int level = atoi(argv[i+1]);
2538  if (level < 0 || level >= QUDA_MAX_MG_LEVEL) {
2539  printf("ERROR: invalid multigrid level %d", level);
2540  usage(argv);
2541  }
2542  i++;
2543 
2544  int xsize = atoi(argv[i+1]);
2545  if (xsize <= 0 ){
2546  printf("ERROR: invalid X block size");
2547  usage(argv);
2548  }
2549  geo_block_size[level][0] = xsize;
2550  i++;
2551 
2552  int ysize = atoi(argv[i+1]);
2553  if (ysize <= 0 ){
2554  printf("ERROR: invalid Y block size");
2555  usage(argv);
2556  }
2557  geo_block_size[level][1] = ysize;
2558  i++;
2559 
2560  int zsize = atoi(argv[i+1]);
2561  if (zsize <= 0 ){
2562  printf("ERROR: invalid Z block size");
2563  usage(argv);
2564  }
2565  geo_block_size[level][2] = zsize;
2566  i++;
2567 
2568  int tsize = atoi(argv[i+1]);
2569  if (tsize <= 0 ){
2570  printf("ERROR: invalid T block size");
2571  usage(argv);
2572  }
2573  geo_block_size[level][3] = tsize;
2574  i++;
2575 
2576  ret = 0;
2577  goto out;
2578  }
2579 
2580  if( strcmp(argv[i], "--mass") == 0){
2581  if (i+1 >= argc){
2582  usage(argv);
2583  }
2584  mass= atof(argv[i+1]);
2585  i++;
2586  ret = 0;
2587  goto out;
2588  }
2589 
2590  if( strcmp(argv[i], "--mg-mu-factor") == 0){
2591  if (i+1 >= argc){
2592  usage(argv);
2593  }
2594  int level = atoi(argv[i+1]);
2595  if (level < 0 || level >= QUDA_MAX_MG_LEVEL) {
2596  printf("ERROR: invalid multigrid level %d", level);
2597  usage(argv);
2598  }
2599  i++;
2600 
2601  double factor = atof(argv[i+1]);
2602  mu_factor[level] = factor;
2603  i++;
2604  ret=0;
2605  goto out;
2606  }
2607 
2608  if( strcmp(argv[i], "--mg-generate-nullspace") == 0){
2609  if (i+1 >= argc){
2610  usage(argv);
2611  }
2612 
2613  if (strcmp(argv[i+1], "true") == 0){
2614  generate_nullspace = true;
2615  }else if (strcmp(argv[i+1], "false") == 0){
2616  generate_nullspace = false;
2617  }else{
2618  fprintf(stderr, "ERROR: invalid generate nullspace type\n");
2619  exit(1);
2620  }
2621 
2622  i++;
2623  ret = 0;
2624  goto out;
2625  }
2626 
2627  if( strcmp(argv[i], "--mg-generate-all-levels") == 0){
2628  if (i+1 >= argc){
2629  usage(argv);
2630  }
2631 
2632  if (strcmp(argv[i+1], "true") == 0){
2633  generate_all_levels = true;
2634  }else if (strcmp(argv[i+1], "false") == 0){
2635  generate_all_levels = false;
2636  }else{
2637  fprintf(stderr, "ERROR: invalid value for generate_all_levels type\n");
2638  exit(1);
2639  }
2640 
2641  i++;
2642  ret = 0;
2643  goto out;
2644  }
2645 
2646  if( strcmp(argv[i], "--mg-load-vec") == 0){
2647  if (i+1 >= argc){
2648  usage(argv);
2649  }
2650  strcpy(vec_infile, argv[i+1]);
2651  i++;
2652  ret = 0;
2653  goto out;
2654  }
2655 
2656  if( strcmp(argv[i], "--mg-save-vec") == 0){
2657  if (i+1 >= argc){
2658  usage(argv);
2659  }
2660  strcpy(vec_outfile, argv[i+1]);
2661  i++;
2662  ret = 0;
2663  goto out;
2664  }
2665 
2666  if( strcmp(argv[i], "--df-nev") == 0){
2667  if (i+1 >= argc){
2668  usage(argv);
2669  }
2670 
2671  nev = atoi(argv[i+1]);
2672  i++;
2673  ret = 0;
2674  goto out;
2675  }
2676 
2677  if( strcmp(argv[i], "--df-max-search-dim") == 0){
2678  if (i+1 >= argc){
2679  usage(argv);
2680  }
2681 
2682  max_search_dim = atoi(argv[i+1]);
2683  i++;
2684  ret = 0;
2685  goto out;
2686  }
2687 
2688  if( strcmp(argv[i], "--df-deflation-grid") == 0){
2689  if (i+1 >= argc){
2690  usage(argv);
2691  }
2692 
2693  deflation_grid = atoi(argv[i+1]);
2694  i++;
2695  ret = 0;
2696  goto out;
2697  }
2698 
2699 
2700  if( strcmp(argv[i], "--df-eigcg-max-restarts") == 0){
2701  if (i+1 >= argc){
2702  usage(argv);
2703  }
2704 
2705  eigcg_max_restarts = atoi(argv[i+1]);
2706  i++;
2707  ret = 0;
2708  goto out;
2709  }
2710 
2711  if( strcmp(argv[i], "--df-max-restart-num") == 0){
2712  if (i+1 >= argc){
2713  usage(argv);
2714  }
2715 
2716  max_restart_num = atoi(argv[i+1]);
2717  i++;
2718  ret = 0;
2719  goto out;
2720  }
2721 
2722 
2723  if( strcmp(argv[i], "--df-tol-restart") == 0){
2724  if (i+1 >= argc){
2725  usage(argv);
2726  }
2727 
2728  tol_restart = atof(argv[i+1]);
2729  i++;
2730  ret = 0;
2731  goto out;
2732  }
2733 
2734 
2735  if( strcmp(argv[i], "--df-tol-eigenval") == 0){
2736  if (i+1 >= argc){
2737  usage(argv);
2738  }
2739 
2740  eigenval_tol = atof(argv[i+1]);
2741  i++;
2742  ret = 0;
2743  goto out;
2744  }
2745 
2746  if( strcmp(argv[i], "--df-tol-inc") == 0){
2747  if (i+1 >= argc){
2748  usage(argv);
2749  }
2750 
2751  inc_tol = atof(argv[i+1]);
2752  i++;
2753  ret = 0;
2754  goto out;
2755  }
2756 
2757  if( strcmp(argv[i], "--solver-ext-lib-type") == 0){
2758  if (i+1 >= argc){
2759  usage(argv);
2760  }
2762  i++;
2763  ret = 0;
2764  goto out;
2765  }
2766 
2767  if( strcmp(argv[i], "--df-ext-lib-type") == 0){
2768  if (i+1 >= argc){
2769  usage(argv);
2770  }
2772  i++;
2773  ret = 0;
2774  goto out;
2775  }
2776 
2777  if( strcmp(argv[i], "--df-location-ritz") == 0){
2778  if (i+1 >= argc){
2779  usage(argv);
2780  }
2782  i++;
2783  ret = 0;
2784  goto out;
2785  }
2786 
2787  if( strcmp(argv[i], "--df-mem-type-ritz") == 0){
2788  if (i+1 >= argc){
2789  usage(argv);
2790  }
2792  i++;
2793  ret = 0;
2794  goto out;
2795  }
2796 
2797  if( strcmp(argv[i], "--niter") == 0){
2798  if (i+1 >= argc){
2799  usage(argv);
2800  }
2801  niter= atoi(argv[i+1]);
2802  if (niter < 1 || niter > 1e6){
2803  printf("ERROR: invalid number of iterations (%d)\n", niter);
2804  usage(argv);
2805  }
2806  i++;
2807  ret = 0;
2808  goto out;
2809  }
2810 
2811  if( strcmp(argv[i], "--ngcrkrylov") == 0){
2812  if (i+1 >= argc){
2813  usage(argv);
2814  }
2815  gcrNkrylov = atoi(argv[i+1]);
2816  if (gcrNkrylov < 1 || gcrNkrylov > 1e6){
2817  printf("ERROR: invalid number of gcrkrylov iterations (%d)\n", gcrNkrylov);
2818  usage(argv);
2819  }
2820  i++;
2821  ret = 0;
2822  goto out;
2823  }
2824 
2825  if( strcmp(argv[i], "--pipeline") == 0){
2826  if (i+1 >= argc){
2827  usage(argv);
2828  }
2829  pipeline = atoi(argv[i+1]);
2830  if (pipeline < 0 || pipeline > 8){
2831  printf("ERROR: invalid pipeline length (%d)\n", pipeline);
2832  usage(argv);
2833  }
2834  i++;
2835  ret = 0;
2836  goto out;
2837  }
2838 
2839  if( strcmp(argv[i], "--solution-pipeline") == 0){
2840  if (i+1 >= argc){
2841  usage(argv);
2842  }
2844  if (solution_accumulator_pipeline < 0 || solution_accumulator_pipeline > 16){
2845  printf("ERROR: invalid solution pipeline length (%d)\n", solution_accumulator_pipeline);
2846  usage(argv);
2847  }
2848  i++;
2849  ret = 0;
2850  goto out;
2851  }
2852 
2853  if( strcmp(argv[i], "--version") == 0){
2854  printf("This program is linked with QUDA library, version %s,",
2855  get_quda_ver_str());
2856  printf(" %s GPU build\n", msg);
2857  exit(0);
2858  }
2859 
2860  out:
2861  *idx = i;
2862  return ret ;
2863 
2864 }
2865 
2866 
2867 static struct timeval startTime;
2868 
2870  gettimeofday(&startTime, NULL);
2871 }
2872 
2874  struct timeval endTime;
2875  gettimeofday(&endTime, NULL);
2876 
2877  long ds = endTime.tv_sec - startTime.tv_sec;
2878  long dus = endTime.tv_usec - startTime.tv_usec;
2879  return ds + 0.000001*dus;
2880 }
2881 
2882 
void printGaugeElement(void *gauge, int X, QudaPrecision precision)
Definition: test_util.cpp:212
QudaPrecision prec_precondition
Definition: test_util.cpp:1617
static void orthogonalize(complex< Float > *a, complex< Float > *b, int len)
Definition: test_util.cpp:909
int dimPartitioned(int dim)
Definition: test_util.cpp:1686
void applyGaugeFieldScaling_long(Float **gauge, int Vh, QudaGaugeParam *param, QudaDslashType dslash_type)
Definition: test_util.cpp:765
enum QudaMassNormalization_s QudaMassNormalization
double anisotropy
Definition: quda.h:31
bool generate_all_levels
Definition: test_util.cpp:1666
int Vh_ex
Definition: test_util.cpp:37
void free(void *)
void construct_gauge_field(void **gauge, int type, QudaPrecision precision, QudaGaugeParam *param)
Definition: test_util.cpp:1054
static void constructUnitGaugeField(Float **res, QudaGaugeParam *param)
Definition: test_util.cpp:876
int compareLink(Float **linkA, Float **linkB, int len)
Definition: test_util.cpp:1359
enum QudaPrecision_s QudaPrecision
_EXTERN_C_ int MPI_Init(int *argc, char ***argv)
Definition: nvtx_pmpi.c:34
QudaTwistFlavorType get_flavor_type(char *s)
Definition: misc.cpp:1070
int max_restart_num
Definition: test_util.cpp:1675
void dw_setDims(int *X, const int L5)
Definition: test_util.cpp:167
static void printVector(Float *v)
Definition: test_util.cpp:199
float fabsf(float)
int getOddBit(int Y)
Definition: test_util.cpp:228
int geo_block_size[QUDA_MAX_MG_LEVEL][QUDA_MAX_DIM]
Definition: test_util.cpp:1668
QudaGaugeFixed gauge_fix
Definition: quda.h:51
__host__ __device__ ValueType norm(const complex< ValueType > &z)
Returns the magnitude of z squared.
Definition: complex_quda.h:896
int Vs_z
Definition: test_util.cpp:30
void accumulateConjugateProduct(Float *a, Float *b, Float *c, int sign)
Definition: test_util.cpp:279
int zdim
Definition: test_util.cpp:1622
__darwin_time_t tv_sec
#define TUP
Definition: test_util.cpp:24
const void * func
Matrix< N, std::complex< T > > conj(const Matrix< N, std::complex< T > > &mat)
QudaLinkType type
Definition: quda.h:35
int niter
Definition: test_util.cpp:1630
int mySpinorSiteSize
Definition: test_util.cpp:43
#define errorQuda(...)
Definition: util_quda.h:90
void printSpinorElement(void *spinor, int X, QudaPrecision precision)
Definition: test_util.cpp:204
#define ZUP
Definition: test_util.cpp:23
void createHwCPU(void *hw, QudaPrecision precision)
Definition: test_util.cpp:1492
double eigenval_tol
Definition: test_util.cpp:1677
int multishift
Definition: test_util.cpp:1640
int nvec[QUDA_MAX_MG_LEVEL]
Definition: test_util.cpp:1635
void createMomCPU(void *mom, QudaPrecision precision)
Definition: test_util.cpp:1454
int device
Definition: test_util.cpp:1609
enum QudaSolveType_s QudaSolveType
void commDimPartitionedSet(int dir)
int commCoords(int)
int V_ex
Definition: test_util.cpp:37
int Vs_y
Definition: test_util.cpp:30
int process_command_line_option(int argc, char **argv, int *idx)
Definition: test_util.cpp:1795
int mg_levels
Definition: test_util.cpp:1655
static __inline__ dim3 dim3 void size_t cudaStream_t int dim
static int rank
Definition: comm_mpi.cpp:42
void complexDotProduct(Float *a, Float *b, Float *c)
Definition: test_util.cpp:259
QudaFieldLocation location_ritz
Definition: test_util.cpp:1681
double cos(double)
double anisotropy
Definition: test_util.cpp:1644
cudaMipmappedArray_const_t unsigned int level
int fullLatticeIndex_5d_4dpc(int i, int oddBit)
Definition: test_util.cpp:697
static void applyGaugeFieldScaling(Float **gauge, int Vh, QudaGaugeParam *param)
Definition: test_util.cpp:719
char * strcpy(char *__dst, const char *__src)
bool kernel_pack_t
Definition: test_util.cpp:1650
int pipeline
Definition: test_util.cpp:1632
int Msrc
Definition: test_util.cpp:1629
STL namespace.
void finalizeComms()
Definition: test_util.cpp:107
double pow(double, double)
int tdim
Definition: test_util.cpp:1623
double atan2(double, double)
static struct timeval startTime
Definition: test_util.cpp:2867
void su3Construct8(Float *mat)
Definition: test_util.cpp:297
void stopwatchStart()
Definition: test_util.cpp:2869
void usage_extra(char **argv)
void constructUnitaryGaugeField(Float **res)
Definition: test_util.cpp:995
int gridsize_from_cmdline[4]
Definition: test_util.cpp:50
int E1
Definition: test_util.cpp:35
int ydim
Definition: test_util.cpp:1621
int compare_floats(void *a, void *b, int len, double epsilon, QudaPrecision precision)
Definition: test_util.cpp:437
int get_rank_order(char *s)
Definition: misc.cpp:828
double mu_factor[QUDA_MAX_MG_LEVEL]
Definition: test_util.cpp:1659
static int compare_link(void **linkA, void **linkB, int len, QudaPrecision precision)
Definition: test_util.cpp:1397
QudaVerbosity mg_verbosity[QUDA_MAX_MG_LEVEL]
Definition: test_util.cpp:1660
QudaReconstructType get_recon(char *s)
Definition: misc.cpp:660
QudaInverterType precon_type
Definition: test_util.cpp:1639
int nu_pre
Definition: test_util.cpp:1657
int Nsrc
Definition: test_util.cpp:1628
static void printMomElement(void *mom, int X, QudaPrecision precision)
Definition: test_util.cpp:1553
void createSiteLinkCPU(void **link, QudaPrecision precision, int phase)
Definition: test_util.cpp:1229
int Vsh_y
Definition: test_util.cpp:31
void exit(int) __attribute__((noreturn))
int gcrNkrylov
Definition: test_util.cpp:1631
QudaFieldLocation get_df_location_ritz(char *s)
Definition: misc.cpp:1261
void initCommsGridQuda(int nDim, const int *dims, QudaCommsMap func, void *fdata)
__darwin_suseconds_t tv_usec
int fullLatticeIndex(int dim[4], int index, int oddBit)
Definition: test_util.cpp:442
static size_t gSize
Definition: llfat_test.cpp:36
int Ls
Definition: test_util.cpp:39
QudaSolveType get_solve_type(char *s)
Definition: misc.cpp:1013
char * index(const char *, int)
QudaGaugeParam param
Definition: pack_test.cpp:17
#define b
QudaReconstructType link_recon_precondition
Definition: test_util.cpp:1614
else return(__swbuf(_c, _p))
int Vsh_t
Definition: test_util.cpp:31
QudaExtLibType get_solve_ext_lib_type(char *s)
Definition: misc.cpp:1244
int strcmp(const char *__s1, const char *__s2)
int compare_mom(Float *momA, Float *momB, int len)
Definition: test_util.cpp:1517
void * longlink[4]
double stopwatchReadSeconds()
Definition: test_util.cpp:2873
int xdim
Definition: test_util.cpp:1620
int Vs_x
Definition: test_util.cpp:30
double tol
Definition: test_util.cpp:1647
void * malloc(size_t __size) __attribute__((__warn_unused_result__)) __attribute__((alloc_size(1)))
QudaInverterType get_solver_type(char *s)
Definition: misc.cpp:1117
int printf(const char *,...) __attribute__((__format__(__printf__
double mass
Definition: test_util.cpp:1642
void srand(unsigned) __attribute__((__availability__(swift
static void * hw
static int dim_partitioned[4]
Definition: test_util.cpp:1684
QudaInverterType setup_inv[QUDA_MAX_MG_LEVEL]
Definition: test_util.cpp:1661
void setSpinorSiteSize(int n)
Definition: test_util.cpp:192
__host__ __device__ void sum(double &a, double &b)
int V5h
Definition: test_util.cpp:41
bool generate_nullspace
Definition: test_util.cpp:1665
QudaDagType dagger
Definition: test_util.cpp:1625
QudaDslashType get_dslash_type(char *s)
Definition: misc.cpp:845
void complexAddTo(Float *a, Float *b)
Definition: test_util.cpp:238
int int int w
enum QudaMatPCType_s QudaMatPCType
#define gaugeSiteSize
Definition: test_util.h:6
int commDim(int)
static int compareFloats(Float *a, Float *b, int len, double epsilon)
Definition: test_util.cpp:426
double tol_restart
Definition: test_util.cpp:1672
double sqrt(double)
void accumulateComplexProduct(Float *a, Float *b, Float *c, Float sign)
Definition: test_util.cpp:266
int test_type
Definition: test_util.cpp:1634
double clover_coeff
Definition: test_util.cpp:1645
QudaReconstructType link_recon_sloppy
Definition: test_util.cpp:1613
int int int enum cudaChannelFormatKind f
static void constructGaugeField(Float **res, QudaGaugeParam *param, QudaDslashType dslash_type=QUDA_WILSON_DSLASH)
Definition: test_util.cpp:916
QudaMatPCType get_matpc_type(char *s)
Definition: misc.cpp:966
#define YUP
Definition: test_util.cpp:22
QudaPrecision prec_ritz
Definition: test_util.cpp:1618
double setup_tol
Definition: test_util.cpp:1662
int max_search_dim
Definition: test_util.cpp:1670
int(* QudaCommsMap)(const int *coords, void *fdata)
Definition: comm_quda.h:12
enum QudaDagType_s QudaDagType
void su3Construct12(Float *mat)
Definition: test_util.cpp:285
static void su3Reconstruct8(Float *mat, int dir, int ga_idx, QudaGaugeParam *param)
Definition: test_util.cpp:335
QudaReconstructType reconstruct
Definition: quda.h:43
void __attribute__((weak)) usage_extra(char **argv)
Definition: test_util.cpp:1691
int X[4]
Definition: quda.h:29
void complexProduct(Float *a, Float *b, Float *c)
Definition: test_util.cpp:245
int neighborIndex(int i, int oddBit, int dx4, int dx3, int dx2, int dx1)
Definition: test_util.cpp:492
static void su3Reconstruct12(Float *mat, int dir, int ga_idx, QudaGaugeParam *param)
Definition: test_util.cpp:318
int Z[4]
Definition: test_util.cpp:27
int fprintf(FILE *, const char *,...) __attribute__((__format__(__printf__
int strong_check_mom(void *momA, void *momB, int len, QudaPrecision prec)
Definition: test_util.cpp:1565
void check_gauge(void **oldG, void **newG, double epsilon, QudaPrecision precision)
Definition: test_util.cpp:1219
static void printLinkElement(void *link, int X, QudaPrecision precision)
Definition: test_util.cpp:1413
QudaTwistFlavorType twist_flavor
Definition: test_util.cpp:1649
#define hwSiteSize
Definition: test_util.h:10
int rand(void) __attribute__((__availability__(swift
int strong_check_link(void **linkA, const char *msgA, void **linkB, const char *msgB, int len, QudaPrecision prec)
Definition: test_util.cpp:1428
QudaMemoryType get_df_mem_type_ritz(char *s)
Definition: misc.cpp:1279
int neighborIndex_mg(int i, int oddBit, int dx4, int dx3, int dx2, int dx1)
Definition: test_util.cpp:527
char vec_outfile[256]
Definition: test_util.cpp:1637
void construct_fat_long_gauge_field(void **fatlink, void **longlink, int type, QudaPrecision precision, QudaGaugeParam *param, QudaDslashType dslash_type)
Definition: test_util.cpp:1069
void construct_clover_field(void *clover, double norm, double diag, QudaPrecision precision)
Definition: test_util.cpp:1166
double tol_hq
Definition: test_util.cpp:1648
QudaDslashType dslash_type
Definition: test_util.cpp:1626
QudaExtLibType solver_ext_lib
Definition: test_util.cpp:1679
int nu_post
Definition: test_util.cpp:1658
double inc_tol
Definition: test_util.cpp:1676
int neighborIndexFullLattice_mg(int i, int dx4, int dx3, int dx2, int dx1)
Definition: test_util.cpp:611
enum QudaFieldLocation_s QudaFieldLocation
int eigcg_max_restarts
Definition: test_util.cpp:1674
bool compute_clover
Definition: test_util.cpp:1646
int fullLatticeIndex_5d(int i, int oddBit)
Definition: test_util.cpp:692
double tadpole_coeff
Definition: quda.h:32
cpuColorSpinorField * out
double atof(const char *)
int solution_accumulator_pipeline
Definition: test_util.cpp:1633
void setDims(int *X)
Definition: test_util.cpp:130
int E[4]
Definition: test_util.cpp:36
void * fatlink[4]
QudaExtLibType deflation_ext_lib
Definition: test_util.cpp:1680
enum QudaReconstructType_s QudaReconstructType
void initRand()
Definition: test_util.cpp:117
int E2
Definition: test_util.cpp:35
#define QUDA_MAX_MG_LEVEL
Maximum number of multi-grid levels. This number may be increased if needed.
int fullLatticeIndex_4d(int i, int oddBit)
Definition: test_util.cpp:658
void usage(char **argv)
Definition: test_util.cpp:1693
#define printfQuda(...)
Definition: util_quda.h:84
QudaTboundary t_boundary
Definition: quda.h:38
int x4_from_full_index(int i)
Definition: test_util.cpp:703
QudaMassNormalization get_mass_normalization_type(char *s)
Definition: misc.cpp:924
static void normalize(complex< Float > *a, int len)
Definition: test_util.cpp:901
int atoi(const char *)
double fabs(double)
int full_idx
QudaInverterType inv_type
Definition: test_util.cpp:1638
bool verify_results
Definition: test_util.cpp:1641
QudaMassNormalization normalization
Definition: test_util.cpp:1651
int V5
Definition: test_util.cpp:40
enum QudaDslashType_s QudaDslashType
int Lsdim
Definition: test_util.cpp:1624
QudaPrecision prec
Definition: test_util.cpp:1615
int faceVolume[4]
Definition: test_util.cpp:32
QudaSolveType solve_type
Definition: test_util.cpp:1653
int Vsh_z
Definition: test_util.cpp:31
void accumulateComplexDotProduct(Float *a, Float *b, Float *c)
Definition: test_util.cpp:273
static int lex_rank_from_coords_t(const int *coords, void *fdata)
Definition: test_util.cpp:52
const void * c
__host__ __device__ complex< ValueType > polar(const ValueType &m, const ValueType &theta=0)
Returns the complex with magnitude m and angle theta in radians.
Definition: complex_quda.h:902
int Vs_t
Definition: test_util.cpp:30
enum QudaVerbosity_s QudaVerbosity
QudaInverterType smoother_type
Definition: test_util.cpp:1664
QudaMatPCType matpc_type
Definition: test_util.cpp:1652
char vec_infile[256]
Definition: test_util.cpp:1636
#define QUDA_MAX_DIM
Maximum number of dimensions supported by QUDA. In practice, no routines make use of more than 5...
QudaMemoryType mem_type_ritz
Definition: test_util.cpp:1682
int E4
Definition: test_util.cpp:35
static void constructCloverField(Float *res, double norm, double diag)
Definition: test_util.cpp:1137
QudaVerbosity get_verbosity_type(char *s)
Definition: misc.cpp:613
static int lex_rank_from_coords_x(const int *coords, void *fdata)
Definition: test_util.cpp:61
char latfile[256]
Definition: test_util.cpp:1627
void mat(void *out, void **link, void *in, int dagger_bit, int mu, QudaPrecision sPrecision, QudaPrecision gPrecision)
static __inline__ size_t size_t d
double omega
Definition: test_util.cpp:1663
int E3
Definition: test_util.cpp:35
double sin(double)
void su3_construct(void *mat, QudaReconstructType reconstruct, QudaPrecision precision)
Definition: test_util.cpp:303
const char * get_quda_ver_str()
Definition: misc.cpp:1229
void su3_reconstruct(void *mat, int dir, int ga_idx, QudaReconstructType reconstruct, QudaPrecision precision, QudaGaugeParam *param)
Definition: test_util.cpp:393
QudaPrecision prec_sloppy
Definition: test_util.cpp:1616
#define a
void initComms(int argc, char **argv, const int *commDims)
Definition: test_util.cpp:72
double mu
Definition: test_util.cpp:1643
float fat_link_max
int Vh
Definition: test_util.cpp:29
static int rank_order
Definition: test_util.cpp:70
enum QudaInverterType_s QudaInverterType
QudaPrecision get_prec(QIO_Reader *infile)
Definition: qio_field.cpp:72
enum QudaMemoryType_s QudaMemoryType
cpuColorSpinorField * spinor
Definition: covdev_test.cpp:41
static void checkGauge(Float **oldG, Float **newG, double epsilon)
Definition: test_util.cpp:1185
int V
Definition: test_util.cpp:28
int comm_dim_partitioned(int dim)
enum QudaExtLibType_s QudaExtLibType
int Vsh_x
Definition: test_util.cpp:31
#define XUP
Definition: test_util.cpp:21
void complexConjugateProduct(Float *a, Float *b, Float *c)
Definition: test_util.cpp:252
int E1h
Definition: test_util.cpp:35
int nev
Definition: test_util.cpp:1669
int deflation_grid
Definition: test_util.cpp:1671
int neighborIndexFullLattice(int i, int dx4, int dx3, int dx2, int dx1)
Definition: test_util.cpp:563
#define momSiteSize
Definition: test_util.h:9
static void dot(sFloat *res, gFloat *a, sFloat *b)
Definition: dslash_util.h:56
QudaReconstructType link_recon
Definition: test_util.cpp:1612
enum QudaTwistFlavorType_s QudaTwistFlavorType