QUDA  1.0.0
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros
test_util.cpp
Go to the documentation of this file.
1 #include <complex>
2 #include <stdlib.h>
3 #include <stdio.h>
4 #include <string.h>
5 #include <short.h>
6 
7 #include <comm_quda.h>
8 
9 // This contains the appropriate ifdef guards already
10 #include <mpi_comm_handle.h>
11 
13 #include <test_util.h>
14 
15 #include <dslash_quda.h>
16 #include "misc.h"
17 
18 using namespace std;
19 
20 #define XUP 0
21 #define YUP 1
22 #define ZUP 2
23 #define TUP 3
24 
25 
26 int Z[4];
27 int V;
28 int Vh;
29 int Vs_x, Vs_y, Vs_z, Vs_t;
31 int faceVolume[4];
32 
33 //extended volume, +4
34 int E1, E1h, E2, E3, E4;
35 int E[4];
36 int V_ex, Vh_ex;
37 
38 int Ls;
39 int V5;
40 int V5h;
41 
43 
44 extern float fat_link_max;
45 
49 int gridsize_from_cmdline[4] = {1,1,1,1};
50 
51 void get_gridsize_from_env(int *const dims)
52 {
53  char *grid_size_env = getenv("QUDA_TEST_GRID_SIZE");
54  if (grid_size_env) {
55  std::stringstream grid_list(grid_size_env);
56 
57  int dim;
58  int i = 0;
59  while (grid_list >> dim) {
60  if (i >= 4) errorQuda("Unexpected grid size array length");
61  dims[i] = dim;
62  if (grid_list.peek() == ',') grid_list.ignore();
63  i++;
64  }
65  }
66 }
67 
68 static int lex_rank_from_coords_t(const int *coords, void *fdata)
69 {
70  int rank = coords[0];
71  for (int i = 1; i < 4; i++) {
72  rank = gridsize_from_cmdline[i] * rank + coords[i];
73  }
74  return rank;
75 }
76 
77 static int lex_rank_from_coords_x(const int *coords, void *fdata)
78 {
79  int rank = coords[3];
80  for (int i = 2; i >= 0; i--) {
81  rank = gridsize_from_cmdline[i] * rank + coords[i];
82  }
83  return rank;
84 }
85 
86 static int rank_order = 0;
87 
88 void initComms(int argc, char **argv, int *const commDims)
89 {
90  if (getenv("QUDA_TEST_GRID_SIZE")) get_gridsize_from_env(commDims);
91 
92 #if defined(QMP_COMMS)
93  QMP_thread_level_t tl;
94  QMP_init_msg_passing(&argc, &argv, QMP_THREAD_SINGLE, &tl);
95 
96  // make sure the QMP logical ordering matches QUDA's
97  if (rank_order == 0) {
98  int map[] = {3, 2, 1, 0};
99  QMP_declare_logical_topology_map(commDims, 4, map, 4);
100  } else {
101  int map[] = { 0, 1, 2, 3 };
102  QMP_declare_logical_topology_map(commDims, 4, map, 4);
103  }
104 #elif defined(MPI_COMMS)
105  MPI_Init(&argc, &argv);
106 #endif
107 
109 
110  initCommsGridQuda(4, commDims, func, NULL);
111  initRand();
112 
113  printfQuda("Rank order is %s major (%s running fastest)\n",
114  rank_order == 0 ? "column" : "row", rank_order == 0 ? "t" : "x");
115 
116 }
117 
119 {
120  // only apply T-boundary at edge nodes
121 #ifdef MULTI_GPU
122  return commCoords(3) == commDim(3)-1;
123 #else
124  return true;
125 #endif
126 }
127 
129 {
130 #if defined(QMP_COMMS)
131  QMP_finalize_msg_passing();
132 #elif defined(MPI_COMMS)
133  MPI_Finalize();
134 #endif
135 }
136 
137 
138 void initRand()
139 {
140  int rank = 0;
141 
142 #if defined(QMP_COMMS)
143  rank = QMP_get_node_number();
144 #elif defined(MPI_COMMS)
145  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
146 #endif
147 
148  srand(17*rank + 137);
149 }
150 
151 void setDims(int *X) {
152  V = 1;
153  for (int d=0; d< 4; d++) {
154  V *= X[d];
155  Z[d] = X[d];
156 
157  faceVolume[d] = 1;
158  for (int i=0; i<4; i++) {
159  if (i==d) continue;
160  faceVolume[d] *= X[i];
161  }
162  }
163  Vh = V/2;
164 
165  Vs_x = X[1]*X[2]*X[3];
166  Vs_y = X[0]*X[2]*X[3];
167  Vs_z = X[0]*X[1]*X[3];
168  Vs_t = X[0]*X[1]*X[2];
169 
170  Vsh_x = Vs_x/2;
171  Vsh_y = Vs_y/2;
172  Vsh_z = Vs_z/2;
173  Vsh_t = Vs_t/2;
174 
175 
176  E1=X[0]+4; E2=X[1]+4; E3=X[2]+4; E4=X[3]+4;
177  E1h=E1/2;
178  E[0] = E1;
179  E[1] = E2;
180  E[2] = E3;
181  E[3] = E4;
182  V_ex = E1*E2*E3*E4;
183  Vh_ex = V_ex/2;
184 
185 }
186 
187 void dw_setDims(int *X, const int L5)
188 {
189  V = 1;
190  for (int d = 0; d < 4; d++) {
191  V *= X[d];
192  Z[d] = X[d];
193 
194  faceVolume[d] = 1;
195  for (int i=0; i<4; i++) {
196  if (i==d) continue;
197  faceVolume[d] *= X[i];
198  }
199  }
200  Vh = V/2;
201 
202  Ls = L5;
203  V5 = V*Ls;
204  V5h = Vh*Ls;
205 
206  Vs_t = Z[0]*Z[1]*Z[2]*Ls;//?
207  Vsh_t = Vs_t/2; //?
208 }
209 
210 
211 void setSpinorSiteSize(int n)
212 {
213  mySpinorSiteSize = n;
214 }
215 
216 
217 template <typename Float>
218 static void printVector(Float *v) {
219  printfQuda("{(%f %f) (%f %f) (%f %f)}\n", v[0], v[1], v[2], v[3], v[4], v[5]);
220 }
221 
222 // X indexes the lattice site
223 void printSpinorElement(void *spinor, int X, QudaPrecision precision) {
224  if (precision == QUDA_DOUBLE_PRECISION)
225  for (int s=0; s<4; s++) printVector((double*)spinor+X*24+s*6);
226  else
227  for (int s=0; s<4; s++) printVector((float*)spinor+X*24+s*6);
228 }
229 
230 // X indexes the full lattice
231 void printGaugeElement(void *gauge, int X, QudaPrecision precision) {
232  if (getOddBit(X) == 0) {
233  if (precision == QUDA_DOUBLE_PRECISION)
234  for (int m=0; m<3; m++) printVector((double*)gauge +(X/2)*gaugeSiteSize + m*3*2);
235  else
236  for (int m=0; m<3; m++) printVector((float*)gauge +(X/2)*gaugeSiteSize + m*3*2);
237 
238  } else {
239  if (precision == QUDA_DOUBLE_PRECISION)
240  for (int m = 0; m < 3; m++) printVector((double*)gauge + (X/2+Vh)*gaugeSiteSize + m*3*2);
241  else
242  for (int m = 0; m < 3; m++) printVector((float*)gauge + (X/2+Vh)*gaugeSiteSize + m*3*2);
243  }
244 }
245 
246 // returns 0 or 1 if the full lattice index X is even or odd
247 int getOddBit(int Y) {
248  int x4 = Y/(Z[2]*Z[1]*Z[0]);
249  int x3 = (Y/(Z[1]*Z[0])) % Z[2];
250  int x2 = (Y/Z[0]) % Z[1];
251  int x1 = Y % Z[0];
252  return (x4+x3+x2+x1) % 2;
253 }
254 
255 // a+=b
256 template <typename Float>
257 inline void complexAddTo(Float *a, Float *b) {
258  a[0] += b[0];
259  a[1] += b[1];
260 }
261 
262 // a = b*c
263 template <typename Float>
264 inline void complexProduct(Float *a, Float *b, Float *c) {
265  a[0] = b[0]*c[0] - b[1]*c[1];
266  a[1] = b[0]*c[1] + b[1]*c[0];
267 }
268 
269 // a = conj(b)*conj(c)
270 template <typename Float>
271 inline void complexConjugateProduct(Float *a, Float *b, Float *c) {
272  a[0] = b[0]*c[0] - b[1]*c[1];
273  a[1] = -b[0]*c[1] - b[1]*c[0];
274 }
275 
276 // a = conj(b)*c
277 template <typename Float>
278 inline void complexDotProduct(Float *a, Float *b, Float *c) {
279  a[0] = b[0]*c[0] + b[1]*c[1];
280  a[1] = b[0]*c[1] - b[1]*c[0];
281 }
282 
283 // a += b*c
284 template <typename Float>
285 inline void accumulateComplexProduct(Float *a, Float *b, Float *c, Float sign) {
286  a[0] += sign*(b[0]*c[0] - b[1]*c[1]);
287  a[1] += sign*(b[0]*c[1] + b[1]*c[0]);
288 }
289 
290 // a += conj(b)*c)
291 template <typename Float>
292 inline void accumulateComplexDotProduct(Float *a, Float *b, Float *c) {
293  a[0] += b[0]*c[0] + b[1]*c[1];
294  a[1] += b[0]*c[1] - b[1]*c[0];
295 }
296 
297 template <typename Float>
298 inline void accumulateConjugateProduct(Float *a, Float *b, Float *c, int sign) {
299  a[0] += sign * (b[0]*c[0] - b[1]*c[1]);
300  a[1] -= sign * (b[0]*c[1] + b[1]*c[0]);
301 }
302 
303 template <typename Float>
304 inline void su3Construct12(Float *mat) {
305  Float *w = mat+12;
306  w[0] = 0.0;
307  w[1] = 0.0;
308  w[2] = 0.0;
309  w[3] = 0.0;
310  w[4] = 0.0;
311  w[5] = 0.0;
312 }
313 
314 // Stabilized Bunk and Sommer
315 template <typename Float>
316 inline void su3Construct8(Float *mat) {
317  mat[0] = atan2(mat[1], mat[0]);
318  mat[1] = atan2(mat[13], mat[12]);
319  for (int i=8; i<18; i++) mat[i] = 0.0;
320 }
321 
322 void su3_construct(void *mat, QudaReconstructType reconstruct, QudaPrecision precision) {
323  if (reconstruct == QUDA_RECONSTRUCT_12) {
324  if (precision == QUDA_DOUBLE_PRECISION) su3Construct12((double*)mat);
325  else su3Construct12((float*)mat);
326  } else {
327  if (precision == QUDA_DOUBLE_PRECISION) su3Construct8((double*)mat);
328  else su3Construct8((float*)mat);
329  }
330 }
331 
332 // given first two rows (u,v) of SU(3) matrix mat, reconstruct the third row
333 // as the cross product of the conjugate vectors: w = u* x v*
334 //
335 // 48 flops
336 template <typename Float>
337 static void su3Reconstruct12(Float *mat, int dir, int ga_idx, QudaGaugeParam *param) {
338  Float *u = &mat[0*(3*2)];
339  Float *v = &mat[1*(3*2)];
340  Float *w = &mat[2*(3*2)];
341  w[0] = 0.0; w[1] = 0.0; w[2] = 0.0; w[3] = 0.0; w[4] = 0.0; w[5] = 0.0;
342  accumulateConjugateProduct(w+0*(2), u+1*(2), v+2*(2), +1);
343  accumulateConjugateProduct(w+0*(2), u+2*(2), v+1*(2), -1);
344  accumulateConjugateProduct(w+1*(2), u+2*(2), v+0*(2), +1);
345  accumulateConjugateProduct(w+1*(2), u+0*(2), v+2*(2), -1);
346  accumulateConjugateProduct(w+2*(2), u+0*(2), v+1*(2), +1);
347  accumulateConjugateProduct(w+2*(2), u+1*(2), v+0*(2), -1);
348  Float u0 = (dir < 3 ? param->anisotropy :
349  (ga_idx >= (Z[3]-1)*Z[0]*Z[1]*Z[2]/2 ? param->t_boundary : 1));
350  w[0]*=u0; w[1]*=u0; w[2]*=u0; w[3]*=u0; w[4]*=u0; w[5]*=u0;
351 }
352 
353 template <typename Float>
354 static void su3Reconstruct8(Float *mat, int dir, int ga_idx, QudaGaugeParam *param) {
355  // First reconstruct first row
356  Float row_sum = 0.0;
357  row_sum += mat[2]*mat[2];
358  row_sum += mat[3]*mat[3];
359  row_sum += mat[4]*mat[4];
360  row_sum += mat[5]*mat[5];
361  Float u0 = (dir < 3 ? param->anisotropy :
362  (ga_idx >= (Z[3]-1)*Z[0]*Z[1]*Z[2]/2 ? param->t_boundary : 1));
363  Float U00_mag = sqrt(1.f/(u0*u0) - row_sum);
364 
365  mat[14] = mat[0];
366  mat[15] = mat[1];
367 
368  mat[0] = U00_mag * cos(mat[14]);
369  mat[1] = U00_mag * sin(mat[14]);
370 
371  Float column_sum = 0.0;
372  for (int i=0; i<2; i++) column_sum += mat[i]*mat[i];
373  for (int i=6; i<8; i++) column_sum += mat[i]*mat[i];
374  Float U20_mag = sqrt(1.f/(u0*u0) - column_sum);
375 
376  mat[12] = U20_mag * cos(mat[15]);
377  mat[13] = U20_mag * sin(mat[15]);
378 
379  // First column now restored
380 
381  // finally reconstruct last elements from SU(2) rotation
382  Float r_inv2 = 1.0/(u0*row_sum);
383 
384  // U11
385  Float A[2];
386  complexDotProduct(A, mat+0, mat+6);
387  complexConjugateProduct(mat+8, mat+12, mat+4);
388  accumulateComplexProduct(mat+8, A, mat+2, u0);
389  mat[8] *= -r_inv2;
390  mat[9] *= -r_inv2;
391 
392  // U12
393  complexConjugateProduct(mat+10, mat+12, mat+2);
394  accumulateComplexProduct(mat+10, A, mat+4, -u0);
395  mat[10] *= r_inv2;
396  mat[11] *= r_inv2;
397 
398  // U21
399  complexDotProduct(A, mat+0, mat+12);
400  complexConjugateProduct(mat+14, mat+6, mat+4);
401  accumulateComplexProduct(mat+14, A, mat+2, -u0);
402  mat[14] *= r_inv2;
403  mat[15] *= r_inv2;
404 
405  // U12
406  complexConjugateProduct(mat+16, mat+6, mat+2);
407  accumulateComplexProduct(mat+16, A, mat+4, u0);
408  mat[16] *= -r_inv2;
409  mat[17] *= -r_inv2;
410 }
411 
412 void su3_reconstruct(void *mat, int dir, int ga_idx, QudaReconstructType reconstruct, QudaPrecision precision, QudaGaugeParam *param) {
413  if (reconstruct == QUDA_RECONSTRUCT_12) {
414  if (precision == QUDA_DOUBLE_PRECISION) su3Reconstruct12((double*)mat, dir, ga_idx, param);
415  else su3Reconstruct12((float*)mat, dir, ga_idx, param);
416  } else {
417  if (precision == QUDA_DOUBLE_PRECISION) su3Reconstruct8((double*)mat, dir, ga_idx, param);
418  else su3Reconstruct8((float*)mat, dir, ga_idx, param);
419  }
420 }
421 
422 template <typename Float>
423 static int compareFloats(Float *a, Float *b, int len, double epsilon) {
424  for (int i = 0; i < len; i++) {
425  double diff = fabs(a[i] - b[i]);
426  if (diff > epsilon) {
427  printfQuda("ERROR: i=%d, a[%d]=%f, b[%d]=%f\n", i, i, a[i], i, b[i]);
428  return 0;
429  }
430  }
431  return 1;
432 }
433 
434 int compare_floats(void *a, void *b, int len, double epsilon, QudaPrecision precision) {
435  if (precision == QUDA_DOUBLE_PRECISION) return compareFloats((double*)a, (double*)b, len, epsilon);
436  else return compareFloats((float*)a, (float*)b, len, epsilon);
437 }
438 
439 int fullLatticeIndex(int dim[4], int index, int oddBit){
440 
441  int za = index/(dim[0]>>1);
442  int zb = za/dim[1];
443  int x2 = za - zb*dim[1];
444  int x4 = zb/dim[2];
445  int x3 = zb - x4*dim[2];
446 
447  return 2*index + ((x2 + x3 + x4 + oddBit) & 1);
448 }
449 
450 // given a "half index" i into either an even or odd half lattice (corresponding
451 // to oddBit = {0, 1}), returns the corresponding full lattice index.
452 int fullLatticeIndex(int i, int oddBit) {
453  /*
454  int boundaryCrossings = i/(Z[0]/2) + i/(Z[1]*Z[0]/2) + i/(Z[2]*Z[1]*Z[0]/2);
455  return 2*i + (boundaryCrossings + oddBit) % 2;
456  */
457 
458  int X1 = Z[0];
459  int X2 = Z[1];
460  int X3 = Z[2];
461  //int X4 = Z[3];
462  int X1h =X1/2;
463 
464  int sid =i;
465  int za = sid/X1h;
466  //int x1h = sid - za*X1h;
467  int zb = za/X2;
468  int x2 = za - zb*X2;
469  int x4 = zb/X3;
470  int x3 = zb - x4*X3;
471  int x1odd = (x2 + x3 + x4 + oddBit) & 1;
472  //int x1 = 2*x1h + x1odd;
473  int X = 2 * sid + x1odd;
474 
475  return X;
476 }
477 
478 // i represents a "half index" into an even or odd "half lattice".
479 // when oddBit={0,1} the half lattice is {even,odd}.
480 //
481 // the displacements, such as dx, refer to the full lattice coordinates.
482 //
483 // neighborIndex() takes a "half index", displaces it, and returns the
484 // new "half index", which can be an index into either the even or odd lattices.
485 // displacements of magnitude one always interchange odd and even lattices.
486 //
487 
488 int neighborIndex(int i, int oddBit, int dx4, int dx3, int dx2, int dx1) {
489  int Y = fullLatticeIndex(i, oddBit);
490  int x4 = Y/(Z[2]*Z[1]*Z[0]);
491  int x3 = (Y/(Z[1]*Z[0])) % Z[2];
492  int x2 = (Y/Z[0]) % Z[1];
493  int x1 = Y % Z[0];
494 
495  // assert (oddBit == (x+y+z+t)%2);
496 
497  x4 = (x4+dx4+Z[3]) % Z[3];
498  x3 = (x3+dx3+Z[2]) % Z[2];
499  x2 = (x2+dx2+Z[1]) % Z[1];
500  x1 = (x1+dx1+Z[0]) % Z[0];
501 
502  return (x4*(Z[2]*Z[1]*Z[0]) + x3*(Z[1]*Z[0]) + x2*(Z[0]) + x1) / 2;
503 }
504 
505 
506 int neighborIndex(int dim[4], int index, int oddBit, int dx[4]){
507 
508  const int fullIndex = fullLatticeIndex(dim, index, oddBit);
509 
510  int x[4];
511  x[3] = fullIndex/(dim[2]*dim[1]*dim[0]);
512  x[2] = (fullIndex/(dim[1]*dim[0])) % dim[2];
513  x[1] = (fullIndex/dim[0]) % dim[1];
514  x[0] = fullIndex % dim[0];
515 
516  for(int dir=0; dir<4; ++dir)
517  x[dir] = (x[dir]+dx[dir]+dim[dir]) % dim[dir];
518 
519  return (((x[3]*dim[2] + x[2])*dim[1] + x[1])*dim[0] + x[0])/2;
520 }
521 
522 int
523 neighborIndex_mg(int i, int oddBit, int dx4, int dx3, int dx2, int dx1)
524 {
525  int ret;
526 
527  int Y = fullLatticeIndex(i, oddBit);
528  int x4 = Y/(Z[2]*Z[1]*Z[0]);
529  int x3 = (Y/(Z[1]*Z[0])) % Z[2];
530  int x2 = (Y/Z[0]) % Z[1];
531  int x1 = Y % Z[0];
532 
533  int ghost_x4 = x4+ dx4;
534 
535  // assert (oddBit == (x+y+z+t)%2);
536 
537  x4 = (x4+dx4+Z[3]) % Z[3];
538  x3 = (x3+dx3+Z[2]) % Z[2];
539  x2 = (x2+dx2+Z[1]) % Z[1];
540  x1 = (x1+dx1+Z[0]) % Z[0];
541 
542  if ( (ghost_x4 >= 0 && ghost_x4 < Z[3]) || !comm_dim_partitioned(3)){
543  ret = (x4*(Z[2]*Z[1]*Z[0]) + x3*(Z[1]*Z[0]) + x2*(Z[0]) + x1) / 2;
544  }else{
545  ret = (x3 * (Z[1] * Z[0]) + x2 * (Z[0]) + x1) / 2;
546  }
547 
548  return ret;
549 }
550 
551 /*
552  * This is a computation of neighbor using the full index and the displacement in each direction
553  *
554  */
555 
556 int neighborIndexFullLattice(int i, int dx4, int dx3, int dx2, int dx1)
557 {
558  int oddBit = 0;
559  int half_idx = i;
560  if (i >= Vh){
561  oddBit =1;
562  half_idx = i - Vh;
563  }
564 
565  int nbr_half_idx = neighborIndex(half_idx, oddBit, dx4,dx3,dx2,dx1);
566  int oddBitChanged = (dx4+dx3+dx2+dx1)%2;
567  if (oddBitChanged){
568  oddBit = 1 - oddBit;
569  }
570  int ret = nbr_half_idx;
571  if (oddBit){
572  ret = Vh + nbr_half_idx;
573  }
574 
575  return ret;
576 }
577 
578 int
579 neighborIndexFullLattice(int dim[4], int index, int dx[4])
580 {
581  const int volume = dim[0]*dim[1]*dim[2]*dim[3];
582  const int halfVolume = volume/2;
583  int oddBit = 0;
584  int halfIndex = index;
585 
586  if(index >= halfVolume){
587  oddBit = 1;
588  halfIndex = index - halfVolume;
589  }
590 
591  int neighborHalfIndex = neighborIndex(dim, halfIndex, oddBit, dx);
592 
593  int oddBitChanged = (dx[0]+dx[1]+dx[2]+dx[3])%2;
594  if(oddBitChanged){
595  oddBit = 1 - oddBit;
596  }
597 
598  return neighborHalfIndex + oddBit*halfVolume;
599 }
600 
601 int neighborIndexFullLattice_mg(int i, int dx4, int dx3, int dx2, int dx1)
602 {
603  int ret;
604  int oddBit = 0;
605  int half_idx = i;
606  if (i >= Vh){
607  oddBit =1;
608  half_idx = i - Vh;
609  }
610 
611  int Y = fullLatticeIndex(half_idx, oddBit);
612  int x4 = Y/(Z[2]*Z[1]*Z[0]);
613  int x3 = (Y/(Z[1]*Z[0])) % Z[2];
614  int x2 = (Y/Z[0]) % Z[1];
615  int x1 = Y % Z[0];
616  int ghost_x4 = x4+ dx4;
617 
618  x4 = (x4+dx4+Z[3]) % Z[3];
619  x3 = (x3+dx3+Z[2]) % Z[2];
620  x2 = (x2+dx2+Z[1]) % Z[1];
621  x1 = (x1+dx1+Z[0]) % Z[0];
622 
623  if ( ghost_x4 >= 0 && ghost_x4 < Z[3]){
624  ret = (x4*(Z[2]*Z[1]*Z[0]) + x3*(Z[1]*Z[0]) + x2*(Z[0]) + x1) / 2;
625  }else{
626  ret = (x3 * (Z[1] * Z[0]) + x2 * (Z[0]) + x1) / 2;
627  return ret;
628  }
629 
630  int oddBitChanged = (dx4+dx3+dx2+dx1)%2;
631  if (oddBitChanged){
632  oddBit = 1 - oddBit;
633  }
634 
635  if (oddBit){
636  ret += Vh;
637  }
638 
639  return ret;
640 }
641 
642 
643 // 4d checkerboard.
644 // given a "half index" i into either an even or odd half lattice (corresponding
645 // to oddBit = {0, 1}), returns the corresponding full lattice index.
646 // Cf. GPGPU code in dslash_core_ante.h.
647 // There, i is the thread index.
648 int fullLatticeIndex_4d(int i, int oddBit) {
649  if (i >= Vh || i < 0) {printf("i out of range in fullLatticeIndex_4d"); exit(-1);}
650  /*
651  int boundaryCrossings = i/(Z[0]/2) + i/(Z[1]*Z[0]/2) + i/(Z[2]*Z[1]*Z[0]/2);
652  return 2*i + (boundaryCrossings + oddBit) % 2;
653  */
654 
655  int X1 = Z[0];
656  int X2 = Z[1];
657  int X3 = Z[2];
658  //int X4 = Z[3];
659  int X1h =X1/2;
660 
661  int sid =i;
662  int za = sid/X1h;
663  //int x1h = sid - za*X1h;
664  int zb = za/X2;
665  int x2 = za - zb*X2;
666  int x4 = zb/X3;
667  int x3 = zb - x4*X3;
668  int x1odd = (x2 + x3 + x4 + oddBit) & 1;
669  //int x1 = 2*x1h + x1odd;
670  int X = 2 * sid + x1odd;
671 
672  return X;
673 }
674 
675 // 5d checkerboard.
676 // given a "half index" i into either an even or odd half lattice (corresponding
677 // to oddBit = {0, 1}), returns the corresponding full lattice index.
678 // Cf. GPGPU code in dslash_core_ante.h.
679 // There, i is the thread index sid.
680 // This function is used by neighborIndex_5d in dslash_reference.cpp.
681 //ok
682 int fullLatticeIndex_5d(int i, int oddBit) {
683  int boundaryCrossings = i/(Z[0]/2) + i/(Z[1]*Z[0]/2) + i/(Z[2]*Z[1]*Z[0]/2) + i/(Z[3]*Z[2]*Z[1]*Z[0]/2);
684  return 2*i + (boundaryCrossings + oddBit) % 2;
685 }
686 
687 int fullLatticeIndex_5d_4dpc(int i, int oddBit) {
688  int boundaryCrossings = i/(Z[0]/2) + i/(Z[1]*Z[0]/2) + i/(Z[2]*Z[1]*Z[0]/2);
689  return 2*i + (boundaryCrossings + oddBit) % 2;
690 }
691 
693 {
694  int oddBit = 0;
695  int half_idx = i;
696  if (i >= Vh){
697  oddBit =1;
698  half_idx = i - Vh;
699  }
700 
701  int Y = fullLatticeIndex(half_idx, oddBit);
702  int x4 = Y/(Z[2]*Z[1]*Z[0]);
703 
704  return x4;
705 }
706 
707 template <typename Float>
708 static void applyGaugeFieldScaling(Float **gauge, int Vh, QudaGaugeParam *param) {
709  // Apply spatial scaling factor (u0) to spatial links
710  for (int d = 0; d < 3; d++) {
711  for (int i = 0; i < gaugeSiteSize*Vh*2; i++) {
712  gauge[d][i] /= param->anisotropy;
713  }
714  }
715 
716  // Apply boundary conditions to temporal links
717  if (param->t_boundary == QUDA_ANTI_PERIODIC_T && last_node_in_t()) {
718  for (int j = (Z[0]/2)*Z[1]*Z[2]*(Z[3]-1); j < Vh; j++) {
719  for (int i = 0; i < gaugeSiteSize; i++) {
720  gauge[3][j*gaugeSiteSize+i] *= -1.0;
721  gauge[3][(Vh+j)*gaugeSiteSize+i] *= -1.0;
722  }
723  }
724  }
725 
726  if (param->gauge_fix) {
727  // set all gauge links (except for the last Z[0]*Z[1]*Z[2]/2) to the identity,
728  // to simulate fixing to the temporal gauge.
729  int iMax = ( last_node_in_t() ? (Z[0]/2)*Z[1]*Z[2]*(Z[3]-1) : Vh );
730  int dir = 3; // time direction only
731  Float *even = gauge[dir];
732  Float *odd = gauge[dir]+Vh*gaugeSiteSize;
733  for (int i = 0; i< iMax; i++) {
734  for (int m = 0; m < 3; m++) {
735  for (int n = 0; n < 3; n++) {
736  even[i*(3*3*2) + m*(3*2) + n*(2) + 0] = (m==n) ? 1 : 0;
737  even[i*(3*3*2) + m*(3*2) + n*(2) + 1] = 0.0;
738  odd [i*(3*3*2) + m*(3*2) + n*(2) + 0] = (m==n) ? 1 : 0;
739  odd [i*(3*3*2) + m*(3*2) + n*(2) + 1] = 0.0;
740  }
741  }
742  }
743  }
744 }
745 
746 template <typename Float>
748 {
749  int X1h=param->X[0]/2;
750  int X1 =param->X[0];
751  int X2 =param->X[1];
752  int X3 =param->X[2];
753  int X4 =param->X[3];
754 
755  // rescale long links by the appropriate coefficient
756  if (dslash_type == QUDA_ASQTAD_DSLASH) {
757  for(int d=0; d<4; d++){
758  for(int i=0; i < V*gaugeSiteSize; i++){
759  gauge[d][i] /= (-24*param->tadpole_coeff*param->tadpole_coeff);
760  }
761  }
762  }
763 
764  // apply the staggered phases
765  for (int d = 0; d < 3; d++) {
766 
767  //even
768  for (int i = 0; i < Vh; i++) {
769 
770  int index = fullLatticeIndex(i, 0);
771  int i4 = index /(X3*X2*X1);
772  int i3 = (index - i4*(X3*X2*X1))/(X2*X1);
773  int i2 = (index - i4*(X3*X2*X1) - i3*(X2*X1))/X1;
774  int i1 = index - i4*(X3*X2*X1) - i3*(X2*X1) - i2*X1;
775  int sign = 1;
776 
777  if (d == 0) {
778  if (i4 % 2 == 1){
779  sign= -1;
780  }
781  }
782 
783  if (d == 1){
784  if ((i4+i1) % 2 == 1){
785  sign= -1;
786  }
787  }
788  if (d == 2){
789  if ( (i4+i1+i2) % 2 == 1){
790  sign= -1;
791  }
792  }
793 
794  for (int j=0; j < 18; j++) {
795  gauge[d][i*gaugeSiteSize + j] *= sign;
796  }
797  }
798  //odd
799  for (int i = 0; i < Vh; i++) {
800  int index = fullLatticeIndex(i, 1);
801  int i4 = index /(X3*X2*X1);
802  int i3 = (index - i4*(X3*X2*X1))/(X2*X1);
803  int i2 = (index - i4*(X3*X2*X1) - i3*(X2*X1))/X1;
804  int i1 = index - i4*(X3*X2*X1) - i3*(X2*X1) - i2*X1;
805  int sign = 1;
806 
807  if (d == 0) {
808  if (i4 % 2 == 1){
809  sign = -1;
810  }
811  }
812 
813  if (d == 1){
814  if ((i4+i1) % 2 == 1){
815  sign = -1;
816  }
817  }
818  if (d == 2){
819  if ( (i4+i1+i2) % 2 == 1){
820  sign = -1;
821  }
822  }
823 
824  for (int j=0; j<18; j++){
825  gauge[d][(Vh+i)*gaugeSiteSize + j] *= sign;
826  }
827  }
828 
829  }
830 
831  // Apply boundary conditions to temporal links
832  if (param->t_boundary == QUDA_ANTI_PERIODIC_T && last_node_in_t()) {
833  for (int j = 0; j < Vh; j++) {
834  int sign = 1;
835  if (dslash_type == QUDA_ASQTAD_DSLASH) {
836  if (j >= (X4-3)*X1h*X2*X3 ){
837  sign = -1;
838  }
839  } else {
840  if (j >= (X4-1)*X1h*X2*X3 ){
841  sign = -1;
842  }
843  }
844 
845  for (int i=0; i<18; i++) {
846  gauge[3][j*gaugeSiteSize + i] *= sign;
847  gauge[3][(Vh+j)*gaugeSiteSize + i] *= sign;
848  }
849  }
850  }
851 
852 }
853 
855  if (local_prec == QUDA_DOUBLE_PRECISION) {
856  applyGaugeFieldScaling_long((double**)gauge, Vh, param, dslash_type);
857  } else if (local_prec == QUDA_SINGLE_PRECISION) {
858  applyGaugeFieldScaling_long((float**)gauge, Vh, param, dslash_type);
859  } else {
860  errorQuda("Invalid type %d for applyGaugeFieldScaling_long\n", local_prec);
861  }
862 }
863 
864 template <typename Float>
865 static void constructUnitGaugeField(Float **res, QudaGaugeParam *param) {
866  Float *resOdd[4], *resEven[4];
867  for (int dir = 0; dir < 4; dir++) {
868  resEven[dir] = res[dir];
869  resOdd[dir] = res[dir]+Vh*gaugeSiteSize;
870  }
871 
872  for (int dir = 0; dir < 4; dir++) {
873  for (int i = 0; i < Vh; i++) {
874  for (int m = 0; m < 3; m++) {
875  for (int n = 0; n < 3; n++) {
876  resEven[dir][i*(3*3*2) + m*(3*2) + n*(2) + 0] = (m==n) ? 1 : 0;
877  resEven[dir][i*(3*3*2) + m*(3*2) + n*(2) + 1] = 0.0;
878  resOdd[dir][i*(3*3*2) + m*(3*2) + n*(2) + 0] = (m==n) ? 1 : 0;
879  resOdd[dir][i*(3*3*2) + m*(3*2) + n*(2) + 1] = 0.0;
880  }
881  }
882  }
883  }
884 
885  applyGaugeFieldScaling(res, Vh, param);
886 }
887 
888 // normalize the vector a
889 template <typename Float>
890 static void normalize(complex<Float> *a, int len) {
891  double sum = 0.0;
892  for (int i=0; i<len; i++) sum += norm(a[i]);
893  for (int i=0; i<len; i++) a[i] /= sqrt(sum);
894 }
895 
896 // orthogonalize vector b to vector a
897 template <typename Float>
898 static void orthogonalize(complex<Float> *a, complex<Float> *b, int len) {
899  complex<double> dot = 0.0;
900  for (int i=0; i<len; i++) dot += conj(a[i])*b[i];
901  for (int i=0; i<len; i++) b[i] -= (complex<Float>)dot*a[i];
902 }
903 
904 template <typename Float>
906 {
907  Float *resOdd[4], *resEven[4];
908  for (int dir = 0; dir < 4; dir++) {
909  resEven[dir] = res[dir];
910  resOdd[dir] = res[dir]+Vh*gaugeSiteSize;
911  }
912 
913  for (int dir = 0; dir < 4; dir++) {
914  for (int i = 0; i < Vh; i++) {
915  for (int m = 1; m < 3; m++) { // last 2 rows
916  for (int n = 0; n < 3; n++) { // 3 columns
917  resEven[dir][i*(3*3*2) + m*(3*2) + n*(2) + 0] = rand() / (Float)RAND_MAX;
918  resEven[dir][i*(3*3*2) + m*(3*2) + n*(2) + 1] = rand() / (Float)RAND_MAX;
919  resOdd[dir][i*(3*3*2) + m*(3*2) + n*(2) + 0] = rand() / (Float)RAND_MAX;
920  resOdd[dir][i * (3 * 3 * 2) + m * (3 * 2) + n * (2) + 1] = rand() / (Float)RAND_MAX;
921  }
922  }
923  normalize((complex<Float>*)(resEven[dir] + (i*3+1)*3*2), 3);
924  orthogonalize((complex<Float>*)(resEven[dir] + (i*3+1)*3*2), (complex<Float>*)(resEven[dir] + (i*3+2)*3*2), 3);
925  normalize((complex<Float>*)(resEven[dir] + (i*3 + 2)*3*2), 3);
926 
927  normalize((complex<Float>*)(resOdd[dir] + (i*3+1)*3*2), 3);
928  orthogonalize((complex<Float>*)(resOdd[dir] + (i*3+1)*3*2), (complex<Float>*)(resOdd[dir] + (i*3+2)*3*2), 3);
929  normalize((complex<Float>*)(resOdd[dir] + (i*3 + 2)*3*2), 3);
930 
931  {
932  Float *w = resEven[dir]+(i*3+0)*3*2;
933  Float *u = resEven[dir]+(i*3+1)*3*2;
934  Float *v = resEven[dir]+(i*3+2)*3*2;
935 
936  for (int n = 0; n < 6; n++) w[n] = 0.0;
937  accumulateConjugateProduct(w+0*(2), u+1*(2), v+2*(2), +1);
938  accumulateConjugateProduct(w+0*(2), u+2*(2), v+1*(2), -1);
939  accumulateConjugateProduct(w+1*(2), u+2*(2), v+0*(2), +1);
940  accumulateConjugateProduct(w+1*(2), u+0*(2), v+2*(2), -1);
941  accumulateConjugateProduct(w+2*(2), u+0*(2), v+1*(2), +1);
942  accumulateConjugateProduct(w+2*(2), u+1*(2), v+0*(2), -1);
943  }
944 
945  {
946  Float *w = resOdd[dir]+(i*3+0)*3*2;
947  Float *u = resOdd[dir]+(i*3+1)*3*2;
948  Float *v = resOdd[dir]+(i*3+2)*3*2;
949 
950  for (int n = 0; n < 6; n++) w[n] = 0.0;
951  accumulateConjugateProduct(w+0*(2), u+1*(2), v+2*(2), +1);
952  accumulateConjugateProduct(w+0*(2), u+2*(2), v+1*(2), -1);
953  accumulateConjugateProduct(w+1*(2), u+2*(2), v+0*(2), +1);
954  accumulateConjugateProduct(w+1*(2), u+0*(2), v+2*(2), -1);
955  accumulateConjugateProduct(w+2*(2), u+0*(2), v+1*(2), +1);
956  accumulateConjugateProduct(w+2*(2), u+1*(2), v+0*(2), -1);
957  }
958 
959  }
960  }
961 
962  if (param->type == QUDA_WILSON_LINKS) {
963  applyGaugeFieldScaling(res, Vh, param);
964  } else if (param->type == QUDA_ASQTAD_LONG_LINKS) {
966  } else if (param->type == QUDA_ASQTAD_FAT_LINKS) {
967  for (int dir = 0; dir < 4; dir++) {
968  for (int i = 0; i < Vh; i++) {
969  for (int m = 0; m < 3; m++) { // last 2 rows
970  for (int n = 0; n < 3; n++) { // 3 columns
971  resEven[dir][i*(3*3*2) + m*(3*2) + n*(2) + 0] =1.0* rand() / (Float)RAND_MAX;
972  resEven[dir][i*(3*3*2) + m*(3*2) + n*(2) + 1] = 2.0* rand() / (Float)RAND_MAX;
973  resOdd[dir][i*(3*3*2) + m*(3*2) + n*(2) + 0] = 3.0*rand() / (Float)RAND_MAX;
974  resOdd[dir][i*(3*3*2) + m*(3*2) + n*(2) + 1] = 4.0*rand() / (Float)RAND_MAX;
975  }
976  }
977  }
978  }
979  }
980 }
981 
982 template <typename Float> void constructUnitaryGaugeField(Float **res)
983 {
984  Float *resOdd[4], *resEven[4];
985  for (int dir = 0; dir < 4; dir++) {
986  resEven[dir] = res[dir];
987  resOdd[dir] = res[dir]+Vh*gaugeSiteSize;
988  }
989 
990  for (int dir = 0; dir < 4; dir++) {
991  for (int i = 0; i < Vh; i++) {
992  for (int m = 1; m < 3; m++) { // last 2 rows
993  for (int n = 0; n < 3; n++) { // 3 columns
994  resEven[dir][i*(3*3*2) + m*(3*2) + n*(2) + 0] = rand() / (Float)RAND_MAX;
995  resEven[dir][i*(3*3*2) + m*(3*2) + n*(2) + 1] = rand() / (Float)RAND_MAX;
996  resOdd[dir][i*(3*3*2) + m*(3*2) + n*(2) + 0] = rand() / (Float)RAND_MAX;
997  resOdd[dir][i * (3 * 3 * 2) + m * (3 * 2) + n * (2) + 1] = rand() / (Float)RAND_MAX;
998  }
999  }
1000  normalize((complex<Float>*)(resEven[dir] + (i*3+1)*3*2), 3);
1001  orthogonalize((complex<Float>*)(resEven[dir] + (i*3+1)*3*2), (complex<Float>*)(resEven[dir] + (i*3+2)*3*2), 3);
1002  normalize((complex<Float>*)(resEven[dir] + (i*3 + 2)*3*2), 3);
1003 
1004  normalize((complex<Float>*)(resOdd[dir] + (i*3+1)*3*2), 3);
1005  orthogonalize((complex<Float>*)(resOdd[dir] + (i*3+1)*3*2), (complex<Float>*)(resOdd[dir] + (i*3+2)*3*2), 3);
1006  normalize((complex<Float>*)(resOdd[dir] + (i*3 + 2)*3*2), 3);
1007 
1008  {
1009  Float *w = resEven[dir]+(i*3+0)*3*2;
1010  Float *u = resEven[dir]+(i*3+1)*3*2;
1011  Float *v = resEven[dir]+(i*3+2)*3*2;
1012 
1013  for (int n = 0; n < 6; n++) w[n] = 0.0;
1014  accumulateConjugateProduct(w+0*(2), u+1*(2), v+2*(2), +1);
1015  accumulateConjugateProduct(w+0*(2), u+2*(2), v+1*(2), -1);
1016  accumulateConjugateProduct(w+1*(2), u+2*(2), v+0*(2), +1);
1017  accumulateConjugateProduct(w+1*(2), u+0*(2), v+2*(2), -1);
1018  accumulateConjugateProduct(w+2*(2), u+0*(2), v+1*(2), +1);
1019  accumulateConjugateProduct(w+2*(2), u+1*(2), v+0*(2), -1);
1020  }
1021 
1022  {
1023  Float *w = resOdd[dir]+(i*3+0)*3*2;
1024  Float *u = resOdd[dir]+(i*3+1)*3*2;
1025  Float *v = resOdd[dir]+(i*3+2)*3*2;
1026 
1027  for (int n = 0; n < 6; n++) w[n] = 0.0;
1028  accumulateConjugateProduct(w+0*(2), u+1*(2), v+2*(2), +1);
1029  accumulateConjugateProduct(w+0*(2), u+2*(2), v+1*(2), -1);
1030  accumulateConjugateProduct(w+1*(2), u+2*(2), v+0*(2), +1);
1031  accumulateConjugateProduct(w+1*(2), u+0*(2), v+2*(2), -1);
1032  accumulateConjugateProduct(w+2*(2), u+0*(2), v+1*(2), +1);
1033  accumulateConjugateProduct(w+2*(2), u+1*(2), v+0*(2), -1);
1034  }
1035  }
1036  }
1037 }
1038 
1039 template <typename Float> static void applyStaggeredScaling(Float **res, QudaGaugeParam *param, int type)
1040 {
1041 
1042  if(type == 3) applyGaugeFieldScaling_long((Float**)res, Vh, param, QUDA_STAGGERED_DSLASH);
1043 
1044  return;
1045 }
1046 
1047 void construct_gauge_field(void **gauge, int type, QudaPrecision precision, QudaGaugeParam *param) {
1048  if (type == 0) {
1049  if (precision == QUDA_DOUBLE_PRECISION) constructUnitGaugeField((double**)gauge, param);
1050  else constructUnitGaugeField((float**)gauge, param);
1051  } else if (type == 1) {
1052  if (precision == QUDA_DOUBLE_PRECISION) constructGaugeField((double**)gauge, param);
1053  else constructGaugeField((float**)gauge, param);
1054  } else {
1055  if (precision == QUDA_DOUBLE_PRECISION) applyGaugeFieldScaling((double**)gauge, Vh, param);
1056  else
1057  applyGaugeFieldScaling((float **)gauge, Vh, param);
1058  }
1059 
1060 }
1061 
1062 void construct_fat_long_gauge_field(void **fatlink, void **longlink, int type, QudaPrecision precision,
1064 {
1065  if (type == 0) {
1066  if (precision == QUDA_DOUBLE_PRECISION) {
1067  constructUnitGaugeField((double**)fatlink, param);
1068  constructUnitGaugeField((double**)longlink, param);
1069  }else {
1070  constructUnitGaugeField((float**)fatlink, param);
1071  constructUnitGaugeField((float**)longlink, param);
1072  }
1073  } else {
1074  if (precision == QUDA_DOUBLE_PRECISION) {
1075  // if doing naive staggered then set to long links so that the staggered phase is applied
1077  if(type != 3) constructGaugeField((double**)fatlink, param, dslash_type);
1078  else applyStaggeredScaling((double**)fatlink, param, type);
1079  param->type = QUDA_ASQTAD_LONG_LINKS;
1080  if (dslash_type == QUDA_ASQTAD_DSLASH)
1081  {
1082  if(type != 3) constructGaugeField((double**)longlink, param, dslash_type);
1083  else applyStaggeredScaling((double**)longlink, param, type);
1084  }
1085  }else {
1087  if(type != 3) constructGaugeField((float**)fatlink, param, dslash_type);
1088  else applyStaggeredScaling((float**)fatlink, param, type);
1089 
1090  param->type = QUDA_ASQTAD_LONG_LINKS;
1091  if (dslash_type == QUDA_ASQTAD_DSLASH) {
1092  if(type != 3) constructGaugeField((float**)longlink, param, dslash_type);
1093  else applyStaggeredScaling((float**)longlink, param, type);
1094  }
1095  }
1096 
1097  if (dslash_type == QUDA_ASQTAD_DSLASH) {
1098  // incorporate non-trivial phase into long links
1099  const double phase = (M_PI * rand()) / RAND_MAX;
1100  const complex<double> z = polar(1.0, phase);
1101  for (int dir = 0; dir < 4; ++dir) {
1102  for (int i = 0; i < V; ++i) {
1103  for (int j = 0; j < gaugeSiteSize; j += 2) {
1104  if (precision == QUDA_DOUBLE_PRECISION) {
1105  complex<double> *l = (complex<double> *)(&(((double *)longlink[dir])[i * gaugeSiteSize + j]));
1106  *l *= z;
1107  } else {
1108  complex<float> *l = (complex<float> *)(&(((float *)longlink[dir])[i * gaugeSiteSize + j]));
1109  *l *= z;
1110  }
1111  }
1112  }
1113  }
1114  }
1115 
1116  if (type == 3) return;
1117 
1118  // set all links to zero to emulate the 1-link operator (needed for host comparison)
1119  if (dslash_type == QUDA_STAGGERED_DSLASH) {
1120  for (int dir = 0; dir < 4; ++dir) {
1121  for (int i = 0; i < V; ++i) {
1122  for (int j = 0; j < gaugeSiteSize; j += 2) {
1123  if (precision == QUDA_DOUBLE_PRECISION) {
1124  ((double *)longlink[dir])[i * gaugeSiteSize + j] = 0.0;
1125  ((double *)longlink[dir])[i * gaugeSiteSize + j + 1] = 0.0;
1126  } else {
1127  ((float *)longlink[dir])[i * gaugeSiteSize + j] = 0.0;
1128  ((float *)longlink[dir])[i * gaugeSiteSize + j + 1] = 0.0;
1129  }
1130  }
1131  }
1132  }
1133  }
1134  }
1135 }
1136 
1137 template <typename Float>
1138 static void constructCloverField(Float *res, double norm, double diag) {
1139 
1140  Float c = 2.0 * norm / RAND_MAX;
1141 
1142  for(int i = 0; i < V; i++) {
1143  for (int j = 0; j < 72; j++) {
1144  res[i*72 + j] = c*rand() - norm;
1145  }
1146 
1147  //impose clover symmetry on each chiral block
1148  for (int ch=0; ch<2; ch++) {
1149  res[i*72 + 3 + 36*ch] = -res[i*72 + 0 + 36*ch];
1150  res[i*72 + 4 + 36*ch] = -res[i*72 + 1 + 36*ch];
1151  res[i*72 + 5 + 36*ch] = -res[i*72 + 2 + 36*ch];
1152  res[i*72 + 30 + 36*ch] = -res[i*72 + 6 + 36*ch];
1153  res[i*72 + 31 + 36*ch] = -res[i*72 + 7 + 36*ch];
1154  res[i*72 + 32 + 36*ch] = -res[i*72 + 8 + 36*ch];
1155  res[i*72 + 33 + 36*ch] = -res[i*72 + 9 + 36*ch];
1156  res[i*72 + 34 + 36*ch] = -res[i*72 + 16 + 36*ch];
1157  res[i*72 + 35 + 36*ch] = -res[i*72 + 17 + 36*ch];
1158  }
1159 
1160  for (int j = 0; j<6; j++) {
1161  res[i*72 + j] += diag;
1162  res[i*72 + j+36] += diag;
1163  }
1164  }
1165 }
1166 
1167 void construct_clover_field(void *clover, double norm, double diag, QudaPrecision precision) {
1168 
1169  if (precision == QUDA_DOUBLE_PRECISION) constructCloverField((double *)clover, norm, diag);
1170  else constructCloverField((float *)clover, norm, diag);
1171 }
1172 
1173 /*void strong_check(void *spinorRef, void *spinorGPU, int len, QudaPrecision prec) {
1174  printf("Reference:\n");
1175  printSpinorElement(spinorRef, 0, prec); printf("...\n");
1176  printSpinorElement(spinorRef, len-1, prec); printf("\n");
1177 
1178  printf("\nCUDA:\n");
1179  printSpinorElement(spinorGPU, 0, prec); printf("...\n");
1180  printSpinorElement(spinorGPU, len-1, prec); printf("\n");
1181 
1182  compare_spinor(spinorRef, spinorGPU, len, prec);
1183  }*/
1184 
1185 template <typename Float>
1186 static void checkGauge(Float **oldG, Float **newG, double epsilon) {
1187 
1188  const int fail_check = 17;
1189  int fail[4][fail_check];
1190  int iter[4][18];
1191  for (int d=0; d<4; d++) for (int i=0; i<fail_check; i++) fail[d][i] = 0;
1192  for (int d=0; d<4; d++) for (int i=0; i<18; i++) iter[d][i] = 0;
1193 
1194  for (int d=0; d<4; d++) {
1195  for (int eo=0; eo<2; eo++) {
1196  for (int i=0; i<Vh; i++) {
1197  int ga_idx = (eo*Vh+i);
1198  for (int j=0; j<18; j++) {
1199  double diff = fabs(newG[d][ga_idx*18+j] - oldG[d][ga_idx*18+j]);
1200 
1201  for (int f=0; f<fail_check; f++) if (diff > pow(10.0,-(f+1))) fail[d][f]++;
1202  if (diff > epsilon) iter[d][j]++;
1203  }
1204  }
1205  }
1206  }
1207 
1208  printf("Component fails (X, Y, Z, T)\n");
1209  for (int i=0; i<18; i++) printf("%d fails = (%8d, %8d, %8d, %8d)\n", i, iter[0][i], iter[1][i], iter[2][i], iter[3][i]);
1210 
1211  printf("\nDeviation Failures = (X, Y, Z, T)\n");
1212  for (int f=0; f<fail_check; f++) {
1213  printf("%e Failures = (%9d, %9d, %9d, %9d) = (%6.5f, %6.5f, %6.5f, %6.5f)\n", pow(10.0, -(f + 1)), fail[0][f],
1214  fail[1][f], fail[2][f], fail[3][f], fail[0][f] / (double)(V * 18), fail[1][f] / (double)(V * 18),
1215  fail[2][f] / (double)(V * 18), fail[3][f] / (double)(V * 18));
1216  }
1217 
1218 }
1219 
1220 void check_gauge(void **oldG, void **newG, double epsilon, QudaPrecision precision) {
1221  if (precision == QUDA_DOUBLE_PRECISION)
1222  checkGauge((double**)oldG, (double**)newG, epsilon);
1223  else
1224  checkGauge((float**)oldG, (float**)newG, epsilon);
1225 }
1226 
1227 void createSiteLinkCPU(void **link, QudaPrecision precision, int phase)
1228 {
1229 
1230  if (precision == QUDA_DOUBLE_PRECISION) {
1231  constructUnitaryGaugeField((double**)link);
1232  }else {
1233  constructUnitaryGaugeField((float**)link);
1234  }
1235 
1236  if(phase){
1237 
1238  for(int i=0;i < V;i++){
1239  for(int dir =XUP; dir <= TUP; dir++){
1240  int idx = i;
1241  int oddBit =0;
1242  if (i >= Vh) {
1243  idx = i - Vh;
1244  oddBit = 1;
1245  }
1246 
1247  int X1 = Z[0];
1248  int X2 = Z[1];
1249  int X3 = Z[2];
1250  int X4 = Z[3];
1251 
1252  int full_idx = fullLatticeIndex(idx, oddBit);
1253  int i4 = full_idx /(X3*X2*X1);
1254  int i3 = (full_idx - i4*(X3*X2*X1))/(X2*X1);
1255  int i2 = (full_idx - i4*(X3*X2*X1) - i3*(X2*X1))/X1;
1256  int i1 = full_idx - i4 * (X3 * X2 * X1) - i3 * (X2 * X1) - i2 * X1;
1257 
1258  double coeff= 1.0;
1259  switch(dir){
1260  case XUP:
1261  if ( (i4 & 1) != 0){
1262  coeff *= -1;
1263  }
1264  break;
1265 
1266  case YUP:
1267  if ( ((i4+i1) & 1) != 0){
1268  coeff *= -1;
1269  }
1270  break;
1271 
1272  case ZUP:
1273  if ( ((i4+i1+i2) & 1) != 0){
1274  coeff *= -1;
1275  }
1276  break;
1277 
1278  case TUP:
1279  if (last_node_in_t() && i4 == (X4-1)){
1280  coeff *= -1;
1281  }
1282  break;
1283 
1284  default:
1285  printf("ERROR: wrong dir(%d)\n", dir);
1286  exit(1);
1287  }
1288 
1289  if (precision == QUDA_DOUBLE_PRECISION){
1290  //double* mylink = (double*)link;
1291  //mylink = mylink + (4*i + dir)*gaugeSiteSize;
1292  double* mylink = (double*)link[dir];
1293  mylink = mylink + i*gaugeSiteSize;
1294 
1295  mylink[12] *= coeff;
1296  mylink[13] *= coeff;
1297  mylink[14] *= coeff;
1298  mylink[15] *= coeff;
1299  mylink[16] *= coeff;
1300  mylink[17] *= coeff;
1301 
1302  }else{
1303  //float* mylink = (float*)link;
1304  //mylink = mylink + (4*i + dir)*gaugeSiteSize;
1305  float* mylink = (float*)link[dir];
1306  mylink = mylink + i*gaugeSiteSize;
1307 
1308  mylink[12] *= coeff;
1309  mylink[13] *= coeff;
1310  mylink[14] *= coeff;
1311  mylink[15] *= coeff;
1312  mylink[16] *= coeff;
1313  mylink[17] *= coeff;
1314  }
1315  }
1316  }
1317  }
1318 
1319 #if 1
1320  for(int dir= 0;dir < 4;dir++){
1321  for(int i=0;i< V*gaugeSiteSize;i++){
1322  if (precision ==QUDA_SINGLE_PRECISION){
1323  float* f = (float*)link[dir];
1324  if (f[i] != f[i] || (fabsf(f[i]) > 1.e+3) ){
1325  fprintf(stderr, "ERROR: %dth: bad number(%f) in function %s \n",i, f[i], __FUNCTION__);
1326  exit(1);
1327  }
1328  }else{
1329  double* f = (double*)link[dir];
1330  if (f[i] != f[i] || (fabs(f[i]) > 1.e+3)){
1331  fprintf(stderr, "ERROR: %dth: bad number(%f) in function %s \n",i, f[i], __FUNCTION__);
1332  exit(1);
1333  }
1334  }
1335  }
1336  }
1337 #endif
1338 
1339  return;
1340 }
1341 
1342 void construct_spinor_source(void *v, int nSpin, int nColor, QudaPrecision precision, const int *const x, quda::RNG &rng)
1343 {
1345  param.v = v;
1346  param.nColor = nColor;
1347  param.nSpin = nSpin;
1348  param.setPrecision(precision);
1351  param.nDim = 4;
1354  for (int d = 0; d < 4; d++) param.x[d] = x[d];
1355  quda::cpuColorSpinorField spinor_in(param);
1356 
1357  quda::spinorNoise(spinor_in, rng, QUDA_NOISE_UNIFORM);
1358 }
1359 
1360 template <typename Float>
1361 int compareLink(Float **linkA, Float **linkB, int len) {
1362  const int fail_check = 16;
1363  int fail[fail_check];
1364  for (int f=0; f<fail_check; f++) fail[f] = 0;
1365 
1366  int iter[18];
1367  for (int i=0; i<18; i++) iter[i] = 0;
1368 
1369  for(int dir=0;dir < 4; dir++){
1370  for (int i=0; i<len; i++) {
1371  for (int j=0; j<18; j++) {
1372  int is = i*18+j;
1373  double diff = fabs(linkA[dir][is]-linkB[dir][is]);
1374  for (int f=0; f<fail_check; f++)
1375  if (diff > pow(10.0,-(f+1))) fail[f]++;
1376  //if (diff > 1e-1) printf("%d %d %e\n", i, j, diff);
1377  if (diff > 1e-3) iter[j]++;
1378  }
1379  }
1380  }
1381 
1382  for (int i=0; i<18; i++) printfQuda("%d fails = %d\n", i, iter[i]);
1383 
1384  int accuracy_level = 0;
1385  for(int f =0; f < fail_check; f++){
1386  if(fail[f] == 0){
1387  accuracy_level =f;
1388  }
1389  }
1390 
1391  for (int f=0; f<fail_check; f++) {
1392  printfQuda("%e Failures: %d / %d = %e\n", pow(10.0,-(f+1)), fail[f], 4*len*18, fail[f] / (double)(4*len*18));
1393  }
1394 
1395  return accuracy_level;
1396 }
1397 
1398 static int
1399 compare_link(void **linkA, void **linkB, int len, QudaPrecision precision)
1400 {
1401  int ret;
1402 
1403  if (precision == QUDA_DOUBLE_PRECISION) {
1404  ret = compareLink((double**)linkA, (double**)linkB, len);
1405  } else {
1406  ret = compareLink((float**)linkA, (float**)linkB, len);
1407  }
1408 
1409  return ret;
1410 }
1411 
1412 
1413 // X indexes the lattice site
1414 static void printLinkElement(void *link, int X, QudaPrecision precision)
1415 {
1416  if (precision == QUDA_DOUBLE_PRECISION){
1417  for(int i=0; i < 3;i++){
1418  printVector((double*)link+ X*gaugeSiteSize + i*6);
1419  }
1420 
1421  }
1422  else{
1423  for(int i=0;i < 3;i++){
1424  printVector((float*)link+X*gaugeSiteSize + i*6);
1425  }
1426  }
1427 }
1428 
1429 int strong_check_link(void **linkA, const char *msgA, void **linkB, const char *msgB, int len, QudaPrecision prec)
1430 {
1431  printfQuda("%s\n", msgA);
1432  printLinkElement(linkA[0], 0, prec);
1433  printfQuda("\n");
1434  printLinkElement(linkA[0], 1, prec);
1435  printfQuda("...\n");
1436  printLinkElement(linkA[3], len - 1, prec);
1437  printfQuda("\n");
1438 
1439  printfQuda("\n%s\n", msgB);
1440  printLinkElement(linkB[0], 0, prec);
1441  printfQuda("\n");
1442  printLinkElement(linkB[0], 1, prec);
1443  printfQuda("...\n");
1444  printLinkElement(linkB[3], len - 1, prec);
1445  printfQuda("\n");
1446 
1447  int ret = compare_link(linkA, linkB, len, prec);
1448  return ret;
1449 }
1450 
1451 void createMomCPU(void *mom, QudaPrecision precision)
1452 {
1453  void* temp;
1454 
1455  size_t gSize = (precision == QUDA_DOUBLE_PRECISION) ? sizeof(double) : sizeof(float);
1456  temp = malloc(4*V*gaugeSiteSize*gSize);
1457  if (temp == NULL){
1458  fprintf(stderr, "Error: malloc failed for temp in function %s\n", __FUNCTION__);
1459  exit(1);
1460  }
1461 
1462  for(int i=0;i < V;i++){
1463  if (precision == QUDA_DOUBLE_PRECISION){
1464  for(int dir=0;dir < 4;dir++){
1465  double *thismom = (double *)mom;
1466  for(int k=0; k < momSiteSize; k++){
1467  thismom[(4 * i + dir) * momSiteSize + k] = 1.0 * rand() / RAND_MAX;
1468  if (k==momSiteSize-1) thismom[ (4*i+dir)*momSiteSize + k ]= 0.0;
1469  }
1470  }
1471  }else{
1472  for(int dir=0;dir < 4;dir++){
1473  float* thismom=(float*)mom;
1474  for(int k=0; k < momSiteSize; k++){
1475  thismom[(4 * i + dir) * momSiteSize + k] = 1.0 * rand() / RAND_MAX;
1476  if (k==momSiteSize-1) thismom[ (4*i+dir)*momSiteSize + k ]= 0.0;
1477  }
1478  }
1479  }
1480  }
1481 
1482  free(temp);
1483  return;
1484 }
1485 
1486 void
1487 createHwCPU(void* hw, QudaPrecision precision)
1488 {
1489  for(int i=0;i < V;i++){
1490  if (precision == QUDA_DOUBLE_PRECISION){
1491  for(int dir=0;dir < 4;dir++){
1492  double* thishw = (double*)hw;
1493  for(int k=0; k < hwSiteSize; k++){
1494  thishw[ (4*i+dir)*hwSiteSize + k ]= 1.0* rand() /RAND_MAX;
1495  }
1496  }
1497  }else{
1498  for(int dir=0;dir < 4;dir++){
1499  float* thishw=(float*)hw;
1500  for(int k=0; k < hwSiteSize; k++){
1501  thishw[ (4*i+dir)*hwSiteSize + k ]= 1.0* rand() /RAND_MAX;
1502  }
1503  }
1504  }
1505  }
1506 
1507  return;
1508 }
1509 
1510 
1511 template <typename Float>
1512 int compare_mom(Float *momA, Float *momB, int len) {
1513  const int fail_check = 16;
1514  int fail[fail_check];
1515  for (int f=0; f<fail_check; f++) fail[f] = 0;
1516 
1517  int iter[momSiteSize];
1518  for (int i=0; i<momSiteSize; i++) iter[i] = 0;
1519 
1520  for (int i=0; i<len; i++) {
1521  for (int j=0; j<momSiteSize-1; j++) {
1522  int is = i*momSiteSize+j;
1523  double diff = fabs(momA[is]-momB[is]);
1524  for (int f=0; f<fail_check; f++)
1525  if (diff > pow(10.0,-(f+1))) fail[f]++;
1526  //if (diff > 1e-1) printf("%d %d %e\n", i, j, diff);
1527  if (diff > 1e-3) iter[j]++;
1528  }
1529  }
1530 
1531  int accuracy_level = 0;
1532  for(int f =0; f < fail_check; f++){
1533  if(fail[f] == 0){
1534  accuracy_level =f+1;
1535  }
1536  }
1537 
1538  for (int i=0; i<momSiteSize; i++) printfQuda("%d fails = %d\n", i, iter[i]);
1539 
1540  for (int f=0; f<fail_check; f++) {
1541  printfQuda("%e Failures: %d / %d = %e\n", pow(10.0,-(f+1)), fail[f], len*9, fail[f]/(double)(len*9));
1542  }
1543 
1544  return accuracy_level;
1545 }
1546 
1547 static void printMomElement(void *mom, int X, QudaPrecision precision)
1548 {
1549  if (precision == QUDA_DOUBLE_PRECISION){
1550  double* thismom = ((double*)mom)+ X*momSiteSize;
1551  printVector(thismom);
1552  printfQuda("(%9f,%9f) (%9f,%9f)\n", thismom[6], thismom[7], thismom[8], thismom[9]);
1553  }else{
1554  float* thismom = ((float*)mom)+ X*momSiteSize;
1555  printVector(thismom);
1556  printfQuda("(%9f,%9f) (%9f,%9f)\n", thismom[6], thismom[7], thismom[8], thismom[9]);
1557  }
1558 }
1559 int strong_check_mom(void *momA, void *momB, int len, QudaPrecision prec)
1560 {
1561  printfQuda("mom:\n");
1562  printMomElement(momA, 0, prec);
1563  printfQuda("\n");
1564  printMomElement(momA, 1, prec);
1565  printfQuda("\n");
1566  printMomElement(momA, 2, prec);
1567  printfQuda("\n");
1568  printMomElement(momA, 3, prec);
1569  printfQuda("...\n");
1570 
1571  printfQuda("\nreference mom:\n");
1572  printMomElement(momB, 0, prec);
1573  printfQuda("\n");
1574  printMomElement(momB, 1, prec);
1575  printfQuda("\n");
1576  printMomElement(momB, 2, prec);
1577  printfQuda("\n");
1578  printMomElement(momB, 3, prec);
1579  printfQuda("\n");
1580 
1581  int ret;
1582  if (prec == QUDA_DOUBLE_PRECISION){
1583  ret = compare_mom((double*)momA, (double*)momB, len);
1584  }else{
1585  ret = compare_mom((float*)momA, (float*)momB, len);
1586  }
1587 
1588  return ret;
1589 }
1590 
1591 /************
1592  * return value
1593  *
1594  * 0: command line option matched and processed sucessfully
1595  * non-zero: command line option does not match
1596  *
1597  */
1598 
1599 #ifdef MULTI_GPU
1600 int device = -1;
1601 #else
1602 int device = 0;
1603 #endif
1604 
1615 int xdim = 24;
1616 int ydim = 24;
1617 int zdim = 24;
1618 int tdim = 24;
1619 int Lsdim = 16;
1622 int laplace3D = 4;
1623 char latfile[256] = "";
1624 bool unit_gauge = false;
1625 double gaussian_sigma = 0.2;
1626 char gauge_outfile[256] = "";
1627 int Nsrc = 1;
1628 int Msrc = 1;
1629 int niter = 100;
1630 int gcrNkrylov = 10;
1632 double ca_lambda_min = 0.0;
1633 double ca_lambda_max = -1.0;
1634 int pipeline = 0;
1636 int test_type = 0;
1642 int multishift = 0;
1643 bool verify_results = true;
1644 bool low_mode_check = false;
1645 bool oblique_proj_check = false;
1646 double mass = 0.1;
1647 double kappa = -1.0;
1648 double mu = 0.1;
1649 double epsilon = 0.01;
1650 double anisotropy = 1.0;
1651 double tadpole_factor = 1.0;
1652 double eps_naik = 0.0;
1653 double clover_coeff = 0.1;
1654 bool compute_clover = false;
1655 bool compute_fatlong = false;
1656 double tol = 1e-7;
1657 double tol_hq = 0.;
1658 double reliable_delta = 0.1;
1665 
1666 int mg_levels = 2;
1667 
1670 
1688 bool pre_orthonormalize = false;
1690 double omega = 0.85;
1705 
1707 int nev = 8;
1710 double tol_restart = 5e+3*tol;
1711 
1714 double inc_tol = 1e-2;
1715 double eigenval_tol = 1e-1;
1716 
1721 
1722 // Parameters for the stand alone eigensolver
1723 int eig_nEv = 16;
1724 int eig_nKr = 32;
1725 int eig_nConv = -1; // If unchanged, will be set to nEv
1728 int eig_max_restarts = 1000;
1729 double eig_tol = 1e-6;
1730 bool eig_use_poly_acc = true;
1731 int eig_poly_deg = 100;
1732 double eig_amin = 0.1;
1733 double eig_amax = 4.0;
1734 bool eig_use_normop = true;
1735 bool eig_use_dagger = false;
1736 bool eig_compute_svd = false;
1739 bool eig_arpack_check = false;
1740 char eig_arpack_logfile[256] = "arpack_logfile.log";
1741 char eig_QUDA_logfile[256] = "QUDA_logfile.log";
1742 char eig_vec_infile[256] = "";
1743 char eig_vec_outfile[256] = "";
1744 
1745 // Parameters for the MG eigensolver.
1746 // The coarsest grid params are for deflation,
1747 // all others are for PR vectors.
1763 bool mg_eig_coarse_guess = false;
1764 
1765 double heatbath_beta_value = 6.2;
1770 bool heatbath_coldstart = false;
1771 
1773 
1774 static int dim_partitioned[4] = {0,0,0,0};
1775 
1776 int dimPartitioned(int dim)
1777 {
1778  return ((gridsize_from_cmdline[dim] > 1) || dim_partitioned[dim]);
1779 }
1780 
1781 void __attribute__((weak)) usage_extra(char** argv){};
1782 
1783 void usage(char** argv )
1784 {
1785  printf("Usage: %s [options]\n", argv[0]);
1786  printf("Common options: \n");
1787 #ifndef MULTI_GPU
1788  printf(" --device <n> # Set the CUDA device to use (default 0, single GPU only)\n");
1789 #endif
1790 
1791  // Problem size and type parameters
1792  printf(" --verbosity <silent/summurize/verbose> # The the verbosity on the top level of QUDA( default summarize)\n");
1793  printf(" --prec <double/single/half> # Precision in GPU\n");
1794  printf(" --prec-sloppy <double/single/half> # Sloppy precision in GPU\n");
1795  printf(" --prec-refine <double/single/half> # Sloppy precision for refinement in GPU\n");
1796  printf(" --prec-precondition <double/single/half> # Preconditioner precision in GPU\n");
1797  printf(" --prec-ritz <double/single/half> # Eigenvector precision in GPU\n");
1798  printf(" --recon <8/9/12/13/18> # Link reconstruction type\n");
1799  printf(" --recon-sloppy <8/9/12/13/18> # Sloppy link reconstruction type\n");
1800  printf(" --recon-precondition <8/9/12/13/18> # Preconditioner link reconstruction type\n");
1801  printf(" --dagger # Set the dagger to 1 (default 0)\n");
1802  printf(" --dim <n> # Set space-time dimension (X Y Z T)\n");
1803  printf(" --sdim <n> # Set space dimension(X/Y/Z) size\n");
1804  printf(" --xdim <n> # Set X dimension size(default 24)\n");
1805  printf(" --ydim <n> # Set X dimension size(default 24)\n");
1806  printf(" --zdim <n> # Set X dimension size(default 24)\n");
1807  printf(" --tdim <n> # Set T dimension size(default 24)\n");
1808  printf(" --Lsdim <n> # Set Ls dimension size(default 16)\n");
1809  printf(" --gridsize <x y z t> # Set the grid size in all four dimension (default 1 1 1 1)\n");
1810  printf(" --xgridsize <n> # Set grid size in X dimension (default 1)\n");
1811  printf(" --ygridsize <n> # Set grid size in Y dimension (default 1)\n");
1812  printf(" --zgridsize <n> # Set grid size in Z dimension (default 1)\n");
1813  printf(" --tgridsize <n> # Set grid size in T dimension (default 1)\n");
1814  printf(" --partition <mask> # Set the communication topology (X=1, Y=2, Z=4, T=8, and combinations of these)\n");
1815  printf(" --rank-order <col/row> # Set the [t][z][y][x] rank order as either column major (t fastest, default) or row major (x fastest)\n");
1816  printf(" --dslash-type <type> # Set the dslash type, the following values are valid\n"
1817  " wilson/clover/twisted-mass/twisted-clover/staggered\n"
1818  " /asqtad/domain-wall/domain-wall-4d/mobius/laplace\n");
1819  printf(" --laplace3D <n> # Restrict laplace operator to omit the t dimension (n=3), or include all dims (n=4) (default 4)\n");
1820  printf(" --flavor <type> # Set the twisted mass flavor type (singlet (default), deg-doublet, nondeg-doublet)\n");
1821  printf(" --load-gauge file # Load gauge field \"file\" for the test (requires QIO)\n");
1822  printf(" --save-gauge file # Save gauge field \"file\" for the test (requires QIO, "
1823  "heatbath test only)\n");
1824  printf(" --unit-gauge <true/false> # Generate a unit valued gauge field in the tests. If false, a "
1825  "random gauge is generated (default false)\n");
1826  printf(" --gaussian-sigma <sigma> # Width of the Gaussian noise used for random gauge field "
1827  "contruction (default 0.2)\n");
1828  printf(" --niter <n> # The number of iterations to perform (default 10)\n");
1829  printf(" --ngcrkrylov <n> # The number of inner iterations to use for GCR, BiCGstab-l, CA-CG (default 10)\n");
1830  printf(" --ca-basis-type <power/chebyshev> # The basis to use for CA-CG (default power)\n");
1831  printf(" --cheby-basis-eig-min # Conservative estimate of smallest eigenvalue for Chebyshev basis CA-CG (default 0)\n");
1832  printf(" --cheby-basis-eig-max # Conservative estimate of largest eigenvalue for Chebyshev basis CA-CG (default is to guess with power iterations)\n");
1833  printf(" --pipeline <n> # The pipeline length for fused operations in GCR, BiCGstab-l (default 0, no pipelining)\n");
1834  printf(" --solution-pipeline <n> # The pipeline length for fused solution accumulation (default 0, no pipelining)\n");
1835  printf(" --inv-type <cg/bicgstab/gcr> # The type of solver to use (default cg)\n");
1836  printf(" --precon-type <mr/ (unspecified)> # The type of solver to use (default none (=unspecified)).\n");
1837  printf(" --multishift <true/false> # Whether to do a multi-shift solver test or not (default false)\n");
1838  printf(" --mass # Mass of Dirac operator (default 0.1)\n");
1839  printf(" --kappa # Kappa of Dirac operator (default 0.12195122... [equiv to mass])\n");
1840  printf(" --mu # Twisted-Mass chiral twist of Dirac operator (default 0.1)\n");
1841  printf(
1842  " --epsilon # Twisted-Mass flavor twist of Dirac operator (default 0.01)\n");
1843  printf(" --tadpole-coeff # Tadpole coefficient for HISQ fermions (default 1.0, recommended [Plaq]^1/4)\n");
1844  printf(" --epsilon-naik # Epsilon factor on Naik term (default 0.0, suggested non-zero -0.1)\n");
1845  printf(" --compute-clover # Compute the clover field or use random numbers (default false)\n");
1846  printf(" --compute-fat-long # Compute the fat/long field or use random numbers (default false)\n");
1847  printf(" --clover-coeff # Clover coefficient (default 1.0)\n");
1848  printf(" --anisotropy # Temporal anisotropy factor (default 1.0)\n");
1849  printf(" --mass-normalization # Mass normalization (kappa (default) / mass / asym-mass)\n");
1850  printf(" --matpc # Matrix preconditioning type (even-even, odd-odd, even-even-asym, odd-odd-asym) \n");
1851  printf(" --solve-type # The type of solve to do (direct, direct-pc, normop, "
1852  "normop-pc, normerr, normerr-pc) \n");
1853  printf(" --solution-type # The solution we desire (mat (default), mat-dag-mat, mat-pc, "
1854  "mat-pc-dag-mat-pc (default for multi-shift))\n");
1855  printf(" --tol <resid_tol> # Set L2 residual tolerance\n");
1856  printf(" --tolhq <resid_hq_tol> # Set heavy-quark residual tolerance\n");
1857  printf(" --reliable-delta <delta> # Set reliable update delta factor\n");
1858  printf(" --test # Test method (different for each test)\n");
1859  printf(" --verify <true/false> # Verify the GPU results using CPU results (default true)\n");
1860 
1861  // Multigrid
1862  printf(" --mg-low-mode-check <true/false> # Measure how well the null vector subspace overlaps with the "
1863  "low eigenmode subspace (default false)\n");
1864  printf(" --mg-oblique-proj-check <true/false> # Measure how well the null vector subspace adjusts the low "
1865  "eigenmode subspace (default false)\n");
1866  printf(" --mg-nvec <level nvec> # Number of null-space vectors to define the multigrid "
1867  "transfer operator on a given level\n"
1868  " If using the eigensolver of the coarsest level then this "
1869  "dictates the size of the deflation space.\n");
1870  printf(" --mg-gpu-prolongate <true/false> # Whether to do the multigrid transfer operators on the GPU (default false)\n");
1871  printf(" --mg-levels <2+> # The number of multigrid levels to do (default 2)\n");
1872  printf(" --mg-nu-pre <level 1-20> # The number of pre-smoother applications to do at a given multigrid level (default 2)\n");
1873  printf(" --mg-nu-post <level 1-20> # The number of post-smoother applications to do at a given multigrid level (default 2)\n");
1874  printf(" --mg-coarse-solve-type <level solve> # The type of solve to do on each level (direct, direct-pc) (default = solve_type)\n");
1875  printf(" --mg-smoother-solve-type <level solve> # The type of solve to do in smoother (direct, direct-pc (default) )\n");
1876  printf(" --mg-solve-location <level cpu/cuda> # The location where the multigrid solver will run (default cuda)\n");
1877  printf(" --mg-setup-location <level cpu/cuda> # The location where the multigrid setup will be computed (default cuda)\n");
1878  printf(" --mg-setup-inv <level inv> # The inverter to use for the setup of multigrid (default bicgstab)\n");
1879  printf(" --mg-setup-maxiter <level iter> # The maximum number of solver iterations to use when relaxing on a null space vector (default 500)\n");
1880  printf(" --mg-setup-maxiter-refresh <level iter> # The maximum number of solver iterations to use when refreshing the pre-existing null space vectors (default 100)\n");
1881  printf(" --mg-setup-iters <level iter> # The number of setup iterations to use for the multigrid (default 1)\n");
1882  printf(" --mg-setup-tol <level tol> # The tolerance to use for the setup of multigrid (default 5e-6)\n");
1883  printf(" --mg-setup-ca-basis-type <level power/chebyshev> # The basis to use for CA-CG setup of multigrid(default power)\n");
1884  printf(" --mg-setup-ca-basis-size <level size> # The basis size to use for CA-CG setup of multigrid (default 4)\n");
1885  printf(" --mg-setup-cheby-basis-eig-min <level val> # Conservative estimate of smallest eigenvalue for Chebyshev basis CA-CG in setup of multigrid (default 0)\n");
1886  printf(" --mg-setup-cheby-basis-eig-max <level val> # Conservative estimate of largest eigenvalue for Chebyshev basis CA-CG in setup of multigrid (default is to guess with power iterations)\n");
1887  printf(" --mg-setup-type <null/test> # The type of setup to use for the multigrid (default null)\n");
1888  printf(" --mg-pre-orth <true/false> # If orthonormalize the vector before inverting in the setup of multigrid (default false)\n");
1889  printf(" --mg-post-orth <true/false> # If orthonormalize the vector after inverting in the setup of multigrid (default true)\n");
1890  printf(" --mg-n-block-ortho <level n> # The number of times to run Gram-Schmidt during block "
1891  "orthonormalization (default 1)\n");
1892  printf(" --mg-omega # The over/under relaxation factor for the smoother of multigrid (default 0.85)\n");
1893  printf(" --mg-coarse-solver <level gcr/etc.> # The solver to wrap the V cycle on each level (default gcr, only for levels 1+)\n");
1894  printf(" --mg-coarse-solver-tol <level gcr/etc.> # The coarse solver tolerance for each level (default 0.25, only for levels 1+)\n");
1895  printf(" --mg-coarse-solver-maxiter <level n> # The coarse solver maxiter for each level (default 100)\n");
1896  printf(" --mg-coarse-solver-ca-basis-type <level power/chebyshev> # The basis to use for CA-CG setup of multigrid(default power)\n");
1897  printf(" --mg-coarse-solver-ca-basis-size <level size> # The basis size to use for CA-CG setup of multigrid (default 4)\n");
1898  printf(" --mg-coarse-solver-cheby-basis-eig-min <level val> # Conservative estimate of smallest eigenvalue for Chebyshev basis CA-CG in setup of multigrid (default 0)\n");
1899  printf(" --mg-coarse-solver-cheby-basis-eig-max <level val> # Conservative estimate of largest eigenvalue for Chebyshev basis CA-CG in setup of multigrid (default is to guess with power iterations)\n");
1900  printf(" --mg-smoother <level mr/etc.> # The smoother to use for multigrid (default mr)\n");
1901  printf(" --mg-smoother-tol <level resid_tol> # The smoother tolerance to use for each multigrid (default 0.25)\n");
1902  printf(" --mg-smoother-halo-prec # The smoother halo precision (applies to all levels - defaults to null_precision)\n");
1903  printf(" --mg-schwarz-type <level false/add/mul> # Whether to use Schwarz preconditioning (requires MR smoother and GCR setup solver) (default false)\n");
1904  printf(" --mg-schwarz-cycle <level cycle> # The number of Schwarz cycles to apply per smoother application (default=1)\n");
1905  printf(" --mg-block-size <level x y z t> # Set the geometric block size for the each multigrid level's transfer operator (default 4 4 4 4)\n");
1906  printf(" --mg-mu-factor <level factor> # Set the multiplicative factor for the twisted mass mu parameter on each level (default 1)\n");
1907  printf(" --mg-generate-nullspace <true/false> # Generate the null-space vector dynamically (default true, if set false and mg-load-vec isn't set, creates free-field null vectors)\n");
1908  printf(" --mg-generate-all-levels <true/talse> # true=generate null-space on all levels, false=generate on level 0 and create other levels from that (default true)\n");
1909  printf(" --mg-load-vec <level file> # Load the vectors \"file\" for the multigrid_test (requires "
1910  "QIO)\n");
1911  printf(" --mg-save-vec <level file> # Save the generated null-space vectors \"file\" from the "
1912  "multigrid_test (requires QIO)\n");
1913  printf(" --mg-verbosity <level verb> # The verbosity to use on each level of the multigrid (default "
1914  "summarize)\n");
1915 
1916  // Deflated solvers
1917  printf(" --df-nev <nev> # Set number of eigenvectors computed within a single solve cycle (default 8)\n");
1918  printf(" --df-max-search-dim <dim> # Set the size of eigenvector search space (default 64)\n");
1919  printf(" --df-deflation-grid <n> # Set maximum number of cycles needed to compute eigenvectors(default 1)\n");
1920  printf(" --df-eigcg-max-restarts <n> # Set how many iterative refinement cycles will be solved with eigCG within a single physical right hand site solve (default 4)\n");
1921  printf(" --df-tol-restart <tol> # Set tolerance for the first restart in the initCG solver(default 5e-5)\n");
1922  printf(" --df-tol-inc <tol> # Set tolerance for the subsequent restarts in the initCG solver (default 1e-2)\n");
1923  printf(" --df-max-restart-num <n> # Set maximum number of the initCG restarts in the deflation stage (default 3)\n");
1924  printf(" --df-tol-eigenval <tol> # Set maximum eigenvalue residual norm (default 1e-1)\n");
1925 
1926  printf(" --solver-ext-lib-type <eigen/magma> # Set external library for the solvers (default Eigen library)\n");
1927  printf(" --df-ext-lib-type <eigen/magma> # Set external library for the deflation methods (default Eigen library)\n");
1928  printf(" --df-location-ritz <host/cuda> # Set memory location for the ritz vectors (default cuda memory location)\n");
1929  printf(" --df-mem-type-ritz <device/pinned/mapped> # Set memory type for the ritz vectors (default device memory type)\n");
1930 
1931  // Eigensolver
1932  printf(" --eig-nEv <n> # The size of eigenvector search space in the eigensolver\n");
1933  printf(" --eig-nKr <n> # The size of the Krylov subspace to use in the eigensolver\n");
1934  printf(" --eig-nConv <n> # The number of converged eigenvalues requested\n");
1935  printf(" --eig-require-convergence <true/false> # If true, the solver will error out if convergence is not "
1936  "attained. If false, a warning will be given (default true)\n");
1937  printf(" --eig-check-interval <n> # Perform a convergence check every nth restart/iteration in "
1938  "the IRAM,IRLM/lanczos,arnoldi eigensolver\n");
1939  printf(" --eig-max-restarts <n> # Perform n iterations of the restart in the eigensolver\n");
1940  printf(" --eig-tol <tol> # The tolerance to use in the eigensolver\n");
1941  printf(" --eig-use-poly-acc <true/false> # Use Chebyshev polynomial acceleration in the eigensolver\n");
1942  printf(" --eig-poly-deg <n> # Set the degree of the Chebyshev polynomial acceleration in "
1943  "the eigensolver\n");
1944  printf(" --eig-amin <Float> # The minimum in the polynomial acceleration\n");
1945  printf(" --eig-amax <Float> # The maximum in the polynomial acceleration\n");
1946  printf(" --eig-use-normop <true/false> # Solve the MdagM problem instead of M (MMdag if "
1947  "eig-use-dagger == true) (default false)\n");
1948  printf(" --eig-use-dagger <true/false> # Solve the Mdag problem instead of M (MMdag if "
1949  "eig-use-normop == true) (default false)\n");
1950  printf(" --eig-compute-svd <true/false> # Solve the MdagM problem, use to compute SVD of M (default "
1951  "false)\n");
1952  printf(" --eig-spectrum <SR/LR/SM/LM/SI/LI> # The spectrum part to be calulated. S=smallest L=largest "
1953  "R=real M=modulus I=imaginary\n");
1954  printf(" --eig-type <eigensolver> # The type of eigensolver to use (default trlm)\n");
1955  printf(" --eig-QUDA-logfile <file_name> # The filename storing the stdout from the QUDA eigensolver\n");
1956  printf(" --eig-arpack-check <true/false> # Cross check the device data against ARPACK (requires ARPACK, "
1957  "default false)\n");
1958  printf(" --eig-load-vec <file> # Load eigenvectors to <file> (requires QIO)\n");
1959  printf(" --eig-save-vec <file> # Save eigenvectors to <file> (requires QIO)\n");
1960 
1961  // Multigrid Eigensolver
1962  printf(" --mg-eig <level> <true/false> # Use the eigensolver on this level (default false)\n");
1963  printf(" --mg-eig-nEv <level> <n> # The size of eigenvector search space in the "
1964  "eigensolver\n");
1965  printf(" --mg-eig-nKr <level> <n> # The size of the Krylov subspace to use in the "
1966  "eigensolver\n");
1967  printf(" --mg-eig-require-convergence <level> <true/false> # If true, the solver will error out if convergence is "
1968  "not attained. If false, a warning will be given (default true)\n");
1969  printf(" --mg-eig-check-interval <level> <n> # Perform a convergence check every nth "
1970  "restart/iteration (only used in Implicit Restart types)\n");
1971  printf(" --mg-eig-max-restarts <level> <n> # Perform a maximun of n restarts in eigensolver "
1972  "(default 100)\n");
1973  printf(" --mg-eig-use-normop <level> <true/false> # Solve the MdagM problem instead of M (MMdag if "
1974  "eig-use-dagger == true) (default false)\n");
1975  printf(" --mg-eig-use-dagger <level> <true/false> # Solve the MMdag problem instead of M (MMdag if "
1976  "eig-use-normop == true) (default false)\n");
1977  printf(
1978  " --mg-eig-tol <level> <tol> # The tolerance to use in the eigensolver (default 1e-6)\n");
1979  printf(" --mg-eig-use-poly-acc <level> <true/false> # Use Chebyshev polynomial acceleration in the "
1980  "eigensolver (default true)\n");
1981  printf(" --mg-eig-poly-deg <level> <n> # Set the degree of the Chebyshev polynomial (default "
1982  "100)\n");
1983  printf(" --mg-eig-amin <level> <Float> # The minimum in the polynomial acceleration (default "
1984  "0.1)\n");
1985  printf(" --mg-eig-amax <level> <Float> # The maximum in the polynomial acceleration (default "
1986  "4.0)\n");
1987  printf(" --mg-eig-spectrum <level> <SR/LR/SM/LM/SI/LI> # The spectrum part to be calulated. S=smallest "
1988  "L=largest R=real M=modulus I=imaginary (default SR)\n");
1989  printf(" --mg-eig-coarse-guess <true/false> # If deflating on the coarse grid, optionaly use an "
1990  "initial guess (default = false)\n");
1991  printf(" --mg-eig-type <level> <eigensolver> # The type of eigensolver to use (default trlm)\n");
1992 
1993  // Miscellanea
1994  printf(" --nsrc <n> # How many spinors to apply the dslash to simultaneusly (experimental for staggered only)\n");
1995  printf(" --msrc <n> # Used for testing non-square block blas routines where nsrc defines the other dimension\n");
1996  printf(" --heatbath-beta <beta> # Beta value used in heatbath test (default 6.2)\n");
1997  printf(" --heatbath-warmup-steps <n> # Number of warmup steps in heatbath test (default 10)\n");
1998  printf(" --heatbath-num-steps <n> # Number of measurement steps in heatbath test (default 10)\n");
1999  printf(" --heatbath-num-hb-per-step <n> # Number of heatbath hits per heatbath step (default 5)\n");
2000  printf(" --heatbath-num-or-per-step <n> # Number of overrelaxation hits per heatbath step (default 5)\n");
2001  printf(" --heatbath-coldstart <true/false> # Whether to use a cold or hot start in heatbath test (default false)\n");
2002 
2003  printf(" --contraction-type <open/dr/dp> # Whether to leave spin elemental open, or use a gamma basis "
2004  "and contract on spin (default open)\n");
2005 
2006  printf(" --help # Print out this message\n");
2007 
2008  usage_extra(argv);
2009 #ifdef MULTI_GPU
2010  char msg[]="multi";
2011 #else
2012  char msg[]="single";
2013 #endif
2014  printf("Note: this program is %s GPU build\n", msg);
2015  exit(1);
2016  return ;
2017 }
2018 
2019 int process_command_line_option(int argc, char** argv, int* idx)
2020 {
2021 #ifdef MULTI_GPU
2022  char msg[]="multi";
2023 #else
2024  char msg[]="single";
2025 #endif
2026 
2027  int ret = -1;
2028 
2029  int i = *idx;
2030 
2031  if( strcmp(argv[i], "--help")== 0){
2032  usage(argv);
2033  }
2034 
2035  if( strcmp(argv[i], "--verify") == 0){
2036  if (i + 1 >= argc) { usage(argv); }
2037 
2038  if (strcmp(argv[i+1], "true") == 0){
2039  verify_results = true;
2040  }else if (strcmp(argv[i+1], "false") == 0){
2041  verify_results = false;
2042  }else{
2043  fprintf(stderr, "ERROR: invalid verify type\n");
2044  exit(1);
2045  }
2046 
2047  i++;
2048  ret = 0;
2049  goto out;
2050  }
2051 
2052  if (strcmp(argv[i], "--mg-low-mode-check") == 0) {
2053  if (i + 1 >= argc) { usage(argv); }
2054 
2055  if (strcmp(argv[i + 1], "true") == 0) {
2056  low_mode_check = true;
2057  } else if (strcmp(argv[i + 1], "false") == 0) {
2058  low_mode_check = false;
2059  } else {
2060  fprintf(stderr, "ERROR: invalid low_mode_check type (true/false)\n");
2061  exit(1);
2062  }
2063 
2064  i++;
2065  ret = 0;
2066  goto out;
2067  }
2068 
2069  if (strcmp(argv[i], "--mg-oblique-proj-check") == 0) {
2070  if (i + 1 >= argc) { usage(argv); }
2071 
2072  if (strcmp(argv[i + 1], "true") == 0) {
2073  oblique_proj_check = true;
2074  } else if (strcmp(argv[i + 1], "false") == 0) {
2075  oblique_proj_check = false;
2076  } else {
2077  fprintf(stderr, "ERROR: invalid oblique_proj_check type (true/false)\n");
2078  exit(1);
2079  }
2080 
2081  i++;
2082  ret = 0;
2083  goto out;
2084  }
2085 
2086  if( strcmp(argv[i], "--device") == 0){
2087  if (i+1 >= argc){
2088  usage(argv);
2089  }
2090  device = atoi(argv[i+1]);
2091  if (device < 0 || device > 16){
2092  printf("ERROR: Invalid CUDA device number (%d)\n", device);
2093  usage(argv);
2094  }
2095  i++;
2096  ret = 0;
2097  goto out;
2098  }
2099 
2100  if( strcmp(argv[i], "--verbosity") == 0){
2101  if (i+1 >= argc){
2102  usage(argv);
2103  }
2104  verbosity = get_verbosity_type(argv[i+1]);
2105  i++;
2106  ret = 0;
2107  goto out;
2108  }
2109 
2110  if( strcmp(argv[i], "--prec") == 0){
2111  if (i + 1 >= argc) { usage(argv); }
2112  prec = get_prec(argv[i+1]);
2113  i++;
2114  ret = 0;
2115  goto out;
2116  }
2117 
2118  if( strcmp(argv[i], "--prec-sloppy") == 0){
2119  if (i + 1 >= argc) { usage(argv); }
2120  prec_sloppy = get_prec(argv[i+1]);
2121  i++;
2122  ret = 0;
2123  goto out;
2124  }
2125 
2126  if( strcmp(argv[i], "--prec-refine") == 0){
2127  if (i + 1 >= argc) { usage(argv); }
2128  prec_refinement_sloppy = get_prec(argv[i+1]);
2129  i++;
2130  ret = 0;
2131  goto out;
2132  }
2133 
2134  if( strcmp(argv[i], "--prec-precondition") == 0){
2135  if (i+1 >= argc){
2136  usage(argv);
2137  }
2138  prec_precondition = get_prec(argv[i+1]);
2139  i++;
2140  ret = 0;
2141  goto out;
2142  }
2143 
2144  if( strcmp(argv[i], "--prec-null") == 0){
2145  if (i+1 >= argc){
2146  usage(argv);
2147  }
2148  prec_null = get_prec(argv[i+1]);
2149  i++;
2150  ret = 0;
2151  goto out;
2152  }
2153 
2154  if( strcmp(argv[i], "--prec-ritz") == 0){
2155  if (i+1 >= argc){
2156  usage(argv);
2157  }
2158  prec_ritz = get_prec(argv[i+1]);
2159  i++;
2160  ret = 0;
2161  goto out;
2162  }
2163 
2164  if( strcmp(argv[i], "--recon") == 0){
2165  if (i+1 >= argc){
2166  usage(argv);
2167  }
2168  link_recon = get_recon(argv[i+1]);
2169  i++;
2170  ret = 0;
2171  goto out;
2172  }
2173 
2174  if( strcmp(argv[i], "--recon-sloppy") == 0){
2175  if (i+1 >= argc){
2176  usage(argv);
2177  }
2178  link_recon_sloppy = get_recon(argv[i+1]);
2179  i++;
2180  ret = 0;
2181  goto out;
2182  }
2183 
2184  if( strcmp(argv[i], "--recon-precondition") == 0){
2185  if (i+1 >= argc){
2186  usage(argv);
2187  }
2188  link_recon_precondition = get_recon(argv[i+1]);
2189  i++;
2190  ret = 0;
2191  goto out;
2192  }
2193 
2194  if( strcmp(argv[i], "--dim") == 0){
2195  if (i+4 >= argc){
2196  usage(argv);
2197  }
2198  xdim= atoi(argv[i+1]);
2199  if (xdim < 0 || xdim > 512){
2200  printf("ERROR: invalid X dimension (%d)\n", xdim);
2201  usage(argv);
2202  }
2203  i++;
2204 
2205  if (i+1 >= argc){
2206  usage(argv);
2207  }
2208  ydim= atoi(argv[i+1]);
2209  if (ydim < 0 || ydim > 512){
2210  printf("ERROR: invalid Y dimension (%d)\n", ydim);
2211  usage(argv);
2212  }
2213  i++;
2214 
2215  if (i+1 >= argc){
2216  usage(argv);
2217  }
2218  zdim= atoi(argv[i+1]);
2219  if (zdim < 0 || zdim > 512){
2220  printf("ERROR: invalid Z dimension (%d)\n", zdim);
2221  usage(argv);
2222  }
2223  i++;
2224 
2225  if (i+1 >= argc){
2226  usage(argv);
2227  }
2228  tdim= atoi(argv[i+1]);
2229  if (tdim < 0 || tdim > 512){
2230  printf("ERROR: invalid T dimension (%d)\n", tdim);
2231  usage(argv);
2232  }
2233  i++;
2234 
2235  ret = 0;
2236  goto out;
2237  }
2238 
2239  if( strcmp(argv[i], "--xdim") == 0){
2240  if (i+1 >= argc){
2241  usage(argv);
2242  }
2243  xdim= atoi(argv[i+1]);
2244  if (xdim < 0 || xdim > 512){
2245  printf("ERROR: invalid X dimension (%d)\n", xdim);
2246  usage(argv);
2247  }
2248  i++;
2249  ret = 0;
2250  goto out;
2251  }
2252 
2253  if( strcmp(argv[i], "--ydim") == 0){
2254  if (i+1 >= argc){
2255  usage(argv);
2256  }
2257  ydim= atoi(argv[i+1]);
2258  if (ydim < 0 || ydim > 512){
2259  printf("ERROR: invalid T dimension (%d)\n", ydim);
2260  usage(argv);
2261  }
2262  i++;
2263  ret = 0;
2264  goto out;
2265  }
2266 
2267 
2268  if( strcmp(argv[i], "--zdim") == 0){
2269  if (i+1 >= argc){
2270  usage(argv);
2271  }
2272  zdim= atoi(argv[i+1]);
2273  if (zdim < 0 || zdim > 512){
2274  printf("ERROR: invalid T dimension (%d)\n", zdim);
2275  usage(argv);
2276  }
2277  i++;
2278  ret = 0;
2279  goto out;
2280  }
2281 
2282  if( strcmp(argv[i], "--tdim") == 0){
2283  if (i + 1 >= argc) { usage(argv); }
2284  tdim = atoi(argv[i+1]);
2285  if (tdim < 0 || tdim > 512){
2286  printf("Error: invalid t dimension");
2287  usage(argv);
2288  }
2289  i++;
2290  ret = 0;
2291  goto out;
2292  }
2293 
2294  if( strcmp(argv[i], "--sdim") == 0){
2295  if (i + 1 >= argc) { usage(argv); }
2296  int sdim = atoi(argv[i+1]);
2297  if (sdim < 0 || sdim > 512){
2298  printf("ERROR: invalid S dimension\n");
2299  usage(argv);
2300  }
2301  xdim=ydim=zdim=sdim;
2302  i++;
2303  ret = 0;
2304  goto out;
2305  }
2306 
2307  if( strcmp(argv[i], "--Lsdim") == 0){
2308  if (i + 1 >= argc) { usage(argv); }
2309  int Ls = atoi(argv[i+1]);
2310  if (Ls < 0 || Ls > 128){
2311  printf("ERROR: invalid Ls dimension\n");
2312  usage(argv);
2313  }
2314  Lsdim=Ls;
2315  i++;
2316  ret = 0;
2317  goto out;
2318  }
2319 
2320  if( strcmp(argv[i], "--dagger") == 0){
2321  dagger = QUDA_DAG_YES;
2322  ret = 0;
2323  goto out;
2324  }
2325 
2326  if( strcmp(argv[i], "--partition") == 0){
2327  if (i+1 >= argc){
2328  usage(argv);
2329  }
2330 #ifdef MULTI_GPU
2331  int value = atoi(argv[i+1]);
2332  for(int j=0; j < 4;j++){
2333  if (value & (1 << j)){
2335  dim_partitioned[j] = 1;
2336  }
2337  }
2338 #else
2339  printfQuda("WARNING: Ignoring --partition option since this is a single-GPU build.\n");
2340 #endif
2341  i++;
2342  ret = 0;
2343  goto out;
2344  }
2345 
2346  if( strcmp(argv[i], "--multishift") == 0){
2347  if (i + 1 >= argc) { usage(argv); }
2348 
2349  if (strcmp(argv[i+1], "true") == 0){
2350  multishift = true;
2351  }else if (strcmp(argv[i+1], "false") == 0){
2352  multishift = false;
2353  }else{
2354  fprintf(stderr, "ERROR: invalid multishift boolean\n");
2355  exit(1);
2356  }
2357 
2358  i++;
2359  ret = 0;
2360  goto out;
2361  }
2362 
2363  if( strcmp(argv[i], "--gridsize") == 0){
2364  if (i + 4 >= argc) { usage(argv); }
2365  int xsize = atoi(argv[i+1]);
2366  if (xsize <= 0 ){
2367  printf("ERROR: invalid X grid size");
2368  usage(argv);
2369  }
2370  gridsize_from_cmdline[0] = xsize;
2371  i++;
2372 
2373  int ysize = atoi(argv[i+1]);
2374  if (ysize <= 0 ){
2375  printf("ERROR: invalid Y grid size");
2376  usage(argv);
2377  }
2378  gridsize_from_cmdline[1] = ysize;
2379  i++;
2380 
2381  int zsize = atoi(argv[i+1]);
2382  if (zsize <= 0 ){
2383  printf("ERROR: invalid Z grid size");
2384  usage(argv);
2385  }
2386  gridsize_from_cmdline[2] = zsize;
2387  i++;
2388 
2389  int tsize = atoi(argv[i+1]);
2390  if (tsize <= 0 ){
2391  printf("ERROR: invalid T grid size");
2392  usage(argv);
2393  }
2394  gridsize_from_cmdline[3] = tsize;
2395  i++;
2396 
2397  ret = 0;
2398  goto out;
2399  }
2400 
2401  if( strcmp(argv[i], "--xgridsize") == 0){
2402  if (i + 1 >= argc) { usage(argv); }
2403  int xsize = atoi(argv[i+1]);
2404  if (xsize <= 0 ){
2405  printf("ERROR: invalid X grid size");
2406  usage(argv);
2407  }
2408  gridsize_from_cmdline[0] = xsize;
2409  i++;
2410  ret = 0;
2411  goto out;
2412  }
2413 
2414  if( strcmp(argv[i], "--ygridsize") == 0){
2415  if (i + 1 >= argc) { usage(argv); }
2416  int ysize = atoi(argv[i+1]);
2417  if (ysize <= 0 ){
2418  printf("ERROR: invalid Y grid size");
2419  usage(argv);
2420  }
2421  gridsize_from_cmdline[1] = ysize;
2422  i++;
2423  ret = 0;
2424  goto out;
2425  }
2426 
2427  if( strcmp(argv[i], "--zgridsize") == 0){
2428  if (i + 1 >= argc) { usage(argv); }
2429  int zsize = atoi(argv[i+1]);
2430  if (zsize <= 0 ){
2431  printf("ERROR: invalid Z grid size");
2432  usage(argv);
2433  }
2434  gridsize_from_cmdline[2] = zsize;
2435  i++;
2436  ret = 0;
2437  goto out;
2438  }
2439 
2440  if( strcmp(argv[i], "--tgridsize") == 0){
2441  if (i + 1 >= argc) { usage(argv); }
2442  int tsize = atoi(argv[i+1]);
2443  if (tsize <= 0 ){
2444  printf("ERROR: invalid T grid size");
2445  usage(argv);
2446  }
2447  gridsize_from_cmdline[3] = tsize;
2448  i++;
2449  ret = 0;
2450  goto out;
2451  }
2452 
2453  if( strcmp(argv[i], "--rank-order") == 0){
2454  if (i+1 >= argc){
2455  usage(argv);
2456  }
2457  rank_order = get_rank_order(argv[i+1]);
2458  i++;
2459  ret = 0;
2460  goto out;
2461  }
2462 
2463  if( strcmp(argv[i], "--dslash-type") == 0){
2464  if (i + 1 >= argc) { usage(argv); }
2465  dslash_type = get_dslash_type(argv[i+1]);
2466  i++;
2467  ret = 0;
2468  goto out;
2469  }
2470 
2471  if (strcmp(argv[i], "--laplace3D") == 0) {
2472  if (i + 1 >= argc) { usage(argv); }
2473  laplace3D = atoi(argv[i + 1]);
2474  if (laplace3D > 4 || laplace3D < 3) {
2475  printf("ERROR: invalid transverse dim %d given. Please use 3 or 4 for t,none.\n", laplace3D);
2476  usage(argv);
2477  }
2478  i++;
2479  ret = 0;
2480  goto out;
2481  }
2482 
2483  if( strcmp(argv[i], "--flavor") == 0){
2484  if (i + 1 >= argc) { usage(argv); }
2485  twist_flavor = get_flavor_type(argv[i+1]);
2486  i++;
2487  ret = 0;
2488  goto out;
2489  }
2490 
2491  if( strcmp(argv[i], "--inv-type") == 0){
2492  if (i + 1 >= argc) { usage(argv); }
2493  inv_type = get_solver_type(argv[i+1]);
2494  i++;
2495  ret = 0;
2496  goto out;
2497  }
2498 
2499  if (strcmp(argv[i], "--precon-type") == 0) {
2500  if (i + 1 >= argc) { usage(argv); }
2501  precon_type = get_solver_type(argv[i + 1]);
2502  i++;
2503  ret = 0;
2504  goto out;
2505  }
2506 
2507  if (strcmp(argv[i], "--mass") == 0) {
2508  if (i + 1 >= argc) { usage(argv); }
2509  mass = atof(argv[i+1]);
2510  i++;
2511  ret = 0;
2512  goto out;
2513  }
2514 
2515  if( strcmp(argv[i], "--kappa") == 0){
2516  if (i+1 >= argc){
2517  usage(argv);
2518  }
2519  kappa = atof(argv[i+1]);
2520  i++;
2521  ret = 0;
2522  goto out;
2523  }
2524 
2525  if( strcmp(argv[i], "--compute-clover") == 0){
2526  if (i+1 >= argc){
2527  usage(argv);
2528  }
2529  if (strcmp(argv[i+1], "true") == 0){
2530  compute_clover = true;
2531  }else if (strcmp(argv[i+1], "false") == 0){
2532  compute_clover = false;
2533  }else{
2534  fprintf(stderr, "ERROR: invalid compute_clover type\n");
2535  exit(1);
2536  }
2537 
2538  i++;
2539  ret = 0;
2540  goto out;
2541  }
2542 
2543  if( strcmp(argv[i], "--clover-coeff") == 0){
2544  if (i+1 >= argc){
2545  usage(argv);
2546  }
2547  clover_coeff = atof(argv[i+1]);
2548  i++;
2549  ret = 0;
2550  goto out;
2551  }
2552 
2553  if( strcmp(argv[i], "--mu") == 0){
2554  if (i+1 >= argc){
2555  usage(argv);
2556  }
2557  mu = atof(argv[i+1]);
2558  i++;
2559  ret = 0;
2560  goto out;
2561  }
2562 
2563  if (strcmp(argv[i], "--epsilon") == 0) {
2564  if (i + 1 >= argc) { usage(argv); }
2565  epsilon = atof(argv[i + 1]);
2566  i++;
2567  ret = 0;
2568  goto out;
2569  }
2570 
2571  if( strcmp(argv[i], "--compute-fat-long") == 0){
2572  if (i+1 >= argc){
2573  usage(argv);
2574  }
2575  if (strcmp(argv[i+1], "true") == 0){
2576  compute_fatlong = true;
2577  }else if (strcmp(argv[i+1], "false") == 0){
2578  compute_fatlong = false;
2579  }else{
2580  fprintf(stderr, "ERROR: invalid compute_fatlong type\n");
2581  exit(1);
2582  }
2583 
2584  i++;
2585  ret = 0;
2586  goto out;
2587  }
2588 
2589 
2590  if( strcmp(argv[i], "--epsilon-naik") == 0){
2591  if (i+1 >= argc){
2592  usage(argv);
2593  }
2594  eps_naik = atof(argv[i+1]);
2595  i++;
2596  ret = 0;
2597  goto out;
2598  }
2599 
2600  if( strcmp(argv[i], "--tadpole-coeff") == 0){
2601  if (i+1 >= argc){
2602  usage(argv);
2603  }
2604  tadpole_factor = atof(argv[i+1]);
2605  i++;
2606  ret = 0;
2607  goto out;
2608  }
2609 
2610  if( strcmp(argv[i], "--anisotropy") == 0){
2611  if (i+1 >= argc){
2612  usage(argv);
2613  }
2614  anisotropy = atof(argv[i+1]);
2615  i++;
2616  ret = 0;
2617  goto out;
2618  }
2619 
2620  if( strcmp(argv[i], "--tol") == 0){
2621  if (i+1 >= argc){
2622  usage(argv);
2623  }
2624  tol= atof(argv[i+1]);
2625  i++;
2626  ret = 0;
2627  goto out;
2628  }
2629 
2630  if( strcmp(argv[i], "--tolhq") == 0){
2631  if (i+1 >= argc){
2632  usage(argv);
2633  }
2634  tol_hq= atof(argv[i+1]);
2635  i++;
2636  ret = 0;
2637  goto out;
2638  }
2639 
2640  if( strcmp(argv[i], "--reliable-delta") == 0){
2641  if (i+1 >= argc){
2642  usage(argv);
2643  }
2644  reliable_delta = atof(argv[i+1]);
2645  i++;
2646  ret = 0;
2647  goto out;
2648  }
2649 
2650  if( strcmp(argv[i], "--alternative-reliable") == 0){
2651  if (i+1 >= argc) {
2652  usage(argv);
2653  }
2654  if (strcmp(argv[i+1], "true") == 0) {
2655  alternative_reliable = true;
2656  } else if (strcmp(argv[i+1], "false") == 0) {
2657  alternative_reliable = false;
2658  } else {
2659  fprintf(stderr, "ERROR: invalid multishift boolean\n");
2660  exit(1);
2661  }
2662  i++;
2663  ret = 0;
2664  goto out;
2665  }
2666 
2667  if( strcmp(argv[i], "--mass-normalization") == 0){
2668  if (i+1 >= argc){
2669  usage(argv);
2670  }
2672  i++;
2673  ret = 0;
2674  goto out;
2675  }
2676 
2677  if( strcmp(argv[i], "--matpc") == 0){
2678  if (i+1 >= argc){
2679  usage(argv);
2680  }
2681  matpc_type = get_matpc_type(argv[i+1]);
2682  i++;
2683  ret = 0;
2684  goto out;
2685  }
2686 
2687  if( strcmp(argv[i], "--solve-type") == 0){
2688  if (i+1 >= argc){
2689  usage(argv);
2690  }
2691  solve_type = get_solve_type(argv[i+1]);
2692  i++;
2693  ret = 0;
2694  goto out;
2695  }
2696 
2697  if (strcmp(argv[i], "--solution-type") == 0) {
2698  if (i + 1 >= argc) { usage(argv); }
2699  solution_type = get_solution_type(argv[i + 1]);
2700  i++;
2701  ret = 0;
2702  goto out;
2703  }
2704 
2705  if( strcmp(argv[i], "--load-gauge") == 0){
2706  if (i + 1 >= argc) { usage(argv); }
2707  strcpy(latfile, argv[i+1]);
2708  i++;
2709  ret = 0;
2710  goto out;
2711  }
2712 
2713  if( strcmp(argv[i], "--save-gauge") == 0){
2714  if (i + 1 >= argc) { usage(argv); }
2715  strcpy(gauge_outfile, argv[i+1]);
2716  i++;
2717  ret = 0;
2718  goto out;
2719  }
2720 
2721  if (strcmp(argv[i], "--unit-gauge") == 0) {
2722  if (i + 1 >= argc) { usage(argv); }
2723 
2724  if (strcmp(argv[i + 1], "true") == 0) {
2725  unit_gauge = true;
2726  } else if (strcmp(argv[i + 1], "false") == 0) {
2727  unit_gauge = false;
2728  } else {
2729  fprintf(stderr, "ERROR: invalid unit-gauge type given (true/false)\n");
2730  exit(1);
2731  }
2732 
2733  i++;
2734  ret = 0;
2735  goto out;
2736  }
2737 
2738  if( strcmp(argv[i], "--nsrc") == 0){
2739  if (i+1 >= argc){
2740  usage(argv);
2741  }
2742  Nsrc = atoi(argv[i+1]);
2743  if (Nsrc < 0 || Nsrc > 128){ // allow 0 for testing setup in isolation
2744  printf("ERROR: invalid number of sources (Nsrc=%d)\n", Nsrc);
2745  usage(argv);
2746  }
2747  i++;
2748  ret = 0;
2749  goto out;
2750  }
2751 
2752  if( strcmp(argv[i], "--msrc") == 0){
2753  if (i+1 >= argc){
2754  usage(argv);
2755  }
2756  Msrc = atoi(argv[i+1]);
2757  if (Msrc < 1 || Msrc > 128){
2758  printf("ERROR: invalid number of sources (Msrc=%d)\n", Msrc);
2759  usage(argv);
2760  }
2761  i++;
2762  ret = 0;
2763  goto out;
2764  }
2765 
2766  if( strcmp(argv[i], "--test") == 0){
2767  if (i + 1 >= argc) { usage(argv); }
2768  test_type = atoi(argv[i+1]);
2769  i++;
2770  ret = 0;
2771  goto out;
2772  }
2773 
2774  if( strcmp(argv[i], "--mg-nvec") == 0){
2775  if (i+2 >= argc){
2776  usage(argv);
2777  }
2778  int level = atoi(argv[i+1]);
2779  if (level < 0 || level >= QUDA_MAX_MG_LEVEL) {
2780  printf("ERROR: invalid multigrid level %d", level);
2781  usage(argv);
2782  }
2783  i++;
2784 
2785  nvec[level] = atoi(argv[i+1]);
2786  if (nvec[level] < 0 || nvec[level] > 1024) {
2787  printf("ERROR: invalid number of vectors (%d)\n", nvec[level]);
2788  usage(argv);
2789  }
2790  i++;
2791  ret = 0;
2792  goto out;
2793  }
2794 
2795  if( strcmp(argv[i], "--mg-levels") == 0){
2796  if (i+1 >= argc){
2797  usage(argv);
2798  }
2799  mg_levels= atoi(argv[i+1]);
2800  if (mg_levels < 2 || mg_levels > QUDA_MAX_MG_LEVEL){
2801  printf("ERROR: invalid number of multigrid levels (%d)\n", mg_levels);
2802  usage(argv);
2803  }
2804  i++;
2805  ret = 0;
2806  goto out;
2807  }
2808 
2809  if( strcmp(argv[i], "--mg-nu-pre") == 0){
2810  if (i+1 >= argc){
2811  usage(argv);
2812  }
2813  int level = atoi(argv[i+1]);
2814  if (level < 0 || level >= QUDA_MAX_MG_LEVEL) {
2815  printf("ERROR: invalid multigrid level %d", level);
2816  usage(argv);
2817  }
2818  i++;
2819 
2820  nu_pre[level] = atoi(argv[i+1]);
2821  if (nu_pre[level] < 0 || nu_pre[level] > 20){
2822  printf("ERROR: invalid pre-smoother applications value (nu_pre=%d)\n", nu_pre[level]);
2823  usage(argv);
2824  }
2825  i++;
2826  ret = 0;
2827  goto out;
2828  }
2829 
2830  if( strcmp(argv[i], "--mg-nu-post") == 0){
2831  if (i+1 >= argc){
2832  usage(argv);
2833  }
2834  int level = atoi(argv[i+1]);
2835  if (level < 0 || level >= QUDA_MAX_MG_LEVEL) {
2836  printf("ERROR: invalid multigrid level %d", level);
2837  usage(argv);
2838  }
2839  i++;
2840 
2841  nu_post[level] = atoi(argv[i+1]);
2842  if (nu_post[level] < 0 || nu_post[level] > 20){
2843  printf("ERROR: invalid pre-smoother applications value (nu_post=%d)\n", nu_post[level]);
2844  usage(argv);
2845  }
2846  i++;
2847  ret = 0;
2848  goto out;
2849  }
2850 
2851  if( strcmp(argv[i], "--mg-coarse-solve-type") == 0){
2852  if (i+1 >= argc){
2853  usage(argv);
2854  }
2855  int level = atoi(argv[i+1]);
2856  if (level < 0 || level >= QUDA_MAX_MG_LEVEL) {
2857  printf("ERROR: invalid multigrid level %d", level);
2858  usage(argv);
2859  }
2860  i++;
2861 
2862  coarse_solve_type[level] = get_solve_type(argv[i+1]);
2863  i++;
2864  ret = 0;
2865  goto out;
2866  }
2867 
2868  if( strcmp(argv[i], "--mg-smoother-solve-type") == 0){
2869  if (i+1 >= argc){
2870  usage(argv);
2871  }
2872  int level = atoi(argv[i+1]);
2873  if (level < 0 || level >= QUDA_MAX_MG_LEVEL) {
2874  printf("ERROR: invalid multigrid level %d", level);
2875  usage(argv);
2876  }
2877  i++;
2878 
2879  smoother_solve_type[level] = get_solve_type(argv[i+1]);
2880  i++;
2881  ret = 0;
2882  goto out;
2883  }
2884 
2885  if( strcmp(argv[i], "--mg-solver-location") == 0){
2886  if (i+2 >= argc){
2887  usage(argv);
2888  }
2889  int level = atoi(argv[i+1]);
2890 
2891  if (level < 0 || level >= QUDA_MAX_MG_LEVEL) {
2892  printf("ERROR: invalid multigrid level %d", level);
2893  usage(argv);
2894  }
2895  i++;
2896 
2897  solver_location[level] = get_location(argv[i+1]);
2898  i++;
2899  ret = 0;
2900  goto out;
2901  }
2902 
2903  if( strcmp(argv[i], "--mg-setup-location") == 0){
2904  if (i+2 >= argc){
2905  usage(argv);
2906  }
2907  int level = atoi(argv[i+1]);
2908 
2909  if (level < 0 || level >= QUDA_MAX_MG_LEVEL) {
2910  printf("ERROR: invalid multigrid level %d", level);
2911  usage(argv);
2912  }
2913  i++;
2914 
2915  setup_location[level] = get_location(argv[i+1]);
2916  i++;
2917  ret = 0;
2918  goto out;
2919  }
2920 
2921  if( strcmp(argv[i], "--mg-setup-inv") == 0){
2922  if (i+2 >= argc){
2923  usage(argv);
2924  }
2925  int level = atoi(argv[i+1]);
2926  if (level < 0 || level >= QUDA_MAX_MG_LEVEL) {
2927  printf("ERROR: invalid multigrid level %d", level);
2928  usage(argv);
2929  }
2930  i++;
2931 
2932  setup_inv[level] = get_solver_type(argv[i+1]);
2933  i++;
2934  ret = 0;
2935  goto out;
2936  }
2937 
2938  if( strcmp(argv[i], "--mg-setup-iters") == 0){
2939  if (i+2 >= argc){
2940  usage(argv);
2941  }
2942  int level = atoi(argv[i+1]);
2943  if (level < 0 || level >= QUDA_MAX_MG_LEVEL) {
2944  printf("ERROR: invalid multigrid level %d", level);
2945  usage(argv);
2946  }
2947  i++;
2948 
2949  num_setup_iter[level] = atoi(argv[i+1]);
2950  i++;
2951  ret = 0;
2952  goto out;
2953  }
2954 
2955  if( strcmp(argv[i], "--mg-setup-tol") == 0){
2956  if (i+1 >= argc){
2957  usage(argv);
2958  }
2959  int level = atoi(argv[i+1]);
2960  if (level < 0 || level >= QUDA_MAX_MG_LEVEL) {
2961  printf("ERROR: invalid multigrid level %d", level);
2962  usage(argv);
2963  }
2964  i++;
2965 
2966  setup_tol[level] = atof(argv[i+1]);
2967  i++;
2968  ret = 0;
2969  goto out;
2970  }
2971 
2972 
2973  if( strcmp(argv[i], "--mg-setup-maxiter") == 0){
2974  if (i+2 >= argc){
2975  usage(argv);
2976  }
2977  int level = atoi(argv[i+1]);
2978  if (level < 0 || level >= QUDA_MAX_MG_LEVEL) {
2979  printf("ERROR: invalid multigrid level %d", level);
2980  usage(argv);
2981  }
2982  i++;
2983 
2984  setup_maxiter[level] = atoi(argv[i+1]);
2985  i++;
2986  ret = 0;
2987  goto out;
2988  }
2989 
2990  if( strcmp(argv[i], "--mg-setup-maxiter-refresh") == 0){
2991  if (i+2 >= argc){
2992  usage(argv);
2993  }
2994  int level = atoi(argv[i+1]);
2995  if (level < 0 || level >= QUDA_MAX_MG_LEVEL) {
2996  printf("ERROR: invalid multigrid level %d", level);
2997  usage(argv);
2998  }
2999  i++;
3000 
3001  setup_maxiter_refresh[level] = atoi(argv[i+1]);
3002  i++;
3003  ret = 0;
3004  goto out;
3005  }
3006 
3007  if( strcmp(argv[i], "--mg-setup-ca-basis-type") == 0){
3008  if (i+2 >= argc){
3009  usage(argv);
3010  }
3011  int level = atoi(argv[i+1]);
3012  if (level < 0 || level >= QUDA_MAX_MG_LEVEL) {
3013  printf("ERROR: invalid multigrid level %d", level);
3014  usage(argv);
3015  }
3016  i++;
3017 
3018  if (strcmp(argv[i+1], "power") == 0){
3020  }else if (strcmp(argv[i+1], "chebyshev") == 0 || strcmp(argv[i+1], "cheby") == 0){
3022  }else{
3023  fprintf(stderr, "ERROR: invalid value for basis ca cg type\n");
3024  exit(1);
3025  }
3026 
3027  i++;
3028  ret = 0;
3029  goto out;
3030  }
3031 
3032  if( strcmp(argv[i], "--mg-setup-ca-basis-size") == 0){
3033  if (i+2 >= argc){
3034  usage(argv);
3035  }
3036  int level = atoi(argv[i+1]);
3037  if (level < 0 || level >= QUDA_MAX_MG_LEVEL) {
3038  printf("ERROR: invalid multigrid level %d", level);
3039  usage(argv);
3040  }
3041  i++;
3042 
3043  setup_ca_basis_size[level] = atoi(argv[i+1]);
3044  if (setup_ca_basis_size[level] < 1){
3045  printf("ERROR: invalid basis size for CA-CG (%d < 1)\n", setup_ca_basis_size[level]);
3046  usage(argv);
3047  }
3048  i++;
3049  ret = 0;
3050  goto out;
3051  }
3052 
3053  if( strcmp(argv[i], "--mg-setup-cheby-basis-eig-min") == 0){
3054  if (i+2 >= argc){
3055  usage(argv);
3056  }
3057  int level = atoi(argv[i+1]);
3058  if (level < 0 || level >= QUDA_MAX_MG_LEVEL) {
3059  printf("ERROR: invalid multigrid level %d", level);
3060  usage(argv);
3061  }
3062  i++;
3063 
3064  setup_ca_lambda_min[level] = atof(argv[i+1]);
3065  if (setup_ca_lambda_min[level] < 0.0){
3066  printf("ERROR: invalid minimum eigenvalue for CA-CG (%f < 0.0)\n", setup_ca_lambda_min[level]);
3067  usage(argv);
3068  }
3069  i++;
3070  ret = 0;
3071  goto out;
3072  }
3073 
3074  if( strcmp(argv[i], "--mg-setup-cheby-basis-eig-max") == 0){
3075  if (i+2 >= argc){
3076  usage(argv);
3077  }
3078  int level = atoi(argv[i+1]);
3079  if (level < 0 || level >= QUDA_MAX_MG_LEVEL) {
3080  printf("ERROR: invalid multigrid level %d", level);
3081  usage(argv);
3082  }
3083  i++;
3084 
3085  setup_ca_lambda_max[level] = atof(argv[i+1]);
3086  // negative value induces power iterations to guess
3087  i++;
3088  ret = 0;
3089  goto out;
3090  }
3091 
3092  if( strcmp(argv[i], "--mg-setup-type") == 0){
3093  if (i+1 >= argc){
3094  usage(argv);
3095  }
3096 
3097  if( strcmp(argv[i+1], "test") == 0)
3099  else if( strcmp(argv[i+1], "null")==0)
3101  else {
3102  fprintf(stderr, "ERROR: invalid setup type\n");
3103  exit(1);
3104  }
3105 
3106  i++;
3107  ret = 0;
3108  goto out;
3109  }
3110 
3111  if( strcmp(argv[i], "--mg-pre-orth") == 0){
3112  if (i+1 >= argc){
3113  usage(argv);
3114  }
3115 
3116  if (strcmp(argv[i+1], "true") == 0){
3117  pre_orthonormalize = true;
3118  }else if (strcmp(argv[i+1], "false") == 0){
3119  pre_orthonormalize = false;
3120  }else{
3121  fprintf(stderr, "ERROR: invalid pre orthogonalize type\n");
3122  exit(1);
3123  }
3124 
3125  i++;
3126  ret = 0;
3127  goto out;
3128  }
3129 
3130  if( strcmp(argv[i], "--mg-post-orth") == 0){
3131  if (i+1 >= argc){
3132  usage(argv);
3133  }
3134 
3135  if (strcmp(argv[i+1], "true") == 0){
3136  post_orthonormalize = true;
3137  }else if (strcmp(argv[i+1], "false") == 0){
3138  post_orthonormalize = false;
3139  }else{
3140  fprintf(stderr, "ERROR: invalid post orthogonalize type\n");
3141  exit(1);
3142  }
3143 
3144  i++;
3145  ret = 0;
3146  goto out;
3147  }
3148 
3149  if (strcmp(argv[i], "--mg-n-block-ortho") == 0) {
3150  if (i + 2 >= argc) { usage(argv); }
3151 
3152  int level = atoi(argv[i + 1]);
3153  if (level < 0 || level >= QUDA_MAX_MG_LEVEL) {
3154  printf("ERROR: invalid multigrid level %d for number of block orthos", level);
3155  usage(argv);
3156  }
3157  i++;
3158 
3159  n_block_ortho[level] = atoi(argv[i + 1]);
3160  if (n_block_ortho[level] < 1) {
3161  fprintf(stderr, "ERROR: invalid number %d of block orthonormalizations", n_block_ortho[level]);
3162  }
3163  i++;
3164  ret = 0;
3165  goto out;
3166  }
3167 
3168  if( strcmp(argv[i], "--mg-omega") == 0){
3169  if (i+1 >= argc){
3170  usage(argv);
3171  }
3172 
3173  omega = atof(argv[i+1]);
3174  i++;
3175  ret = 0;
3176  goto out;
3177  }
3178 
3179  if( strcmp(argv[i], "--mg-verbosity") == 0){
3180  if (i+2 >= argc){
3181  usage(argv);
3182  }
3183  int level = atoi(argv[i+1]);
3184  if (level < 0 || level >= QUDA_MAX_MG_LEVEL) {
3185  printf("ERROR: invalid multigrid level %d", level);
3186  usage(argv);
3187  }
3188  i++;
3189 
3190  mg_verbosity[level] = get_verbosity_type(argv[i+1]);
3191  i++;
3192  ret = 0;
3193  goto out;
3194  }
3195 
3196  if( strcmp(argv[i], "--mg-coarse-solver") == 0){
3197  if (i+2 >= argc){
3198  usage(argv);
3199  }
3200  int level = atoi(argv[i+1]);
3201  if (level < 1 || level >= QUDA_MAX_MG_LEVEL) {
3202  printf("ERROR: invalid multigrid level %d for coarse solver", level);
3203  usage(argv);
3204  }
3205  i++;
3206 
3207  coarse_solver[level] = get_solver_type(argv[i+1]);
3208  i++;
3209  ret = 0;
3210  goto out;
3211  }
3212 
3213  if( strcmp(argv[i], "--mg-smoother") == 0){
3214  if (i+2 >= argc){
3215  usage(argv);
3216  }
3217  int level = atoi(argv[i+1]);
3218  if (level < 0 || level >= QUDA_MAX_MG_LEVEL) {
3219  printf("ERROR: invalid multigrid level %d", level);
3220  usage(argv);
3221  }
3222  i++;
3223 
3224  smoother_type[level] = get_solver_type(argv[i+1]);
3225  i++;
3226  ret = 0;
3227  goto out;
3228  }
3229 
3230  if( strcmp(argv[i], "--mg-smoother-tol") == 0){
3231  if (i+2 >= argc){
3232  usage(argv);
3233  }
3234  int level = atoi(argv[i+1]);
3235  if (level < 0 || level >= QUDA_MAX_MG_LEVEL) {
3236  printf("ERROR: invalid multigrid level %d", level);
3237  usage(argv);
3238  }
3239  i++;
3240 
3241  smoother_tol[level] = atof(argv[i+1]);
3242 
3243  i++;
3244  ret = 0;
3245  goto out;
3246  }
3247 
3248  if( strcmp(argv[i], "--mg-coarse-solver-tol") == 0){
3249  if (i+2 >= argc){
3250  usage(argv);
3251  }
3252  int level = atoi(argv[i+1]);
3253  if (level < 1 || level >= QUDA_MAX_MG_LEVEL) {
3254  printf("ERROR: invalid multigrid level %d for coarse solver", level);
3255  usage(argv);
3256  }
3257  i++;
3258 
3259  coarse_solver_tol[level] = atof(argv[i+1]);
3260  i++;
3261  ret = 0;
3262  goto out;
3263  }
3264 
3265  if( strcmp(argv[i], "--mg-coarse-solver-maxiter") == 0){
3266  if (i+2 >= argc){
3267  usage(argv);
3268  }
3269 
3270  int level = atoi(argv[i+1]);
3271  if (level < 1 || level >= QUDA_MAX_MG_LEVEL) {
3272  printf("ERROR: invalid multigrid level %d for coarse solver", level);
3273  usage(argv);
3274  }
3275  i++;
3276 
3277  coarse_solver_maxiter[level] = atoi(argv[i+1]);
3278  i++;
3279  ret = 0;
3280  goto out;
3281  }
3282 
3283  if( strcmp(argv[i], "--mg-coarse-solver-ca-basis-type") == 0){
3284  if (i+2 >= argc){
3285  usage(argv);
3286  }
3287  int level = atoi(argv[i+1]);
3288  if (level < 0 || level >= QUDA_MAX_MG_LEVEL) {
3289  printf("ERROR: invalid multigrid level %d", level);
3290  usage(argv);
3291  }
3292  i++;
3293 
3294  if (strcmp(argv[i+1], "power") == 0){
3296  }else if (strcmp(argv[i+1], "chebyshev") == 0 || strcmp(argv[i+1], "cheby") == 0){
3298  }else{
3299  fprintf(stderr, "ERROR: invalid value for basis ca cg type\n");
3300  exit(1);
3301  }
3302 
3303  i++;
3304  ret = 0;
3305  goto out;
3306  }
3307 
3308  if( strcmp(argv[i], "--mg-coarse-solver-ca-basis-size") == 0){
3309  if (i+2 >= argc){
3310  usage(argv);
3311  }
3312  int level = atoi(argv[i+1]);
3313  if (level < 0 || level >= QUDA_MAX_MG_LEVEL) {
3314  printf("ERROR: invalid multigrid level %d", level);
3315  usage(argv);
3316  }
3317  i++;
3318 
3319  coarse_solver_ca_basis_size[level] = atoi(argv[i+1]);
3320  if (coarse_solver_ca_basis_size[level] < 1){
3321  printf("ERROR: invalid basis size for CA-CG (%d < 1)\n", coarse_solver_ca_basis_size[level]);
3322  usage(argv);
3323  }
3324  i++;
3325  ret = 0;
3326  goto out;
3327  }
3328 
3329  if( strcmp(argv[i], "--mg-coarse-solver-cheby-basis-eig-min") == 0){
3330  if (i+2 >= argc){
3331  usage(argv);
3332  }
3333  int level = atoi(argv[i+1]);
3334  if (level < 0 || level >= QUDA_MAX_MG_LEVEL) {
3335  printf("ERROR: invalid multigrid level %d", level);
3336  usage(argv);
3337  }
3338  i++;
3339 
3340  coarse_solver_ca_lambda_min[level] = atof(argv[i+1]);
3341  if (coarse_solver_ca_lambda_min[level] < 0.0){
3342  printf("ERROR: invalid minimum eigenvalue for CA-CG (%f < 0.0)\n", coarse_solver_ca_lambda_min[level]);
3343  usage(argv);
3344  }
3345  i++;
3346  ret = 0;
3347  goto out;
3348  }
3349 
3350  if( strcmp(argv[i], "--mg-coarse-solver-cheby-basis-eig-max") == 0){
3351  if (i+2 >= argc){
3352  usage(argv);
3353  }
3354  int level = atoi(argv[i+1]);
3355  if (level < 0 || level >= QUDA_MAX_MG_LEVEL) {
3356  printf("ERROR: invalid multigrid level %d", level);
3357  usage(argv);
3358  }
3359  i++;
3360 
3361  coarse_solver_ca_lambda_max[level] = atof(argv[i+1]);
3362  // negative value induces power iterations to guess
3363  i++;
3364  ret = 0;
3365  goto out;
3366  }
3367 
3368 
3369 
3370  if( strcmp(argv[i], "--mg-smoother-halo-prec") == 0){
3371  if (i+1 >= argc){
3372  usage(argv);
3373  }
3374  smoother_halo_prec = get_prec(argv[i+1]);
3375  i++;
3376  ret = 0;
3377  goto out;
3378  }
3379 
3380 
3381  if( strcmp(argv[i], "--mg-schwarz-type") == 0){
3382  if (i+2 >= argc){
3383  usage(argv);
3384  }
3385  int level = atoi(argv[i+1]);
3386  if (level < 0 || level >= QUDA_MAX_MG_LEVEL) {
3387  printf("ERROR: invalid multigrid level %d", level);
3388  usage(argv);
3389  }
3390  i++;
3391 
3392  schwarz_type[level] = get_schwarz_type(argv[i+1]);
3393  i++;
3394  ret = 0;
3395  goto out;
3396  }
3397 
3398  if( strcmp(argv[i], "--mg-schwarz-cycle") == 0){
3399  if (i+2 >= argc){
3400  usage(argv);
3401  }
3402  int level = atoi(argv[i+1]);
3403  if (level < 0 || level >= QUDA_MAX_MG_LEVEL) {
3404  printf("ERROR: invalid multigrid level %d", level);
3405  usage(argv);
3406  }
3407  i++;
3408 
3409  schwarz_cycle[level] = atoi(argv[i+1]);
3410  if (schwarz_cycle[level] < 0 || schwarz_cycle[level] >= 128) {
3411  printf("ERROR: invalid Schwarz cycle value requested %d for level %d",
3412  level, schwarz_cycle[level]);
3413  usage(argv);
3414  }
3415  i++;
3416  ret = 0;
3417  goto out;
3418  }
3419 
3420  if( strcmp(argv[i], "--mg-block-size") == 0){
3421  if (i + 5 >= argc) { usage(argv); }
3422  int level = atoi(argv[i+1]);
3423  if (level < 0 || level >= QUDA_MAX_MG_LEVEL) {
3424  printf("ERROR: invalid multigrid level %d", level);
3425  usage(argv);
3426  }
3427  i++;
3428 
3429  int xsize = atoi(argv[i+1]);
3430  if (xsize <= 0 ){
3431  printf("ERROR: invalid X block size");
3432  usage(argv);
3433  }
3434  geo_block_size[level][0] = xsize;
3435  i++;
3436 
3437  int ysize = atoi(argv[i+1]);
3438  if (ysize <= 0 ){
3439  printf("ERROR: invalid Y block size");
3440  usage(argv);
3441  }
3442  geo_block_size[level][1] = ysize;
3443  i++;
3444 
3445  int zsize = atoi(argv[i+1]);
3446  if (zsize <= 0 ){
3447  printf("ERROR: invalid Z block size");
3448  usage(argv);
3449  }
3450  geo_block_size[level][2] = zsize;
3451  i++;
3452 
3453  int tsize = atoi(argv[i+1]);
3454  if (tsize <= 0 ){
3455  printf("ERROR: invalid T block size");
3456  usage(argv);
3457  }
3458  geo_block_size[level][3] = tsize;
3459  i++;
3460 
3461  ret = 0;
3462  goto out;
3463  }
3464 
3465  if( strcmp(argv[i], "--mg-mu-factor") == 0){
3466  if (i+2 >= argc){
3467  usage(argv);
3468  }
3469  int level = atoi(argv[i+1]);
3470  if (level < 0 || level >= QUDA_MAX_MG_LEVEL) {
3471  printf("ERROR: invalid multigrid level %d", level);
3472  usage(argv);
3473  }
3474  i++;
3475 
3476  double factor = atof(argv[i+1]);
3477  mu_factor[level] = factor;
3478  i++;
3479  ret=0;
3480  goto out;
3481  }
3482 
3483  if( strcmp(argv[i], "--mg-generate-nullspace") == 0){
3484  if (i+1 >= argc){
3485  usage(argv);
3486  }
3487 
3488  if (strcmp(argv[i+1], "true") == 0){
3489  generate_nullspace = true;
3490  }else if (strcmp(argv[i+1], "false") == 0){
3491  generate_nullspace = false;
3492  }else{
3493  fprintf(stderr, "ERROR: invalid generate nullspace type\n");
3494  exit(1);
3495  }
3496 
3497  i++;
3498  ret = 0;
3499  goto out;
3500  }
3501 
3502  if( strcmp(argv[i], "--mg-generate-all-levels") == 0){
3503  if (i+1 >= argc){
3504  usage(argv);
3505  }
3506 
3507  if (strcmp(argv[i+1], "true") == 0){
3508  generate_all_levels = true;
3509  }else if (strcmp(argv[i+1], "false") == 0){
3510  generate_all_levels = false;
3511  }else{
3512  fprintf(stderr, "ERROR: invalid value for generate_all_levels type\n");
3513  exit(1);
3514  }
3515 
3516  i++;
3517  ret = 0;
3518  goto out;
3519  }
3520 
3521  if( strcmp(argv[i], "--mg-load-vec") == 0){
3522  if (i + 2 >= argc) { usage(argv); }
3523  int level = atoi(argv[i + 1]);
3524  if (level < 0 || level >= QUDA_MAX_MG_LEVEL) {
3525  printf("ERROR: invalid multigrid level %d", level);
3526  usage(argv);
3527  }
3528  i++;
3529 
3530  strcpy(mg_vec_infile[level], argv[i + 1]);
3531  i++;
3532  ret = 0;
3533  goto out;
3534  }
3535 
3536  if( strcmp(argv[i], "--mg-save-vec") == 0){
3537  if (i+1 >= argc){
3538  usage(argv);
3539  }
3540  int level = atoi(argv[i + 1]);
3541  if (level < 0 || level >= QUDA_MAX_MG_LEVEL) {
3542  printf("ERROR: invalid multigrid level %d", level);
3543  usage(argv);
3544  }
3545  i++;
3546  strcpy(mg_vec_outfile[level], argv[i + 1]);
3547  i++;
3548  ret = 0;
3549  goto out;
3550  }
3551 
3552  if( strcmp(argv[i], "--df-nev") == 0){
3553  if (i+1 >= argc){
3554  usage(argv);
3555  }
3556 
3557  nev = atoi(argv[i+1]);
3558  i++;
3559  ret = 0;
3560  goto out;
3561  }
3562 
3563  if( strcmp(argv[i], "--df-max-search-dim") == 0){
3564  if (i+1 >= argc){
3565  usage(argv);
3566  }
3567 
3568  max_search_dim = atoi(argv[i+1]);
3569  i++;
3570  ret = 0;
3571  goto out;
3572  }
3573 
3574  if( strcmp(argv[i], "--df-deflation-grid") == 0){
3575  if (i+1 >= argc){
3576  usage(argv);
3577  }
3578 
3579  deflation_grid = atoi(argv[i+1]);
3580  i++;
3581  ret = 0;
3582  goto out;
3583  }
3584 
3585 
3586  if( strcmp(argv[i], "--df-eigcg-max-restarts") == 0){
3587  if (i+1 >= argc){
3588  usage(argv);
3589  }
3590 
3591  eigcg_max_restarts = atoi(argv[i+1]);
3592  i++;
3593  ret = 0;
3594  goto out;
3595  }
3596 
3597  if( strcmp(argv[i], "--df-max-restart-num") == 0){
3598  if (i+1 >= argc){
3599  usage(argv);
3600  }
3601 
3602  max_restart_num = atoi(argv[i+1]);
3603  i++;
3604  ret = 0;
3605  goto out;
3606  }
3607 
3608  if( strcmp(argv[i], "--df-tol-restart") == 0){
3609  if (i+1 >= argc){
3610  usage(argv);
3611  }
3612 
3613  tol_restart = atof(argv[i+1]);
3614  i++;
3615  ret = 0;
3616  goto out;
3617  }
3618 
3619  if( strcmp(argv[i], "--df-tol-eigenval") == 0){
3620  if (i+1 >= argc){
3621  usage(argv);
3622  }
3623 
3624  eigenval_tol = atof(argv[i+1]);
3625  i++;
3626  ret = 0;
3627  goto out;
3628  }
3629 
3630  if( strcmp(argv[i], "--df-tol-inc") == 0){
3631  if (i+1 >= argc){
3632  usage(argv);
3633  }
3634 
3635  inc_tol = atof(argv[i+1]);
3636  i++;
3637  ret = 0;
3638  goto out;
3639  }
3640 
3641  if( strcmp(argv[i], "--solver-ext-lib-type") == 0){
3642  if (i+1 >= argc){
3643  usage(argv);
3644  }
3646  i++;
3647  ret = 0;
3648  goto out;
3649  }
3650 
3651  if( strcmp(argv[i], "--df-ext-lib-type") == 0){
3652  if (i+1 >= argc){
3653  usage(argv);
3654  }
3656  i++;
3657  ret = 0;
3658  goto out;
3659  }
3660 
3661  if( strcmp(argv[i], "--df-location-ritz") == 0){
3662  if (i+1 >= argc){
3663  usage(argv);
3664  }
3665  location_ritz = get_location(argv[i+1]);
3666  i++;
3667  ret = 0;
3668  goto out;
3669  }
3670 
3671  if( strcmp(argv[i], "--df-mem-type-ritz") == 0){
3672  if (i+1 >= argc){
3673  usage(argv);
3674  }
3675  mem_type_ritz = get_df_mem_type_ritz(argv[i+1]);
3676  i++;
3677  ret = 0;
3678  goto out;
3679  }
3680 
3681  if (strcmp(argv[i], "--eig-nEv") == 0) {
3682  if (i + 1 >= argc) { usage(argv); }
3683  eig_nEv = atoi(argv[i + 1]);
3684  i++;
3685  ret = 0;
3686  goto out;
3687  }
3688 
3689  if (strcmp(argv[i], "--eig-nKr") == 0) {
3690  if (i + 1 >= argc) { usage(argv); }
3691  eig_nKr = atoi(argv[i + 1]);
3692  i++;
3693  ret = 0;
3694  goto out;
3695  }
3696 
3697  if (strcmp(argv[i], "--eig-nConv") == 0) {
3698  if (i + 1 >= argc) { usage(argv); }
3699  eig_nConv = atoi(argv[i + 1]);
3700  i++;
3701  ret = 0;
3702  goto out;
3703  }
3704 
3705  if (strcmp(argv[i], "--eig-require-convergence") == 0) {
3706  if (i + 1 >= argc) { usage(argv); }
3707 
3708  if (strcmp(argv[i + 1], "true") == 0) {
3709  eig_require_convergence = true;
3710  } else if (strcmp(argv[i + 1], "false") == 0) {
3711  eig_require_convergence = false;
3712  } else {
3713  fprintf(stderr, "ERROR: invalid value for require-convergence (true/false)\n");
3714  exit(1);
3715  }
3716 
3717  i++;
3718  ret = 0;
3719  goto out;
3720  }
3721 
3722  if (strcmp(argv[i], "--eig-check-interval") == 0) {
3723  if (i + 1 >= argc) { usage(argv); }
3724  eig_check_interval = atoi(argv[i + 1]);
3725  i++;
3726  ret = 0;
3727  goto out;
3728  }
3729 
3730  if (strcmp(argv[i], "--eig-max-restarts") == 0) {
3731  if (i + 1 >= argc) { usage(argv); }
3732  eig_max_restarts = atoi(argv[i + 1]);
3733  i++;
3734  ret = 0;
3735  goto out;
3736  }
3737 
3738  if (strcmp(argv[i], "--eig-tol") == 0) {
3739  if (i + 1 >= argc) { usage(argv); }
3740  eig_tol = atof(argv[i + 1]);
3741  i++;
3742  ret = 0;
3743  goto out;
3744  }
3745 
3746  if (strcmp(argv[i], "--eig-use-poly-acc") == 0) {
3747  if (i + 1 >= argc) { usage(argv); }
3748 
3749  if (strcmp(argv[i + 1], "true") == 0) {
3750  eig_use_poly_acc = true;
3751  } else if (strcmp(argv[i + 1], "false") == 0) {
3752  eig_use_poly_acc = false;
3753  } else {
3754  fprintf(stderr, "ERROR: invalid value for use-poly-acc (true/false)\n");
3755  exit(1);
3756  }
3757 
3758  i++;
3759  ret = 0;
3760  goto out;
3761  }
3762 
3763  if (strcmp(argv[i], "--eig-poly-deg") == 0) {
3764  if (i + 1 >= argc) { usage(argv); }
3765  eig_poly_deg = atoi(argv[i + 1]);
3766  i++;
3767  ret = 0;
3768  goto out;
3769  }
3770 
3771  if (strcmp(argv[i], "--eig-amin") == 0) {
3772  if (i + 1 >= argc) { usage(argv); }
3773  eig_amin = atof(argv[i + 1]);
3774  i++;
3775  ret = 0;
3776  goto out;
3777  }
3778 
3779  if (strcmp(argv[i], "--eig-amax") == 0) {
3780  if (i + 1 >= argc) { usage(argv); }
3781  eig_amax = atof(argv[i + 1]);
3782  i++;
3783  ret = 0;
3784  goto out;
3785  }
3786 
3787  if (strcmp(argv[i], "--eig-use-normop") == 0) {
3788  if (i + 1 >= argc) { usage(argv); }
3789 
3790  if (strcmp(argv[i + 1], "true") == 0) {
3791  eig_use_normop = true;
3792  } else if (strcmp(argv[i + 1], "false") == 0) {
3793  eig_use_normop = false;
3794  } else {
3795  fprintf(stderr, "ERROR: invalid value for eig-use-normop (true/false)\n");
3796  exit(1);
3797  }
3798 
3799  i++;
3800  ret = 0;
3801  goto out;
3802  }
3803 
3804  if (strcmp(argv[i], "--eig-use-dagger") == 0) {
3805  if (i + 1 >= argc) { usage(argv); }
3806 
3807  if (strcmp(argv[i + 1], "true") == 0) {
3808  eig_use_dagger = true;
3809  } else if (strcmp(argv[i + 1], "false") == 0) {
3810  eig_use_dagger = false;
3811  } else {
3812  fprintf(stderr, "ERROR: invalid value for eig-use-dagger (true/false)\n");
3813  exit(1);
3814  }
3815 
3816  i++;
3817  ret = 0;
3818  goto out;
3819  }
3820 
3821  if (strcmp(argv[i], "--eig-compute-svd") == 0) {
3822  if (i + 1 >= argc) { usage(argv); }
3823 
3824  if (strcmp(argv[i + 1], "true") == 0) {
3825  eig_compute_svd = true;
3826  } else if (strcmp(argv[i + 1], "false") == 0) {
3827  eig_compute_svd = false;
3828  } else {
3829  fprintf(stderr, "ERROR: invalid value for eig-compute-svd (true/false)\n");
3830  exit(1);
3831  }
3832 
3833  i++;
3834  ret = 0;
3835  goto out;
3836  }
3837 
3838  if (strcmp(argv[i], "--eig-spectrum") == 0) {
3839  if (i + 1 >= argc) { usage(argv); }
3840  eig_spectrum = get_eig_spectrum_type(argv[i + 1]);
3841  i++;
3842  ret = 0;
3843  goto out;
3844  }
3845 
3846  if (strcmp(argv[i], "--eig-type") == 0) {
3847  if (i + 1 >= argc) { usage(argv); }
3848  eig_type = get_eig_type(argv[i + 1]);
3849  i++;
3850  ret = 0;
3851  goto out;
3852  }
3853 
3854  if (strcmp(argv[i], "--eig-ARPACK-logfile") == 0) {
3855  if (i + 1 >= argc) { usage(argv); }
3856  strcpy(eig_arpack_logfile, argv[i + 1]);
3857  i++;
3858  ret = 0;
3859  goto out;
3860  }
3861 
3862  if (strcmp(argv[i], "--eig-QUDA-logfile") == 0) {
3863  if (i + 1 >= argc) { usage(argv); }
3864  strcpy(eig_QUDA_logfile, argv[i + 1]);
3865  i++;
3866  ret = 0;
3867  goto out;
3868  }
3869 
3870  if (strcmp(argv[i], "--eig-arpack-check") == 0) {
3871  if (i + 1 >= argc) { usage(argv); }
3872 
3873  if (strcmp(argv[i + 1], "true") == 0) {
3874  eig_arpack_check = true;
3875  } else if (strcmp(argv[i + 1], "false") == 0) {
3876  eig_arpack_check = false;
3877  } else {
3878  fprintf(stderr, "ERROR: invalid value for arpack-check (true/false)\n");
3879  exit(1);
3880  }
3881 
3882  i++;
3883  ret = 0;
3884  goto out;
3885  }
3886 
3887  if (strcmp(argv[i], "--eig-load-vec") == 0) {
3888  if (i + 1 >= argc) { usage(argv); }
3889  strcpy(eig_vec_infile, argv[i + 1]);
3890  i++;
3891  ret = 0;
3892  goto out;
3893  }
3894 
3895  if (strcmp(argv[i], "--eig-save-vec") == 0) {
3896  if (i + 1 >= argc) { usage(argv); }
3897  strcpy(eig_vec_outfile, argv[i + 1]);
3898  i++;
3899  ret = 0;
3900  goto out;
3901  }
3902 
3903  if (strcmp(argv[i], "--mg-eig") == 0) {
3904  if (i + 1 >= argc) { usage(argv); }
3905  int level = atoi(argv[i + 1]);
3906  if (level < 0 || level >= QUDA_MAX_MG_LEVEL) {
3907  printf("ERROR: invalid multigrid level %d", level);
3908  usage(argv);
3909  }
3910  i++;
3911 
3912  if (strcmp(argv[i + 1], "true") == 0) {
3913  mg_eig[level] = true;
3914  } else if (strcmp(argv[i + 1], "false") == 0) {
3915  mg_eig[level] = false;
3916  } else {
3917  fprintf(stderr, "ERROR: invalid value for mg-eig %d (true/false)\n", level);
3918  exit(1);
3919  }
3920 
3921  i++;
3922  ret = 0;
3923  goto out;
3924  }
3925 
3926  if (strcmp(argv[i], "--mg-eig-nEv") == 0) {
3927  if (i + 1 >= argc) { usage(argv); }
3928  int level = atoi(argv[i + 1]);
3929  if (level < 0 || level >= QUDA_MAX_MG_LEVEL) {
3930  printf("ERROR: invalid multigrid level %d", level);
3931  usage(argv);
3932  }
3933  i++;
3934 
3935  mg_eig_nEv[level] = atoi(argv[i + 1]);
3936 
3937  i++;
3938  ret = 0;
3939  goto out;
3940  }
3941 
3942  if (strcmp(argv[i], "--mg-eig-nKr") == 0) {
3943  if (i + 1 >= argc) { usage(argv); }
3944  int level = atoi(argv[i + 1]);
3945  if (level < 0 || level >= QUDA_MAX_MG_LEVEL) {
3946  printf("ERROR: invalid multigrid level %d", level);
3947  usage(argv);
3948  }
3949  i++;
3950 
3951  mg_eig_nKr[level] = atoi(argv[i + 1]);
3952 
3953  i++;
3954  ret = 0;
3955  goto out;
3956  }
3957 
3958  if (strcmp(argv[i], "--mg-eig-require-convergence") == 0) {
3959  if (i + 1 >= argc) { usage(argv); }
3960  int level = atoi(argv[i + 1]);
3961  if (level < 0 || level >= QUDA_MAX_MG_LEVEL) {
3962  printf("ERROR: invalid multigrid level %d", level);
3963  usage(argv);
3964  }
3965  i++;
3966 
3967  if (strcmp(argv[i + 1], "true") == 0) {
3968  mg_eig_require_convergence[level] = true;
3969  } else if (strcmp(argv[i + 1], "false") == 0) {
3970  mg_eig_require_convergence[level] = false;
3971  } else {
3972  fprintf(stderr, "ERROR: invalid value for mg-eig-require-convergence %d (true/false)\n", level);
3973  exit(1);
3974  }
3975 
3976  i++;
3977  ret = 0;
3978  goto out;
3979  }
3980 
3981  if (strcmp(argv[i], "--mg-eig-check-interval") == 0) {
3982  if (i + 1 >= argc) { usage(argv); }
3983  int level = atoi(argv[i + 1]);
3984  if (level < 0 || level >= QUDA_MAX_MG_LEVEL) {
3985  printf("ERROR: invalid multigrid level %d", level);
3986  usage(argv);
3987  }
3988  i++;
3989 
3990  mg_eig_check_interval[level] = atoi(argv[i + 1]);
3991 
3992  i++;
3993  ret = 0;
3994  goto out;
3995  }
3996 
3997  if (strcmp(argv[i], "--mg-eig-max-restarts") == 0) {
3998  if (i + 1 >= argc) { usage(argv); }
3999  int level = atoi(argv[i + 1]);
4000  if (level < 0 || level >= QUDA_MAX_MG_LEVEL) {
4001  printf("ERROR: invalid multigrid level %d", level);
4002  usage(argv);
4003  }
4004  i++;
4005 
4006  mg_eig_max_restarts[level] = atoi(argv[i + 1]);
4007 
4008  i++;
4009  ret = 0;
4010  goto out;
4011  }
4012 
4013  if (strcmp(argv[i], "--mg-eig-tol") == 0) {
4014  if (i + 1 >= argc) { usage(argv); }
4015  int level = atoi(argv[i + 1]);
4016  if (level < 0 || level >= QUDA_MAX_MG_LEVEL) {
4017  printf("ERROR: invalid multigrid level %d", level);
4018  usage(argv);
4019  }
4020  i++;
4021 
4022  mg_eig_tol[level] = atof(argv[i + 1]);
4023 
4024  i++;
4025  ret = 0;
4026  goto out;
4027  }
4028 
4029  if (strcmp(argv[i], "--mg-eig-use-poly-acc") == 0) {
4030  if (i + 1 >= argc) { usage(argv); }
4031  int level = atoi(argv[i + 1]);
4032  if (level < 0 || level >= QUDA_MAX_MG_LEVEL) {
4033  printf("ERROR: invalid multigrid level %d", level);
4034  usage(argv);
4035  }
4036  i++;
4037 
4038  if (strcmp(argv[i + 1], "true") == 0) {
4039  mg_eig_use_poly_acc[level] = true;
4040  } else if (strcmp(argv[i + 1], "false") == 0) {
4041  mg_eig_use_poly_acc[level] = false;
4042  } else {
4043  fprintf(stderr, "ERROR: invalid value for mg-eig-use-poly-acc %d (true/false)\n", level);
4044  exit(1);
4045  }
4046 
4047  i++;
4048  ret = 0;
4049  goto out;
4050  }
4051 
4052  if (strcmp(argv[i], "--mg-eig-poly-deg") == 0) {
4053  if (i + 1 >= argc) { usage(argv); }
4054  int level = atoi(argv[i + 1]);
4055  if (level < 0 || level >= QUDA_MAX_MG_LEVEL) {
4056  printf("ERROR: invalid multigrid level %d", level);
4057  usage(argv);
4058  }
4059  i++;
4060 
4061  mg_eig_poly_deg[level] = atoi(argv[i + 1]);
4062 
4063  i++;
4064  ret = 0;
4065  goto out;
4066  }
4067 
4068  if (strcmp(argv[i], "--mg-eig-amin") == 0) {
4069  if (i + 1 >= argc) { usage(argv); }
4070  int level = atoi(argv[i + 1]);
4071  if (level < 0 || level >= QUDA_MAX_MG_LEVEL) {
4072  printf("ERROR: invalid multigrid level %d", level);
4073  usage(argv);
4074  }
4075  i++;
4076 
4077  mg_eig_amin[level] = atof(argv[i + 1]);
4078 
4079  i++;
4080  ret = 0;
4081  goto out;
4082  }
4083 
4084  if (strcmp(argv[i], "--mg-eig-amax") == 0) {
4085  if (i + 1 >= argc) { usage(argv); }
4086  int level = atoi(argv[i + 1]);
4087  if (level < 0 || level >= QUDA_MAX_MG_LEVEL) {
4088  printf("ERROR: invalid multigrid level %d", level);
4089  usage(argv);
4090  }
4091  i++;
4092 
4093  mg_eig_amax[level] = atof(argv[i + 1]);
4094 
4095  i++;
4096  ret = 0;
4097  goto out;
4098  }
4099 
4100  if (strcmp(argv[i], "--mg-eig-use-normop") == 0) {
4101  if (i + 1 >= argc) { usage(argv); }
4102  int level = atoi(argv[i + 1]);
4103  if (level < 0 || level >= QUDA_MAX_MG_LEVEL) {
4104  printf("ERROR: invalid multigrid level %d", level);
4105  usage(argv);
4106  }
4107  i++;
4108 
4109  if (strcmp(argv[i + 1], "true") == 0) {
4110  mg_eig_use_normop[level] = true;
4111  } else if (strcmp(argv[i + 1], "false") == 0) {
4112  mg_eig_use_normop[level] = false;
4113  } else {
4114  fprintf(stderr, "ERROR: invalid value for mg-eig-use-normop (true/false)\n");
4115  exit(1);
4116  }
4117 
4118  i++;
4119  ret = 0;
4120  goto out;
4121  }
4122 
4123  if (strcmp(argv[i], "--mg-eig-use-dagger") == 0) {
4124  if (i + 1 >= argc) { usage(argv); }
4125  int level = atoi(argv[i + 1]);
4126  if (level < 0 || level >= QUDA_MAX_MG_LEVEL) {
4127  printf("ERROR: invalid multigrid level %d", level);
4128  usage(argv);
4129  }
4130  i++;
4131 
4132  if (strcmp(argv[i + 1], "true") == 0) {
4133  mg_eig_use_dagger[level] = true;
4134  } else if (strcmp(argv[i + 1], "false") == 0) {
4135  mg_eig_use_dagger[level] = false;
4136  } else {
4137  fprintf(stderr, "ERROR: invalid value for mg-eig-use-dagger (true/false)\n");
4138  exit(1);
4139  }
4140 
4141  i++;
4142  ret = 0;
4143  goto out;
4144  }
4145 
4146  if (strcmp(argv[i], "--mg-eig-spectrum") == 0) {
4147  if (i + 1 >= argc) { usage(argv); }
4148  int level = atoi(argv[i + 1]);
4149  if (level < 0 || level >= QUDA_MAX_MG_LEVEL) {
4150  printf("ERROR: invalid multigrid level %d", level);
4151  usage(argv);
4152  }
4153  i++;
4154 
4155  mg_eig_spectrum[level] = get_eig_spectrum_type(argv[i + 1]);
4156 
4157  i++;
4158  ret = 0;
4159  goto out;
4160  }
4161 
4162  if (strcmp(argv[i], "--mg-eig-type") == 0) {
4163  if (i + 1 >= argc) { usage(argv); }
4164  int level = atoi(argv[i + 1]);
4165  if (level < 0 || level >= QUDA_MAX_MG_LEVEL) {
4166  printf("ERROR: invalid multigrid level %d", level);
4167  usage(argv);
4168  }
4169  i++;
4170 
4171  mg_eig_type[level] = get_eig_type(argv[i + 1]);
4172 
4173  i++;
4174  ret = 0;
4175  goto out;
4176  }
4177 
4178  if (strcmp(argv[i], "--mg-eig-coarse-guess") == 0) {
4179  if (i + 1 >= argc) { usage(argv); }
4180 
4181  if (strcmp(argv[i + 1], "true") == 0) {
4182  eig_use_poly_acc = true;
4183  } else if (strcmp(argv[i + 1], "false") == 0) {
4184  eig_use_poly_acc = false;
4185  } else {
4186  fprintf(stderr, "ERROR: invalid value for mg-eig-coarse-guess (true/false)\n");
4187  exit(1);
4188  }
4189 
4190  i++;
4191  ret = 0;
4192  goto out;
4193  }
4194 
4195  if( strcmp(argv[i], "--niter") == 0){
4196  if (i+1 >= argc){
4197  usage(argv);
4198  }
4199  niter= atoi(argv[i+1]);
4200  if (niter < 1 || niter > 1e6){
4201  printf("ERROR: invalid number of iterations (%d)\n", niter);
4202  usage(argv);
4203  }
4204  i++;
4205  ret = 0;
4206  goto out;
4207  }
4208 
4209  if( strcmp(argv[i], "--ngcrkrylov") == 0 || strcmp(argv[i], "--ca-basis-size") == 0) {
4210  if (i+1 >= argc){
4211  usage(argv);
4212  }
4213  gcrNkrylov = atoi(argv[i+1]);
4214  if (gcrNkrylov < 1 || gcrNkrylov > 1e6){
4215  printf("ERROR: invalid number of gcrkrylov iterations (%d)\n", gcrNkrylov);
4216  usage(argv);
4217  }
4218  i++;
4219  ret = 0;
4220  goto out;
4221  }
4222 
4223  if( strcmp(argv[i], "--ca-basis-type") == 0){
4224  if (i+1 >= argc){
4225  usage(argv);
4226  }
4227 
4228  if (strcmp(argv[i+1], "power") == 0){
4230  }else if (strcmp(argv[i+1], "chebyshev") == 0 || strcmp(argv[i+1], "cheby") == 0){
4232  }else{
4233  fprintf(stderr, "ERROR: invalid value for basis ca cg type\n");
4234  exit(1);
4235  }
4236 
4237  i++;
4238  ret = 0;
4239  goto out;
4240  }
4241 
4242  if( strcmp(argv[i], "--cheby-basis-eig-min") == 0){
4243  if (i+1 >= argc){
4244  usage(argv);
4245  }
4246  ca_lambda_min = atof(argv[i+1]);
4247  if (ca_lambda_min < 0.0){
4248  printf("ERROR: invalid minimum eigenvalue for CA-CG (%f < 0.0)\n", ca_lambda_min);
4249  usage(argv);
4250  }
4251  i++;
4252  ret = 0;
4253  goto out;
4254  }
4255 
4256  if( strcmp(argv[i], "--cheby-basis-eig-max") == 0){
4257  if (i+1 >= argc){
4258  usage(argv);
4259  }
4260  ca_lambda_max = atof(argv[i+1]);
4261  // negative value induces power iterations to guess
4262  i++;
4263  ret = 0;
4264  goto out;
4265  }
4266 
4267  if( strcmp(argv[i], "--pipeline") == 0){
4268  if (i+1 >= argc){
4269  usage(argv);
4270  }
4271  pipeline = atoi(argv[i+1]);
4272  if (pipeline < 0 || pipeline > 16){
4273  printf("ERROR: invalid pipeline length (%d)\n", pipeline);
4274  usage(argv);
4275  }
4276  i++;
4277  ret = 0;
4278  goto out;
4279  }
4280 
4281  if( strcmp(argv[i], "--solution-pipeline") == 0){
4282  if (i+1 >= argc){
4283  usage(argv);
4284  }
4285  solution_accumulator_pipeline = atoi(argv[i+1]);
4286  if (solution_accumulator_pipeline < 0 || solution_accumulator_pipeline > 16){
4287  printf("ERROR: invalid solution pipeline length (%d)\n", solution_accumulator_pipeline);
4288  usage(argv);
4289  }
4290  i++;
4291  ret = 0;
4292  goto out;
4293  }
4294 
4295  if (strcmp(argv[i], "--gaussian-sigma") == 0) {
4296  if (i + 1 >= argc) { usage(argv); }
4297  gaussian_sigma = atof(argv[i + 1]);
4298  if (gaussian_sigma < 0.0 || gaussian_sigma > 1.0) {
4299  printf("ERROR: invalid sigma (%f)\n", gaussian_sigma);
4300  usage(argv);
4301  }
4302  i++;
4303  ret = 0;
4304  goto out;
4305  }
4306 
4307  if( strcmp(argv[i], "--heatbath-beta") == 0){
4308  if (i+1 >= argc){
4309  usage(argv);
4310  }
4311  heatbath_beta_value = atof(argv[i+1]);
4312  if (heatbath_beta_value <= 0.0){
4313  printf("ERROR: invalid beta (%f)\n", heatbath_beta_value);
4314  usage(argv);
4315  }
4316  i++;
4317  ret = 0;
4318  goto out;
4319  }
4320 
4321  if( strcmp(argv[i], "--heatbath-warmup-steps") == 0){
4322  if (i+1 >= argc){
4323  usage(argv);
4324  }
4325  heatbath_warmup_steps = atoi(argv[i+1]);
4326  if (heatbath_warmup_steps < 0){
4327  printf("ERROR: invalid number of warmup steps (%d)\n", heatbath_warmup_steps);
4328  usage(argv);
4329  }
4330  i++;
4331  ret = 0;
4332  goto out;
4333  }
4334 
4335  if( strcmp(argv[i], "--heatbath-num-steps") == 0){
4336  if (i+1 >= argc){
4337  usage(argv);
4338  }
4339  heatbath_num_steps = atoi(argv[i+1]);
4340  if (heatbath_num_steps < 0){
4341  printf("ERROR: invalid number of heatbath steps (%d)\n", heatbath_num_steps);
4342  usage(argv);
4343  }
4344  i++;
4345  ret = 0;
4346  goto out;
4347  }
4348 
4349  if( strcmp(argv[i], "--heatbath-num-hb-per-step") == 0){
4350  if (i+1 >= argc){
4351  usage(argv);
4352  }
4353  heatbath_num_heatbath_per_step = atoi(argv[i+1]);
4355  printf("ERROR: invalid number of heatbath hits per step (%d)\n", heatbath_num_heatbath_per_step);
4356  usage(argv);
4357  }
4358  i++;
4359  ret = 0;
4360  goto out;
4361  }
4362 
4363  if( strcmp(argv[i], "--heatbath-num-or-per-step") == 0){
4364  if (i+1 >= argc){
4365  usage(argv);
4366  }
4367  heatbath_num_overrelax_per_step = atoi(argv[i+1]);
4369  printf("ERROR: invalid number of overrelaxation hits per step (%d)\n", heatbath_num_overrelax_per_step);
4370  usage(argv);
4371  }
4372  i++;
4373  ret = 0;
4374  goto out;
4375  }
4376 
4377  if( strcmp(argv[i], "--heatbath-coldstart") == 0){
4378  if (i+1 >= argc){
4379  usage(argv);
4380  }
4381 
4382  if (strcmp(argv[i+1], "true") == 0){
4383  heatbath_coldstart = true;
4384  }else if (strcmp(argv[i+1], "false") == 0){
4385  heatbath_coldstart = false;
4386  }else{
4387  fprintf(stderr, "ERROR: invalid value for heatbath_coldstart type\n");
4388  usage(argv);
4389  }
4390 
4391  i++;
4392  ret = 0;
4393  goto out;
4394  }
4395 
4396  if (strcmp(argv[i], "--contract-type") == 0) {
4397  if (i + 1 >= argc) { usage(argv); }
4398  contract_type = get_contract_type(argv[i + 1]);
4399  i++;
4400  ret = 0;
4401  goto out;
4402  }
4403 
4404  if( strcmp(argv[i], "--version") == 0){
4405  printf("This program is linked with QUDA library, version %s,", get_quda_ver_str());
4406  printf(" %s GPU build\n", msg);
4407  exit(0);
4408  }
4409 
4410  out:
4411  *idx = i;
4412  return ret ;
4413 
4414 }
4415 
4416 
4417 static struct timeval startTime;
4418 
4420  gettimeofday(&startTime, NULL);
4421 }
4422 
4424  struct timeval endTime;
4425  gettimeofday(&endTime, NULL);
4426 
4427  long ds = endTime.tv_sec - startTime.tv_sec;
4428  long dus = endTime.tv_usec - startTime.tv_usec;
4429  return ds + 0.000001*dus;
4430 }
void printGaugeElement(void *gauge, int X, QudaPrecision precision)
Definition: test_util.cpp:231
QudaPrecision prec_precondition
Definition: test_util.cpp:1611
QudaCABasis coarse_solver_ca_basis[QUDA_MAX_MG_LEVEL]
Definition: test_util.cpp:1697
bool alternative_reliable
Definition: test_util.cpp:1659
static void orthogonalize(complex< Float > *a, complex< Float > *b, int len)
Definition: test_util.cpp:898
static size_t gSize
bool eig_arpack_check
Definition: test_util.cpp:1739
int dimPartitioned(int dim)
Definition: test_util.cpp:1776
void applyGaugeFieldScaling_long(Float **gauge, int Vh, QudaGaugeParam *param, QudaDslashType dslash_type)
Definition: test_util.cpp:747
int eig_max_restarts
Definition: test_util.cpp:1728
enum QudaMassNormalization_s QudaMassNormalization
double setup_ca_lambda_max[QUDA_MAX_MG_LEVEL]
Definition: test_util.cpp:1686
bool eig_use_dagger
Definition: test_util.cpp:1735
double anisotropy
Definition: quda.h:38
double heatbath_beta_value
Definition: test_util.cpp:1765
bool generate_all_levels
Definition: test_util.cpp:1702
int Vh_ex
Definition: test_util.cpp:36
void get_gridsize_from_env(int *const dims)
Definition: test_util.cpp:51
QudaSolutionType get_solution_type(char *s)
Definition: misc.cpp:1202
void setPrecision(QudaPrecision precision, QudaPrecision ghost_precision=QUDA_INVALID_PRECISION, bool force_native=false)
char eig_vec_infile[256]
Definition: test_util.cpp:1742
static void sum(Float *dst, Float *a, Float *b, int cnt)
Definition: dslash_util.h:8
void construct_gauge_field(void **gauge, int type, QudaPrecision precision, QudaGaugeParam *param)
Definition: test_util.cpp:1047
double eig_tol
Definition: test_util.cpp:1729
static void constructUnitGaugeField(Float **res, QudaGaugeParam *param)
Definition: test_util.cpp:865
int compareLink(Float **linkA, Float **linkB, int len)
Definition: test_util.cpp:1361
enum QudaPrecision_s QudaPrecision
char mg_vec_outfile[QUDA_MAX_MG_LEVEL][256]
Definition: test_util.cpp:1639
int heatbath_num_heatbath_per_step
Definition: test_util.cpp:1768
QudaPrecision prec_refinement_sloppy
Definition: test_util.cpp:1610
_EXTERN_C_ int MPI_Init(int *argc, char ***argv)
Definition: nvtx_pmpi.c:42
QudaTwistFlavorType get_flavor_type(char *s)
Definition: misc.cpp:1243
int max_restart_num
Definition: test_util.cpp:1713
void dw_setDims(int *X, const int L5)
Definition: test_util.cpp:187
static void printVector(Float *v)
Definition: test_util.cpp:218
QudaPrecision prec_null
Definition: test_util.cpp:1612
QudaVerbosity verbosity
Definition: test_util.cpp:1614
int getOddBit(int Y)
Definition: test_util.cpp:247
int geo_block_size[QUDA_MAX_MG_LEVEL][QUDA_MAX_DIM]
Definition: test_util.cpp:1706
QudaGaugeFixed gauge_fix
Definition: quda.h:61
__host__ __device__ ValueType norm(const complex< ValueType > &z)
Returns the magnitude of z squared.
int Vs_z
Definition: test_util.cpp:29
QudaInverterType coarse_solver[QUDA_MAX_MG_LEVEL]
Definition: test_util.cpp:1691
void accumulateConjugateProduct(Float *a, Float *b, Float *c, int sign)
Definition: test_util.cpp:298
QudaEigType mg_eig_type[QUDA_MAX_MG_LEVEL]
Definition: test_util.cpp:1762
int zdim
Definition: test_util.cpp:1617
int heatbath_warmup_steps
Definition: test_util.cpp:1766
int setup_maxiter[QUDA_MAX_MG_LEVEL]
Definition: test_util.cpp:1681
#define TUP
Definition: test_util.cpp:23
Matrix< N, std::complex< T > > conj(const Matrix< N, std::complex< T > > &mat)
QudaLinkType type
Definition: quda.h:42
int niter
Definition: test_util.cpp:1629
static int X2
Definition: face_gauge.cpp:42
int mySpinorSiteSize
Definition: test_util.cpp:42
int heatbath_num_steps
Definition: test_util.cpp:1767
#define errorQuda(...)
Definition: util_quda.h:121
void printSpinorElement(void *spinor, int X, QudaPrecision precision)
Definition: test_util.cpp:223
#define ZUP
Definition: test_util.cpp:22
void createHwCPU(void *hw, QudaPrecision precision)
Definition: test_util.cpp:1487
double eigenval_tol
Definition: test_util.cpp:1715
bool mg_eig_use_normop[QUDA_MAX_MG_LEVEL]
Definition: test_util.cpp:1759
int multishift
Definition: test_util.cpp:1642
QudaSetupType setup_type
Definition: test_util.cpp:1687
int nvec[QUDA_MAX_MG_LEVEL]
Definition: test_util.cpp:1637
void createMomCPU(void *mom, QudaPrecision precision)
Definition: test_util.cpp:1451
int device
Definition: test_util.cpp:1602
enum QudaSolveType_s QudaSolveType
__host__ __device__ ValueType sqrt(ValueType x)
Definition: complex_quda.h:120
void commDimPartitionedSet(int dir)
int commCoords(int)
int V_ex
Definition: test_util.cpp:36
int Vs_y
Definition: test_util.cpp:29
char eig_vec_outfile[256]
Definition: test_util.cpp:1743
double ca_lambda_max
Definition: test_util.cpp:1633
int process_command_line_option(int argc, char **argv, int *idx)
Definition: test_util.cpp:2019
int mg_levels
Definition: test_util.cpp:1666
static int rank
Definition: comm_mpi.cpp:44
void complexDotProduct(Float *a, Float *b, Float *c)
Definition: test_util.cpp:278
bool compute_fatlong
Definition: test_util.cpp:1655
QudaFieldLocation location_ritz
Definition: test_util.cpp:1719
int mg_eig_nEv[QUDA_MAX_MG_LEVEL]
Definition: test_util.cpp:1749
double anisotropy
Definition: test_util.cpp:1650
int fullLatticeIndex_5d_4dpc(int i, int oddBit)
Definition: test_util.cpp:687
static void applyGaugeFieldScaling(Float **gauge, int Vh, QudaGaugeParam *param)
Definition: test_util.cpp:708
int pipeline
Definition: test_util.cpp:1634
QudaPrecision smoother_halo_prec
Definition: test_util.cpp:1694
int Msrc
Definition: test_util.cpp:1628
STL namespace.
void finalizeComms()
Definition: test_util.cpp:128
QudaContractType contract_type
Definition: test_util.cpp:1772
static void applyStaggeredScaling(Float **res, QudaGaugeParam *param, int type)
Definition: test_util.cpp:1039
int tdim
Definition: test_util.cpp:1618
static struct timeval startTime
Definition: test_util.cpp:4417
void su3Construct8(Float *mat)
Definition: test_util.cpp:316
double setup_tol[QUDA_MAX_MG_LEVEL]
Definition: test_util.cpp:1680
char mg_vec_infile[QUDA_MAX_MG_LEVEL][256]
Definition: test_util.cpp:1638
void stopwatchStart()
Definition: test_util.cpp:4419
void usage_extra(char **argv)
void constructUnitaryGaugeField(Float **res)
Definition: test_util.cpp:982
int gridsize_from_cmdline[4]
Definition: test_util.cpp:49
int E1
Definition: test_util.cpp:34
double eig_amax
Definition: test_util.cpp:1733
int ydim
Definition: test_util.cpp:1616
int compare_floats(void *a, void *b, int len, double epsilon, QudaPrecision precision)
Definition: test_util.cpp:434
int laplace3D
Definition: test_util.cpp:1622
int get_rank_order(char *s)
Definition: misc.cpp:860
enum QudaEigType_s QudaEigType
void construct_spinor_source(void *v, int nSpin, int nColor, QudaPrecision precision, const int *const x, quda::RNG &rng)
Definition: test_util.cpp:1342
double mu_factor[QUDA_MAX_MG_LEVEL]
Definition: test_util.cpp:1674
bool eig_use_normop
Definition: test_util.cpp:1734
static int compare_link(void **linkA, void **linkB, int len, QudaPrecision precision)
Definition: test_util.cpp:1399
QudaVerbosity mg_verbosity[QUDA_MAX_MG_LEVEL]
Definition: test_util.cpp:1675
QudaReconstructType get_recon(char *s)
Definition: misc.cpp:653
QudaInverterType precon_type
Definition: test_util.cpp:1641
int Nsrc
Definition: test_util.cpp:1627
bool mg_eig[QUDA_MAX_MG_LEVEL]
Definition: test_util.cpp:1748
QudaFieldLocation get_location(char *s)
Definition: misc.cpp:1475
static void printMomElement(void *mom, int X, QudaPrecision precision)
Definition: test_util.cpp:1547
bool mg_eig_require_convergence[QUDA_MAX_MG_LEVEL]
Definition: test_util.cpp:1751
void createSiteLinkCPU(void **link, QudaPrecision precision, int phase)
Definition: test_util.cpp:1227
int Vsh_y
Definition: test_util.cpp:30
char eig_QUDA_logfile[256]
Definition: test_util.cpp:1741
std::map< TuneKey, TuneParam > map
Definition: tune.cpp:28
QudaSiteSubset siteSubset
Definition: lattice_field.h:71
int gcrNkrylov
Definition: test_util.cpp:1630
QudaSolveType smoother_solve_type[QUDA_MAX_MG_LEVEL]
Definition: test_util.cpp:1678
void initCommsGridQuda(int nDim, const int *dims, QudaCommsMap func, void *fdata)
QudaSchwarzType get_schwarz_type(char *s)
Definition: misc.cpp:1224
int fullLatticeIndex(int dim[4], int index, int oddBit)
Definition: test_util.cpp:439
int Ls
Definition: test_util.cpp:38
QudaSolveType get_solve_type(char *s)
Definition: misc.cpp:1146
QudaGaugeParam param
Definition: pack_test.cpp:17
QudaReconstructType link_recon_precondition
Definition: test_util.cpp:1607
bool last_node_in_t()
Definition: test_util.cpp:118
double coarse_solver_ca_lambda_min[QUDA_MAX_MG_LEVEL]
Definition: test_util.cpp:1699
int Vsh_t
Definition: test_util.cpp:30
QudaExtLibType get_solve_ext_lib_type(char *s)
Definition: misc.cpp:1458
QudaCABasis setup_ca_basis[QUDA_MAX_MG_LEVEL]
Definition: test_util.cpp:1683
int x[QUDA_MAX_DIM]
Definition: lattice_field.h:67
bool low_mode_check
Definition: test_util.cpp:1644
int compare_mom(Float *momA, Float *momB, int len)
Definition: test_util.cpp:1512
double stopwatchReadSeconds()
Definition: test_util.cpp:4423
int xdim
Definition: test_util.cpp:1615
int Vs_x
Definition: test_util.cpp:29
double tol
Definition: test_util.cpp:1656
QudaInverterType get_solver_type(char *s)
Definition: misc.cpp:1290
double mass
Definition: test_util.cpp:1646
double mg_eig_tol[QUDA_MAX_MG_LEVEL]
Definition: test_util.cpp:1754
static void * hw
int nu_pre[QUDA_MAX_MG_LEVEL]
Definition: test_util.cpp:1671
static int dim_partitioned[4]
Definition: test_util.cpp:1774
__host__ __device__ ValueType sin(ValueType x)
Definition: complex_quda.h:51
const int nColor
Definition: covdev_test.cpp:75
double coarse_solver_tol[QUDA_MAX_MG_LEVEL]
Definition: test_util.cpp:1692
double eig_amin
Definition: test_util.cpp:1732
double ca_lambda_min
Definition: test_util.cpp:1632
QudaInverterType setup_inv[QUDA_MAX_MG_LEVEL]
Definition: test_util.cpp:1676
void setSpinorSiteSize(int n)
Definition: test_util.cpp:211
int V5h
Definition: test_util.cpp:40
bool generate_nullspace
Definition: test_util.cpp:1701
QudaDagType dagger
Definition: test_util.cpp:1620
double mg_eig_amin[QUDA_MAX_MG_LEVEL]
Definition: test_util.cpp:1757
QudaDslashType get_dslash_type(char *s)
Definition: misc.cpp:877
double mg_eig_amax[QUDA_MAX_MG_LEVEL]
Definition: test_util.cpp:1758
void complexAddTo(Float *a, Float *b)
Definition: test_util.cpp:257
QudaEigSpectrumType get_eig_spectrum_type(char *s)
Definition: misc.cpp:983
QudaFieldLocation setup_location[QUDA_MAX_MG_LEVEL]
Definition: test_util.cpp:1669
__host__ __device__ ValueType atan2(ValueType x, ValueType y)
Definition: complex_quda.h:76
Class declaration to initialize and hold CURAND RNG states.
Definition: random_quda.h:23
QudaFieldLocation solver_location[QUDA_MAX_MG_LEVEL]
Definition: test_util.cpp:1668
enum QudaMatPCType_s QudaMatPCType
QudaInverterType smoother_type[QUDA_MAX_MG_LEVEL]
Definition: test_util.cpp:1693
static int compareFloats(Float *a, Float *b, int len, double epsilon)
Definition: test_util.cpp:423
double tol_restart
Definition: test_util.cpp:1710
bool eig_require_convergence
Definition: test_util.cpp:1726
void accumulateComplexProduct(Float *a, Float *b, Float *c, Float sign)
Definition: test_util.cpp:285
QudaSchwarzType schwarz_type[QUDA_MAX_MG_LEVEL]
Definition: test_util.cpp:1703
int test_type
Definition: test_util.cpp:1636
double clover_coeff
Definition: test_util.cpp:1653
__host__ __device__ ValueType pow(ValueType x, ExponentType e)
Definition: complex_quda.h:111
enum QudaSolutionType_s QudaSolutionType
QudaReconstructType link_recon_sloppy
Definition: test_util.cpp:1606
int heatbath_num_overrelax_per_step
Definition: test_util.cpp:1769
double kappa
Definition: test_util.cpp:1647
static void constructGaugeField(Float **res, QudaGaugeParam *param, QudaDslashType dslash_type=QUDA_WILSON_DSLASH)
Definition: test_util.cpp:905
bool mg_eig_use_dagger[QUDA_MAX_MG_LEVEL]
Definition: test_util.cpp:1760
enum QudaSchwarzType_s QudaSchwarzType
QudaMatPCType get_matpc_type(char *s)
Definition: misc.cpp:1100
int mg_eig_nKr[QUDA_MAX_MG_LEVEL]
Definition: test_util.cpp:1750
QudaEigSpectrumType eig_spectrum
Definition: test_util.cpp:1737
int eig_nEv
Definition: test_util.cpp:1723
#define YUP
Definition: test_util.cpp:21
QudaPrecision prec_ritz
Definition: test_util.cpp:1613
bool pre_orthonormalize
Definition: test_util.cpp:1688
int max_search_dim
Definition: test_util.cpp:1708
int X[4]
Definition: covdev_test.cpp:70
int coarse_solver_ca_basis_size[QUDA_MAX_MG_LEVEL]
Definition: test_util.cpp:1698
int(* QudaCommsMap)(const int *coords, void *fdata)
Definition: comm_quda.h:12
enum QudaDagType_s QudaDagType
QudaCABasis ca_basis
Definition: test_util.cpp:1631
void su3Construct12(Float *mat)
Definition: test_util.cpp:304
static void su3Reconstruct8(Float *mat, int dir, int ga_idx, QudaGaugeParam *param)
Definition: test_util.cpp:354
void __attribute__((weak)) usage_extra(char **argv)
Definition: test_util.cpp:1781
int mg_eig_check_interval[QUDA_MAX_MG_LEVEL]
Definition: test_util.cpp:1752
int X[4]
Definition: quda.h:36
void complexProduct(Float *a, Float *b, Float *c)
Definition: test_util.cpp:264
double eps_naik
Definition: test_util.cpp:1652
QudaEigType get_eig_type(char *s)
Definition: misc.cpp:1025
int neighborIndex(int i, int oddBit, int dx4, int dx3, int dx2, int dx1)
Definition: test_util.cpp:488
static void su3Reconstruct12(Float *mat, int dir, int ga_idx, QudaGaugeParam *param)
Definition: test_util.cpp:337
int setup_ca_basis_size[QUDA_MAX_MG_LEVEL]
Definition: test_util.cpp:1684
int Z[4]
Definition: test_util.cpp:26
int strong_check_mom(void *momA, void *momB, int len, QudaPrecision prec)
Definition: test_util.cpp:1559
void check_gauge(void **oldG, void **newG, double epsilon, QudaPrecision precision)
Definition: test_util.cpp:1220
static void printLinkElement(void *link, int X, QudaPrecision precision)
Definition: test_util.cpp:1414
QudaTwistFlavorType twist_flavor
Definition: test_util.cpp:1660
static int X3
Definition: face_gauge.cpp:42
#define hwSiteSize
Definition: test_util.h:11
int mg_eig_poly_deg[QUDA_MAX_MG_LEVEL]
Definition: test_util.cpp:1756
int strong_check_link(void **linkA, const char *msgA, void **linkB, const char *msgB, int len, QudaPrecision prec)
Definition: test_util.cpp:1429
QudaMemoryType get_df_mem_type_ritz(char *s)
Definition: misc.cpp:1505
static int X1
Definition: face_gauge.cpp:42
int nu_post[QUDA_MAX_MG_LEVEL]
Definition: test_util.cpp:1672
static int dims[4]
Definition: face_gauge.cpp:41
bool post_orthonormalize
Definition: test_util.cpp:1689
int neighborIndex_mg(int i, int oddBit, int dx4, int dx3, int dx2, int dx1)
Definition: test_util.cpp:523
void construct_fat_long_gauge_field(void **fatlink, void **longlink, int type, QudaPrecision precision, QudaGaugeParam *param, QudaDslashType dslash_type)
Definition: test_util.cpp:1062
static int index(int ndim, const int *dims, const int *x)
Definition: comm_common.cpp:32
void construct_clover_field(void *clover, double norm, double diag, QudaPrecision precision)
Definition: test_util.cpp:1167
bool eig_compute_svd
Definition: test_util.cpp:1736
double tol_hq
Definition: test_util.cpp:1657
QudaDslashType dslash_type
Definition: test_util.cpp:1621
QudaExtLibType solver_ext_lib
Definition: test_util.cpp:1717
int coarse_solver_maxiter[QUDA_MAX_MG_LEVEL]
Definition: test_util.cpp:1696
double inc_tol
Definition: test_util.cpp:1714
int neighborIndexFullLattice_mg(int i, int dx4, int dx3, int dx2, int dx1)
Definition: test_util.cpp:601
enum QudaFieldLocation_s QudaFieldLocation
double setup_ca_lambda_min[QUDA_MAX_MG_LEVEL]
Definition: test_util.cpp:1685
int eigcg_max_restarts
Definition: test_util.cpp:1712
bool compute_clover
Definition: test_util.cpp:1654
int fullLatticeIndex_5d(int i, int oddBit)
Definition: test_util.cpp:682
bool oblique_proj_check
Definition: test_util.cpp:1645
double tadpole_coeff
Definition: quda.h:39
static int commDim[QUDA_MAX_DIM]
Definition: dslash_pack.cuh:9
cpuColorSpinorField * out
enum QudaEigSpectrumType_s QudaEigSpectrumType
int solution_accumulator_pipeline
Definition: test_util.cpp:1635
void setDims(int *X)
Definition: test_util.cpp:151
int E[4]
Definition: test_util.cpp:35
double gaussian_sigma
Definition: test_util.cpp:1625
QudaExtLibType deflation_ext_lib
Definition: test_util.cpp:1718
enum QudaReconstructType_s QudaReconstructType
void initRand()
Definition: test_util.cpp:138
QudaSolutionType solution_type
Definition: test_util.cpp:1664
int eig_poly_deg
Definition: test_util.cpp:1731
int E2
Definition: test_util.cpp:34
enum QudaCABasis_s QudaCABasis
#define QUDA_MAX_MG_LEVEL
Maximum number of multi-grid levels. This number may be increased if needed.
__shared__ float s[]
enum QudaSetupType_s QudaSetupType
QudaEigType eig_type
Definition: test_util.cpp:1738
int fullLatticeIndex_4d(int i, int oddBit)
Definition: test_util.cpp:648
void usage(char **argv)
Definition: test_util.cpp:1783
double reliable_delta
Definition: test_util.cpp:1658
#define printfQuda(...)
Definition: util_quda.h:115
bool mg_eig_coarse_guess
Definition: test_util.cpp:1763
QudaTboundary t_boundary
Definition: quda.h:45
int x4_from_full_index(int i)
Definition: test_util.cpp:692
QudaSolveType coarse_solve_type[QUDA_MAX_MG_LEVEL]
Definition: test_util.cpp:1677
QudaMassNormalization get_mass_normalization_type(char *s)
Definition: misc.cpp:1058
static void normalize(complex< Float > *a, int len)
Definition: test_util.cpp:890
int n_block_ortho[QUDA_MAX_MG_LEVEL]
Definition: test_util.cpp:1673
int setup_maxiter_refresh[QUDA_MAX_MG_LEVEL]
Definition: test_util.cpp:1682
char eig_arpack_logfile[256]
Definition: test_util.cpp:1740
QudaInverterType inv_type
Definition: test_util.cpp:1640
bool verify_results
Definition: test_util.cpp:1643
int eig_nKr
Definition: test_util.cpp:1724
QudaMassNormalization normalization
Definition: test_util.cpp:1661
int V5
Definition: test_util.cpp:39
enum QudaDslashType_s QudaDslashType
int Lsdim
Definition: test_util.cpp:1619
QudaPrecision prec
Definition: test_util.cpp:1608
int faceVolume[4]
Definition: test_util.cpp:31
QudaSolveType solve_type
Definition: test_util.cpp:1663
char gauge_outfile[256]
Definition: test_util.cpp:1626
int Vsh_z
Definition: test_util.cpp:30
void accumulateComplexDotProduct(Float *a, Float *b, Float *c)
Definition: test_util.cpp:292
static int lex_rank_from_coords_t(const int *coords, void *fdata)
Definition: test_util.cpp:68
enum QudaContractType_s QudaContractType
bool unit_gauge
Definition: test_util.cpp:1624
void * longlink
__host__ __device__ ValueType cos(ValueType x)
Definition: complex_quda.h:46
__host__ __device__ complex< ValueType > polar(const ValueType &m, const ValueType &theta=0)
Returns the complex with magnitude m and angle theta in radians.
int Vs_t
Definition: test_util.cpp:29
double epsilon
Definition: test_util.cpp:1649
double coarse_solver_ca_lambda_max[QUDA_MAX_MG_LEVEL]
Definition: test_util.cpp:1700
int schwarz_cycle[QUDA_MAX_MG_LEVEL]
Definition: test_util.cpp:1704
enum QudaVerbosity_s QudaVerbosity
int mg_eig_max_restarts[QUDA_MAX_MG_LEVEL]
Definition: test_util.cpp:1753
void * fatlink
QudaMatPCType matpc_type
Definition: test_util.cpp:1662
#define QUDA_MAX_DIM
Maximum number of dimensions supported by QUDA. In practice, no routines make use of more than 5...
bool eig_use_poly_acc
Definition: test_util.cpp:1730
QudaMemoryType mem_type_ritz
Definition: test_util.cpp:1720
QudaEigSpectrumType mg_eig_spectrum[QUDA_MAX_MG_LEVEL]
Definition: test_util.cpp:1761
int E4
Definition: test_util.cpp:34
static void constructCloverField(Float *res, double norm, double diag)
Definition: test_util.cpp:1138
bool heatbath_coldstart
Definition: test_util.cpp:1770
void initComms(int argc, char **argv, int *const commDims)
Definition: test_util.cpp:88
QudaVerbosity get_verbosity_type(char *s)
Definition: misc.cpp:606
static int lex_rank_from_coords_x(const int *coords, void *fdata)
Definition: test_util.cpp:77
char latfile[256]
Definition: test_util.cpp:1623
void mat(void *out, void **link, void *in, int dagger_bit, int mu, QudaPrecision sPrecision, QudaPrecision gPrecision)
double omega
Definition: test_util.cpp:1690
int E3
Definition: test_util.cpp:34
double tadpole_factor
Definition: test_util.cpp:1651
void su3_construct(void *mat, QudaReconstructType reconstruct, QudaPrecision precision)
Definition: test_util.cpp:322
const char * get_quda_ver_str()
Definition: misc.cpp:1443
QudaContractType get_contract_type(char *s)
Definition: misc.cpp:955
void su3_reconstruct(void *mat, int dir, int ga_idx, QudaReconstructType reconstruct, QudaPrecision precision, QudaGaugeParam *param)
Definition: test_util.cpp:412
QudaPrecision prec_sloppy
Definition: test_util.cpp:1609
int eig_check_interval
Definition: test_util.cpp:1727
double mu
Definition: test_util.cpp:1648
float fat_link_max
int Vh
Definition: test_util.cpp:28
static int rank_order
Definition: test_util.cpp:86
enum QudaInverterType_s QudaInverterType
QudaPrecision get_prec(QIO_Reader *infile)
Definition: qio_field.cpp:69
enum QudaMemoryType_s QudaMemoryType
cpuColorSpinorField * spinor
Definition: covdev_test.cpp:41
int num_setup_iter[QUDA_MAX_MG_LEVEL]
Definition: test_util.cpp:1679
static void checkGauge(Float **oldG, Float **newG, double epsilon)
Definition: test_util.cpp:1186
int V
Definition: test_util.cpp:27
int comm_dim_partitioned(int dim)
enum QudaExtLibType_s QudaExtLibType
void spinorNoise(ColorSpinorField &src, RNG &randstates, QudaNoiseType type)
Generate a random noise spinor. This variant allows the user to manage the RNG state.
int Vsh_x
Definition: test_util.cpp:30
#define XUP
Definition: test_util.cpp:20
void complexConjugateProduct(Float *a, Float *b, Float *c)
Definition: test_util.cpp:271
int E1h
Definition: test_util.cpp:34
int nev
Definition: test_util.cpp:1707
static int X4
Definition: face_gauge.cpp:42
#define gaugeSiteSize
Definition: face_gauge.cpp:34
bool mg_eig_use_poly_acc[QUDA_MAX_MG_LEVEL]
Definition: test_util.cpp:1755
int deflation_grid
Definition: test_util.cpp:1709
double smoother_tol[QUDA_MAX_MG_LEVEL]
Definition: test_util.cpp:1695
int neighborIndexFullLattice(int i, int dx4, int dx3, int dx2, int dx1)
Definition: test_util.cpp:556
#define momSiteSize
Definition: test_util.h:10
int eig_nConv
Definition: test_util.cpp:1725
static void dot(sFloat *res, gFloat *a, sFloat *b)
Definition: dslash_util.h:56
QudaReconstructType link_recon
Definition: test_util.cpp:1605
enum QudaTwistFlavorType_s QudaTwistFlavorType