QUDA  v0.7.0
A library for QCD on GPUs
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
test_util.cpp
Go to the documentation of this file.
1 #include <complex>
2 #include <stdlib.h>
3 #include <stdio.h>
4 #include <string.h>
5 #include <short.h>
6 
7 #if defined(QMP_COMMS)
8 #include <qmp.h>
9 #elif defined(MPI_COMMS)
10 #include <mpi.h>
11 #endif
12 
14 #include <test_util.h>
15 
16 #include <face_quda.h>
17 #include <dslash_quda.h>
18 #include "misc.h"
19 
20 using namespace std;
21 
22 #define XUP 0
23 #define YUP 1
24 #define ZUP 2
25 #define TUP 3
26 
27 
28 int Z[4];
29 int V;
30 int Vh;
31 int Vs_x, Vs_y, Vs_z, Vs_t;
33 int faceVolume[4];
34 
35 //extended volume, +4
36 int E1, E1h, E2, E3, E4;
37 int E[4];
38 int V_ex, Vh_ex;
39 
40 int Ls;
41 int V5;
42 int V5h;
43 
45 
46 extern float fat_link_max;
47 
48 void initComms(int argc, char **argv, const int *commDims)
49 {
50 #if defined(QMP_COMMS)
51  QMP_thread_level_t tl;
52  QMP_init_msg_passing(&argc, &argv, QMP_THREAD_SINGLE, &tl);
53 
54  // FIXME? - tests crash without this
55  QMP_declare_logical_topology(commDims, 4);
56 
57 #elif defined(MPI_COMMS)
58  MPI_Init(&argc, &argv);
59 #endif
60  initCommsGridQuda(4, commDims, NULL, NULL);
61  initRand();
62 }
63 
64 
66 {
67 #if defined(QMP_COMMS)
68  QMP_finalize_msg_passing();
69 #elif defined(MPI_COMMS)
70  MPI_Finalize();
71 #endif
72 }
73 
74 
75 void initRand()
76 {
77  int rank = 0;
78 
79 #if defined(QMP_COMMS)
80  rank = QMP_get_node_number();
81 #elif defined(MPI_COMMS)
82  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
83 #endif
84 
85  srand(17*rank + 137);
86 }
87 
88 void setDims(int *X) {
89  V = 1;
90  for (int d=0; d< 4; d++) {
91  V *= X[d];
92  Z[d] = X[d];
93 
94  faceVolume[d] = 1;
95  for (int i=0; i<4; i++) {
96  if (i==d) continue;
97  faceVolume[d] *= X[i];
98  }
99  }
100  Vh = V/2;
101 
102  Vs_x = X[1]*X[2]*X[3];
103  Vs_y = X[0]*X[2]*X[3];
104  Vs_z = X[0]*X[1]*X[3];
105  Vs_t = X[0]*X[1]*X[2];
106 
107  Vsh_x = Vs_x/2;
108  Vsh_y = Vs_y/2;
109  Vsh_z = Vs_z/2;
110  Vsh_t = Vs_t/2;
111 
112 
113  E1=X[0]+4; E2=X[1]+4; E3=X[2]+4; E4=X[3]+4;
114  E1h=E1/2;
115  E[0] = E1;
116  E[1] = E2;
117  E[2] = E3;
118  E[3] = E4;
119  V_ex = E1*E2*E3*E4;
120  Vh_ex = V_ex/2;
121 
122 }
123 
124 
125 void dw_setDims(int *X, const int L5)
126 {
127  V = 1;
128  for (int d=0; d< 4; d++)
129  {
130  V *= X[d];
131  Z[d] = X[d];
132 
133  faceVolume[d] = 1;
134  for (int i=0; i<4; i++) {
135  if (i==d) continue;
136  faceVolume[d] *= X[i];
137  }
138  }
139  Vh = V/2;
140 
141  Ls = L5;
142  V5 = V*Ls;
143  V5h = Vh*Ls;
144 
145  Vs_t = Z[0]*Z[1]*Z[2]*Ls;//?
146  Vsh_t = Vs_t/2; //?
147 }
148 
149 
150 void setSpinorSiteSize(int n)
151 {
152  mySpinorSiteSize = n;
153 }
154 
155 
156 template <typename Float>
157 static void printVector(Float *v) {
158  printfQuda("{(%f %f) (%f %f) (%f %f)}\n", v[0], v[1], v[2], v[3], v[4], v[5]);
159 }
160 
161 // X indexes the lattice site
162 void printSpinorElement(void *spinor, int X, QudaPrecision precision) {
163  if (precision == QUDA_DOUBLE_PRECISION)
164  for (int s=0; s<4; s++) printVector((double*)spinor+X*24+s*6);
165  else
166  for (int s=0; s<4; s++) printVector((float*)spinor+X*24+s*6);
167 }
168 
169 // X indexes the full lattice
170 void printGaugeElement(void *gauge, int X, QudaPrecision precision) {
171  if (getOddBit(X) == 0) {
172  if (precision == QUDA_DOUBLE_PRECISION)
173  for (int m=0; m<3; m++) printVector((double*)gauge +(X/2)*gaugeSiteSize + m*3*2);
174  else
175  for (int m=0; m<3; m++) printVector((float*)gauge +(X/2)*gaugeSiteSize + m*3*2);
176 
177  } else {
178  if (precision == QUDA_DOUBLE_PRECISION)
179  for (int m = 0; m < 3; m++) printVector((double*)gauge + (X/2+Vh)*gaugeSiteSize + m*3*2);
180  else
181  for (int m = 0; m < 3; m++) printVector((float*)gauge + (X/2+Vh)*gaugeSiteSize + m*3*2);
182  }
183 }
184 
185 // returns 0 or 1 if the full lattice index X is even or odd
186 int getOddBit(int Y) {
187  int x4 = Y/(Z[2]*Z[1]*Z[0]);
188  int x3 = (Y/(Z[1]*Z[0])) % Z[2];
189  int x2 = (Y/Z[0]) % Z[1];
190  int x1 = Y % Z[0];
191  return (x4+x3+x2+x1) % 2;
192 }
193 
194 // a+=b
195 template <typename Float>
196 inline void complexAddTo(Float *a, Float *b) {
197  a[0] += b[0];
198  a[1] += b[1];
199 }
200 
201 // a = b*c
202 template <typename Float>
203 inline void complexProduct(Float *a, Float *b, Float *c) {
204  a[0] = b[0]*c[0] - b[1]*c[1];
205  a[1] = b[0]*c[1] + b[1]*c[0];
206 }
207 
208 // a = conj(b)*conj(c)
209 template <typename Float>
210 inline void complexConjugateProduct(Float *a, Float *b, Float *c) {
211  a[0] = b[0]*c[0] - b[1]*c[1];
212  a[1] = -b[0]*c[1] - b[1]*c[0];
213 }
214 
215 // a = conj(b)*c
216 template <typename Float>
217 inline void complexDotProduct(Float *a, Float *b, Float *c) {
218  a[0] = b[0]*c[0] + b[1]*c[1];
219  a[1] = b[0]*c[1] - b[1]*c[0];
220 }
221 
222 // a += b*c
223 template <typename Float>
224 inline void accumulateComplexProduct(Float *a, Float *b, Float *c, Float sign) {
225  a[0] += sign*(b[0]*c[0] - b[1]*c[1]);
226  a[1] += sign*(b[0]*c[1] + b[1]*c[0]);
227 }
228 
229 // a += conj(b)*c)
230 template <typename Float>
231 inline void accumulateComplexDotProduct(Float *a, Float *b, Float *c) {
232  a[0] += b[0]*c[0] + b[1]*c[1];
233  a[1] += b[0]*c[1] - b[1]*c[0];
234 }
235 
236 template <typename Float>
237 inline void accumulateConjugateProduct(Float *a, Float *b, Float *c, int sign) {
238  a[0] += sign * (b[0]*c[0] - b[1]*c[1]);
239  a[1] -= sign * (b[0]*c[1] + b[1]*c[0]);
240 }
241 
242 template <typename Float>
243 inline void su3Construct12(Float *mat) {
244  Float *w = mat+12;
245  w[0] = 0.0;
246  w[1] = 0.0;
247  w[2] = 0.0;
248  w[3] = 0.0;
249  w[4] = 0.0;
250  w[5] = 0.0;
251 }
252 
253 // Stabilized Bunk and Sommer
254 template <typename Float>
255 inline void su3Construct8(Float *mat) {
256  mat[0] = atan2(mat[1], mat[0]);
257  mat[1] = atan2(mat[13], mat[12]);
258  for (int i=8; i<18; i++) mat[i] = 0.0;
259 }
260 
261 void su3_construct(void *mat, QudaReconstructType reconstruct, QudaPrecision precision) {
262  if (reconstruct == QUDA_RECONSTRUCT_12) {
263  if (precision == QUDA_DOUBLE_PRECISION) su3Construct12((double*)mat);
264  else su3Construct12((float*)mat);
265  } else {
266  if (precision == QUDA_DOUBLE_PRECISION) su3Construct8((double*)mat);
267  else su3Construct8((float*)mat);
268  }
269 }
270 
271 // given first two rows (u,v) of SU(3) matrix mat, reconstruct the third row
272 // as the cross product of the conjugate vectors: w = u* x v*
273 //
274 // 48 flops
275 template <typename Float>
276 static void su3Reconstruct12(Float *mat, int dir, int ga_idx, QudaGaugeParam *param) {
277  Float *u = &mat[0*(3*2)];
278  Float *v = &mat[1*(3*2)];
279  Float *w = &mat[2*(3*2)];
280  w[0] = 0.0; w[1] = 0.0; w[2] = 0.0; w[3] = 0.0; w[4] = 0.0; w[5] = 0.0;
281  accumulateConjugateProduct(w+0*(2), u+1*(2), v+2*(2), +1);
282  accumulateConjugateProduct(w+0*(2), u+2*(2), v+1*(2), -1);
283  accumulateConjugateProduct(w+1*(2), u+2*(2), v+0*(2), +1);
284  accumulateConjugateProduct(w+1*(2), u+0*(2), v+2*(2), -1);
285  accumulateConjugateProduct(w+2*(2), u+0*(2), v+1*(2), +1);
286  accumulateConjugateProduct(w+2*(2), u+1*(2), v+0*(2), -1);
287  Float u0 = (dir < 3 ? param->anisotropy :
288  (ga_idx >= (Z[3]-1)*Z[0]*Z[1]*Z[2]/2 ? param->t_boundary : 1));
289  w[0]*=u0; w[1]*=u0; w[2]*=u0; w[3]*=u0; w[4]*=u0; w[5]*=u0;
290 }
291 
292 template <typename Float>
293 static void su3Reconstruct8(Float *mat, int dir, int ga_idx, QudaGaugeParam *param) {
294  // First reconstruct first row
295  Float row_sum = 0.0;
296  row_sum += mat[2]*mat[2];
297  row_sum += mat[3]*mat[3];
298  row_sum += mat[4]*mat[4];
299  row_sum += mat[5]*mat[5];
300  Float u0 = (dir < 3 ? param->anisotropy :
301  (ga_idx >= (Z[3]-1)*Z[0]*Z[1]*Z[2]/2 ? param->t_boundary : 1));
302  Float U00_mag = sqrt(1.f/(u0*u0) - row_sum);
303 
304  mat[14] = mat[0];
305  mat[15] = mat[1];
306 
307  mat[0] = U00_mag * cos(mat[14]);
308  mat[1] = U00_mag * sin(mat[14]);
309 
310  Float column_sum = 0.0;
311  for (int i=0; i<2; i++) column_sum += mat[i]*mat[i];
312  for (int i=6; i<8; i++) column_sum += mat[i]*mat[i];
313  Float U20_mag = sqrt(1.f/(u0*u0) - column_sum);
314 
315  mat[12] = U20_mag * cos(mat[15]);
316  mat[13] = U20_mag * sin(mat[15]);
317 
318  // First column now restored
319 
320  // finally reconstruct last elements from SU(2) rotation
321  Float r_inv2 = 1.0/(u0*row_sum);
322 
323  // U11
324  Float A[2];
325  complexDotProduct(A, mat+0, mat+6);
326  complexConjugateProduct(mat+8, mat+12, mat+4);
327  accumulateComplexProduct(mat+8, A, mat+2, u0);
328  mat[8] *= -r_inv2;
329  mat[9] *= -r_inv2;
330 
331  // U12
332  complexConjugateProduct(mat+10, mat+12, mat+2);
333  accumulateComplexProduct(mat+10, A, mat+4, -u0);
334  mat[10] *= r_inv2;
335  mat[11] *= r_inv2;
336 
337  // U21
338  complexDotProduct(A, mat+0, mat+12);
339  complexConjugateProduct(mat+14, mat+6, mat+4);
340  accumulateComplexProduct(mat+14, A, mat+2, -u0);
341  mat[14] *= r_inv2;
342  mat[15] *= r_inv2;
343 
344  // U12
345  complexConjugateProduct(mat+16, mat+6, mat+2);
346  accumulateComplexProduct(mat+16, A, mat+4, u0);
347  mat[16] *= -r_inv2;
348  mat[17] *= -r_inv2;
349 }
350 
351 void su3_reconstruct(void *mat, int dir, int ga_idx, QudaReconstructType reconstruct, QudaPrecision precision, QudaGaugeParam *param) {
352  if (reconstruct == QUDA_RECONSTRUCT_12) {
353  if (precision == QUDA_DOUBLE_PRECISION) su3Reconstruct12((double*)mat, dir, ga_idx, param);
354  else su3Reconstruct12((float*)mat, dir, ga_idx, param);
355  } else {
356  if (precision == QUDA_DOUBLE_PRECISION) su3Reconstruct8((double*)mat, dir, ga_idx, param);
357  else su3Reconstruct8((float*)mat, dir, ga_idx, param);
358  }
359 }
360 
361 /*
362  void su3_construct_8_half(float *mat, short *mat_half) {
363  su3Construct8(mat);
364 
365  mat_half[0] = floatToShort(mat[0] / M_PI);
366  mat_half[1] = floatToShort(mat[1] / M_PI);
367  for (int i=2; i<18; i++) {
368  mat_half[i] = floatToShort(mat[i]);
369  }
370  }
371 
372  void su3_reconstruct_8_half(float *mat, short *mat_half, int dir, int ga_idx, QudaGaugeParam *param) {
373 
374  for (int i=0; i<18; i++) {
375  mat[i] = shortToFloat(mat_half[i]);
376  }
377  mat[0] *= M_PI;
378  mat[1] *= M_PI;
379 
380  su3Reconstruct8(mat, dir, ga_idx, param);
381  }*/
382 
383 template <typename Float>
384 static int compareFloats(Float *a, Float *b, int len, double epsilon) {
385  for (int i = 0; i < len; i++) {
386  double diff = fabs(a[i] - b[i]);
387  if (diff > epsilon) {
388  printfQuda("ERROR: i=%d, a[%d]=%f, b[%d]=%f\n", i, i, a[i], i, b[i]);
389  return 0;
390  }
391  }
392  return 1;
393 }
394 
395 int compare_floats(void *a, void *b, int len, double epsilon, QudaPrecision precision) {
396  if (precision == QUDA_DOUBLE_PRECISION) return compareFloats((double*)a, (double*)b, len, epsilon);
397  else return compareFloats((float*)a, (float*)b, len, epsilon);
398 }
399 
400 int fullLatticeIndex(int dim[4], int index, int oddBit){
401 
402  int za = index/(dim[0]>>1);
403  int zb = za/dim[1];
404  int x2 = za - zb*dim[1];
405  int x4 = zb/dim[2];
406  int x3 = zb - x4*dim[2];
407 
408  return 2*index + ((x2 + x3 + x4 + oddBit) & 1);
409 }
410 
411 // given a "half index" i into either an even or odd half lattice (corresponding
412 // to oddBit = {0, 1}), returns the corresponding full lattice index.
413 int fullLatticeIndex(int i, int oddBit) {
414  /*
415  int boundaryCrossings = i/(Z[0]/2) + i/(Z[1]*Z[0]/2) + i/(Z[2]*Z[1]*Z[0]/2);
416  return 2*i + (boundaryCrossings + oddBit) % 2;
417  */
418 
419  int X1 = Z[0];
420  int X2 = Z[1];
421  int X3 = Z[2];
422  //int X4 = Z[3];
423  int X1h =X1/2;
424 
425  int sid =i;
426  int za = sid/X1h;
427  //int x1h = sid - za*X1h;
428  int zb = za/X2;
429  int x2 = za - zb*X2;
430  int x4 = zb/X3;
431  int x3 = zb - x4*X3;
432  int x1odd = (x2 + x3 + x4 + oddBit) & 1;
433  //int x1 = 2*x1h + x1odd;
434  int X = 2*sid + x1odd;
435 
436  return X;
437 }
438 
439 
440 // i represents a "half index" into an even or odd "half lattice".
441 // when oddBit={0,1} the half lattice is {even,odd}.
442 //
443 // the displacements, such as dx, refer to the full lattice coordinates.
444 //
445 // neighborIndex() takes a "half index", displaces it, and returns the
446 // new "half index", which can be an index into either the even or odd lattices.
447 // displacements of magnitude one always interchange odd and even lattices.
448 //
449 
450 int neighborIndex(int i, int oddBit, int dx4, int dx3, int dx2, int dx1) {
451  int Y = fullLatticeIndex(i, oddBit);
452  int x4 = Y/(Z[2]*Z[1]*Z[0]);
453  int x3 = (Y/(Z[1]*Z[0])) % Z[2];
454  int x2 = (Y/Z[0]) % Z[1];
455  int x1 = Y % Z[0];
456 
457  // assert (oddBit == (x+y+z+t)%2);
458 
459  x4 = (x4+dx4+Z[3]) % Z[3];
460  x3 = (x3+dx3+Z[2]) % Z[2];
461  x2 = (x2+dx2+Z[1]) % Z[1];
462  x1 = (x1+dx1+Z[0]) % Z[0];
463 
464  return (x4*(Z[2]*Z[1]*Z[0]) + x3*(Z[1]*Z[0]) + x2*(Z[0]) + x1) / 2;
465 }
466 
467 
468 int neighborIndex(int dim[4], int index, int oddBit, int dx[4]){
469 
470  const int fullIndex = fullLatticeIndex(dim, index, oddBit);
471 
472  int x[4];
473  x[3] = fullIndex/(dim[2]*dim[1]*dim[0]);
474  x[2] = (fullIndex/(dim[1]*dim[0])) % dim[2];
475  x[1] = (fullIndex/dim[0]) % dim[1];
476  x[0] = fullIndex % dim[0];
477 
478  for(int dir=0; dir<4; ++dir)
479  x[dir] = (x[dir]+dx[dir]+dim[dir]) % dim[dir];
480 
481  return (((x[3]*dim[2] + x[2])*dim[1] + x[1])*dim[0] + x[0])/2;
482 }
483 
484 int
485 neighborIndex_mg(int i, int oddBit, int dx4, int dx3, int dx2, int dx1)
486 {
487  int ret;
488 
489  int Y = fullLatticeIndex(i, oddBit);
490  int x4 = Y/(Z[2]*Z[1]*Z[0]);
491  int x3 = (Y/(Z[1]*Z[0])) % Z[2];
492  int x2 = (Y/Z[0]) % Z[1];
493  int x1 = Y % Z[0];
494 
495  int ghost_x4 = x4+ dx4;
496 
497  // assert (oddBit == (x+y+z+t)%2);
498 
499  x4 = (x4+dx4+Z[3]) % Z[3];
500  x3 = (x3+dx3+Z[2]) % Z[2];
501  x2 = (x2+dx2+Z[1]) % Z[1];
502  x1 = (x1+dx1+Z[0]) % Z[0];
503 
504  if ( ghost_x4 >= 0 && ghost_x4 < Z[3]){
505  ret = (x4*(Z[2]*Z[1]*Z[0]) + x3*(Z[1]*Z[0]) + x2*(Z[0]) + x1) / 2;
506  }else{
507  ret = (x3*(Z[1]*Z[0]) + x2*(Z[0]) + x1) / 2;
508  }
509 
510 
511  return ret;
512 }
513 
514 
515 /*
516  * This is a computation of neighbor using the full index and the displacement in each direction
517  *
518  */
519 
520 int
521 neighborIndexFullLattice(int i, int dx4, int dx3, int dx2, int dx1)
522 {
523  int oddBit = 0;
524  int half_idx = i;
525  if (i >= Vh){
526  oddBit =1;
527  half_idx = i - Vh;
528  }
529 
530  int nbr_half_idx = neighborIndex(half_idx, oddBit, dx4,dx3,dx2,dx1);
531  int oddBitChanged = (dx4+dx3+dx2+dx1)%2;
532  if (oddBitChanged){
533  oddBit = 1 - oddBit;
534  }
535  int ret = nbr_half_idx;
536  if (oddBit){
537  ret = Vh + nbr_half_idx;
538  }
539 
540  return ret;
541 }
542 
543 int
544 neighborIndexFullLattice(int dim[4], int index, int dx[4])
545 {
546  const int volume = dim[0]*dim[1]*dim[2]*dim[3];
547  const int halfVolume = volume/2;
548  int oddBit = 0;
549  int halfIndex = index;
550 
551  if(index >= halfVolume){
552  oddBit = 1;
553  halfIndex = index - halfVolume;
554  }
555 
556  int neighborHalfIndex = neighborIndex(dim, halfIndex, oddBit, dx);
557 
558  int oddBitChanged = (dx[0]+dx[1]+dx[2]+dx[3])%2;
559  if(oddBitChanged){
560  oddBit = 1 - oddBit;
561  }
562 
563  return neighborHalfIndex + oddBit*halfVolume;
564 }
565 
566 
567 
568 int
569 neighborIndexFullLattice_mg(int i, int dx4, int dx3, int dx2, int dx1)
570 {
571  int ret;
572  int oddBit = 0;
573  int half_idx = i;
574  if (i >= Vh){
575  oddBit =1;
576  half_idx = i - Vh;
577  }
578 
579  int Y = fullLatticeIndex(half_idx, oddBit);
580  int x4 = Y/(Z[2]*Z[1]*Z[0]);
581  int x3 = (Y/(Z[1]*Z[0])) % Z[2];
582  int x2 = (Y/Z[0]) % Z[1];
583  int x1 = Y % Z[0];
584  int ghost_x4 = x4+ dx4;
585 
586  x4 = (x4+dx4+Z[3]) % Z[3];
587  x3 = (x3+dx3+Z[2]) % Z[2];
588  x2 = (x2+dx2+Z[1]) % Z[1];
589  x1 = (x1+dx1+Z[0]) % Z[0];
590 
591  if ( ghost_x4 >= 0 && ghost_x4 < Z[3]){
592  ret = (x4*(Z[2]*Z[1]*Z[0]) + x3*(Z[1]*Z[0]) + x2*(Z[0]) + x1) / 2;
593  }else{
594  ret = (x3*(Z[1]*Z[0]) + x2*(Z[0]) + x1) / 2;
595  return ret;
596  }
597 
598  int oddBitChanged = (dx4+dx3+dx2+dx1)%2;
599  if (oddBitChanged){
600  oddBit = 1 - oddBit;
601  }
602 
603  if (oddBit){
604  ret += Vh;
605  }
606 
607  return ret;
608 }
609 
610 
611 // 4d checkerboard.
612 // given a "half index" i into either an even or odd half lattice (corresponding
613 // to oddBit = {0, 1}), returns the corresponding full lattice index.
614 // Cf. GPGPU code in dslash_core_ante.h.
615 // There, i is the thread index.
616 int fullLatticeIndex_4d(int i, int oddBit) {
617  if (i >= Vh || i < 0) {printf("i out of range in fullLatticeIndex_4d"); exit(-1);}
618  /*
619  int boundaryCrossings = i/(Z[0]/2) + i/(Z[1]*Z[0]/2) + i/(Z[2]*Z[1]*Z[0]/2);
620  return 2*i + (boundaryCrossings + oddBit) % 2;
621  */
622 
623  int X1 = Z[0];
624  int X2 = Z[1];
625  int X3 = Z[2];
626  //int X4 = Z[3];
627  int X1h =X1/2;
628 
629  int sid =i;
630  int za = sid/X1h;
631  //int x1h = sid - za*X1h;
632  int zb = za/X2;
633  int x2 = za - zb*X2;
634  int x4 = zb/X3;
635  int x3 = zb - x4*X3;
636  int x1odd = (x2 + x3 + x4 + oddBit) & 1;
637  //int x1 = 2*x1h + x1odd;
638  int X = 2*sid + x1odd;
639 
640  return X;
641 }
642 
643 // 5d checkerboard.
644 // given a "half index" i into either an even or odd half lattice (corresponding
645 // to oddBit = {0, 1}), returns the corresponding full lattice index.
646 // Cf. GPGPU code in dslash_core_ante.h.
647 // There, i is the thread index sid.
648 // This function is used by neighborIndex_5d in dslash_reference.cpp.
649 //ok
650 int fullLatticeIndex_5d(int i, int oddBit) {
651  int boundaryCrossings = i/(Z[0]/2) + i/(Z[1]*Z[0]/2) + i/(Z[2]*Z[1]*Z[0]/2) + i/(Z[3]*Z[2]*Z[1]*Z[0]/2);
652  return 2*i + (boundaryCrossings + oddBit) % 2;
653 }
654 
656  int boundaryCrossings = i/(Z[0]/2) + i/(Z[1]*Z[0]/2) + i/(Z[2]*Z[1]*Z[0]/2);
657  return 2*i + (boundaryCrossings + oddBit) % 2;
658 }
659 
660 int
662 {
663  int oddBit = 0;
664  int half_idx = i;
665  if (i >= Vh){
666  oddBit =1;
667  half_idx = i - Vh;
668  }
669 
670  int Y = fullLatticeIndex(half_idx, oddBit);
671  int x4 = Y/(Z[2]*Z[1]*Z[0]);
672 
673  return x4;
674 }
675 
676 template <typename Float>
677 static void applyGaugeFieldScaling(Float **gauge, int Vh, QudaGaugeParam *param) {
678  // Apply spatial scaling factor (u0) to spatial links
679  for (int d = 0; d < 3; d++) {
680  for (int i = 0; i < gaugeSiteSize*Vh*2; i++) {
681  gauge[d][i] /= param->anisotropy;
682  }
683  }
684 
685  // only apply T-boundary at edge nodes
686 #ifdef MULTI_GPU
687  bool last_node_in_t = (commCoords(3) == commDim(3)-1) ? true : false;
688 #else
689  bool last_node_in_t = true;
690 #endif
691 
692  // Apply boundary conditions to temporal links
693  if (param->t_boundary == QUDA_ANTI_PERIODIC_T && last_node_in_t) {
694  for (int j = (Z[0]/2)*Z[1]*Z[2]*(Z[3]-1); j < Vh; j++) {
695  for (int i = 0; i < gaugeSiteSize; i++) {
696  gauge[3][j*gaugeSiteSize+i] *= -1.0;
697  gauge[3][(Vh+j)*gaugeSiteSize+i] *= -1.0;
698  }
699  }
700  }
701 
702  if (param->gauge_fix) {
703  // set all gauge links (except for the last Z[0]*Z[1]*Z[2]/2) to the identity,
704  // to simulate fixing to the temporal gauge.
705  int iMax = ( last_node_in_t ? (Z[0]/2)*Z[1]*Z[2]*(Z[3]-1) : Vh );
706  int dir = 3; // time direction only
707  Float *even = gauge[dir];
708  Float *odd = gauge[dir]+Vh*gaugeSiteSize;
709  for (int i = 0; i< iMax; i++) {
710  for (int m = 0; m < 3; m++) {
711  for (int n = 0; n < 3; n++) {
712  even[i*(3*3*2) + m*(3*2) + n*(2) + 0] = (m==n) ? 1 : 0;
713  even[i*(3*3*2) + m*(3*2) + n*(2) + 1] = 0.0;
714  odd [i*(3*3*2) + m*(3*2) + n*(2) + 0] = (m==n) ? 1 : 0;
715  odd [i*(3*3*2) + m*(3*2) + n*(2) + 1] = 0.0;
716  }
717  }
718  }
719  }
720 }
721 
722 template <typename Float>
723 void applyGaugeFieldScaling_long(Float **gauge, int Vh, QudaGaugeParam *param)
724 {
725 
726  int X1h=param->X[0]/2;
727  int X1 =param->X[0];
728  int X2 =param->X[1];
729  int X3 =param->X[2];
730  int X4 =param->X[3];
731 
732  // rescale long links by the appropriate coefficient
733  for(int d=0; d<4; d++){
734  for(int i=0; i < V*gaugeSiteSize; i++){
735  gauge[d][i] /= (-24*param->tadpole_coeff*param->tadpole_coeff);
736  }
737  }
738 
739  // apply the staggered phases
740  for (int d = 0; d < 3; d++) {
741 
742  //even
743  for (int i = 0; i < Vh; i++) {
744 
745  int index = fullLatticeIndex(i, 0);
746  int i4 = index /(X3*X2*X1);
747  int i3 = (index - i4*(X3*X2*X1))/(X2*X1);
748  int i2 = (index - i4*(X3*X2*X1) - i3*(X2*X1))/X1;
749  int i1 = index - i4*(X3*X2*X1) - i3*(X2*X1) - i2*X1;
750  int sign=1;
751 
752  if (d == 0) {
753  if (i4 % 2 == 1){
754  sign= -1;
755  }
756  }
757 
758  if (d == 1){
759  if ((i4+i1) % 2 == 1){
760  sign= -1;
761  }
762  }
763  if (d == 2){
764  if ( (i4+i1+i2) % 2 == 1){
765  sign= -1;
766  }
767  }
768 
769  for (int j=0;j < 6; j++){
770  gauge[d][i*gaugeSiteSize + 12+ j] *= sign;
771  }
772  }
773  //odd
774  for (int i = 0; i < Vh; i++) {
775  int index = fullLatticeIndex(i, 1);
776  int i4 = index /(X3*X2*X1);
777  int i3 = (index - i4*(X3*X2*X1))/(X2*X1);
778  int i2 = (index - i4*(X3*X2*X1) - i3*(X2*X1))/X1;
779  int i1 = index - i4*(X3*X2*X1) - i3*(X2*X1) - i2*X1;
780  int sign=1;
781 
782  if (d == 0) {
783  if (i4 % 2 == 1){
784  sign= -1;
785  }
786  }
787 
788  if (d == 1){
789  if ((i4+i1) % 2 == 1){
790  sign= -1;
791  }
792  }
793  if (d == 2){
794  if ( (i4+i1+i2) % 2 == 1){
795  sign = -1;
796  }
797  }
798 
799  for (int j=0;j < 6; j++){
800  gauge[d][(Vh+i)*gaugeSiteSize + 12 + j] *= sign;
801  }
802  }
803 
804  }
805 
806  // Apply boundary conditions to temporal links
807  if (param->t_boundary == QUDA_ANTI_PERIODIC_T) {
808  for (int j = 0; j < Vh; j++) {
809  int sign =1;
810  if (j >= (X4-3)*X1h*X2*X3 ){
811  sign= -1;
812  }
813 
814  for (int i = 0; i < 6; i++) {
815  gauge[3][j*gaugeSiteSize+ 12+ i ] *= sign;
816  gauge[3][(Vh+j)*gaugeSiteSize+12 +i] *= sign;
817  }
818  }
819  }
820 }
821 
822 
823 
824 template <typename Float>
825 static void constructUnitGaugeField(Float **res, QudaGaugeParam *param) {
826  Float *resOdd[4], *resEven[4];
827  for (int dir = 0; dir < 4; dir++) {
828  resEven[dir] = res[dir];
829  resOdd[dir] = res[dir]+Vh*gaugeSiteSize;
830  }
831 
832  for (int dir = 0; dir < 4; dir++) {
833  for (int i = 0; i < Vh; i++) {
834  for (int m = 0; m < 3; m++) {
835  for (int n = 0; n < 3; n++) {
836  resEven[dir][i*(3*3*2) + m*(3*2) + n*(2) + 0] = (m==n) ? 1 : 0;
837  resEven[dir][i*(3*3*2) + m*(3*2) + n*(2) + 1] = 0.0;
838  resOdd[dir][i*(3*3*2) + m*(3*2) + n*(2) + 0] = (m==n) ? 1 : 0;
839  resOdd[dir][i*(3*3*2) + m*(3*2) + n*(2) + 1] = 0.0;
840  }
841  }
842  }
843  }
844 
845  applyGaugeFieldScaling(res, Vh, param);
846 }
847 
848 // normalize the vector a
849 template <typename Float>
850 static void normalize(complex<Float> *a, int len) {
851  double sum = 0.0;
852  for (int i=0; i<len; i++) sum += norm(a[i]);
853  for (int i=0; i<len; i++) a[i] /= sqrt(sum);
854 }
855 
856 // orthogonalize vector b to vector a
857 template <typename Float>
858 static void orthogonalize(complex<Float> *a, complex<Float> *b, int len) {
859  complex<double> dot = 0.0;
860  for (int i=0; i<len; i++) dot += conj(a[i])*b[i];
861  for (int i=0; i<len; i++) b[i] -= (complex<Float>)dot*a[i];
862 }
863 
864 template <typename Float>
865 static void constructGaugeField(Float **res, QudaGaugeParam *param) {
866  Float *resOdd[4], *resEven[4];
867  for (int dir = 0; dir < 4; dir++) {
868  resEven[dir] = res[dir];
869  resOdd[dir] = res[dir]+Vh*gaugeSiteSize;
870  }
871 
872  for (int dir = 0; dir < 4; dir++) {
873  for (int i = 0; i < Vh; i++) {
874  for (int m = 1; m < 3; m++) { // last 2 rows
875  for (int n = 0; n < 3; n++) { // 3 columns
876  resEven[dir][i*(3*3*2) + m*(3*2) + n*(2) + 0] = rand() / (Float)RAND_MAX;
877  resEven[dir][i*(3*3*2) + m*(3*2) + n*(2) + 1] = rand() / (Float)RAND_MAX;
878  resOdd[dir][i*(3*3*2) + m*(3*2) + n*(2) + 0] = rand() / (Float)RAND_MAX;
879  resOdd[dir][i*(3*3*2) + m*(3*2) + n*(2) + 1] = rand() / (Float)RAND_MAX;
880  }
881  }
882  normalize((complex<Float>*)(resEven[dir] + (i*3+1)*3*2), 3);
883  orthogonalize((complex<Float>*)(resEven[dir] + (i*3+1)*3*2), (complex<Float>*)(resEven[dir] + (i*3+2)*3*2), 3);
884  normalize((complex<Float>*)(resEven[dir] + (i*3 + 2)*3*2), 3);
885 
886  normalize((complex<Float>*)(resOdd[dir] + (i*3+1)*3*2), 3);
887  orthogonalize((complex<Float>*)(resOdd[dir] + (i*3+1)*3*2), (complex<Float>*)(resOdd[dir] + (i*3+2)*3*2), 3);
888  normalize((complex<Float>*)(resOdd[dir] + (i*3 + 2)*3*2), 3);
889 
890  {
891  Float *w = resEven[dir]+(i*3+0)*3*2;
892  Float *u = resEven[dir]+(i*3+1)*3*2;
893  Float *v = resEven[dir]+(i*3+2)*3*2;
894 
895  for (int n = 0; n < 6; n++) w[n] = 0.0;
896  accumulateConjugateProduct(w+0*(2), u+1*(2), v+2*(2), +1);
897  accumulateConjugateProduct(w+0*(2), u+2*(2), v+1*(2), -1);
898  accumulateConjugateProduct(w+1*(2), u+2*(2), v+0*(2), +1);
899  accumulateConjugateProduct(w+1*(2), u+0*(2), v+2*(2), -1);
900  accumulateConjugateProduct(w+2*(2), u+0*(2), v+1*(2), +1);
901  accumulateConjugateProduct(w+2*(2), u+1*(2), v+0*(2), -1);
902  }
903 
904  {
905  Float *w = resOdd[dir]+(i*3+0)*3*2;
906  Float *u = resOdd[dir]+(i*3+1)*3*2;
907  Float *v = resOdd[dir]+(i*3+2)*3*2;
908 
909  for (int n = 0; n < 6; n++) w[n] = 0.0;
910  accumulateConjugateProduct(w+0*(2), u+1*(2), v+2*(2), +1);
911  accumulateConjugateProduct(w+0*(2), u+2*(2), v+1*(2), -1);
912  accumulateConjugateProduct(w+1*(2), u+2*(2), v+0*(2), +1);
913  accumulateConjugateProduct(w+1*(2), u+0*(2), v+2*(2), -1);
914  accumulateConjugateProduct(w+2*(2), u+0*(2), v+1*(2), +1);
915  accumulateConjugateProduct(w+2*(2), u+1*(2), v+0*(2), -1);
916  }
917 
918  }
919  }
920 
921  if (param->type == QUDA_WILSON_LINKS){
922  applyGaugeFieldScaling(res, Vh, param);
923  } else if (param->type == QUDA_ASQTAD_LONG_LINKS){
924  applyGaugeFieldScaling_long(res, Vh, param);
925  } else if (param->type == QUDA_ASQTAD_FAT_LINKS){
926  for (int dir = 0; dir < 4; dir++){
927  for (int i = 0; i < Vh; i++) {
928  for (int m = 0; m < 3; m++) { // last 2 rows
929  for (int n = 0; n < 3; n++) { // 3 columns
930  resEven[dir][i*(3*3*2) + m*(3*2) + n*(2) + 0] =1.0* rand() / (Float)RAND_MAX;
931  resEven[dir][i*(3*3*2) + m*(3*2) + n*(2) + 1] =2.0* rand() / (Float)RAND_MAX;
932  resOdd[dir][i*(3*3*2) + m*(3*2) + n*(2) + 0] = 3.0*rand() / (Float)RAND_MAX;
933  resOdd[dir][i*(3*3*2) + m*(3*2) + n*(2) + 1] = 4.0*rand() / (Float)RAND_MAX;
934  }
935  }
936  }
937  }
938 
939  }
940 
941 }
942 
943 template <typename Float>
945 {
946  Float *resOdd[4], *resEven[4];
947  for (int dir = 0; dir < 4; dir++) {
948  resEven[dir] = res[dir];
949  resOdd[dir] = res[dir]+Vh*gaugeSiteSize;
950  }
951 
952  for (int dir = 0; dir < 4; dir++) {
953  for (int i = 0; i < Vh; i++) {
954  for (int m = 1; m < 3; m++) { // last 2 rows
955  for (int n = 0; n < 3; n++) { // 3 columns
956  resEven[dir][i*(3*3*2) + m*(3*2) + n*(2) + 0] = rand() / (Float)RAND_MAX;
957  resEven[dir][i*(3*3*2) + m*(3*2) + n*(2) + 1] = rand() / (Float)RAND_MAX;
958  resOdd[dir][i*(3*3*2) + m*(3*2) + n*(2) + 0] = rand() / (Float)RAND_MAX;
959  resOdd[dir][i*(3*3*2) + m*(3*2) + n*(2) + 1] = rand() / (Float)RAND_MAX;
960  }
961  }
962  normalize((complex<Float>*)(resEven[dir] + (i*3+1)*3*2), 3);
963  orthogonalize((complex<Float>*)(resEven[dir] + (i*3+1)*3*2), (complex<Float>*)(resEven[dir] + (i*3+2)*3*2), 3);
964  normalize((complex<Float>*)(resEven[dir] + (i*3 + 2)*3*2), 3);
965 
966  normalize((complex<Float>*)(resOdd[dir] + (i*3+1)*3*2), 3);
967  orthogonalize((complex<Float>*)(resOdd[dir] + (i*3+1)*3*2), (complex<Float>*)(resOdd[dir] + (i*3+2)*3*2), 3);
968  normalize((complex<Float>*)(resOdd[dir] + (i*3 + 2)*3*2), 3);
969 
970  {
971  Float *w = resEven[dir]+(i*3+0)*3*2;
972  Float *u = resEven[dir]+(i*3+1)*3*2;
973  Float *v = resEven[dir]+(i*3+2)*3*2;
974 
975  for (int n = 0; n < 6; n++) w[n] = 0.0;
976  accumulateConjugateProduct(w+0*(2), u+1*(2), v+2*(2), +1);
977  accumulateConjugateProduct(w+0*(2), u+2*(2), v+1*(2), -1);
978  accumulateConjugateProduct(w+1*(2), u+2*(2), v+0*(2), +1);
979  accumulateConjugateProduct(w+1*(2), u+0*(2), v+2*(2), -1);
980  accumulateConjugateProduct(w+2*(2), u+0*(2), v+1*(2), +1);
981  accumulateConjugateProduct(w+2*(2), u+1*(2), v+0*(2), -1);
982  }
983 
984  {
985  Float *w = resOdd[dir]+(i*3+0)*3*2;
986  Float *u = resOdd[dir]+(i*3+1)*3*2;
987  Float *v = resOdd[dir]+(i*3+2)*3*2;
988 
989  for (int n = 0; n < 6; n++) w[n] = 0.0;
990  accumulateConjugateProduct(w+0*(2), u+1*(2), v+2*(2), +1);
991  accumulateConjugateProduct(w+0*(2), u+2*(2), v+1*(2), -1);
992  accumulateConjugateProduct(w+1*(2), u+2*(2), v+0*(2), +1);
993  accumulateConjugateProduct(w+1*(2), u+0*(2), v+2*(2), -1);
994  accumulateConjugateProduct(w+2*(2), u+0*(2), v+1*(2), +1);
995  accumulateConjugateProduct(w+2*(2), u+1*(2), v+0*(2), -1);
996  }
997 
998  }
999  }
1000 }
1001 
1002 
1003 void construct_gauge_field(void **gauge, int type, QudaPrecision precision, QudaGaugeParam *param) {
1004  if (type == 0) {
1005  if (precision == QUDA_DOUBLE_PRECISION) constructUnitGaugeField((double**)gauge, param);
1006  else constructUnitGaugeField((float**)gauge, param);
1007  } else if (type == 1) {
1008  if (precision == QUDA_DOUBLE_PRECISION) constructGaugeField((double**)gauge, param);
1009  else constructGaugeField((float**)gauge, param);
1010  } else {
1011  if (precision == QUDA_DOUBLE_PRECISION) applyGaugeFieldScaling((double**)gauge, Vh, param);
1012  else applyGaugeFieldScaling((float**)gauge, Vh, param);
1013  }
1014 
1015 }
1016 
1017 void
1019  QudaPrecision precision, QudaGaugeParam* param,
1021 {
1022  if (type == 0) {
1023  if (precision == QUDA_DOUBLE_PRECISION) {
1024  constructUnitGaugeField((double**)fatlink, param);
1025  constructUnitGaugeField((double**)longlink, param);
1026  }else {
1027  constructUnitGaugeField((float**)fatlink, param);
1028  constructUnitGaugeField((float**)longlink, param);
1029  }
1030  } else {
1031  if (precision == QUDA_DOUBLE_PRECISION) {
1032  param->type = QUDA_ASQTAD_FAT_LINKS;
1033  constructGaugeField((double**)fatlink, param);
1034  param->type = QUDA_ASQTAD_LONG_LINKS;
1035  constructGaugeField((double**)longlink, param);
1036  }else {
1037  param->type = QUDA_ASQTAD_FAT_LINKS;
1038  constructGaugeField((float**)fatlink, param);
1039  param->type = QUDA_ASQTAD_LONG_LINKS;
1040  constructGaugeField((float**)longlink, param);
1041  }
1042  }
1043 
1044  if(param->reconstruct == QUDA_RECONSTRUCT_9 ||
1045  param->reconstruct == QUDA_RECONSTRUCT_13){ // incorporate non-trivial phase into long links
1046  const double cos_pi_3 = 0.5; // Cos(pi/3)
1047  const double sin_pi_3 = sqrt(0.75); // Sin(pi/3)
1048  for(int dir=0; dir<4; ++dir){
1049  for(int i=0; i<V; ++i){
1050  for(int j=0; j<gaugeSiteSize; j+=2){
1051  if(precision == QUDA_DOUBLE_PRECISION){
1052  const double real = ((double*)longlink[dir])[i*gaugeSiteSize + j];
1053  const double imag = ((double*)longlink[dir])[i*gaugeSiteSize + j + 1];
1054  ((double*)longlink[dir])[i*gaugeSiteSize + j] = real*cos_pi_3 - imag*sin_pi_3;
1055  ((double*)longlink[dir])[i*gaugeSiteSize + j + 1] = real*sin_pi_3 + imag*cos_pi_3;
1056  }else{
1057  const float real = ((float*)longlink[dir])[i*gaugeSiteSize + j];
1058  const float imag = ((float*)longlink[dir])[i*gaugeSiteSize + j + 1];
1059  ((float*)longlink[dir])[i*gaugeSiteSize + j] = real*cos_pi_3 - imag*sin_pi_3;
1060  ((float*)longlink[dir])[i*gaugeSiteSize + j + 1] = real*sin_pi_3 + imag*cos_pi_3;
1061  }
1062  }
1063  }
1064  }
1065  }
1066 
1067  if (dslash_type == QUDA_STAGGERED_DSLASH) { // set all links to zero to emulate the 1-link operator
1068  for(int dir=0; dir<4; ++dir){
1069  for(int i=0; i<V; ++i){
1070  for(int j=0; j<gaugeSiteSize; j+=2){
1071  if(precision == QUDA_DOUBLE_PRECISION){
1072  ((double*)longlink[dir])[i*gaugeSiteSize + j] = 0.0;
1073  ((double*)longlink[dir])[i*gaugeSiteSize + j + 1] = 0.0;
1074  }else{
1075  ((float*)longlink[dir])[i*gaugeSiteSize + j] = 0.0;
1076  ((float*)longlink[dir])[i*gaugeSiteSize + j + 1] = 0.0;
1077  }
1078  }
1079  }
1080  }
1081  }
1082 
1083 
1084 }
1085 
1086 
1087 template <typename Float>
1088 static void constructCloverField(Float *res, double norm, double diag) {
1089 
1090  Float c = 2.0 * norm / RAND_MAX;
1091 
1092  for(int i = 0; i < V; i++) {
1093  for (int j = 0; j < 72; j++) {
1094  res[i*72 + j] = c*rand() - norm;
1095  }
1096  for (int j = 0; j< 6; j++) {
1097  res[i*72 + j] += diag;
1098  res[i*72 + j+36] += diag;
1099  }
1100  }
1101 }
1102 
1103 void construct_clover_field(void *clover, double norm, double diag, QudaPrecision precision) {
1104 
1105  if (precision == QUDA_DOUBLE_PRECISION) constructCloverField((double *)clover, norm, diag);
1106  else constructCloverField((float *)clover, norm, diag);
1107 }
1108 
1109 /*void strong_check(void *spinorRef, void *spinorGPU, int len, QudaPrecision prec) {
1110  printf("Reference:\n");
1111  printSpinorElement(spinorRef, 0, prec); printf("...\n");
1112  printSpinorElement(spinorRef, len-1, prec); printf("\n");
1113 
1114  printf("\nCUDA:\n");
1115  printSpinorElement(spinorGPU, 0, prec); printf("...\n");
1116  printSpinorElement(spinorGPU, len-1, prec); printf("\n");
1117 
1118  compare_spinor(spinorRef, spinorGPU, len, prec);
1119  }*/
1120 
1121 template <typename Float>
1122 static void checkGauge(Float **oldG, Float **newG, double epsilon) {
1123 
1124  const int fail_check = 17;
1125  int fail[4][fail_check];
1126  int iter[4][18];
1127  for (int d=0; d<4; d++) for (int i=0; i<fail_check; i++) fail[d][i] = 0;
1128  for (int d=0; d<4; d++) for (int i=0; i<18; i++) iter[d][i] = 0;
1129 
1130  for (int d=0; d<4; d++) {
1131  for (int eo=0; eo<2; eo++) {
1132  for (int i=0; i<Vh; i++) {
1133  int ga_idx = (eo*Vh+i);
1134  for (int j=0; j<18; j++) {
1135  double diff = fabs(newG[d][ga_idx*18+j] - oldG[d][ga_idx*18+j]);
1136 
1137  for (int f=0; f<fail_check; f++) if (diff > pow(10.0,-(f+1))) fail[d][f]++;
1138  if (diff > epsilon) iter[d][j]++;
1139  }
1140  }
1141  }
1142  }
1143 
1144  printf("Component fails (X, Y, Z, T)\n");
1145  for (int i=0; i<18; i++) printf("%d fails = (%8d, %8d, %8d, %8d)\n", i, iter[0][i], iter[1][i], iter[2][i], iter[3][i]);
1146 
1147  printf("\nDeviation Failures = (X, Y, Z, T)\n");
1148  for (int f=0; f<fail_check; f++) {
1149  printf("%e Failures = (%9d, %9d, %9d, %9d) = (%6.5f, %6.5f, %6.5f, %6.5f)\n", pow(10.0,-(f+1)),
1150  fail[0][f], fail[1][f], fail[2][f], fail[3][f],
1151  fail[0][f]/(double)(V*18), fail[1][f]/(double)(V*18), fail[2][f]/(double)(V*18), fail[3][f]/(double)(V*18));
1152  }
1153 
1154 }
1155 
1156 void check_gauge(void **oldG, void **newG, double epsilon, QudaPrecision precision) {
1157  if (precision == QUDA_DOUBLE_PRECISION)
1158  checkGauge((double**)oldG, (double**)newG, epsilon);
1159  else
1160  checkGauge((float**)oldG, (float**)newG, epsilon);
1161 }
1162 
1163 
1164 
1165 void
1166 createSiteLinkCPU(void** link, QudaPrecision precision, int phase)
1167 {
1168 
1169  if (precision == QUDA_DOUBLE_PRECISION) {
1170  constructUnitaryGaugeField((double**)link);
1171  }else {
1172  constructUnitaryGaugeField((float**)link);
1173  }
1174 
1175  // only apply temporal boundary condition if I'm the last node in T
1176 #ifdef MULTI_GPU
1177  bool last_node_in_t = (commCoords(3) == commDim(3)-1) ? true : false;
1178 #else
1179  bool last_node_in_t = true;
1180 #endif
1181 
1182  if(phase){
1183 
1184  for(int i=0;i < V;i++){
1185  for(int dir =XUP; dir <= TUP; dir++){
1186  int idx = i;
1187  int oddBit =0;
1188  if (i >= Vh) {
1189  idx = i - Vh;
1190  oddBit = 1;
1191  }
1192 
1193  int X1 = Z[0];
1194  int X2 = Z[1];
1195  int X3 = Z[2];
1196  int X4 = Z[3];
1197 
1198  int full_idx = fullLatticeIndex(idx, oddBit);
1199  int i4 = full_idx /(X3*X2*X1);
1200  int i3 = (full_idx - i4*(X3*X2*X1))/(X2*X1);
1201  int i2 = (full_idx - i4*(X3*X2*X1) - i3*(X2*X1))/X1;
1202  int i1 = full_idx - i4*(X3*X2*X1) - i3*(X2*X1) - i2*X1;
1203 
1204  double coeff= 1.0;
1205  switch(dir){
1206  case XUP:
1207  if ( (i4 & 1) != 0){
1208  coeff *= -1;
1209  }
1210  break;
1211 
1212  case YUP:
1213  if ( ((i4+i1) & 1) != 0){
1214  coeff *= -1;
1215  }
1216  break;
1217 
1218  case ZUP:
1219  if ( ((i4+i1+i2) & 1) != 0){
1220  coeff *= -1;
1221  }
1222  break;
1223 
1224  case TUP:
1225  if (last_node_in_t && i4 == (X4-1)){
1226  coeff *= -1;
1227  }
1228  break;
1229 
1230  default:
1231  printf("ERROR: wrong dir(%d)\n", dir);
1232  exit(1);
1233  }
1234 
1235  if (precision == QUDA_DOUBLE_PRECISION){
1236  //double* mylink = (double*)link;
1237  //mylink = mylink + (4*i + dir)*gaugeSiteSize;
1238  double* mylink = (double*)link[dir];
1239  mylink = mylink + i*gaugeSiteSize;
1240 
1241  mylink[12] *= coeff;
1242  mylink[13] *= coeff;
1243  mylink[14] *= coeff;
1244  mylink[15] *= coeff;
1245  mylink[16] *= coeff;
1246  mylink[17] *= coeff;
1247 
1248  }else{
1249  //float* mylink = (float*)link;
1250  //mylink = mylink + (4*i + dir)*gaugeSiteSize;
1251  float* mylink = (float*)link[dir];
1252  mylink = mylink + i*gaugeSiteSize;
1253 
1254  mylink[12] *= coeff;
1255  mylink[13] *= coeff;
1256  mylink[14] *= coeff;
1257  mylink[15] *= coeff;
1258  mylink[16] *= coeff;
1259  mylink[17] *= coeff;
1260 
1261  }
1262  }
1263  }
1264  }
1265 
1266 
1267 #if 1
1268  for(int dir= 0;dir < 4;dir++){
1269  for(int i=0;i< V*gaugeSiteSize;i++){
1270  if (precision ==QUDA_SINGLE_PRECISION){
1271  float* f = (float*)link[dir];
1272  if (f[i] != f[i] || (fabsf(f[i]) > 1.e+3) ){
1273  fprintf(stderr, "ERROR: %dth: bad number(%f) in function %s \n",i, f[i], __FUNCTION__);
1274  exit(1);
1275  }
1276  }else{
1277  double* f = (double*)link[dir];
1278  if (f[i] != f[i] || (fabs(f[i]) > 1.e+3)){
1279  fprintf(stderr, "ERROR: %dth: bad number(%f) in function %s \n",i, f[i], __FUNCTION__);
1280  exit(1);
1281  }
1282 
1283  }
1284 
1285  }
1286  }
1287 #endif
1288 
1289  return;
1290 }
1291 
1292 
1293 
1294 
1295 template <typename Float>
1296 int compareLink(Float **linkA, Float **linkB, int len) {
1297  const int fail_check = 16;
1298  int fail[fail_check];
1299  for (int f=0; f<fail_check; f++) fail[f] = 0;
1300 
1301  int iter[18];
1302  for (int i=0; i<18; i++) iter[i] = 0;
1303 
1304  for(int dir=0;dir < 4; dir++){
1305  for (int i=0; i<len; i++) {
1306  for (int j=0; j<18; j++) {
1307  int is = i*18+j;
1308  double diff = fabs(linkA[dir][is]-linkB[dir][is]);
1309  for (int f=0; f<fail_check; f++)
1310  if (diff > pow(10.0,-(f+1))) fail[f]++;
1311  //if (diff > 1e-1) printf("%d %d %e\n", i, j, diff);
1312  if (diff > 1e-3) iter[j]++;
1313  }
1314  }
1315  }
1316 
1317  for (int i=0; i<18; i++) printfQuda("%d fails = %d\n", i, iter[i]);
1318 
1319  int accuracy_level = 0;
1320  for(int f =0; f < fail_check; f++){
1321  if(fail[f] == 0){
1322  accuracy_level =f;
1323  }
1324  }
1325 
1326  for (int f=0; f<fail_check; f++) {
1327  printfQuda("%e Failures: %d / %d = %e\n", pow(10.0,-(f+1)), fail[f], 4*len*18, fail[f] / (double)(4*len*18));
1328  }
1329 
1330  return accuracy_level;
1331 }
1332 
1333 static int
1334 compare_link(void **linkA, void **linkB, int len, QudaPrecision precision)
1335 {
1336  int ret;
1337 
1338  if (precision == QUDA_DOUBLE_PRECISION){
1339  ret = compareLink((double**)linkA, (double**)linkB, len);
1340  }else {
1341  ret = compareLink((float**)linkA, (float**)linkB, len);
1342  }
1343 
1344  return ret;
1345 }
1346 
1347 
1348 // X indexes the lattice site
1349 static void
1350 printLinkElement(void *link, int X, QudaPrecision precision)
1351 {
1352  if (precision == QUDA_DOUBLE_PRECISION){
1353  for(int i=0; i < 3;i++){
1354  printVector((double*)link+ X*gaugeSiteSize + i*6);
1355  }
1356 
1357  }
1358  else{
1359  for(int i=0;i < 3;i++){
1360  printVector((float*)link+X*gaugeSiteSize + i*6);
1361  }
1362  }
1363 }
1364 
1365 int strong_check_link(void** linkA, const char* msgA,
1366  void **linkB, const char* msgB,
1367  int len, QudaPrecision prec)
1368 {
1369  printfQuda("%s\n", msgA);
1370  printLinkElement(linkA[0], 0, prec);
1371  printfQuda("\n");
1372  printLinkElement(linkA[0], 1, prec);
1373  printfQuda("...\n");
1374  printLinkElement(linkA[3], len-1, prec);
1375  printfQuda("\n");
1376 
1377  printfQuda("\n%s\n", msgB);
1378  printLinkElement(linkB[0], 0, prec);
1379  printfQuda("\n");
1380  printLinkElement(linkB[0], 1, prec);
1381  printfQuda("...\n");
1382  printLinkElement(linkB[3], len-1, prec);
1383  printfQuda("\n");
1384 
1385  int ret = compare_link(linkA, linkB, len, prec);
1386  return ret;
1387 }
1388 
1389 
1390 void
1391 createMomCPU(void* mom, QudaPrecision precision)
1392 {
1393  void* temp;
1394 
1395  size_t gSize = (precision == QUDA_DOUBLE_PRECISION) ? sizeof(double) : sizeof(float);
1396  temp = malloc(4*V*gaugeSiteSize*gSize);
1397  if (temp == NULL){
1398  fprintf(stderr, "Error: malloc failed for temp in function %s\n", __FUNCTION__);
1399  exit(1);
1400  }
1401 
1402 
1403 
1404  for(int i=0;i < V;i++){
1405  if (precision == QUDA_DOUBLE_PRECISION){
1406  for(int dir=0;dir < 4;dir++){
1407  double* thismom = (double*)mom;
1408  for(int k=0; k < momSiteSize; k++){
1409  thismom[ (4*i+dir)*momSiteSize + k ]= 1.0* rand() /RAND_MAX;
1410  if (k==momSiteSize-1) thismom[ (4*i+dir)*momSiteSize + k ]= 0.0;
1411  }
1412  }
1413  }else{
1414  for(int dir=0;dir < 4;dir++){
1415  float* thismom=(float*)mom;
1416  for(int k=0; k < momSiteSize; k++){
1417  thismom[ (4*i+dir)*momSiteSize + k ]= 1.0* rand() /RAND_MAX;
1418  if (k==momSiteSize-1) thismom[ (4*i+dir)*momSiteSize + k ]= 0.0;
1419  }
1420  }
1421  }
1422  }
1423 
1424  free(temp);
1425  return;
1426 }
1427 
1428 void
1429 createHwCPU(void* hw, QudaPrecision precision)
1430 {
1431  for(int i=0;i < V;i++){
1432  if (precision == QUDA_DOUBLE_PRECISION){
1433  for(int dir=0;dir < 4;dir++){
1434  double* thishw = (double*)hw;
1435  for(int k=0; k < hwSiteSize; k++){
1436  thishw[ (4*i+dir)*hwSiteSize + k ]= 1.0* rand() /RAND_MAX;
1437  }
1438  }
1439  }else{
1440  for(int dir=0;dir < 4;dir++){
1441  float* thishw=(float*)hw;
1442  for(int k=0; k < hwSiteSize; k++){
1443  thishw[ (4*i+dir)*hwSiteSize + k ]= 1.0* rand() /RAND_MAX;
1444  }
1445  }
1446  }
1447  }
1448 
1449  return;
1450 }
1451 
1452 
1453 template <typename Float>
1454 int compare_mom(Float *momA, Float *momB, int len) {
1455  const int fail_check = 16;
1456  int fail[fail_check];
1457  for (int f=0; f<fail_check; f++) fail[f] = 0;
1458 
1459  int iter[momSiteSize];
1460  for (int i=0; i<momSiteSize; i++) iter[i] = 0;
1461 
1462  for (int i=0; i<len; i++) {
1463  for (int j=0; j<momSiteSize-1; j++) {
1464  int is = i*momSiteSize+j;
1465  double diff = fabs(momA[is]-momB[is]);
1466  for (int f=0; f<fail_check; f++)
1467  if (diff > pow(10.0,-(f+1))) fail[f]++;
1468  //if (diff > 1e-1) printf("%d %d %e\n", i, j, diff);
1469  if (diff > 1e-3) iter[j]++;
1470  }
1471  }
1472 
1473  int accuracy_level = 0;
1474  for(int f =0; f < fail_check; f++){
1475  if(fail[f] == 0){
1476  accuracy_level =f+1;
1477  }
1478  }
1479 
1480  for (int i=0; i<momSiteSize; i++) printfQuda("%d fails = %d\n", i, iter[i]);
1481 
1482  for (int f=0; f<fail_check; f++) {
1483  printfQuda("%e Failures: %d / %d = %e\n", pow(10.0,-(f+1)), fail[f], len*9, fail[f]/(double)(len*9));
1484  }
1485 
1486  return accuracy_level;
1487 }
1488 
1489 static void
1490 printMomElement(void *mom, int X, QudaPrecision precision)
1491 {
1492  if (precision == QUDA_DOUBLE_PRECISION){
1493  double* thismom = ((double*)mom)+ X*momSiteSize;
1494  printVector(thismom);
1495  printfQuda("(%9f,%9f) (%9f,%9f)\n", thismom[6], thismom[7], thismom[8], thismom[9]);
1496  }else{
1497  float* thismom = ((float*)mom)+ X*momSiteSize;
1498  printVector(thismom);
1499  printfQuda("(%9f,%9f) (%9f,%9f)\n", thismom[6], thismom[7], thismom[8], thismom[9]);
1500  }
1501 }
1502 int strong_check_mom(void * momA, void *momB, int len, QudaPrecision prec)
1503 {
1504  printfQuda("mom:\n");
1505  printMomElement(momA, 0, prec);
1506  printfQuda("\n");
1507  printMomElement(momA, 1, prec);
1508  printfQuda("\n");
1509  printMomElement(momA, 2, prec);
1510  printfQuda("\n");
1511  printMomElement(momA, 3, prec);
1512  printfQuda("...\n");
1513 
1514  printfQuda("\nreference mom:\n");
1515  printMomElement(momB, 0, prec);
1516  printfQuda("\n");
1517  printMomElement(momB, 1, prec);
1518  printfQuda("\n");
1519  printMomElement(momB, 2, prec);
1520  printfQuda("\n");
1521  printMomElement(momB, 3, prec);
1522  printfQuda("\n");
1523 
1524  int ret;
1525  if (prec == QUDA_DOUBLE_PRECISION){
1526  ret = compare_mom((double*)momA, (double*)momB, len);
1527  }else{
1528  ret = compare_mom((float*)momA, (float*)momB, len);
1529  }
1530 
1531  return ret;
1532 }
1533 
1534 
1535 /************
1536  * return value
1537  *
1538  * 0: command line option matched and processed sucessfully
1539  * non-zero: command line option does not match
1540  *
1541  */
1542 
1543 #ifdef MULTI_GPU
1544 int device = -1;
1545 #else
1546 int device = 0;
1547 #endif
1548 
1553 int xdim = 24;
1554 int ydim = 24;
1555 int zdim = 24;
1556 int tdim = 24;
1557 int Lsdim = 16;
1559 int gridsize_from_cmdline[4] = {1,1,1,1};
1561 char latfile[256] = "";
1562 bool tune = true;
1563 int niter = 10;
1564 int test_type = 0;
1567 int multishift = 0;
1568 bool verify_results = true;
1569 double mass = 0.1;
1571 bool kernel_pack_t = false;
1574 
1575 static int dim_partitioned[4] = {0,0,0,0};
1576 
1578 {
1579  return ((gridsize_from_cmdline[dim] > 1) || dim_partitioned[dim]);
1580 }
1581 
1582 void __attribute__((weak)) usage_extra(char** argv){};
1583 
1584 void usage(char** argv )
1585 {
1586  printf("Usage: %s [options]\n", argv[0]);
1587  printf("Common options: \n");
1588 #ifndef MULTI_GPU
1589  printf(" --device <n> # Set the CUDA device to use (default 0, single GPU only)\n");
1590 #endif
1591  printf(" --prec <double/single/half> # Precision in GPU\n");
1592  printf(" --prec_sloppy <double/single/half> # Sloppy precision in GPU\n");
1593  printf(" --recon <8/9/12/13/18> # Link reconstruction type\n");
1594  printf(" --recon_sloppy <8/9/12/13/18> # Sloppy link reconstruction type\n");
1595  printf(" --dagger # Set the dagger to 1 (default 0)\n");
1596  printf(" --sdim <n> # Set space dimention(X/Y/Z) size\n");
1597  printf(" --xdim <n> # Set X dimension size(default 24)\n");
1598  printf(" --ydim <n> # Set X dimension size(default 24)\n");
1599  printf(" --zdim <n> # Set X dimension size(default 24)\n");
1600  printf(" --tdim <n> # Set T dimension size(default 24)\n");
1601  printf(" --Lsdim <n> # Set Ls dimension size(default 16)\n");
1602  printf(" --xgridsize <n> # Set grid size in X dimension (default 1)\n");
1603  printf(" --ygridsize <n> # Set grid size in Y dimension (default 1)\n");
1604  printf(" --zgridsize <n> # Set grid size in Z dimension (default 1)\n");
1605  printf(" --tgridsize <n> # Set grid size in T dimension (default 1)\n");
1606  printf(" --partition <mask> # Set the communication topology (X=1, Y=2, Z=4, T=8, and combinations of these)\n");
1607  printf(" --kernel-pack-t # Set T dimension kernel packing to be true (default false)\n");
1608  printf(" --dslash_type <type> # Set the dslash type, the following values are valid\n"
1609  " wilson/clover/twisted_mass/twisted_clover/staggered\n"
1610  " /asqtad/domain_wall/domain_wall_4d/mobius\n");
1611  printf(" --flavor <type> # Set the twisted mass flavor type (minus (default), plus, deg_doublet, nondeg_doublet)\n");
1612  printf(" --load-gauge file # Load gauge field \"file\" for the test (requires QIO)\n");
1613  printf(" --niter <n> # The number of iterations to perform (default 10)\n");
1614  printf(" --inv_type <cg/bicgstab/gcr> # The type of solver to use (default cg)\n");
1615  printf(" --precon_type <mr/ (unspecified)> # The type of solver to use (default none (=unspecified))\n");
1616  printf(" --multishift <true/false> # Whether to do a multi-shift solver test or not (default false)\n");
1617  printf(" --mass # Mass of Dirac operator (default 0.1)\n");
1618  printf(" --mass-normalization # Mass normalization (kappa (default) / mass)\n");
1619  printf(" --matpc # Matrix preconditioning type (even-even, odd_odd, even_even_asym, odd_odd_asym) \n");
1620  printf(" --tune <true/false> # Whether to autotune or not (default true)\n");
1621  printf(" --test # Test method (different for each test)\n");
1622  printf(" --verify <true/false> # Verify the GPU results using CPU results (default true)\n");
1623  printf(" --help # Print out this message\n");
1624  usage_extra(argv);
1625 #ifdef MULTI_GPU
1626  char msg[]="multi";
1627 #else
1628  char msg[]="single";
1629 #endif
1630  printf("Note: this program is %s GPU build\n", msg);
1631  exit(1);
1632  return ;
1633 }
1634 
1635 int process_command_line_option(int argc, char** argv, int* idx)
1636 {
1637 #ifdef MULTI_GPU
1638  char msg[]="multi";
1639 #else
1640  char msg[]="single";
1641 #endif
1642 
1643  int ret = -1;
1644 
1645  int i = *idx;
1646 
1647  if( strcmp(argv[i], "--help")== 0){
1648  usage(argv);
1649  }
1650 
1651  if( strcmp(argv[i], "--verify") == 0){
1652  if (i+1 >= argc){
1653  usage(argv);
1654  }
1655 
1656  if (strcmp(argv[i+1], "true") == 0){
1657  verify_results = true;
1658  }else if (strcmp(argv[i+1], "false") == 0){
1659  verify_results = false;
1660  }else{
1661  fprintf(stderr, "ERROR: invalid verify type\n");
1662  exit(1);
1663  }
1664 
1665  i++;
1666  ret = 0;
1667  goto out;
1668  }
1669 
1670  if( strcmp(argv[i], "--device") == 0){
1671  if (i+1 >= argc){
1672  usage(argv);
1673  }
1674  device = atoi(argv[i+1]);
1675  if (device < 0 || device > 16){
1676  printf("ERROR: Invalid CUDA device number (%d)\n", device);
1677  usage(argv);
1678  }
1679  i++;
1680  ret = 0;
1681  goto out;
1682  }
1683 
1684  if( strcmp(argv[i], "--prec") == 0){
1685  if (i+1 >= argc){
1686  usage(argv);
1687  }
1688  prec = get_prec(argv[i+1]);
1689  i++;
1690  ret = 0;
1691  goto out;
1692  }
1693 
1694  if( strcmp(argv[i], "--prec_sloppy") == 0){
1695  if (i+1 >= argc){
1696  usage(argv);
1697  }
1698  prec_sloppy = get_prec(argv[i+1]);
1699  i++;
1700  ret = 0;
1701  goto out;
1702  }
1703 
1704  if( strcmp(argv[i], "--recon") == 0){
1705  if (i+1 >= argc){
1706  usage(argv);
1707  }
1708  link_recon = get_recon(argv[i+1]);
1709  i++;
1710  ret = 0;
1711  goto out;
1712  }
1713 
1714  if( strcmp(argv[i], "--recon_sloppy") == 0){
1715  if (i+1 >= argc){
1716  usage(argv);
1717  }
1718  link_recon_sloppy = get_recon(argv[i+1]);
1719  i++;
1720  ret = 0;
1721  goto out;
1722  }
1723 
1724  if( strcmp(argv[i], "--xdim") == 0){
1725  if (i+1 >= argc){
1726  usage(argv);
1727  }
1728  xdim= atoi(argv[i+1]);
1729  if (xdim < 0 || xdim > 512){
1730  printf("ERROR: invalid X dimension (%d)\n", xdim);
1731  usage(argv);
1732  }
1733  i++;
1734  ret = 0;
1735  goto out;
1736  }
1737 
1738  if( strcmp(argv[i], "--ydim") == 0){
1739  if (i+1 >= argc){
1740  usage(argv);
1741  }
1742  ydim= atoi(argv[i+1]);
1743  if (ydim < 0 || ydim > 512){
1744  printf("ERROR: invalid T dimension (%d)\n", ydim);
1745  usage(argv);
1746  }
1747  i++;
1748  ret = 0;
1749  goto out;
1750  }
1751 
1752 
1753  if( strcmp(argv[i], "--zdim") == 0){
1754  if (i+1 >= argc){
1755  usage(argv);
1756  }
1757  zdim= atoi(argv[i+1]);
1758  if (zdim < 0 || zdim > 512){
1759  printf("ERROR: invalid T dimension (%d)\n", zdim);
1760  usage(argv);
1761  }
1762  i++;
1763  ret = 0;
1764  goto out;
1765  }
1766 
1767  if( strcmp(argv[i], "--tdim") == 0){
1768  if (i+1 >= argc){
1769  usage(argv);
1770  }
1771  tdim = atoi(argv[i+1]);
1772  if (tdim < 0 || tdim > 512){
1773  errorQuda("Error: invalid t dimension");
1774  }
1775  i++;
1776  ret = 0;
1777  goto out;
1778  }
1779 
1780  if( strcmp(argv[i], "--sdim") == 0){
1781  if (i+1 >= argc){
1782  usage(argv);
1783  }
1784  int sdim = atoi(argv[i+1]);
1785  if (sdim < 0 || sdim > 512){
1786  printfQuda("ERROR: invalid S dimension\n");
1787  }
1788  xdim=ydim=zdim=sdim;
1789  i++;
1790  ret = 0;
1791  goto out;
1792  }
1793 
1794  if( strcmp(argv[i], "--Lsdim") == 0){
1795  if (i+1 >= argc){
1796  usage(argv);
1797  }
1798  int Ls = atoi(argv[i+1]);
1799  if (Ls < 0 || Ls > 128){
1800  printfQuda("ERROR: invalid Ls dimension\n");
1801  }
1802  Lsdim=Ls;
1803  i++;
1804  ret = 0;
1805  goto out;
1806  }
1807 
1808  if( strcmp(argv[i], "--dagger") == 0){
1809  dagger = QUDA_DAG_YES;
1810  ret = 0;
1811  goto out;
1812  }
1813 
1814  if( strcmp(argv[i], "--partition") == 0){
1815  if (i+1 >= argc){
1816  usage(argv);
1817  }
1818 #ifdef MULTI_GPU
1819  int value = atoi(argv[i+1]);
1820  for(int j=0; j < 4;j++){
1821  if (value & (1 << j)){
1823  dim_partitioned[j] = 1;
1824  }
1825  }
1826 #else
1827  printfQuda("WARNING: Ignoring --partition option since this is a single-GPU build.\n");
1828 #endif
1829  i++;
1830  ret = 0;
1831  goto out;
1832  }
1833 
1834  if( strcmp(argv[i], "--kernel-pack-t") == 0){
1835  kernel_pack_t = true;
1836  ret= 0;
1837  goto out;
1838  }
1839 
1840 
1841  if( strcmp(argv[i], "--tune") == 0){
1842  if (i+1 >= argc){
1843  usage(argv);
1844  }
1845 
1846  if (strcmp(argv[i+1], "true") == 0){
1847  tune = true;
1848  }else if (strcmp(argv[i+1], "false") == 0){
1849  tune = false;
1850  }else{
1851  fprintf(stderr, "ERROR: invalid tuning type\n");
1852  exit(1);
1853  }
1854 
1855  i++;
1856  ret = 0;
1857  goto out;
1858  }
1859 
1860  if( strcmp(argv[i], "--multishift") == 0){
1861  if (i+1 >= argc){
1862  usage(argv);
1863  }
1864 
1865  if (strcmp(argv[i+1], "true") == 0){
1866  multishift = true;
1867  }else if (strcmp(argv[i+1], "false") == 0){
1868  multishift = false;
1869  }else{
1870  fprintf(stderr, "ERROR: invalid multishift boolean\n");
1871  exit(1);
1872  }
1873 
1874  i++;
1875  ret = 0;
1876  goto out;
1877  }
1878 
1879  if( strcmp(argv[i], "--xgridsize") == 0){
1880  if (i+1 >= argc){
1881  usage(argv);
1882  }
1883  int xsize = atoi(argv[i+1]);
1884  if (xsize <= 0 ){
1885  errorQuda("ERROR: invalid X grid size");
1886  }
1887  gridsize_from_cmdline[0] = xsize;
1888  i++;
1889  ret = 0;
1890  goto out;
1891  }
1892 
1893  if( strcmp(argv[i], "--ygridsize") == 0){
1894  if (i+1 >= argc){
1895  usage(argv);
1896  }
1897  int ysize = atoi(argv[i+1]);
1898  if (ysize <= 0 ){
1899  errorQuda("ERROR: invalid Y grid size");
1900  }
1901  gridsize_from_cmdline[1] = ysize;
1902  i++;
1903  ret = 0;
1904  goto out;
1905  }
1906 
1907  if( strcmp(argv[i], "--zgridsize") == 0){
1908  if (i+1 >= argc){
1909  usage(argv);
1910  }
1911  int zsize = atoi(argv[i+1]);
1912  if (zsize <= 0 ){
1913  errorQuda("ERROR: invalid Z grid size");
1914  }
1915  gridsize_from_cmdline[2] = zsize;
1916  i++;
1917  ret = 0;
1918  goto out;
1919  }
1920 
1921  if( strcmp(argv[i], "--tgridsize") == 0){
1922  if (i+1 >= argc){
1923  usage(argv);
1924  }
1925  int tsize = atoi(argv[i+1]);
1926  if (tsize <= 0 ){
1927  errorQuda("ERROR: invalid T grid size");
1928  }
1929  gridsize_from_cmdline[3] = tsize;
1930  i++;
1931  ret = 0;
1932  goto out;
1933  }
1934 
1935  if( strcmp(argv[i], "--dslash_type") == 0){
1936  if (i+1 >= argc){
1937  usage(argv);
1938  }
1939  dslash_type = get_dslash_type(argv[i+1]);
1940  i++;
1941  ret = 0;
1942  goto out;
1943  }
1944 
1945  if( strcmp(argv[i], "--flavor") == 0){
1946  if (i+1 >= argc){
1947  usage(argv);
1948  }
1949  twist_flavor = get_flavor_type(argv[i+1]);
1950  i++;
1951  ret = 0;
1952  goto out;
1953  }
1954 
1955  if( strcmp(argv[i], "--inv_type") == 0){
1956  if (i+1 >= argc){
1957  usage(argv);
1958  }
1959  inv_type = get_solver_type(argv[i+1]);
1960  i++;
1961  ret = 0;
1962  goto out;
1963  }
1964 
1965  if( strcmp(argv[i], "--precon_type") == 0){
1966  if (i+1 >= argc){
1967  usage(argv);
1968  }
1969  precon_type = get_solver_type(argv[i+1]);
1970  i++;
1971  ret = 0;
1972  goto out;
1973  }
1974 
1975  if( strcmp(argv[i], "--mass") == 0){
1976  if (i+1 >= argc){
1977  usage(argv);
1978  }
1979  mass= atof(argv[i+1]);
1980  i++;
1981  ret = 0;
1982  goto out;
1983  }
1984 
1985  if( strcmp(argv[i], "--mass-normalization") == 0){
1986  if (i+1 >= argc){
1987  usage(argv);
1988  }
1990  i++;
1991  ret = 0;
1992  goto out;
1993  }
1994 
1995  if( strcmp(argv[i], "--matpc") == 0){
1996  if (i+1 >= argc){
1997  usage(argv);
1998  }
1999  matpc_type = get_matpc_type(argv[i+1]);
2000  i++;
2001  ret = 0;
2002  goto out;
2003  }
2004 
2005  if( strcmp(argv[i], "--load-gauge") == 0){
2006  if (i+1 >= argc){
2007  usage(argv);
2008  }
2009  strcpy(latfile, argv[i+1]);
2010  i++;
2011  ret = 0;
2012  goto out;
2013  }
2014 
2015  if( strcmp(argv[i], "--test") == 0){
2016  if (i+1 >= argc){
2017  usage(argv);
2018  }
2019  test_type = atoi(argv[i+1]);
2020  i++;
2021  ret = 0;
2022  goto out;
2023  }
2024 
2025  if( strcmp(argv[i], "--niter") == 0){
2026  if (i+1 >= argc){
2027  usage(argv);
2028  }
2029  niter= atoi(argv[i+1]);
2030  if (niter < 1 || niter > 1e6){
2031  printf("ERROR: invalid number of iterations (%d)\n", niter);
2032  usage(argv);
2033  }
2034  i++;
2035  ret = 0;
2036  goto out;
2037  }
2038 
2039  if( strcmp(argv[i], "--version") == 0){
2040  printf("This program is linked with QUDA library, version %s,",
2041  get_quda_ver_str());
2042  printf(" %s GPU build\n", msg);
2043  exit(0);
2044  }
2045 
2046  out:
2047  *idx = i;
2048  return ret ;
2049 
2050 }
2051 
2052 
2053 static struct timeval startTime;
2054 
2056  gettimeofday(&startTime, NULL);
2057 }
2058 
2060  struct timeval endTime;
2061  gettimeofday(&endTime, NULL);
2062 
2063  long ds = endTime.tv_sec - startTime.tv_sec;
2064  long dus = endTime.tv_usec - startTime.tv_usec;
2065  return ds + 0.000001*dus;
2066 }
2067 
2068 
void printGaugeElement(void *gauge, int X, QudaPrecision precision)
Definition: test_util.cpp:170
int commDim(int)
int dimPartitioned(int dim)
Definition: test_util.cpp:1577
enum QudaMassNormalization_s QudaMassNormalization
double anisotropy
Definition: quda.h:31
int Vh_ex
Definition: test_util.cpp:38
__constant__ int X1h
void construct_gauge_field(void **gauge, int type, QudaPrecision precision, QudaGaugeParam *param)
Definition: test_util.cpp:1003
int compareLink(Float **linkA, Float **linkB, int len)
Definition: test_util.cpp:1296
__constant__ int X2
enum QudaPrecision_s QudaPrecision
QudaTwistFlavorType get_flavor_type(char *s)
Definition: misc.cpp:946
void dw_setDims(int *X, const int L5)
Definition: test_util.cpp:125
int getOddBit(int Y)
Definition: test_util.cpp:186
QudaGaugeFixed gauge_fix
Definition: quda.h:51
__host__ __device__ ValueType norm(const complex< ValueType > &z)
Returns the magnitude of z squared.
Definition: complex_quda.h:859
int Vs_z
Definition: test_util.cpp:31
void accumulateConjugateProduct(Float *a, Float *b, Float *c, int sign)
Definition: test_util.cpp:237
int zdim
Definition: test_util.cpp:1555
#define TUP
Definition: test_util.cpp:25
Matrix< N, std::complex< T > > conj(const Matrix< N, std::complex< T > > &mat)
QudaLinkType type
Definition: quda.h:35
int niter
Definition: test_util.cpp:1563
int mySpinorSiteSize
Definition: test_util.cpp:44
#define errorQuda(...)
Definition: util_quda.h:73
void printSpinorElement(void *spinor, int X, QudaPrecision precision)
Definition: test_util.cpp:162
#define ZUP
Definition: test_util.cpp:24
void createHwCPU(void *hw, QudaPrecision precision)
Definition: test_util.cpp:1429
int multishift
Definition: test_util.cpp:1567
void createMomCPU(void *mom, QudaPrecision precision)
Definition: test_util.cpp:1391
int device
Definition: test_util.cpp:1546
__constant__ int X1
__host__ __device__ ValueType sqrt(ValueType x)
Definition: complex_quda.h:105
int V_ex
Definition: test_util.cpp:38
int Vs_y
Definition: test_util.cpp:31
int process_command_line_option(int argc, char **argv, int *idx)
Definition: test_util.cpp:1635
void complexDotProduct(Float *a, Float *b, Float *c)
Definition: test_util.cpp:217
int fullLatticeIndex_5d_4dpc(int i, int oddBit)
Definition: test_util.cpp:655
void mat(void *out, void **fatlink, void **longlink, void *in, double kappa, int dagger_bit, QudaPrecision sPrecision, QudaPrecision gPrecision)
bool kernel_pack_t
Definition: test_util.cpp:1571
#define gaugeSiteSize
void finalizeComms()
Definition: test_util.cpp:65
int tdim
Definition: test_util.cpp:1556
void su3Construct8(Float *mat)
Definition: test_util.cpp:255
void stopwatchStart()
Definition: test_util.cpp:2055
void constructUnitaryGaugeField(Float **res)
Definition: test_util.cpp:944
cpuColorSpinorField * spinor
Definition: dslash_test.cpp:40
int gridsize_from_cmdline[4]
Definition: test_util.cpp:1559
int E1
Definition: test_util.cpp:36
int ydim
Definition: test_util.cpp:1554
int compare_floats(void *a, void *b, int len, double epsilon, QudaPrecision precision)
Definition: test_util.cpp:395
void commDimPartitionedSet(int dir)
QudaReconstructType get_recon(char *s)
Definition: misc.cpp:614
QudaInverterType precon_type
Definition: test_util.cpp:1566
void createSiteLinkCPU(void **link, QudaPrecision precision, int phase)
Definition: test_util.cpp:1166
int Vsh_y
Definition: test_util.cpp:32
void initCommsGridQuda(int nDim, const int *dims, QudaCommsMap func, void *fdata)
void * longlink[4]
int fullLatticeIndex(int dim[4], int index, int oddBit)
Definition: test_util.cpp:400
int Ls
Definition: test_util.cpp:40
QudaGaugeParam param
Definition: pack_test.cpp:17
int Vsh_t
Definition: test_util.cpp:32
int compare_mom(Float *momA, Float *momB, int len)
Definition: test_util.cpp:1454
double stopwatchReadSeconds()
Definition: test_util.cpp:2059
int xdim
Definition: test_util.cpp:1553
int Vs_x
Definition: test_util.cpp:31
QudaInverterType get_solver_type(char *s)
Definition: misc.cpp:998
double mass
Definition: test_util.cpp:1569
__host__ __device__ ValueType sin(ValueType x)
Definition: complex_quda.h:40
FloatingPoint< float > Float
Definition: gtest.h:7350
void setSpinorSiteSize(int n)
Definition: test_util.cpp:150
int V5h
Definition: test_util.cpp:42
QudaDagType dagger
Definition: test_util.cpp:1558
#define momSiteSize
QudaDslashType get_dslash_type(char *s)
Definition: misc.cpp:783
void complexAddTo(Float *a, Float *b)
Definition: test_util.cpp:196
__host__ __device__ ValueType atan2(ValueType x, ValueType y)
Definition: complex_quda.h:65
enum QudaMatPCType_s QudaMatPCType
void accumulateComplexProduct(Float *a, Float *b, Float *c, Float sign)
Definition: test_util.cpp:224
int test_type
Definition: test_util.cpp:1564
__host__ __device__ ValueType pow(ValueType x, ExponentType e)
Definition: complex_quda.h:100
QudaReconstructType link_recon_sloppy
Definition: test_util.cpp:1550
__constant__ double coeff
QudaMatPCType get_matpc_type(char *s)
Definition: misc.cpp:899
#define YUP
Definition: test_util.cpp:23
enum QudaDagType_s QudaDagType
void su3Construct12(Float *mat)
Definition: test_util.cpp:243
QudaReconstructType reconstruct
Definition: quda.h:43
void __attribute__((weak)) usage_extra(char **argv)
Definition: test_util.cpp:1582
int X[4]
Definition: quda.h:29
void complexProduct(Float *a, Float *b, Float *c)
Definition: test_util.cpp:203
int neighborIndex(int i, int oddBit, int dx4, int dx3, int dx2, int dx1)
Definition: test_util.cpp:450
int Z[4]
Definition: test_util.cpp:28
int commCoords(int)
int strong_check_mom(void *momA, void *momB, int len, QudaPrecision prec)
Definition: test_util.cpp:1502
void check_gauge(void **oldG, void **newG, double epsilon, QudaPrecision precision)
Definition: test_util.cpp:1156
QudaTwistFlavorType twist_flavor
Definition: test_util.cpp:1570
void applyGaugeFieldScaling_long(Float **gauge, int Vh, QudaGaugeParam *param)
Definition: test_util.cpp:723
int x[4]
int strong_check_link(void **linkA, const char *msgA, void **linkB, const char *msgB, int len, QudaPrecision prec)
Definition: test_util.cpp:1365
int neighborIndex_mg(int i, int oddBit, int dx4, int dx3, int dx2, int dx1)
Definition: test_util.cpp:485
#define hwSiteSize
Definition: hw_quda.cpp:8
void construct_fat_long_gauge_field(void **fatlink, void **longlink, int type, QudaPrecision precision, QudaGaugeParam *param, QudaDslashType dslash_type)
Definition: test_util.cpp:1018
int dx[4]
void construct_clover_field(void *clover, double norm, double diag, QudaPrecision precision)
Definition: test_util.cpp:1103
QudaDslashType dslash_type
Definition: test_util.cpp:1560
int neighborIndexFullLattice_mg(int i, int dx4, int dx3, int dx2, int dx1)
Definition: test_util.cpp:569
int fullLatticeIndex_5d(int i, int oddBit)
Definition: test_util.cpp:650
double tadpole_coeff
Definition: quda.h:32
cpuColorSpinorField * out
QudaPrecision get_prec(QIO_Reader *infile)
Definition: gauge_qio.cpp:38
void setDims(int *X)
Definition: test_util.cpp:88
int E[4]
Definition: test_util.cpp:37
enum QudaReconstructType_s QudaReconstructType
void initRand()
Definition: test_util.cpp:75
__constant__ int X3
int E2
Definition: test_util.cpp:36
bool tune
Definition: test_util.cpp:1562
int fullLatticeIndex_4d(int i, int oddBit)
Definition: test_util.cpp:616
void usage(char **argv)
Definition: test_util.cpp:1584
#define printfQuda(...)
Definition: util_quda.h:67
QudaTboundary t_boundary
Definition: quda.h:38
int x4_from_full_index(int i)
Definition: test_util.cpp:661
QudaMassNormalization get_mass_normalization_type(char *s)
Definition: misc.cpp:857
int full_idx
short x1odd
Definition: llfat_core.h:821
QudaInverterType inv_type
Definition: test_util.cpp:1565
bool verify_results
Definition: test_util.cpp:1568
quda::cudaGaugeField * checkGauge(QudaInvertParam *param)
QudaMassNormalization normalization
Definition: test_util.cpp:1572
int V5
Definition: test_util.cpp:41
enum QudaDslashType_s QudaDslashType
int Lsdim
Definition: test_util.cpp:1557
QudaPrecision prec
Definition: test_util.cpp:1551
int faceVolume[4]
Definition: test_util.cpp:33
int Vsh_z
Definition: test_util.cpp:32
void accumulateComplexDotProduct(Float *a, Float *b, Float *c)
Definition: test_util.cpp:231
__host__ __device__ ValueType cos(ValueType x)
Definition: complex_quda.h:35
int Vs_t
Definition: test_util.cpp:31
QudaMatPCType matpc_type
Definition: test_util.cpp:1573
void usage_extra(char **argv)
int E4
Definition: test_util.cpp:36
VOLATILE spinorFloat * s
char latfile[256]
Definition: test_util.cpp:1561
int E3
Definition: test_util.cpp:36
void su3_construct(void *mat, QudaReconstructType reconstruct, QudaPrecision precision)
Definition: test_util.cpp:261
const char * get_quda_ver_str()
Definition: misc.cpp:1050
void su3_reconstruct(void *mat, int dir, int ga_idx, QudaReconstructType reconstruct, QudaPrecision precision, QudaGaugeParam *param)
Definition: test_util.cpp:351
QudaPrecision prec_sloppy
Definition: test_util.cpp:1552
void initComms(int argc, char **argv, const int *commDims)
Definition: test_util.cpp:48
void * gauge[4]
Definition: su3_test.cpp:15
float fat_link_max
int Vh
Definition: test_util.cpp:30
enum QudaInverterType_s QudaInverterType
int oddBit
int V
Definition: test_util.cpp:29
__constant__ int X4
int Vsh_x
Definition: test_util.cpp:32
#define XUP
Definition: test_util.cpp:22
void complexConjugateProduct(Float *a, Float *b, Float *c)
Definition: test_util.cpp:210
int E1h
Definition: test_util.cpp:36
int neighborIndexFullLattice(int i, int dx4, int dx3, int dx2, int dx1)
Definition: test_util.cpp:521
void * fatlink[4]
QudaReconstructType link_recon
Definition: test_util.cpp:1549
enum QudaTwistFlavorType_s QudaTwistFlavorType