9 #elif defined(MPI_COMMS)
48 void initComms(
int argc,
char **argv,
const int *commDims)
50 #if defined(QMP_COMMS)
51 QMP_thread_level_t tl;
52 QMP_init_msg_passing(&argc, &argv, QMP_THREAD_SINGLE, &tl);
55 QMP_declare_logical_topology(commDims, 4);
57 #elif defined(MPI_COMMS)
58 MPI_Init(&argc, &argv);
67 #if defined(QMP_COMMS)
68 QMP_finalize_msg_passing();
69 #elif defined(MPI_COMMS)
79 #if defined(QMP_COMMS)
80 rank = QMP_get_node_number();
81 #elif defined(MPI_COMMS)
82 MPI_Comm_rank(MPI_COMM_WORLD, &rank);
90 for (
int d=0; d< 4; d++) {
95 for (
int i=0; i<4; i++) {
102 Vs_x = X[1]*X[2]*X[3];
103 Vs_y = X[0]*X[2]*X[3];
104 Vs_z = X[0]*X[1]*X[3];
105 Vs_t = X[0]*X[1]*X[2];
113 E1=X[0]+4;
E2=X[1]+4;
E3=X[2]+4;
E4=X[3]+4;
128 for (
int d=0; d< 4; d++)
134 for (
int i=0; i<4; i++) {
156 template <
typename Float>
157 static void printVector(
Float *v) {
158 printfQuda(
"{(%f %f) (%f %f) (%f %f)}\n", v[0], v[1], v[2], v[3], v[4], v[5]);
164 for (
int s=0;
s<4;
s++) printVector((
double*)spinor+X*24+
s*6);
166 for (
int s=0;
s<4;
s++) printVector((
float*)spinor+X*24+
s*6);
173 for (
int m=0; m<3; m++) printVector((
double*)gauge +(X/2)*
gaugeSiteSize + m*3*2);
175 for (
int m=0; m<3; m++) printVector((
float*)gauge +(X/2)*
gaugeSiteSize + m*3*2);
179 for (
int m = 0; m < 3; m++) printVector((
double*)gauge + (X/2+
Vh)*
gaugeSiteSize + m*3*2);
181 for (
int m = 0; m < 3; m++) printVector((
float*)gauge + (X/2+
Vh)*
gaugeSiteSize + m*3*2);
187 int x4 = Y/(
Z[2]*
Z[1]*
Z[0]);
188 int x3 = (Y/(
Z[1]*
Z[0])) %
Z[2];
189 int x2 = (Y/
Z[0]) %
Z[1];
191 return (x4+x3+x2+x1) % 2;
195 template <
typename Float>
202 template <
typename Float>
204 a[0] = b[0]*c[0] - b[1]*c[1];
205 a[1] = b[0]*c[1] + b[1]*c[0];
209 template <
typename Float>
211 a[0] = b[0]*c[0] - b[1]*c[1];
212 a[1] = -b[0]*c[1] - b[1]*c[0];
216 template <
typename Float>
218 a[0] = b[0]*c[0] + b[1]*c[1];
219 a[1] = b[0]*c[1] - b[1]*c[0];
223 template <
typename Float>
225 a[0] += sign*(b[0]*c[0] - b[1]*c[1]);
226 a[1] += sign*(b[0]*c[1] + b[1]*c[0]);
230 template <
typename Float>
232 a[0] += b[0]*c[0] + b[1]*c[1];
233 a[1] += b[0]*c[1] - b[1]*c[0];
236 template <
typename Float>
238 a[0] += sign * (b[0]*c[0] - b[1]*c[1]);
239 a[1] -= sign * (b[0]*c[1] + b[1]*c[0]);
242 template <
typename Float>
254 template <
typename Float>
256 mat[0] =
atan2(mat[1], mat[0]);
257 mat[1] =
atan2(mat[13], mat[12]);
258 for (
int i=8; i<18; i++) mat[i] = 0.0;
275 template <
typename Float>
277 Float *u = &mat[0*(3*2)];
278 Float *v = &mat[1*(3*2)];
279 Float *w = &mat[2*(3*2)];
280 w[0] = 0.0; w[1] = 0.0; w[2] = 0.0; w[3] = 0.0; w[4] = 0.0; w[5] = 0.0;
289 w[0]*=u0; w[1]*=u0; w[2]*=u0; w[3]*=u0; w[4]*=u0; w[5]*=u0;
292 template <
typename Float>
296 row_sum += mat[2]*mat[2];
297 row_sum += mat[3]*mat[3];
298 row_sum += mat[4]*mat[4];
299 row_sum += mat[5]*mat[5];
302 Float U00_mag =
sqrt(1.f/(u0*u0) - row_sum);
307 mat[0] = U00_mag *
cos(mat[14]);
308 mat[1] = U00_mag *
sin(mat[14]);
310 Float column_sum = 0.0;
311 for (
int i=0; i<2; i++) column_sum += mat[i]*mat[i];
312 for (
int i=6; i<8; i++) column_sum += mat[i]*mat[i];
313 Float U20_mag =
sqrt(1.f/(u0*u0) - column_sum);
315 mat[12] = U20_mag *
cos(mat[15]);
316 mat[13] = U20_mag *
sin(mat[15]);
321 Float r_inv2 = 1.0/(u0*row_sum);
354 else su3Reconstruct12((
float*)mat, dir, ga_idx, param);
357 else su3Reconstruct8((
float*)mat, dir, ga_idx, param);
383 template <
typename Float>
384 static int compareFloats(
Float *a,
Float *b,
int len,
double epsilon) {
385 for (
int i = 0; i < len; i++) {
386 double diff = fabs(a[i] - b[i]);
387 if (diff > epsilon) {
388 printfQuda(
"ERROR: i=%d, a[%d]=%f, b[%d]=%f\n", i, i, a[i], i, b[i]);
397 else return compareFloats((
float*)a, (
float*)b, len, epsilon);
402 int za = index/(dim[0]>>1);
404 int x2 = za - zb*dim[1];
406 int x3 = zb - x4*dim[2];
408 return 2*index + ((x2 + x3 + x4 +
oddBit) & 1);
452 int x4 = Y/(
Z[2]*
Z[1]*
Z[0]);
453 int x3 = (Y/(
Z[1]*
Z[0])) %
Z[2];
454 int x2 = (Y/
Z[0]) %
Z[1];
459 x4 = (x4+dx4+
Z[3]) %
Z[3];
460 x3 = (x3+dx3+
Z[2]) %
Z[2];
461 x2 = (x2+dx2+
Z[1]) %
Z[1];
462 x1 = (x1+dx1+
Z[0]) %
Z[0];
464 return (x4*(
Z[2]*
Z[1]*
Z[0]) + x3*(Z[1]*Z[0]) + x2*(Z[0]) +
x1) / 2;
473 x[3] = fullIndex/(dim[2]*dim[1]*dim[0]);
474 x[2] = (fullIndex/(dim[1]*dim[0])) % dim[2];
475 x[1] = (fullIndex/dim[0]) % dim[1];
476 x[0] = fullIndex % dim[0];
478 for(
int dir=0; dir<4; ++dir)
479 x[dir] = (x[dir]+dx[dir]+dim[dir]) % dim[dir];
481 return (((x[3]*dim[2] + x[2])*dim[1] + x[1])*dim[0] + x[0])/2;
490 int x4 = Y/(
Z[2]*
Z[1]*
Z[0]);
491 int x3 = (Y/(
Z[1]*
Z[0])) %
Z[2];
492 int x2 = (Y/
Z[0]) %
Z[1];
495 int ghost_x4 = x4+ dx4;
499 x4 = (x4+dx4+
Z[3]) %
Z[3];
500 x3 = (x3+dx3+
Z[2]) %
Z[2];
501 x2 = (x2+dx2+
Z[1]) %
Z[1];
502 x1 = (x1+dx1+
Z[0]) %
Z[0];
504 if ( ghost_x4 >= 0 && ghost_x4 <
Z[3]){
505 ret = (x4*(
Z[2]*
Z[1]*
Z[0]) + x3*(
Z[1]*
Z[0]) + x2*(Z[0]) + x1) / 2;
507 ret = (x3*(
Z[1]*
Z[0]) + x2*(
Z[0]) +
x1) / 2;
530 int nbr_half_idx =
neighborIndex(half_idx, oddBit, dx4,dx3,dx2,dx1);
531 int oddBitChanged = (dx4+dx3+dx2+dx1)%2;
535 int ret = nbr_half_idx;
537 ret =
Vh + nbr_half_idx;
546 const int volume = dim[0]*dim[1]*dim[2]*dim[3];
547 const int halfVolume = volume/2;
549 int halfIndex = index;
551 if(index >= halfVolume){
553 halfIndex = index - halfVolume;
556 int neighborHalfIndex =
neighborIndex(dim, halfIndex, oddBit, dx);
558 int oddBitChanged = (dx[0]+dx[1]+dx[2]+dx[3])%2;
563 return neighborHalfIndex + oddBit*halfVolume;
580 int x4 = Y/(
Z[2]*
Z[1]*
Z[0]);
581 int x3 = (Y/(
Z[1]*
Z[0])) %
Z[2];
582 int x2 = (Y/
Z[0]) %
Z[1];
584 int ghost_x4 = x4+ dx4;
586 x4 = (x4+dx4+
Z[3]) %
Z[3];
587 x3 = (x3+dx3+
Z[2]) %
Z[2];
588 x2 = (x2+dx2+
Z[1]) %
Z[1];
589 x1 = (x1+dx1+
Z[0]) %
Z[0];
591 if ( ghost_x4 >= 0 && ghost_x4 <
Z[3]){
592 ret = (x4*(
Z[2]*
Z[1]*
Z[0]) + x3*(
Z[1]*
Z[0]) + x2*(Z[0]) + x1) / 2;
594 ret = (x3*(
Z[1]*
Z[0]) + x2*(
Z[0]) +
x1) / 2;
598 int oddBitChanged = (dx4+dx3+dx2+dx1)%2;
617 if (i >=
Vh || i < 0) {printf(
"i out of range in fullLatticeIndex_4d"); exit(-1);}
651 int boundaryCrossings = i/(
Z[0]/2) + i/(
Z[1]*
Z[0]/2) + i/(
Z[2]*
Z[1]*
Z[0]/2) + i/(
Z[3]*
Z[2]*
Z[1]*
Z[0]/2);
652 return 2*i + (boundaryCrossings +
oddBit) % 2;
656 int boundaryCrossings = i/(
Z[0]/2) + i/(
Z[1]*
Z[0]/2) + i/(
Z[2]*
Z[1]*
Z[0]/2);
657 return 2*i + (boundaryCrossings +
oddBit) % 2;
671 int x4 = Y/(
Z[2]*
Z[1]*
Z[0]);
676 template <
typename Float>
679 for (
int d = 0; d < 3; d++) {
689 bool last_node_in_t =
true;
694 for (
int j = (
Z[0]/2)*
Z[1]*
Z[2]*(
Z[3]-1); j <
Vh; j++) {
696 gauge[3][j*gaugeSiteSize+i] *= -1.0;
697 gauge[3][(Vh+j)*gaugeSiteSize+i] *= -1.0;
705 int iMax = ( last_node_in_t ? (
Z[0]/2)*
Z[1]*
Z[2]*(
Z[3]-1) : Vh );
707 Float *even = gauge[dir];
709 for (
int i = 0; i< iMax; i++) {
710 for (
int m = 0; m < 3; m++) {
711 for (
int n = 0; n < 3; n++) {
712 even[i*(3*3*2) + m*(3*2) + n*(2) + 0] = (m==n) ? 1 : 0;
713 even[i*(3*3*2) + m*(3*2) + n*(2) + 1] = 0.0;
714 odd [i*(3*3*2) + m*(3*2) + n*(2) + 0] = (m==n) ? 1 : 0;
715 odd [i*(3*3*2) + m*(3*2) + n*(2) + 1] = 0.0;
722 template <
typename Float>
726 int X1h=param->
X[0]/2;
733 for(
int d=0; d<4; d++){
740 for (
int d = 0; d < 3; d++) {
743 for (
int i = 0; i <
Vh; i++) {
746 int i4 = index /(X3*X2*
X1);
747 int i3 = (index - i4*(X3*X2*
X1))/(X2*
X1);
748 int i2 = (index - i4*(X3*X2*
X1) - i3*(X2*X1))/X1;
749 int i1 = index - i4*(X3*X2*
X1) - i3*(X2*X1) - i2*
X1;
759 if ((i4+i1) % 2 == 1){
764 if ( (i4+i1+i2) % 2 == 1){
769 for (
int j=0;j < 6; j++){
770 gauge[d][i*gaugeSiteSize + 12+ j] *= sign;
774 for (
int i = 0; i <
Vh; i++) {
776 int i4 = index /(X3*X2*
X1);
777 int i3 = (index - i4*(X3*X2*
X1))/(X2*
X1);
778 int i2 = (index - i4*(X3*X2*
X1) - i3*(X2*X1))/X1;
779 int i1 = index - i4*(X3*X2*
X1) - i3*(X2*X1) - i2*
X1;
789 if ((i4+i1) % 2 == 1){
794 if ( (i4+i1+i2) % 2 == 1){
799 for (
int j=0;j < 6; j++){
800 gauge[d][(Vh+i)*gaugeSiteSize + 12 + j] *= sign;
808 for (
int j = 0; j <
Vh; j++) {
810 if (j >= (X4-3)*X1h*X2*
X3 ){
814 for (
int i = 0; i < 6; i++) {
815 gauge[3][j*gaugeSiteSize+ 12+ i ] *= sign;
816 gauge[3][(Vh+j)*gaugeSiteSize+12 +i] *= sign;
824 template <
typename Float>
826 Float *resOdd[4], *resEven[4];
827 for (
int dir = 0; dir < 4; dir++) {
828 resEven[dir] = res[dir];
832 for (
int dir = 0; dir < 4; dir++) {
833 for (
int i = 0; i <
Vh; i++) {
834 for (
int m = 0; m < 3; m++) {
835 for (
int n = 0; n < 3; n++) {
836 resEven[dir][i*(3*3*2) + m*(3*2) + n*(2) + 0] = (m==n) ? 1 : 0;
837 resEven[dir][i*(3*3*2) + m*(3*2) + n*(2) + 1] = 0.0;
838 resOdd[dir][i*(3*3*2) + m*(3*2) + n*(2) + 0] = (m==n) ? 1 : 0;
839 resOdd[dir][i*(3*3*2) + m*(3*2) + n*(2) + 1] = 0.0;
845 applyGaugeFieldScaling(res, Vh, param);
849 template <
typename Float>
852 for (
int i=0; i<len; i++) sum +=
norm(a[i]);
853 for (
int i=0; i<len; i++) a[i] /=
sqrt(sum);
857 template <
typename Float>
860 for (
int i=0; i<len; i++) dot +=
conj(a[i])*b[i];
864 template <
typename Float>
866 Float *resOdd[4], *resEven[4];
867 for (
int dir = 0; dir < 4; dir++) {
868 resEven[dir] = res[dir];
872 for (
int dir = 0; dir < 4; dir++) {
873 for (
int i = 0; i <
Vh; i++) {
874 for (
int m = 1; m < 3; m++) {
875 for (
int n = 0; n < 3; n++) {
876 resEven[dir][i*(3*3*2) + m*(3*2) + n*(2) + 0] = rand() / (
Float)RAND_MAX;
877 resEven[dir][i*(3*3*2) + m*(3*2) + n*(2) + 1] = rand() / (
Float)RAND_MAX;
878 resOdd[dir][i*(3*3*2) + m*(3*2) + n*(2) + 0] = rand() / (
Float)RAND_MAX;
879 resOdd[dir][i*(3*3*2) + m*(3*2) + n*(2) + 1] = rand() / (
Float)RAND_MAX;
891 Float *w = resEven[dir]+(i*3+0)*3*2;
892 Float *u = resEven[dir]+(i*3+1)*3*2;
893 Float *v = resEven[dir]+(i*3+2)*3*2;
895 for (
int n = 0; n < 6; n++) w[n] = 0.0;
905 Float *w = resOdd[dir]+(i*3+0)*3*2;
906 Float *u = resOdd[dir]+(i*3+1)*3*2;
907 Float *v = resOdd[dir]+(i*3+2)*3*2;
909 for (
int n = 0; n < 6; n++) w[n] = 0.0;
922 applyGaugeFieldScaling(res, Vh, param);
926 for (
int dir = 0; dir < 4; dir++){
927 for (
int i = 0; i <
Vh; i++) {
928 for (
int m = 0; m < 3; m++) {
929 for (
int n = 0; n < 3; n++) {
930 resEven[dir][i*(3*3*2) + m*(3*2) + n*(2) + 0] =1.0* rand() / (
Float)RAND_MAX;
931 resEven[dir][i*(3*3*2) + m*(3*2) + n*(2) + 1] =2.0* rand() / (
Float)RAND_MAX;
932 resOdd[dir][i*(3*3*2) + m*(3*2) + n*(2) + 0] = 3.0*rand() / (
Float)RAND_MAX;
933 resOdd[dir][i*(3*3*2) + m*(3*2) + n*(2) + 1] = 4.0*rand() / (
Float)RAND_MAX;
943 template <
typename Float>
946 Float *resOdd[4], *resEven[4];
947 for (
int dir = 0; dir < 4; dir++) {
948 resEven[dir] = res[dir];
952 for (
int dir = 0; dir < 4; dir++) {
953 for (
int i = 0; i <
Vh; i++) {
954 for (
int m = 1; m < 3; m++) {
955 for (
int n = 0; n < 3; n++) {
956 resEven[dir][i*(3*3*2) + m*(3*2) + n*(2) + 0] = rand() / (
Float)RAND_MAX;
957 resEven[dir][i*(3*3*2) + m*(3*2) + n*(2) + 1] = rand() / (
Float)RAND_MAX;
958 resOdd[dir][i*(3*3*2) + m*(3*2) + n*(2) + 0] = rand() / (
Float)RAND_MAX;
959 resOdd[dir][i*(3*3*2) + m*(3*2) + n*(2) + 1] = rand() / (
Float)RAND_MAX;
971 Float *w = resEven[dir]+(i*3+0)*3*2;
972 Float *u = resEven[dir]+(i*3+1)*3*2;
973 Float *v = resEven[dir]+(i*3+2)*3*2;
975 for (
int n = 0; n < 6; n++) w[n] = 0.0;
985 Float *w = resOdd[dir]+(i*3+0)*3*2;
986 Float *u = resOdd[dir]+(i*3+1)*3*2;
987 Float *v = resOdd[dir]+(i*3+2)*3*2;
989 for (
int n = 0; n < 6; n++) w[n] = 0.0;
1006 else constructUnitGaugeField((
float**)gauge, param);
1007 }
else if (type == 1) {
1009 else constructGaugeField((
float**)gauge, param);
1012 else applyGaugeFieldScaling((
float**)gauge, Vh, param);
1024 constructUnitGaugeField((
double**)fatlink, param);
1025 constructUnitGaugeField((
double**)longlink, param);
1027 constructUnitGaugeField((
float**)fatlink, param);
1028 constructUnitGaugeField((
float**)longlink, param);
1033 constructGaugeField((
double**)fatlink, param);
1035 constructGaugeField((
double**)longlink, param);
1038 constructGaugeField((
float**)fatlink, param);
1040 constructGaugeField((
float**)longlink, param);
1046 const double cos_pi_3 = 0.5;
1047 const double sin_pi_3 =
sqrt(0.75);
1048 for(
int dir=0; dir<4; ++dir){
1049 for(
int i=0; i<
V; ++i){
1052 const double real = ((
double*)longlink[dir])[i*gaugeSiteSize + j];
1053 const double imag = ((
double*)longlink[dir])[i*gaugeSiteSize + j + 1];
1054 ((
double*)longlink[dir])[i*gaugeSiteSize + j] = real*cos_pi_3 - imag*sin_pi_3;
1055 ((
double*)longlink[dir])[i*gaugeSiteSize + j + 1] = real*sin_pi_3 + imag*cos_pi_3;
1057 const float real = ((
float*)longlink[dir])[i*gaugeSiteSize + j];
1058 const float imag = ((
float*)longlink[dir])[i*gaugeSiteSize + j + 1];
1059 ((
float*)longlink[dir])[i*gaugeSiteSize + j] = real*cos_pi_3 - imag*sin_pi_3;
1060 ((
float*)longlink[dir])[i*gaugeSiteSize + j + 1] = real*sin_pi_3 + imag*cos_pi_3;
1068 for(
int dir=0; dir<4; ++dir){
1069 for(
int i=0; i<
V; ++i){
1072 ((
double*)longlink[dir])[i*gaugeSiteSize + j] = 0.0;
1073 ((
double*)longlink[dir])[i*gaugeSiteSize + j + 1] = 0.0;
1075 ((
float*)longlink[dir])[i*gaugeSiteSize + j] = 0.0;
1076 ((
float*)longlink[dir])[i*gaugeSiteSize + j + 1] = 0.0;
1087 template <
typename Float>
1088 static void constructCloverField(
Float *res,
double norm,
double diag) {
1090 Float c = 2.0 * norm / RAND_MAX;
1092 for(
int i = 0; i <
V; i++) {
1093 for (
int j = 0; j < 72; j++) {
1094 res[i*72 + j] = c*rand() -
norm;
1096 for (
int j = 0; j< 6; j++) {
1097 res[i*72 + j] += diag;
1098 res[i*72 + j+36] += diag;
1106 else constructCloverField((
float *)clover, norm, diag);
1121 template <
typename Float>
1124 const int fail_check = 17;
1125 int fail[4][fail_check];
1127 for (
int d=0; d<4; d++)
for (
int i=0; i<fail_check; i++) fail[d][i] = 0;
1128 for (
int d=0; d<4; d++)
for (
int i=0; i<18; i++) iter[d][i] = 0;
1130 for (
int d=0; d<4; d++) {
1131 for (
int eo=0; eo<2; eo++) {
1132 for (
int i=0; i<
Vh; i++) {
1133 int ga_idx = (eo*Vh+i);
1134 for (
int j=0; j<18; j++) {
1135 double diff = fabs(newG[d][ga_idx*18+j] - oldG[d][ga_idx*18+j]);
1137 for (
int f=0; f<fail_check; f++) if (diff >
pow(10.0,-(f+1))) fail[d][f]++;
1138 if (diff > epsilon) iter[d][j]++;
1144 printf(
"Component fails (X, Y, Z, T)\n");
1145 for (
int i=0; i<18; i++) printf(
"%d fails = (%8d, %8d, %8d, %8d)\n", i, iter[0][i], iter[1][i], iter[2][i], iter[3][i]);
1147 printf(
"\nDeviation Failures = (X, Y, Z, T)\n");
1148 for (
int f=0; f<fail_check; f++) {
1149 printf(
"%e Failures = (%9d, %9d, %9d, %9d) = (%6.5f, %6.5f, %6.5f, %6.5f)\n",
pow(10.0,-(f+1)),
1150 fail[0][f], fail[1][f], fail[2][f], fail[3][f],
1151 fail[0][f]/(
double)(V*18), fail[1][f]/(
double)(V*18), fail[2][f]/(
double)(V*18), fail[3][f]/(
double)(V*18));
1158 checkGauge((
double**)oldG, (
double**)newG, epsilon);
1160 checkGauge((
float**)oldG, (
float**)newG, epsilon);
1179 bool last_node_in_t =
true;
1184 for(
int i=0;i <
V;i++){
1185 for(
int dir =
XUP; dir <=
TUP; dir++){
1199 int i4 = full_idx /(X3*X2*
X1);
1200 int i3 = (full_idx - i4*(X3*X2*
X1))/(X2*
X1);
1201 int i2 = (full_idx - i4*(X3*X2*
X1) - i3*(X2*X1))/X1;
1202 int i1 = full_idx - i4*(X3*X2*
X1) - i3*(X2*X1) - i2*
X1;
1207 if ( (i4 & 1) != 0){
1213 if ( ((i4+i1) & 1) != 0){
1219 if ( ((i4+i1+i2) & 1) != 0){
1225 if (last_node_in_t && i4 == (X4-1)){
1231 printf(
"ERROR: wrong dir(%d)\n", dir);
1238 double* mylink = (
double*)link[dir];
1241 mylink[12] *=
coeff;
1242 mylink[13] *=
coeff;
1243 mylink[14] *=
coeff;
1244 mylink[15] *=
coeff;
1245 mylink[16] *=
coeff;
1246 mylink[17] *=
coeff;
1251 float* mylink = (
float*)link[dir];
1254 mylink[12] *=
coeff;
1255 mylink[13] *=
coeff;
1256 mylink[14] *=
coeff;
1257 mylink[15] *=
coeff;
1258 mylink[16] *=
coeff;
1259 mylink[17] *=
coeff;
1268 for(
int dir= 0;dir < 4;dir++){
1271 float* f = (
float*)link[dir];
1272 if (f[i] != f[i] || (fabsf(f[i]) > 1.e+3) ){
1273 fprintf(stderr,
"ERROR: %dth: bad number(%f) in function %s \n",i, f[i], __FUNCTION__);
1277 double* f = (
double*)link[dir];
1278 if (f[i] != f[i] || (fabs(f[i]) > 1.e+3)){
1279 fprintf(stderr,
"ERROR: %dth: bad number(%f) in function %s \n",i, f[i], __FUNCTION__);
1295 template <
typename Float>
1297 const int fail_check = 16;
1298 int fail[fail_check];
1299 for (
int f=0; f<fail_check; f++) fail[f] = 0;
1302 for (
int i=0; i<18; i++) iter[i] = 0;
1304 for(
int dir=0;dir < 4; dir++){
1305 for (
int i=0; i<len; i++) {
1306 for (
int j=0; j<18; j++) {
1308 double diff = fabs(linkA[dir][is]-linkB[dir][is]);
1309 for (
int f=0; f<fail_check; f++)
1310 if (diff >
pow(10.0,-(f+1))) fail[f]++;
1312 if (diff > 1e-3) iter[j]++;
1317 for (
int i=0; i<18; i++)
printfQuda(
"%d fails = %d\n", i, iter[i]);
1319 int accuracy_level = 0;
1320 for(
int f =0; f < fail_check; f++){
1326 for (
int f=0; f<fail_check; f++) {
1327 printfQuda(
"%e Failures: %d / %d = %e\n",
pow(10.0,-(f+1)), fail[f], 4*len*18, fail[f] / (
double)(4*len*18));
1330 return accuracy_level;
1334 compare_link(
void **linkA,
void **linkB,
int len,
QudaPrecision precision)
1339 ret =
compareLink((
double**)linkA, (
double**)linkB, len);
1341 ret =
compareLink((
float**)linkA, (
float**)linkB, len);
1353 for(
int i=0; i < 3;i++){
1354 printVector((
double*)link+ X*gaugeSiteSize + i*6);
1359 for(
int i=0;i < 3;i++){
1360 printVector((
float*)link+X*gaugeSiteSize + i*6);
1366 void **linkB,
const char* msgB,
1370 printLinkElement(linkA[0], 0, prec);
1372 printLinkElement(linkA[0], 1, prec);
1374 printLinkElement(linkA[3], len-1, prec);
1378 printLinkElement(linkB[0], 0, prec);
1380 printLinkElement(linkB[0], 1, prec);
1382 printLinkElement(linkB[3], len-1, prec);
1385 int ret = compare_link(linkA, linkB, len, prec);
1396 temp = malloc(4*V*gaugeSiteSize*gSize);
1398 fprintf(stderr,
"Error: malloc failed for temp in function %s\n", __FUNCTION__);
1404 for(
int i=0;i <
V;i++){
1406 for(
int dir=0;dir < 4;dir++){
1407 double* thismom = (
double*)mom;
1409 thismom[ (4*i+dir)*momSiteSize + k ]= 1.0* rand() /RAND_MAX;
1410 if (k==momSiteSize-1) thismom[ (4*i+dir)*momSiteSize + k ]= 0.0;
1414 for(
int dir=0;dir < 4;dir++){
1415 float* thismom=(
float*)mom;
1417 thismom[ (4*i+dir)*momSiteSize + k ]= 1.0* rand() /RAND_MAX;
1418 if (k==momSiteSize-1) thismom[ (4*i+dir)*momSiteSize + k ]= 0.0;
1431 for(
int i=0;i <
V;i++){
1433 for(
int dir=0;dir < 4;dir++){
1434 double* thishw = (
double*)hw;
1436 thishw[ (4*i+dir)*hwSiteSize + k ]= 1.0* rand() /RAND_MAX;
1440 for(
int dir=0;dir < 4;dir++){
1441 float* thishw=(
float*)hw;
1443 thishw[ (4*i+dir)*hwSiteSize + k ]= 1.0* rand() /RAND_MAX;
1453 template <
typename Float>
1455 const int fail_check = 16;
1456 int fail[fail_check];
1457 for (
int f=0; f<fail_check; f++) fail[f] = 0;
1462 for (
int i=0; i<len; i++) {
1463 for (
int j=0; j<momSiteSize-1; j++) {
1464 int is = i*momSiteSize+j;
1465 double diff = fabs(momA[is]-momB[is]);
1466 for (
int f=0; f<fail_check; f++)
1467 if (diff >
pow(10.0,-(f+1))) fail[f]++;
1469 if (diff > 1e-3) iter[j]++;
1473 int accuracy_level = 0;
1474 for(
int f =0; f < fail_check; f++){
1476 accuracy_level =f+1;
1482 for (
int f=0; f<fail_check; f++) {
1483 printfQuda(
"%e Failures: %d / %d = %e\n",
pow(10.0,-(f+1)), fail[f], len*9, fail[f]/(
double)(len*9));
1486 return accuracy_level;
1494 printVector(thismom);
1495 printfQuda(
"(%9f,%9f) (%9f,%9f)\n", thismom[6], thismom[7], thismom[8], thismom[9]);
1498 printVector(thismom);
1499 printfQuda(
"(%9f,%9f) (%9f,%9f)\n", thismom[6], thismom[7], thismom[8], thismom[9]);
1505 printMomElement(momA, 0, prec);
1507 printMomElement(momA, 1, prec);
1509 printMomElement(momA, 2, prec);
1511 printMomElement(momA, 3, prec);
1515 printMomElement(momB, 0, prec);
1517 printMomElement(momB, 1, prec);
1519 printMomElement(momB, 2, prec);
1521 printMomElement(momB, 3, prec);
1526 ret =
compare_mom((
double*)momA, (
double*)momB, len);
1528 ret =
compare_mom((
float*)momA, (
float*)momB, len);
1575 static int dim_partitioned[4] = {0,0,0,0};
1586 printf(
"Usage: %s [options]\n", argv[0]);
1587 printf(
"Common options: \n");
1589 printf(
" --device <n> # Set the CUDA device to use (default 0, single GPU only)\n");
1591 printf(
" --prec <double/single/half> # Precision in GPU\n");
1592 printf(
" --prec_sloppy <double/single/half> # Sloppy precision in GPU\n");
1593 printf(
" --recon <8/9/12/13/18> # Link reconstruction type\n");
1594 printf(
" --recon_sloppy <8/9/12/13/18> # Sloppy link reconstruction type\n");
1595 printf(
" --dagger # Set the dagger to 1 (default 0)\n");
1596 printf(
" --sdim <n> # Set space dimention(X/Y/Z) size\n");
1597 printf(
" --xdim <n> # Set X dimension size(default 24)\n");
1598 printf(
" --ydim <n> # Set X dimension size(default 24)\n");
1599 printf(
" --zdim <n> # Set X dimension size(default 24)\n");
1600 printf(
" --tdim <n> # Set T dimension size(default 24)\n");
1601 printf(
" --Lsdim <n> # Set Ls dimension size(default 16)\n");
1602 printf(
" --xgridsize <n> # Set grid size in X dimension (default 1)\n");
1603 printf(
" --ygridsize <n> # Set grid size in Y dimension (default 1)\n");
1604 printf(
" --zgridsize <n> # Set grid size in Z dimension (default 1)\n");
1605 printf(
" --tgridsize <n> # Set grid size in T dimension (default 1)\n");
1606 printf(
" --partition <mask> # Set the communication topology (X=1, Y=2, Z=4, T=8, and combinations of these)\n");
1607 printf(
" --kernel-pack-t # Set T dimension kernel packing to be true (default false)\n");
1608 printf(
" --dslash_type <type> # Set the dslash type, the following values are valid\n"
1609 " wilson/clover/twisted_mass/twisted_clover/staggered\n"
1610 " /asqtad/domain_wall/domain_wall_4d/mobius\n");
1611 printf(
" --flavor <type> # Set the twisted mass flavor type (minus (default), plus, deg_doublet, nondeg_doublet)\n");
1612 printf(
" --load-gauge file # Load gauge field \"file\" for the test (requires QIO)\n");
1613 printf(
" --niter <n> # The number of iterations to perform (default 10)\n");
1614 printf(
" --inv_type <cg/bicgstab/gcr> # The type of solver to use (default cg)\n");
1615 printf(
" --precon_type <mr/ (unspecified)> # The type of solver to use (default none (=unspecified))\n");
1616 printf(
" --multishift <true/false> # Whether to do a multi-shift solver test or not (default false)\n");
1617 printf(
" --mass # Mass of Dirac operator (default 0.1)\n");
1618 printf(
" --mass-normalization # Mass normalization (kappa (default) / mass)\n");
1619 printf(
" --matpc # Matrix preconditioning type (even-even, odd_odd, even_even_asym, odd_odd_asym) \n");
1620 printf(
" --tune <true/false> # Whether to autotune or not (default true)\n");
1621 printf(
" --test # Test method (different for each test)\n");
1622 printf(
" --verify <true/false> # Verify the GPU results using CPU results (default true)\n");
1623 printf(
" --help # Print out this message\n");
1628 char msg[]=
"single";
1630 printf(
"Note: this program is %s GPU build\n", msg);
1640 char msg[]=
"single";
1647 if( strcmp(argv[i],
"--help")== 0){
1651 if( strcmp(argv[i],
"--verify") == 0){
1656 if (strcmp(argv[i+1],
"true") == 0){
1658 }
else if (strcmp(argv[i+1],
"false") == 0){
1661 fprintf(stderr,
"ERROR: invalid verify type\n");
1670 if( strcmp(argv[i],
"--device") == 0){
1674 device = atoi(argv[i+1]);
1675 if (device < 0 || device > 16){
1676 printf(
"ERROR: Invalid CUDA device number (%d)\n",
device);
1684 if( strcmp(argv[i],
"--prec") == 0){
1694 if( strcmp(argv[i],
"--prec_sloppy") == 0){
1704 if( strcmp(argv[i],
"--recon") == 0){
1714 if( strcmp(argv[i],
"--recon_sloppy") == 0){
1724 if( strcmp(argv[i],
"--xdim") == 0){
1728 xdim= atoi(argv[i+1]);
1729 if (xdim < 0 || xdim > 512){
1730 printf(
"ERROR: invalid X dimension (%d)\n",
xdim);
1738 if( strcmp(argv[i],
"--ydim") == 0){
1742 ydim= atoi(argv[i+1]);
1743 if (ydim < 0 || ydim > 512){
1744 printf(
"ERROR: invalid T dimension (%d)\n",
ydim);
1753 if( strcmp(argv[i],
"--zdim") == 0){
1757 zdim= atoi(argv[i+1]);
1758 if (zdim < 0 || zdim > 512){
1759 printf(
"ERROR: invalid T dimension (%d)\n",
zdim);
1767 if( strcmp(argv[i],
"--tdim") == 0){
1771 tdim = atoi(argv[i+1]);
1772 if (tdim < 0 || tdim > 512){
1773 errorQuda(
"Error: invalid t dimension");
1780 if( strcmp(argv[i],
"--sdim") == 0){
1784 int sdim = atoi(argv[i+1]);
1785 if (sdim < 0 || sdim > 512){
1794 if( strcmp(argv[i],
"--Lsdim") == 0){
1798 int Ls = atoi(argv[i+1]);
1799 if (Ls < 0 || Ls > 128){
1808 if( strcmp(argv[i],
"--dagger") == 0){
1814 if( strcmp(argv[i],
"--partition") == 0){
1819 int value = atoi(argv[i+1]);
1820 for(
int j=0; j < 4;j++){
1821 if (value & (1 << j)){
1823 dim_partitioned[j] = 1;
1827 printfQuda(
"WARNING: Ignoring --partition option since this is a single-GPU build.\n");
1834 if( strcmp(argv[i],
"--kernel-pack-t") == 0){
1841 if( strcmp(argv[i],
"--tune") == 0){
1846 if (strcmp(argv[i+1],
"true") == 0){
1848 }
else if (strcmp(argv[i+1],
"false") == 0){
1851 fprintf(stderr,
"ERROR: invalid tuning type\n");
1860 if( strcmp(argv[i],
"--multishift") == 0){
1865 if (strcmp(argv[i+1],
"true") == 0){
1867 }
else if (strcmp(argv[i+1],
"false") == 0){
1870 fprintf(stderr,
"ERROR: invalid multishift boolean\n");
1879 if( strcmp(argv[i],
"--xgridsize") == 0){
1883 int xsize = atoi(argv[i+1]);
1885 errorQuda(
"ERROR: invalid X grid size");
1893 if( strcmp(argv[i],
"--ygridsize") == 0){
1897 int ysize = atoi(argv[i+1]);
1899 errorQuda(
"ERROR: invalid Y grid size");
1907 if( strcmp(argv[i],
"--zgridsize") == 0){
1911 int zsize = atoi(argv[i+1]);
1913 errorQuda(
"ERROR: invalid Z grid size");
1921 if( strcmp(argv[i],
"--tgridsize") == 0){
1925 int tsize = atoi(argv[i+1]);
1927 errorQuda(
"ERROR: invalid T grid size");
1935 if( strcmp(argv[i],
"--dslash_type") == 0){
1945 if( strcmp(argv[i],
"--flavor") == 0){
1955 if( strcmp(argv[i],
"--inv_type") == 0){
1965 if( strcmp(argv[i],
"--precon_type") == 0){
1975 if( strcmp(argv[i],
"--mass") == 0){
1979 mass= atof(argv[i+1]);
1985 if( strcmp(argv[i],
"--mass-normalization") == 0){
1995 if( strcmp(argv[i],
"--matpc") == 0){
2005 if( strcmp(argv[i],
"--load-gauge") == 0){
2015 if( strcmp(argv[i],
"--test") == 0){
2025 if( strcmp(argv[i],
"--niter") == 0){
2029 niter= atoi(argv[i+1]);
2030 if (niter < 1 || niter > 1e6){
2031 printf(
"ERROR: invalid number of iterations (%d)\n",
niter);
2039 if( strcmp(argv[i],
"--version") == 0){
2040 printf(
"This program is linked with QUDA library, version %s,",
2042 printf(
" %s GPU build\n", msg);
2053 static struct timeval startTime;
2056 gettimeofday(&startTime, NULL);
2060 struct timeval endTime;
2061 gettimeofday(&endTime, NULL);
2063 long ds = endTime.tv_sec - startTime.tv_sec;
2064 long dus = endTime.tv_usec - startTime.tv_usec;
2065 return ds + 0.000001*dus;
void printGaugeElement(void *gauge, int X, QudaPrecision precision)
int dimPartitioned(int dim)
enum QudaMassNormalization_s QudaMassNormalization
void construct_gauge_field(void **gauge, int type, QudaPrecision precision, QudaGaugeParam *param)
int compareLink(Float **linkA, Float **linkB, int len)
enum QudaPrecision_s QudaPrecision
QudaTwistFlavorType get_flavor_type(char *s)
void dw_setDims(int *X, const int L5)
__host__ __device__ ValueType norm(const complex< ValueType > &z)
Returns the magnitude of z squared.
void accumulateConjugateProduct(Float *a, Float *b, Float *c, int sign)
Matrix< N, std::complex< T > > conj(const Matrix< N, std::complex< T > > &mat)
void printSpinorElement(void *spinor, int X, QudaPrecision precision)
void createHwCPU(void *hw, QudaPrecision precision)
void createMomCPU(void *mom, QudaPrecision precision)
__host__ __device__ ValueType sqrt(ValueType x)
int process_command_line_option(int argc, char **argv, int *idx)
void complexDotProduct(Float *a, Float *b, Float *c)
int fullLatticeIndex_5d_4dpc(int i, int oddBit)
void mat(void *out, void **fatlink, void **longlink, void *in, double kappa, int dagger_bit, QudaPrecision sPrecision, QudaPrecision gPrecision)
void su3Construct8(Float *mat)
void constructUnitaryGaugeField(Float **res)
cpuColorSpinorField * spinor
int gridsize_from_cmdline[4]
int compare_floats(void *a, void *b, int len, double epsilon, QudaPrecision precision)
void commDimPartitionedSet(int dir)
QudaReconstructType get_recon(char *s)
QudaInverterType precon_type
void createSiteLinkCPU(void **link, QudaPrecision precision, int phase)
void initCommsGridQuda(int nDim, const int *dims, QudaCommsMap func, void *fdata)
int fullLatticeIndex(int dim[4], int index, int oddBit)
int compare_mom(Float *momA, Float *momB, int len)
double stopwatchReadSeconds()
QudaInverterType get_solver_type(char *s)
__host__ __device__ ValueType sin(ValueType x)
FloatingPoint< float > Float
void setSpinorSiteSize(int n)
QudaDslashType get_dslash_type(char *s)
void complexAddTo(Float *a, Float *b)
__host__ __device__ ValueType atan2(ValueType x, ValueType y)
enum QudaMatPCType_s QudaMatPCType
void accumulateComplexProduct(Float *a, Float *b, Float *c, Float sign)
__host__ __device__ ValueType pow(ValueType x, ExponentType e)
QudaReconstructType link_recon_sloppy
__constant__ double coeff
QudaMatPCType get_matpc_type(char *s)
enum QudaDagType_s QudaDagType
void su3Construct12(Float *mat)
QudaReconstructType reconstruct
void __attribute__((weak)) usage_extra(char **argv)
void complexProduct(Float *a, Float *b, Float *c)
int neighborIndex(int i, int oddBit, int dx4, int dx3, int dx2, int dx1)
int strong_check_mom(void *momA, void *momB, int len, QudaPrecision prec)
void check_gauge(void **oldG, void **newG, double epsilon, QudaPrecision precision)
QudaTwistFlavorType twist_flavor
void applyGaugeFieldScaling_long(Float **gauge, int Vh, QudaGaugeParam *param)
int strong_check_link(void **linkA, const char *msgA, void **linkB, const char *msgB, int len, QudaPrecision prec)
int neighborIndex_mg(int i, int oddBit, int dx4, int dx3, int dx2, int dx1)
void construct_fat_long_gauge_field(void **fatlink, void **longlink, int type, QudaPrecision precision, QudaGaugeParam *param, QudaDslashType dslash_type)
void construct_clover_field(void *clover, double norm, double diag, QudaPrecision precision)
QudaDslashType dslash_type
int neighborIndexFullLattice_mg(int i, int dx4, int dx3, int dx2, int dx1)
int fullLatticeIndex_5d(int i, int oddBit)
cpuColorSpinorField * out
QudaPrecision get_prec(QIO_Reader *infile)
enum QudaReconstructType_s QudaReconstructType
int fullLatticeIndex_4d(int i, int oddBit)
int x4_from_full_index(int i)
QudaMassNormalization get_mass_normalization_type(char *s)
QudaInverterType inv_type
quda::cudaGaugeField * checkGauge(QudaInvertParam *param)
QudaMassNormalization normalization
enum QudaDslashType_s QudaDslashType
void accumulateComplexDotProduct(Float *a, Float *b, Float *c)
__host__ __device__ ValueType cos(ValueType x)
void usage_extra(char **argv)
void su3_construct(void *mat, QudaReconstructType reconstruct, QudaPrecision precision)
const char * get_quda_ver_str()
void su3_reconstruct(void *mat, int dir, int ga_idx, QudaReconstructType reconstruct, QudaPrecision precision, QudaGaugeParam *param)
QudaPrecision prec_sloppy
void initComms(int argc, char **argv, const int *commDims)
enum QudaInverterType_s QudaInverterType
void complexConjugateProduct(Float *a, Float *b, Float *c)
int neighborIndexFullLattice(int i, int dx4, int dx3, int dx2, int dx1)
QudaReconstructType link_recon
enum QudaTwistFlavorType_s QudaTwistFlavorType