9 #elif defined(MPI_COMMS)
49 void initComms(
int argc,
char **argv,
const int *commDims)
51 #if defined(QMP_COMMS)
52 QMP_thread_level_t tl;
53 QMP_init_msg_passing(&argc, &argv, QMP_THREAD_SINGLE, &tl);
54 #elif defined(MPI_COMMS)
55 MPI_Init(&argc, &argv);
64 #if defined(QMP_COMMS)
65 QMP_finalize_msg_passing();
66 #elif defined(MPI_COMMS)
76 #if defined(QMP_COMMS)
77 rank = QMP_get_node_number();
78 #elif defined(MPI_COMMS)
79 MPI_Comm_rank(MPI_COMM_WORLD, &rank);
88 for (
int d=0; d< 4; d++) {
93 for (
int i=0; i<4; i++) {
100 Vs_x = X[1]*X[2]*X[3];
101 Vs_y = X[0]*X[2]*X[3];
102 Vs_z = X[0]*X[1]*X[3];
103 Vs_t = X[0]*X[1]*X[2];
111 E1=X[0]+4;
E2=X[1]+4;
E3=X[2]+4;
E4=X[3]+4;
126 for (
int d=0; d< 4; d++)
132 for (
int i=0; i<4; i++) {
154 template <
typename Float>
155 static void printVector(
Float *v) {
156 printfQuda(
"{(%f %f) (%f %f) (%f %f)}\n", v[0], v[1], v[2], v[3], v[4], v[5]);
162 for (
int s=0;
s<4;
s++) printVector((
double*)spinor+X*24+
s*6);
164 for (
int s=0;
s<4;
s++) printVector((
float*)spinor+X*24+
s*6);
171 for (
int m=0; m<3; m++) printVector((
double*)gauge +(X/2)*
gaugeSiteSize + m*3*2);
173 for (
int m=0; m<3; m++) printVector((
float*)gauge +(X/2)*
gaugeSiteSize + m*3*2);
177 for (
int m = 0; m < 3; m++) printVector((
double*)gauge + (X/2+
Vh)*
gaugeSiteSize + m*3*2);
179 for (
int m = 0; m < 3; m++) printVector((
float*)gauge + (X/2+
Vh)*
gaugeSiteSize + m*3*2);
185 int x4 = Y/(
Z[2]*
Z[1]*
Z[0]);
186 int x3 = (Y/(
Z[1]*
Z[0])) %
Z[2];
187 int x2 = (Y/
Z[0]) %
Z[1];
189 return (x4+x3+x2+x1) % 2;
193 template <
typename Float>
200 template <
typename Float>
202 a[0] = b[0]*c[0] - b[1]*c[1];
203 a[1] = b[0]*c[1] + b[1]*c[0];
207 template <
typename Float>
209 a[0] = b[0]*c[0] - b[1]*c[1];
210 a[1] = -b[0]*c[1] - b[1]*c[0];
214 template <
typename Float>
216 a[0] = b[0]*c[0] + b[1]*c[1];
217 a[1] = b[0]*c[1] - b[1]*c[0];
221 template <
typename Float>
223 a[0] += sign*(b[0]*c[0] - b[1]*c[1]);
224 a[1] += sign*(b[0]*c[1] + b[1]*c[0]);
228 template <
typename Float>
230 a[0] += b[0]*c[0] + b[1]*c[1];
231 a[1] += b[0]*c[1] - b[1]*c[0];
234 template <
typename Float>
236 a[0] += sign * (b[0]*c[0] - b[1]*c[1]);
237 a[1] -= sign * (b[0]*c[1] + b[1]*c[0]);
240 template <
typename Float>
252 template <
typename Float>
254 mat[0] = atan2(mat[1], mat[0]);
255 mat[1] = atan2(mat[13], mat[12]);
256 for (
int i=8; i<18; i++) mat[i] = 0.0;
273 template <
typename Float>
275 Float *u = &mat[0*(3*2)];
276 Float *v = &mat[1*(3*2)];
277 Float *w = &mat[2*(3*2)];
278 w[0] = 0.0; w[1] = 0.0; w[2] = 0.0; w[3] = 0.0; w[4] = 0.0; w[5] = 0.0;
287 w[0]*=u0; w[1]*=u0; w[2]*=u0; w[3]*=u0; w[4]*=u0; w[5]*=u0;
290 template <
typename Float>
294 row_sum += mat[2]*mat[2];
295 row_sum += mat[3]*mat[3];
296 row_sum += mat[4]*mat[4];
297 row_sum += mat[5]*mat[5];
300 Float U00_mag = sqrt(1.f/(u0*u0) - row_sum);
305 mat[0] = U00_mag * cos(mat[14]);
306 mat[1] = U00_mag * sin(mat[14]);
308 Float column_sum = 0.0;
309 for (
int i=0; i<2; i++) column_sum += mat[i]*mat[i];
310 for (
int i=6; i<8; i++) column_sum += mat[i]*mat[i];
311 Float U20_mag = sqrt(1.f/(u0*u0) - column_sum);
313 mat[12] = U20_mag * cos(mat[15]);
314 mat[13] = U20_mag * sin(mat[15]);
319 Float r_inv2 = 1.0/(u0*row_sum);
352 else su3Reconstruct12((
float*)mat, dir, ga_idx, param);
355 else su3Reconstruct8((
float*)mat, dir, ga_idx, param);
381 template <
typename Float>
382 static int compareFloats(
Float *a,
Float *b,
int len,
double epsilon) {
383 for (
int i = 0; i < len; i++) {
384 double diff = fabs(a[i] - b[i]);
385 if (diff > epsilon) {
386 printfQuda(
"ERROR: i=%d, a[%d]=%f, b[%d]=%f\n", i, i, a[i], i, b[i]);
395 else return compareFloats((
float*)a, (
float*)b, len, epsilon);
421 int x1odd = (x2 + x3 + x4 + oddBit) & 1;
441 int x4 = Y/(
Z[2]*
Z[1]*
Z[0]);
442 int x3 = (Y/(
Z[1]*
Z[0])) %
Z[2];
443 int x2 = (Y/
Z[0]) %
Z[1];
448 x4 = (x4+dx4+
Z[3]) %
Z[3];
449 x3 = (x3+dx3+
Z[2]) %
Z[2];
450 x2 = (x2+dx2+
Z[1]) %
Z[1];
451 x1 = (x1+dx1+
Z[0]) %
Z[0];
453 return (x4*(
Z[2]*
Z[1]*
Z[0]) + x3*(Z[1]*Z[0]) + x2*(Z[0]) +
x1) / 2;
462 int x4 = Y/(
Z[2]*
Z[1]*
Z[0]);
463 int x3 = (Y/(
Z[1]*
Z[0])) %
Z[2];
464 int x2 = (Y/
Z[0]) %
Z[1];
467 int ghost_x4 = x4+ dx4;
471 x4 = (x4+dx4+
Z[3]) %
Z[3];
472 x3 = (x3+dx3+
Z[2]) %
Z[2];
473 x2 = (x2+dx2+
Z[1]) %
Z[1];
474 x1 = (x1+dx1+
Z[0]) %
Z[0];
476 if ( ghost_x4 >= 0 && ghost_x4 <
Z[3]){
477 ret = (x4*(
Z[2]*
Z[1]*
Z[0]) + x3*(
Z[1]*
Z[0]) + x2*(Z[0]) + x1) / 2;
479 ret = (x3*(
Z[1]*
Z[0]) + x2*(
Z[0]) +
x1) / 2;
502 int nbr_half_idx =
neighborIndex(half_idx, oddBit, dx4,dx3,dx2,dx1);
503 int oddBitChanged = (dx4+dx3+dx2+dx1)%2;
507 int ret = nbr_half_idx;
509 ret =
Vh + nbr_half_idx;
528 int x4 = Y/(
Z[2]*
Z[1]*
Z[0]);
529 int x3 = (Y/(
Z[1]*
Z[0])) %
Z[2];
530 int x2 = (Y/
Z[0]) %
Z[1];
532 int ghost_x4 = x4+ dx4;
534 x4 = (x4+dx4+
Z[3]) %
Z[3];
535 x3 = (x3+dx3+
Z[2]) %
Z[2];
536 x2 = (x2+dx2+
Z[1]) %
Z[1];
537 x1 = (x1+dx1+
Z[0]) %
Z[0];
539 if ( ghost_x4 >= 0 && ghost_x4 <
Z[3]){
540 ret = (x4*(
Z[2]*
Z[1]*
Z[0]) + x3*(
Z[1]*
Z[0]) + x2*(Z[0]) + x1) / 2;
542 ret = (x3*(
Z[1]*
Z[0]) + x2*(
Z[0]) +
x1) / 2;
546 int oddBitChanged = (dx4+dx3+dx2+dx1)%2;
565 if (i >=
Vh || i < 0) {printf(
"i out of range in fullLatticeIndex_4d"); exit(-1);}
584 int x1odd = (x2 + x3 + x4 + oddBit) & 1;
599 int boundaryCrossings = i/(
Z[0]/2) + i/(
Z[1]*
Z[0]/2) + i/(
Z[2]*
Z[1]*
Z[0]/2) + i/(
Z[3]*
Z[2]*
Z[1]*
Z[0]/2);
600 return 2*i + (boundaryCrossings + oddBit) % 2;
614 int x4 = Y/(
Z[2]*
Z[1]*
Z[0]);
619 template <
typename Float>
622 for (
int d = 0; d < 3; d++) {
632 bool last_node_in_t =
true;
637 for (
int j = (
Z[0]/2)*
Z[1]*
Z[2]*(
Z[3]-1); j <
Vh; j++) {
639 gauge[3][j*gaugeSiteSize+i] *= -1.0;
640 gauge[3][(Vh+j)*gaugeSiteSize+i] *= -1.0;
648 int iMax = ( last_node_in_t ? (
Z[0]/2)*
Z[1]*
Z[2]*(
Z[3]-1) : Vh );
652 for (
int i = 0; i< iMax; i++) {
653 for (
int m = 0; m < 3; m++) {
654 for (
int n = 0; n < 3; n++) {
655 even[i*(3*3*2) + m*(3*2) + n*(2) + 0] = (m==n) ? 1 : 0;
656 even[i*(3*3*2) + m*(3*2) + n*(2) + 1] = 0.0;
657 odd [i*(3*3*2) + m*(3*2) + n*(2) + 0] = (m==n) ? 1 : 0;
658 odd [i*(3*3*2) + m*(3*2) + n*(2) + 1] = 0.0;
665 template <
typename Float>
669 int X1h=param->
X[0]/2;
676 for(
int d=0; d<4; d++){
683 for (
int d = 0; d < 3; d++) {
686 for (
int i = 0; i <
Vh; i++) {
689 int i4 = index /(X3*X2*
X1);
690 int i3 = (index - i4*(X3*X2*
X1))/(X2*
X1);
691 int i2 = (index - i4*(X3*X2*
X1) - i3*(X2*X1))/X1;
692 int i1 = index - i4*(X3*X2*
X1) - i3*(X2*X1) - i2*
X1;
702 if ((i4+i1) % 2 == 1){
707 if ( (i4+i1+i2) % 2 == 1){
712 for (
int j=0;j < 6; j++){
713 gauge[d][i*gaugeSiteSize + 12+ j] *= sign;
717 for (
int i = 0; i <
Vh; i++) {
719 int i4 = index /(X3*X2*
X1);
720 int i3 = (index - i4*(X3*X2*
X1))/(X2*
X1);
721 int i2 = (index - i4*(X3*X2*
X1) - i3*(X2*X1))/X1;
722 int i1 = index - i4*(X3*X2*
X1) - i3*(X2*X1) - i2*
X1;
732 if ((i4+i1) % 2 == 1){
737 if ( (i4+i1+i2) % 2 == 1){
742 for (
int j=0;j < 6; j++){
743 gauge[d][(Vh+i)*gaugeSiteSize + 12 + j] *= sign;
751 for (
int j = 0; j <
Vh; j++) {
753 if (j >= (X4-3)*X1h*X2*
X3 ){
757 for (
int i = 0; i < 6; i++) {
758 gauge[3][j*gaugeSiteSize+ 12+ i ] *= sign;
759 gauge[3][(Vh+j)*gaugeSiteSize+12 +i] *= sign;
767 template <
typename Float>
769 Float *resOdd[4], *resEven[4];
770 for (
int dir = 0; dir < 4; dir++) {
775 for (
int dir = 0; dir < 4; dir++) {
776 for (
int i = 0; i <
Vh; i++) {
777 for (
int m = 0; m < 3; m++) {
778 for (
int n = 0; n < 3; n++) {
779 resEven[
dir][i*(3*3*2) + m*(3*2) + n*(2) + 0] = (m==n) ? 1 : 0;
780 resEven[
dir][i*(3*3*2) + m*(3*2) + n*(2) + 1] = 0.0;
781 resOdd[
dir][i*(3*3*2) + m*(3*2) + n*(2) + 0] = (m==n) ? 1 : 0;
782 resOdd[
dir][i*(3*3*2) + m*(3*2) + n*(2) + 1] = 0.0;
788 applyGaugeFieldScaling(res, Vh, param);
792 template <
typename Float>
795 for (
int i=0; i<len; i++) sum +=
norm(a[i]);
796 for (
int i=0; i<len; i++) a[i] /= sqrt(sum);
800 template <
typename Float>
803 for (
int i=0; i<len; i++) dot +=
conj(a[i])*b[i];
807 template <
typename Float>
809 Float *resOdd[4], *resEven[4];
810 for (
int dir = 0; dir < 4; dir++) {
815 for (
int dir = 0; dir < 4; dir++) {
816 for (
int i = 0; i <
Vh; i++) {
817 for (
int m = 1; m < 3; m++) {
818 for (
int n = 0; n < 3; n++) {
819 resEven[
dir][i*(3*3*2) + m*(3*2) + n*(2) + 0] = rand() / (
Float)RAND_MAX;
820 resEven[
dir][i*(3*3*2) + m*(3*2) + n*(2) + 1] = rand() / (
Float)RAND_MAX;
821 resOdd[
dir][i*(3*3*2) + m*(3*2) + n*(2) + 0] = rand() / (
Float)RAND_MAX;
822 resOdd[
dir][i*(3*3*2) + m*(3*2) + n*(2) + 1] = rand() / (
Float)RAND_MAX;
834 Float *w = resEven[
dir]+(i*3+0)*3*2;
835 Float *u = resEven[
dir]+(i*3+1)*3*2;
836 Float *v = resEven[
dir]+(i*3+2)*3*2;
838 for (
int n = 0; n < 6; n++) w[n] = 0.0;
852 for (
int n = 0; n < 6; n++) w[n] = 0.0;
865 applyGaugeFieldScaling(res, Vh, param);
869 for (
int dir = 0; dir < 4; dir++){
870 for (
int i = 0; i <
Vh; i++) {
871 for (
int m = 0; m < 3; m++) {
872 for (
int n = 0; n < 3; n++) {
873 resEven[
dir][i*(3*3*2) + m*(3*2) + n*(2) + 0] =1.0* rand() / (
Float)RAND_MAX;
874 resEven[
dir][i*(3*3*2) + m*(3*2) + n*(2) + 1] =2.0* rand() / (
Float)RAND_MAX;
875 resOdd[
dir][i*(3*3*2) + m*(3*2) + n*(2) + 0] = 3.0*rand() / (
Float)RAND_MAX;
876 resOdd[
dir][i*(3*3*2) + m*(3*2) + n*(2) + 1] = 4.0*rand() / (
Float)RAND_MAX;
886 template <
typename Float>
889 Float *resOdd[4], *resEven[4];
890 for (
int dir = 0; dir < 4; dir++) {
895 for (
int dir = 0; dir < 4; dir++) {
896 for (
int i = 0; i <
Vh; i++) {
897 for (
int m = 1; m < 3; m++) {
898 for (
int n = 0; n < 3; n++) {
899 resEven[
dir][i*(3*3*2) + m*(3*2) + n*(2) + 0] = rand() / (
Float)RAND_MAX;
900 resEven[
dir][i*(3*3*2) + m*(3*2) + n*(2) + 1] = rand() / (
Float)RAND_MAX;
901 resOdd[
dir][i*(3*3*2) + m*(3*2) + n*(2) + 0] = rand() / (
Float)RAND_MAX;
902 resOdd[
dir][i*(3*3*2) + m*(3*2) + n*(2) + 1] = rand() / (
Float)RAND_MAX;
914 Float *w = resEven[
dir]+(i*3+0)*3*2;
915 Float *u = resEven[
dir]+(i*3+1)*3*2;
916 Float *v = resEven[
dir]+(i*3+2)*3*2;
918 for (
int n = 0; n < 6; n++) w[n] = 0.0;
932 for (
int n = 0; n < 6; n++) w[n] = 0.0;
949 else constructUnitGaugeField((
float**)gauge, param);
950 }
else if (type == 1) {
952 else constructGaugeField((
float**)gauge, param);
955 else applyGaugeFieldScaling((
float**)gauge, Vh, param);
966 constructUnitGaugeField((
double**)fatlink, param);
967 constructUnitGaugeField((
double**)longlink, param);
969 constructUnitGaugeField((
float**)fatlink, param);
970 constructUnitGaugeField((
float**)longlink, param);
975 constructGaugeField((
double**)fatlink, param);
977 constructGaugeField((
double**)longlink, param);
980 constructGaugeField((
float**)fatlink, param);
982 constructGaugeField((
float**)longlink, param);
988 template <
typename Float>
989 static void constructCloverField(
Float *res,
double norm,
double diag) {
991 Float c = 2.0 * norm / RAND_MAX;
993 for(
int i = 0; i <
V; i++) {
994 for (
int j = 0; j < 72; j++) {
995 res[i*72 + j] = c*rand() -
norm;
997 for (
int j = 0; j< 6; j++) {
998 res[i*72 + j] += diag;
999 res[i*72 + j+36] += diag;
1007 else constructCloverField((
float *)clover, norm, diag);
1022 template <
typename Float>
1025 const int fail_check = 17;
1026 int fail[4][fail_check];
1028 for (
int d=0; d<4; d++)
for (
int i=0; i<fail_check; i++) fail[d][i] = 0;
1029 for (
int d=0; d<4; d++)
for (
int i=0; i<18; i++) iter[d][i] = 0;
1031 for (
int d=0; d<4; d++) {
1032 for (
int eo=0; eo<2; eo++) {
1033 for (
int i=0; i<
Vh; i++) {
1034 int ga_idx = (eo*Vh+i);
1035 for (
int j=0; j<18; j++) {
1036 double diff = fabs(newG[d][ga_idx*18+j] - oldG[d][ga_idx*18+j]);
1038 for (
int f=0; f<fail_check; f++) if (diff > pow(10.0,-(f+1))) fail[d][f]++;
1039 if (diff > epsilon) iter[d][j]++;
1045 printf(
"Component fails (X, Y, Z, T)\n");
1046 for (
int i=0; i<18; i++) printf(
"%d fails = (%8d, %8d, %8d, %8d)\n", i, iter[0][i], iter[1][i], iter[2][i], iter[3][i]);
1048 printf(
"\nDeviation Failures = (X, Y, Z, T)\n");
1049 for (
int f=0; f<fail_check; f++) {
1050 printf(
"%e Failures = (%9d, %9d, %9d, %9d) = (%e, %e, %e, %e)\n", pow(10.0,-(f+1)),
1051 fail[0][f], fail[1][f], fail[2][f], fail[3][f],
1052 fail[0][f]/(
double)(V*18), fail[1][f]/(
double)(V*18), fail[2][f]/(
double)(V*18), fail[3][f]/(
double)(V*18));
1059 checkGauge((
double**)oldG, (
double**)newG, epsilon);
1061 checkGauge((
float**)oldG, (
float**)newG, epsilon);
1080 bool last_node_in_t =
true;
1085 for(
int i=0;i <
V;i++){
1086 for(
int dir =
XUP; dir <=
TUP; dir++){
1100 int i4 = full_idx /(X3*X2*
X1);
1101 int i3 = (full_idx - i4*(X3*X2*
X1))/(X2*
X1);
1102 int i2 = (full_idx - i4*(X3*X2*
X1) - i3*(X2*X1))/X1;
1103 int i1 = full_idx - i4*(X3*X2*
X1) - i3*(X2*X1) - i2*
X1;
1108 if ( (i4 & 1) != 0){
1114 if ( ((i4+i1) & 1) != 0){
1120 if ( ((i4+i1+i2) & 1) != 0){
1126 if (last_node_in_t && i4 == (X4-1)){
1132 printf(
"ERROR: wrong dir(%d)\n", dir);
1139 double* mylink = (
double*)link[dir];
1142 mylink[12] *=
coeff;
1143 mylink[13] *=
coeff;
1144 mylink[14] *=
coeff;
1145 mylink[15] *=
coeff;
1146 mylink[16] *=
coeff;
1147 mylink[17] *=
coeff;
1152 float* mylink = (
float*)link[dir];
1155 mylink[12] *=
coeff;
1156 mylink[13] *=
coeff;
1157 mylink[14] *=
coeff;
1158 mylink[15] *=
coeff;
1159 mylink[16] *=
coeff;
1160 mylink[17] *=
coeff;
1169 for(
int dir= 0;dir < 4;dir++){
1172 float* f = (
float*)link[dir];
1173 if (f[i] != f[i] || (fabsf(f[i]) > 1.e+3) ){
1174 fprintf(stderr,
"ERROR: %dth: bad number(%f) in function %s \n",i, f[i], __FUNCTION__);
1178 double* f = (
double*)link[dir];
1179 if (f[i] != f[i] || (fabs(f[i]) > 1.e+3)){
1180 fprintf(stderr,
"ERROR: %dth: bad number(%f) in function %s \n",i, f[i], __FUNCTION__);
1196 template <
typename Float>
1198 const int fail_check = 16;
1199 int fail[fail_check];
1200 for (
int f=0; f<fail_check; f++) fail[f] = 0;
1203 for (
int i=0; i<18; i++) iter[i] = 0;
1205 for(
int dir=0;dir < 4; dir++){
1206 for (
int i=0; i<len; i++) {
1207 for (
int j=0; j<18; j++) {
1209 double diff = fabs(linkA[dir][is]-linkB[dir][is]);
1210 for (
int f=0; f<fail_check; f++)
1211 if (diff > pow(10.0,-(f+1))) fail[f]++;
1213 if (diff > 1e-3) iter[j]++;
1218 for (
int i=0; i<18; i++)
printfQuda(
"%d fails = %d\n", i, iter[i]);
1220 int accuracy_level = 0;
1221 for(
int f =0; f < fail_check; f++){
1227 for (
int f=0; f<fail_check; f++) {
1228 printfQuda(
"%e Failures: %d / %d = %e\n", pow(10.0,-(f+1)), fail[f], 4*len*18, fail[f] / (
double)(4*len*18));
1231 return accuracy_level;
1235 compare_link(
void **linkA,
void **linkB,
int len,
QudaPrecision precision)
1240 ret =
compareLink((
double**)linkA, (
double**)linkB, len);
1242 ret =
compareLink((
float**)linkA, (
float**)linkB, len);
1254 for(
int i=0; i < 3;i++){
1255 printVector((
double*)link+ X*gaugeSiteSize + i*6);
1260 for(
int i=0;i < 3;i++){
1261 printVector((
float*)link+X*gaugeSiteSize + i*6);
1267 void **linkB,
const char* msgB,
1271 printLinkElement(linkA[0], 0, prec);
1273 printLinkElement(linkA[0], 1, prec);
1275 printLinkElement(linkA[3], len-1, prec);
1279 printLinkElement(linkB[0], 0, prec);
1281 printLinkElement(linkB[0], 1, prec);
1283 printLinkElement(linkB[3], len-1, prec);
1286 int ret = compare_link(linkA, linkB, len, prec);
1297 temp = malloc(4*V*gaugeSiteSize*gSize);
1299 fprintf(stderr,
"Error: malloc failed for temp in function %s\n", __FUNCTION__);
1305 for(
int i=0;i <
V;i++){
1307 for(
int dir=0;dir < 4;dir++){
1308 double* thismom = (
double*)mom;
1310 thismom[ (4*i+
dir)*momSiteSize + k ]= 1.0* rand() /RAND_MAX;
1311 if (k==momSiteSize-1) thismom[ (4*i+
dir)*momSiteSize + k ]= 0.0;
1315 for(
int dir=0;dir < 4;dir++){
1316 float* thismom=(
float*)mom;
1318 thismom[ (4*i+
dir)*momSiteSize + k ]= 1.0* rand() /RAND_MAX;
1319 if (k==momSiteSize-1) thismom[ (4*i+
dir)*momSiteSize + k ]= 0.0;
1332 for(
int i=0;i <
V;i++){
1334 for(
int dir=0;dir < 4;dir++){
1335 double* thishw = (
double*)hw;
1337 thishw[ (4*i+
dir)*hwSiteSize + k ]= 1.0* rand() /RAND_MAX;
1341 for(
int dir=0;dir < 4;dir++){
1342 float* thishw=(
float*)hw;
1344 thishw[ (4*i+
dir)*hwSiteSize + k ]= 1.0* rand() /RAND_MAX;
1354 template <
typename Float>
1356 const int fail_check = 16;
1357 int fail[fail_check];
1358 for (
int f=0; f<fail_check; f++) fail[f] = 0;
1363 for (
int i=0; i<len; i++) {
1365 int is = i*momSiteSize+j;
1366 double diff = fabs(momA[is]-momB[is]);
1367 for (
int f=0; f<fail_check; f++)
1368 if (diff > pow(10.0,-(f+1))) fail[f]++;
1370 if (diff > 1e-3) iter[j]++;
1374 int accuracy_level = 0;
1375 for(
int f =0; f < fail_check; f++){
1377 accuracy_level =f+1;
1383 for (
int f=0; f<fail_check; f++) {
1384 printfQuda(
"%e Failures: %d / %d = %e\n", pow(10.0,-(f+1)), fail[f], len*momSiteSize, fail[f] / (
double)(len*6));
1387 return accuracy_level;
1395 printVector(thismom);
1396 printfQuda(
"(%9f,%9f) (%9f,%9f)\n", thismom[6], thismom[7], thismom[8], thismom[9]);
1399 printVector(thismom);
1400 printfQuda(
"(%9f,%9f) (%9f,%9f)\n", thismom[6], thismom[7], thismom[8], thismom[9]);
1406 printMomElement(momA, 0, prec);
1408 printMomElement(momA, 1, prec);
1410 printMomElement(momA, 2, prec);
1412 printMomElement(momA, 3, prec);
1416 printMomElement(momB, 0, prec);
1418 printMomElement(momB, 1, prec);
1420 printMomElement(momB, 2, prec);
1422 printMomElement(momB, 3, prec);
1427 ret =
compare_mom((
double*)momA, (
double*)momB, len);
1429 ret =
compare_mom((
float*)momA, (
float*)momB, len);
1467 static int dim_partitioned[4] = {0,0,0,0};
1479 printf(
"Usage: %s [options]\n", argv[0]);
1480 printf(
"Common options: \n");
1482 printf(
" --device <n> # Set the CUDA device to use (default 0, single GPU only)\n");
1484 printf(
" --prec <double/single/half> # Precision in GPU\n");
1485 printf(
" --prec_sloppy <double/single/half> # Sloppy precision in GPU\n");
1486 printf(
" --recon <8/12/18> # Link reconstruction type\n");
1487 printf(
" --recon_sloppy <8/12/18> # Sloppy link reconstruction type\n");
1488 printf(
" --dagger # Set the dagger to 1 (default 0)\n");
1489 printf(
" --sdim <n> # Set space dimention(X/Y/Z) size\n");
1490 printf(
" --xdim <n> # Set X dimension size(default 24)\n");
1491 printf(
" --ydim <n> # Set X dimension size(default 24)\n");
1492 printf(
" --zdim <n> # Set X dimension size(default 24)\n");
1493 printf(
" --tdim <n> # Set T dimension size(default 24)\n");
1494 printf(
" --Lsdim <n> # Set Ls dimension size(default 16)\n");
1495 printf(
" --xgridsize <n> # Set grid size in X dimension (default 1)\n");
1496 printf(
" --ygridsize <n> # Set grid size in Y dimension (default 1)\n");
1497 printf(
" --zgridsize <n> # Set grid size in Z dimension (default 1)\n");
1498 printf(
" --tgridsize <n> # Set grid size in T dimension (default 1)\n");
1499 printf(
" --partition <mask> # Set the communication topology (X=1, Y=2, Z=4, T=8, and combinations of these)\n");
1500 printf(
" --kernel_pack_t # Set T dimension kernel packing to be true (default false)\n");
1501 printf(
" --dslash_type <type> # Set the dslash type, the following values are valid\n"
1502 " wilson/clover/twisted_mass/asqtad/domain_wall\n");
1503 printf(
" --load-gauge file # Load gauge field \"file\" for the test (requires QIO)\n");
1504 printf(
" --niter <n> # The number of iterations to perform (default 10)\n");
1505 printf(
" --tune <true/false> # Whether to autotune or not (default true)\n");
1506 printf(
" --test # Test method (different for each test)\n");
1507 printf(
" --help # Print out this message\n");
1512 char msg[]=
"single";
1514 printf(
"Note: this program is %s GPU build\n", msg);
1524 char msg[]=
"single";
1531 if( strcmp(argv[i],
"--help")== 0){
1535 if( strcmp(argv[i],
"--device") == 0){
1539 device = atoi(argv[i+1]);
1540 if (device < 0 || device > 16){
1541 printf(
"ERROR: Invalid CUDA device number (%d)\n",
device);
1549 if( strcmp(argv[i],
"--prec") == 0){
1559 if( strcmp(argv[i],
"--prec_sloppy") == 0){
1569 if( strcmp(argv[i],
"--recon") == 0){
1579 if( strcmp(argv[i],
"--recon_sloppy") == 0){
1589 if( strcmp(argv[i],
"--xdim") == 0){
1593 xdim= atoi(argv[i+1]);
1594 if (xdim < 0 || xdim > 512){
1595 printf(
"ERROR: invalid X dimension (%d)\n",
xdim);
1603 if( strcmp(argv[i],
"--ydim") == 0){
1607 ydim= atoi(argv[i+1]);
1608 if (ydim < 0 || ydim > 512){
1609 printf(
"ERROR: invalid T dimension (%d)\n",
ydim);
1618 if( strcmp(argv[i],
"--zdim") == 0){
1622 zdim= atoi(argv[i+1]);
1623 if (zdim < 0 || zdim > 512){
1624 printf(
"ERROR: invalid T dimension (%d)\n",
zdim);
1632 if( strcmp(argv[i],
"--tdim") == 0){
1636 tdim = atoi(argv[i+1]);
1637 if (tdim < 0 || tdim > 512){
1638 errorQuda(
"Error: invalid t dimension");
1645 if( strcmp(argv[i],
"--sdim") == 0){
1649 int sdim = atoi(argv[i+1]);
1650 if (sdim < 0 || sdim > 512){
1659 if( strcmp(argv[i],
"--Lsdim") == 0){
1663 int Ls = atoi(argv[i+1]);
1664 if (Ls < 0 || Ls > 128){
1673 if( strcmp(argv[i],
"--dagger") == 0){
1679 if( strcmp(argv[i],
"--partition") == 0){
1684 int value = atoi(argv[i+1]);
1685 for(
int j=0; j < 4;j++){
1686 if (value & (1 << j)){
1688 dim_partitioned[j] = 1;
1692 printfQuda(
"WARNING: Ignoring --partition option since this is a single-GPU build.\n");
1699 if( strcmp(argv[i],
"--kernel_pack_t") == 0){
1706 if( strcmp(argv[i],
"--tune") == 0){
1711 if (strcmp(argv[i+1],
"true") == 0){
1713 }
else if (strcmp(argv[i+1],
"false") == 0){
1716 fprintf(stderr,
"ERROR: invalid tuning type\n");
1725 if( strcmp(argv[i],
"--xgridsize") == 0){
1729 int xsize = atoi(argv[i+1]);
1731 errorQuda(
"ERROR: invalid X grid size");
1739 if( strcmp(argv[i],
"--ygridsize") == 0){
1743 int ysize = atoi(argv[i+1]);
1745 errorQuda(
"ERROR: invalid Y grid size");
1753 if( strcmp(argv[i],
"--zgridsize") == 0){
1757 int zsize = atoi(argv[i+1]);
1759 errorQuda(
"ERROR: invalid Z grid size");
1767 if( strcmp(argv[i],
"--tgridsize") == 0){
1771 int tsize = atoi(argv[i+1]);
1773 errorQuda(
"ERROR: invalid T grid size");
1781 if( strcmp(argv[i],
"--dslash_type") == 0){
1791 if( strcmp(argv[i],
"--load-gauge") == 0){
1801 if( strcmp(argv[i],
"--test") == 0){
1811 if( strcmp(argv[i],
"--niter") == 0){
1815 niter= atoi(argv[i+1]);
1816 if (niter < 1 || niter > 1e6){
1817 printf(
"ERROR: invalid number of iterations (%d)\n",
niter);
1825 if( strcmp(argv[i],
"--version") == 0){
1826 printf(
"This program is linked with QUDA library, version %s,",
1828 printf(
" %s GPU build\n", msg);
1839 static struct timeval startTime;
1842 gettimeofday(&startTime, NULL);
1846 struct timeval endTime;
1847 gettimeofday(&endTime, NULL);
1849 long ds = endTime.tv_sec - startTime.tv_sec;
1850 long dus = endTime.tv_usec - startTime.tv_usec;
1851 return ds + 0.000001*dus;