27 #define MAX(a, b) ((a) > (b) ? (a) : (b))
153 }
else if (type == 1) {
172 int construct_type = 0;
220 for (
int d = 0; d < 4; d++) cs_param->
x[d] =
gauge_param->
X[d];
222 if (pc) cs_param->
x[0] /= 2;
242 param.setPrecision(precision);
249 for (
int d = 0; d < 4; d++)
param.x[d] = x[d];
255 void initComms(
int argc,
char **argv, std::array<int, 4> &commDims) {
initComms(argc, argv, commDims.data()); }
257 void initComms(
int argc,
char **argv,
int *
const commDims)
259 if (getenv(
"QUDA_TEST_GRID_SIZE")) {
get_size_from_env(commDims,
"QUDA_TEST_GRID_SIZE"); }
262 #if defined(QMP_COMMS)
263 QMP_thread_level_t tl;
264 QMP_init_msg_passing(&argc, &argv, QMP_THREAD_SINGLE, &tl);
268 int map[] = {3, 2, 1, 0};
269 QMP_declare_logical_topology_map(commDims, 4,
map, 4);
271 int map[] = {0, 1, 2, 3};
272 QMP_declare_logical_topology_map(commDims, 4,
map, 4);
274 #elif defined(MPI_COMMS)
282 for (
int d = 0; d < 4; d++) {
295 #if defined(QMP_COMMS)
296 QMP_finalize_msg_passing();
297 #elif defined(MPI_COMMS)
306 #if defined(QMP_COMMS)
307 rank = QMP_get_node_number();
308 #elif defined(MPI_COMMS)
309 MPI_Comm_rank(MPI_COMM_WORLD, &rank);
312 srand(17 * rank + 137);
318 for (
int d = 0; d < 4; d++) {
323 for (
int i = 0; i < 4; i++) {
324 if (i == d)
continue;
330 Vs_x = X[1] * X[2] * X[3];
331 Vs_y = X[0] * X[2] * X[3];
332 Vs_z = X[0] * X[1] * X[3];
333 Vs_t = X[0] * X[1] * X[2];
356 for (
int d = 0; d < 4; d++) {
361 for (
int i = 0; i < 4; i++) {
362 if (i == d)
continue;
390 return (((coordinate[3] *
dim[2] + coordinate[2]) *
dim[1] + coordinate[1]) *
dim[0] + coordinate[0]) >> 1;
394 const int shift[4],
int parity)
397 aux[0] = shrinked_index * 2;
399 for (
int i = 0; i < 3; i++) { aux[i + 1] = aux[i] / shrinked_dim[i]; }
401 coordinate[0] = aux[0] - aux[1] * shrinked_dim[0];
402 coordinate[1] = aux[1] - aux[2] * shrinked_dim[1];
403 coordinate[2] = aux[2] - aux[3] * shrinked_dim[2];
404 coordinate[3] = aux[3];
407 coordinate[0] += (
parity + coordinate[3] + coordinate[2] + coordinate[1]) & 1;
410 for (
int d = 0; d < 4; d++) { coordinate[d] += shift[d]; }
426 int x4 = Y / (
Z[2] *
Z[1] *
Z[0]);
427 int x3 = (Y / (
Z[1] *
Z[0])) %
Z[2];
428 int x2 = (Y /
Z[0]) %
Z[1];
433 x4 = (x4 + dx4 +
Z[3]) %
Z[3];
434 x3 = (x3 + dx3 +
Z[2]) %
Z[2];
435 x2 = (x2 + dx2 +
Z[1]) %
Z[1];
436 x1 = (x1 + dx1 +
Z[0]) %
Z[0];
438 return (x4 * (
Z[2] *
Z[1] *
Z[0]) + x3 * (
Z[1] *
Z[0]) + x2 * (
Z[0]) + x1) / 2;
447 x[3] = fullIndex / (
dim[2] *
dim[1] *
dim[0]);
448 x[2] = (fullIndex / (
dim[1] *
dim[0])) %
dim[2];
449 x[1] = (fullIndex /
dim[0]) %
dim[1];
450 x[0] = fullIndex %
dim[0];
452 for (
int dir = 0; dir < 4; ++dir) x[dir] = (x[dir] + dx[dir] +
dim[dir]) %
dim[dir];
454 return (((x[3] *
dim[2] + x[2]) *
dim[1] + x[1]) *
dim[0] + x[0]) / 2;
462 int x4 = Y / (
Z[2] *
Z[1] *
Z[0]);
463 int x3 = (Y / (
Z[1] *
Z[0])) %
Z[2];
464 int x2 = (Y /
Z[0]) %
Z[1];
467 int ghost_x4 = x4 + dx4;
471 x4 = (x4 + dx4 +
Z[3]) %
Z[3];
472 x3 = (x3 + dx3 +
Z[2]) %
Z[2];
473 x2 = (x2 + dx2 +
Z[1]) %
Z[1];
474 x1 = (x1 + dx1 +
Z[0]) %
Z[0];
477 ret = (x4 * (
Z[2] *
Z[1] *
Z[0]) + x3 * (
Z[1] *
Z[0]) + x2 * (
Z[0]) + x1) / 2;
479 ret = (x3 * (
Z[1] *
Z[0]) + x2 * (
Z[0]) + x1) / 2;
499 int nbr_half_idx =
neighborIndex(half_idx, oddBit, dx4, dx3, dx2, dx1);
500 int oddBitChanged = (dx4 + dx3 + dx2 + dx1) % 2;
501 if (oddBitChanged) { oddBit = 1 - oddBit; }
502 int ret = nbr_half_idx;
503 if (oddBit) { ret =
Vh + nbr_half_idx; }
511 const int halfVolume = volume / 2;
513 int halfIndex = index;
515 if (index >= halfVolume) {
517 halfIndex = index - halfVolume;
522 int oddBitChanged = (dx[0] + dx[1] + dx[2] + dx[3]) % 2;
523 if (oddBitChanged) { oddBit = 1 - oddBit; }
525 return neighborHalfIndex + oddBit * halfVolume;
539 int x4 = Y / (
Z[2] *
Z[1] *
Z[0]);
540 int x3 = (Y / (
Z[1] *
Z[0])) %
Z[2];
541 int x2 = (Y /
Z[0]) %
Z[1];
543 int ghost_x4 = x4 + dx4;
545 x4 = (x4 + dx4 +
Z[3]) %
Z[3];
546 x3 = (x3 + dx3 +
Z[2]) %
Z[2];
547 x2 = (x2 + dx2 +
Z[1]) %
Z[1];
548 x1 = (x1 + dx1 +
Z[0]) %
Z[0];
550 if (ghost_x4 >= 0 && ghost_x4 <
Z[3]) {
551 ret = (x4 * (
Z[2] *
Z[1] *
Z[0]) + x3 * (
Z[1] *
Z[0]) + x2 * (
Z[0]) + x1) / 2;
553 ret = (x3 * (
Z[1] *
Z[0]) + x2 * (
Z[0]) + x1) / 2;
557 int oddBitChanged = (dx4 + dx3 + dx2 + dx1) % 2;
558 if (oddBitChanged) { oddBit = 1 - oddBit; }
560 if (oddBit) { ret +=
Vh; }
594 int za = index / (
dim[0] >> 1);
595 int zb = za /
dim[1];
596 int x2 = za - zb *
dim[1];
597 int x4 = zb /
dim[2];
598 int x3 = zb - x4 *
dim[2];
600 return 2 * index + ((x2 + x3 + x4 + oddBit) & 1);
622 int x2 = za - zb * X2;
624 int x3 = zb - x4 * X3;
625 int x1odd = (x2 + x3 + x4 + oddBit) & 1;
627 int X = 2 * sid + x1odd;
647 char *grid_size_env = getenv(env);
649 std::stringstream grid_list(grid_size_env);
653 while (grid_list >>
dim) {
654 if (i >= 4)
errorQuda(
"Unexpected grid size array length");
656 if (grid_list.peek() ==
',') grid_list.ignore();
664 int rank = coords[0];
671 int rank = coords[3];
678 printfQuda(
"{(%f %f) (%f %f) (%f %f)}\n", v[0], v[1], v[2], v[3], v[4], v[5]);
684 int x4 = Y / (
Z[2] *
Z[1] *
Z[0]);
685 int x3 = (Y / (
Z[1] *
Z[0])) %
Z[2];
686 int x2 = (Y /
Z[0]) %
Z[1];
688 return (x4 + x3 + x2 + x1) % 2;
701 a[0] = b[0] * c[0] - b[1] * c[1];
702 a[1] = b[0] * c[1] + b[1] * c[0];
708 a[0] = b[0] * c[0] - b[1] * c[1];
709 a[1] = -b[0] * c[1] - b[1] * c[0];
715 a[0] = b[0] * c[0] + b[1] * c[1];
716 a[1] = b[0] * c[1] - b[1] * c[0];
722 a[0] += sign * (b[0] * c[0] - b[1] * c[1]);
723 a[1] += sign * (b[0] * c[1] + b[1] * c[0]);
729 a[0] += b[0] * c[0] + b[1] * c[1];
730 a[1] += b[0] * c[1] - b[1] * c[0];
735 a[0] += sign * (b[0] * c[0] - b[1] * c[1]);
736 a[1] -= sign * (b[0] * c[1] + b[1] * c[0]);
755 for (
int i = 8; i < 18; i++)
mat[i] = 0.0;
807 row_sum +=
mat[2] *
mat[2];
808 row_sum +=
mat[3] *
mat[3];
809 row_sum +=
mat[4] *
mat[4];
810 row_sum +=
mat[5] *
mat[5];
812 Float U00_mag =
sqrt(1.f / (u0 * u0) - row_sum);
820 Float column_sum = 0.0;
821 for (
int i = 0; i < 2; i++) column_sum +=
mat[i] *
mat[i];
822 for (
int i = 6; i < 8; i++) column_sum +=
mat[i] *
mat[i];
823 Float U20_mag =
sqrt(1.f / (u0 * u0) - column_sum);
831 Float r_inv2 = 1.0 / (u0 * row_sum);
866 su3Reconstruct12((
double *)
mat, dir, ga_idx,
param);
868 su3Reconstruct12((
float *)
mat, dir, ga_idx,
param);
871 su3Reconstruct8((
double *)
mat, dir, ga_idx,
param);
873 su3Reconstruct8((
float *)
mat, dir, ga_idx,
param);
877 template <
typename Float>
static int compareFloats(
Float *a,
Float *b,
int len,
double epsilon)
879 for (
int i = 0; i < len; i++) {
880 double diff = fabs(a[i] - b[i]);
881 if (diff >
epsilon || std::isnan(diff)) {
882 printfQuda(
"ERROR: i=%d, a[%d]=%f, b[%d]=%f\n", i, i, a[i], i, b[i]);
892 return compareFloats((
double *)a, (
double *)b, len,
epsilon);
894 return compareFloats((
float *)a, (
float *)b, len,
epsilon);
904 if (i >=
Vh || i < 0) {
905 printf(
"i out of range in fullLatticeIndex_4d");
923 int x2 = za - zb * X2;
925 int x3 = zb - x4 * X3;
926 int x1odd = (x2 + x3 + x4 + oddBit) & 1;
928 int X = 2 * sid + x1odd;
942 int boundaryCrossings
943 = i / (
Z[0] / 2) + i / (
Z[1] *
Z[0] / 2) + i / (
Z[2] *
Z[1] *
Z[0] / 2) + i / (
Z[3] *
Z[2] *
Z[1] *
Z[0] / 2);
944 return 2 * i + (boundaryCrossings + oddBit) % 2;
949 int boundaryCrossings = i / (
Z[0] / 2) + i / (
Z[1] *
Z[0] / 2) + i / (
Z[2] *
Z[1] *
Z[0] / 2);
950 return 2 * i + (boundaryCrossings + oddBit) % 2;
963 int x4 = Y / (
Z[2] *
Z[1] *
Z[0]);
971 for (
int d = 0; d < 3; d++) {
977 for (
int j = (
Z[0] / 2) *
Z[1] *
Z[2] * (
Z[3] - 1); j <
Vh; j++) {
990 Float *even = gauge[dir];
992 for (
int i = 0; i < iMax; i++) {
993 for (
int m = 0; m < 3; m++) {
994 for (
int n = 0; n < 3; n++) {
995 even[i * (3 * 3 * 2) + m * (3 * 2) + n * (2) + 0] = (m == n) ? 1 : 0;
996 even[i * (3 * 3 * 2) + m * (3 * 2) + n * (2) + 1] = 0.0;
997 odd[i * (3 * 3 * 2) + m * (3 * 2) + n * (2) + 0] = (m == n) ? 1 : 0;
998 odd[i * (3 * 3 * 2) + m * (3 * 2) + n * (2) + 1] = 0.0;
1008 Float *resOdd[4], *resEven[4];
1009 for (
int dir = 0; dir < 4; dir++) {
1010 resEven[dir] = res[dir];
1014 for (
int dir = 0; dir < 4; dir++) {
1015 for (
int i = 0; i <
Vh; i++) {
1016 for (
int m = 0; m < 3; m++) {
1017 for (
int n = 0; n < 3; n++) {
1018 resEven[dir][i * (3 * 3 * 2) + m * (3 * 2) + n * (2) + 0] = (m == n) ? 1 : 0;
1019 resEven[dir][i * (3 * 3 * 2) + m * (3 * 2) + n * (2) + 1] = 0.0;
1020 resOdd[dir][i * (3 * 3 * 2) + m * (3 * 2) + n * (2) + 0] = (m == n) ? 1 : 0;
1021 resOdd[dir][i * (3 * 3 * 2) + m * (3 * 2) + n * (2) + 1] = 0.0;
1034 template <
typename Float>
static void normalize(
complex<Float> *a,
int len)
1037 for (
int i = 0; i < len; i++)
sum +=
norm(a[i]);
1038 for (
int i = 0; i < len; i++) a[i] /=
sqrt(
sum);
1045 for (
int i = 0; i < len; i++) dot +=
conj(a[i]) * b[i];
1046 for (
int i = 0; i < len; i++) b[i] -= (
complex<Float>)dot * a[i];
1051 Float *resOdd[4], *resEven[4];
1052 for (
int dir = 0; dir < 4; dir++) {
1053 resEven[dir] = res[dir];
1057 for (
int dir = 0; dir < 4; dir++) {
1058 for (
int i = 0; i <
Vh; i++) {
1059 for (
int m = 1; m < 3; m++) {
1060 for (
int n = 0; n < 3; n++) {
1061 resEven[dir][i * (3 * 3 * 2) + m * (3 * 2) + n * (2) + 0] = rand() / (
Float)RAND_MAX;
1062 resEven[dir][i * (3 * 3 * 2) + m * (3 * 2) + n * (2) + 1] = rand() / (
Float)RAND_MAX;
1063 resOdd[dir][i * (3 * 3 * 2) + m * (3 * 2) + n * (2) + 0] = rand() / (
Float)RAND_MAX;
1064 resOdd[dir][i * (3 * 3 * 2) + m * (3 * 2) + n * (2) + 1] = rand() / (
Float)RAND_MAX;
1067 normalize((
complex<Float> *)(resEven[dir] + (i * 3 + 1) * 3 * 2), 3);
1068 orthogonalize((
complex<Float> *)(resEven[dir] + (i * 3 + 1) * 3 * 2),
1070 normalize((
complex<Float> *)(resEven[dir] + (i * 3 + 2) * 3 * 2), 3);
1072 normalize((
complex<Float> *)(resOdd[dir] + (i * 3 + 1) * 3 * 2), 3);
1073 orthogonalize((
complex<Float> *)(resOdd[dir] + (i * 3 + 1) * 3 * 2),
1075 normalize((
complex<Float> *)(resOdd[dir] + (i * 3 + 2) * 3 * 2), 3);
1078 Float *w = resEven[dir] + (i * 3 + 0) * 3 * 2;
1079 Float *u = resEven[dir] + (i * 3 + 1) * 3 * 2;
1080 Float *v = resEven[dir] + (i * 3 + 2) * 3 * 2;
1082 for (
int n = 0; n < 6; n++) w[n] = 0.0;
1092 Float *w = resOdd[dir] + (i * 3 + 0) * 3 * 2;
1093 Float *u = resOdd[dir] + (i * 3 + 1) * 3 * 2;
1094 Float *v = resOdd[dir] + (i * 3 + 2) * 3 * 2;
1096 for (
int n = 0; n < 6; n++) w[n] = 0.0;
1112 for (
int dir = 0; dir < 4; dir++) {
1113 for (
int i = 0; i <
Vh; i++) {
1114 for (
int m = 0; m < 3; m++) {
1115 for (
int n = 0; n < 3; n++) {
1116 resEven[dir][i * (3 * 3 * 2) + m * (3 * 2) + n * (2) + 0] = 1.0 * rand() / (
Float)RAND_MAX;
1117 resEven[dir][i * (3 * 3 * 2) + m * (3 * 2) + n * (2) + 1] = 2.0 * rand() / (
Float)RAND_MAX;
1118 resOdd[dir][i * (3 * 3 * 2) + m * (3 * 2) + n * (2) + 0] = 3.0 * rand() / (
Float)RAND_MAX;
1119 resOdd[dir][i * (3 * 3 * 2) + m * (3 * 2) + n * (2) + 1] = 4.0 * rand() / (
Float)RAND_MAX;
1132 Float *resOdd[4], *resEven[4];
1133 for (
int dir = 0; dir < 4; dir++) {
1134 resEven[dir] = res[dir];
1138 for (
int dir = 0; dir < 4; dir++) {
1139 for (
int i = 0; i <
Vh; i++) {
1140 for (
int m = 1; m < 3; m++) {
1141 for (
int n = 0; n < 3; n++) {
1142 resEven[dir][i * (3 * 3 * 2) + m * (3 * 2) + n * (2) + 0] = rand() / (
Float)RAND_MAX;
1143 resEven[dir][i * (3 * 3 * 2) + m * (3 * 2) + n * (2) + 1] = rand() / (
Float)RAND_MAX;
1144 resOdd[dir][i * (3 * 3 * 2) + m * (3 * 2) + n * (2) + 0] = rand() / (
Float)RAND_MAX;
1145 resOdd[dir][i * (3 * 3 * 2) + m * (3 * 2) + n * (2) + 1] = rand() / (
Float)RAND_MAX;
1148 normalize((
complex<Float> *)(resEven[dir] + (i * 3 + 1) * 3 * 2), 3);
1149 orthogonalize((
complex<Float> *)(resEven[dir] + (i * 3 + 1) * 3 * 2),
1151 normalize((
complex<Float> *)(resEven[dir] + (i * 3 + 2) * 3 * 2), 3);
1153 normalize((
complex<Float> *)(resOdd[dir] + (i * 3 + 1) * 3 * 2), 3);
1154 orthogonalize((
complex<Float> *)(resOdd[dir] + (i * 3 + 1) * 3 * 2),
1156 normalize((
complex<Float> *)(resOdd[dir] + (i * 3 + 2) * 3 * 2), 3);
1159 Float *w = resEven[dir] + (i * 3 + 0) * 3 * 2;
1160 Float *u = resEven[dir] + (i * 3 + 1) * 3 * 2;
1161 Float *v = resEven[dir] + (i * 3 + 2) * 3 * 2;
1163 for (
int n = 0; n < 6; n++) w[n] = 0.0;
1173 Float *w = resOdd[dir] + (i * 3 + 0) * 3 * 2;
1174 Float *u = resOdd[dir] + (i * 3 + 1) * 3 * 2;
1175 Float *v = resOdd[dir] + (i * 3 + 2) * 3 * 2;
1177 for (
int n = 0; n < 6; n++) w[n] = 0.0;
1194 for (
int i = 0; i <
V; i++) {
1195 for (
int j = 0; j < 72; j++) { res[i * 72 + j] = c * rand() -
norm; }
1198 for (
int ch = 0; ch < 2; ch++) {
1199 res[i * 72 + 3 + 36 * ch] = -res[i * 72 + 0 + 36 * ch];
1200 res[i * 72 + 4 + 36 * ch] = -res[i * 72 + 1 + 36 * ch];
1201 res[i * 72 + 5 + 36 * ch] = -res[i * 72 + 2 + 36 * ch];
1202 res[i * 72 + 30 + 36 * ch] = -res[i * 72 + 6 + 36 * ch];
1203 res[i * 72 + 31 + 36 * ch] = -res[i * 72 + 7 + 36 * ch];
1204 res[i * 72 + 32 + 36 * ch] = -res[i * 72 + 8 + 36 * ch];
1205 res[i * 72 + 33 + 36 * ch] = -res[i * 72 + 9 + 36 * ch];
1206 res[i * 72 + 34 + 36 * ch] = -res[i * 72 + 16 + 36 * ch];
1207 res[i * 72 + 35 + 36 * ch] = -res[i * 72 + 17 + 36 * ch];
1210 for (
int j = 0; j < 6; j++) {
1211 res[i * 72 + j] += diag;
1212 res[i * 72 + j + 36] += diag;
1220 const int fail_check = 17;
1221 int fail[4][fail_check];
1223 for (
int d = 0; d < 4; d++)
1224 for (
int i = 0; i < fail_check; i++) fail[d][i] = 0;
1225 for (
int d = 0; d < 4; d++)
1226 for (
int i = 0; i < 18; i++) iter[d][i] = 0;
1228 for (
int d = 0; d < 4; d++) {
1229 for (
int eo = 0; eo < 2; eo++) {
1230 for (
int i = 0; i <
Vh; i++) {
1231 int ga_idx = (eo *
Vh + i);
1232 for (
int j = 0; j < 18; j++) {
1233 double diff = fabs(newG[d][ga_idx * 18 + j] - oldG[d][ga_idx * 18 + j]);
1235 for (
int f = 0; f < fail_check; f++)
1236 if (diff >
pow(10.0, -(f + 1)) || std::isnan(diff)) fail[d][f]++;
1237 if (diff >
epsilon || std::isnan(diff)) iter[d][j]++;
1243 printf(
"Component fails (X, Y, Z, T)\n");
1244 for (
int i = 0; i < 18; i++)
1245 printf(
"%d fails = (%8d, %8d, %8d, %8d)\n", i, iter[0][i], iter[1][i], iter[2][i], iter[3][i]);
1247 printf(
"\nDeviation Failures = (X, Y, Z, T)\n");
1248 for (
int f = 0; f < fail_check; f++) {
1249 printf(
"%e Failures = (%9d, %9d, %9d, %9d) = (%6.5f, %6.5f, %6.5f, %6.5f)\n",
pow(10.0, -(f + 1)), fail[0][f],
1250 fail[1][f], fail[2][f], fail[3][f], fail[0][f] / (
double)(
V * 18), fail[1][f] / (
double)(
V * 18),
1251 fail[2][f] / (
double)(
V * 18), fail[3][f] / (
double)(
V * 18));
1273 for (
int i = 0; i <
V; i++) {
1274 for (
int dir =
XUP; dir <=
TUP; dir++) {
1288 int i4 = full_idx / (X3 * X2 * X1);
1289 int i3 = (full_idx - i4 * (X3 * X2 * X1)) / (X2 * X1);
1290 int i2 = (full_idx - i4 * (X3 * X2 * X1) - i3 * (X2 * X1)) / X1;
1291 int i1 = full_idx - i4 * (X3 * X2 * X1) - i3 * (X2 * X1) - i2 * X1;
1296 if ((i4 & 1) != 0) { coeff *= -1; }
1300 if (((i4 + i1) & 1) != 0) { coeff *= -1; }
1304 if (((i4 + i1 + i2) & 1) != 0) { coeff *= -1; }
1311 default: printf(
"ERROR: wrong dir(%d)\n", dir); exit(1);
1317 double *mylink = (
double *)link[dir];
1320 mylink[12] *= coeff;
1321 mylink[13] *= coeff;
1322 mylink[14] *= coeff;
1323 mylink[15] *= coeff;
1324 mylink[16] *= coeff;
1325 mylink[17] *= coeff;
1330 float *mylink = (
float *)link[dir];
1333 mylink[12] *= coeff;
1334 mylink[13] *= coeff;
1335 mylink[14] *= coeff;
1336 mylink[15] *= coeff;
1337 mylink[16] *= coeff;
1338 mylink[17] *= coeff;
1345 for (
int dir = 0; dir < 4; dir++) {
1348 float *f = (
float *)link[dir];
1349 if (f[i] != f[i] || (fabsf(f[i]) > 1.e+3)) {
1350 fprintf(stderr,
"ERROR: %dth: bad number(%f) in function %s \n", i, f[i], __FUNCTION__);
1354 double *f = (
double *)link[dir];
1355 if (f[i] != f[i] || (fabs(f[i]) > 1.e+3)) {
1356 fprintf(stderr,
"ERROR: %dth: bad number(%f) in function %s \n", i, f[i], __FUNCTION__);
1369 const int fail_check = 16;
1370 int fail[fail_check];
1371 for (
int f = 0; f < fail_check; f++) fail[f] = 0;
1374 for (
int i = 0; i < 18; i++) iter[i] = 0;
1376 for (
int dir = 0; dir < 4; dir++) {
1377 for (
int i = 0; i < len; i++) {
1378 for (
int j = 0; j < 18; j++) {
1379 int is = i * 18 + j;
1380 double diff = fabs(linkA[dir][is] - linkB[dir][is]);
1381 for (
int f = 0; f < fail_check; f++)
1382 if (diff >
pow(10.0, -(f + 1)) || std::isnan(diff)) fail[f]++;
1384 if (diff > 1e-3 || std::isnan(diff)) iter[j]++;
1389 for (
int i = 0; i < 18; i++)
printfQuda(
"%d fails = %d\n", i, iter[i]);
1391 int accuracy_level = 0;
1392 for (
int f = 0; f < fail_check; f++) {
1393 if (fail[f] == 0) { accuracy_level = f; }
1396 for (
int f = 0; f < fail_check; f++) {
1397 printfQuda(
"%e Failures: %d / %d = %e\n",
pow(10.0, -(f + 1)), fail[f], 4 * len * 18,
1398 fail[f] / (
double)(4 * len * 18));
1401 return accuracy_level;
1404 static int compare_link(
void **linkA,
void **linkB,
int len,
QudaPrecision precision)
1409 ret =
compareLink((
double **)linkA, (
double **)linkB, len);
1411 ret =
compareLink((
float **)linkA, (
float **)linkB, len);
1418 static void printLinkElement(
void *link,
int X,
QudaPrecision precision)
1431 printLinkElement(linkA[0], 0,
prec);
1433 printLinkElement(linkA[0], 1,
prec);
1435 printLinkElement(linkA[3], len - 1,
prec);
1439 printLinkElement(linkB[0], 0,
prec);
1441 printLinkElement(linkB[0], 1,
prec);
1443 printLinkElement(linkB[3], len - 1,
prec);
1446 int ret = compare_link(linkA, linkB, len,
prec);
1457 fprintf(stderr,
"Error: malloc failed for temp in function %s\n", __FUNCTION__);
1461 for (
int i = 0; i <
V; i++) {
1463 for (
int dir = 0; dir < 4; dir++) {
1464 double *thismom = (
double *)mom;
1466 thismom[(4 * i + dir) *
mom_site_size + k] = 1.0 * rand() / RAND_MAX;
1471 for (
int dir = 0; dir < 4; dir++) {
1472 float *thismom = (
float *)mom;
1474 thismom[(4 * i + dir) *
mom_site_size + k] = 1.0 * rand() / RAND_MAX;
1487 for (
int i = 0; i <
V; i++) {
1489 for (
int dir = 0; dir < 4; dir++) {
1490 double *thishw = (
double *)hw;
1494 for (
int dir = 0; dir < 4; dir++) {
1495 float *thishw = (
float *)hw;
1506 const int fail_check = 16;
1507 int fail[fail_check];
1508 for (
int f = 0; f < fail_check; f++) fail[f] = 0;
1513 for (
int i = 0; i < len; i++) {
1516 double diff = fabs(momA[is] - momB[is]);
1517 for (
int f = 0; f < fail_check; f++)
1518 if (diff >
pow(10.0, -(f + 1)) || std::isnan(diff)) fail[f]++;
1520 if (diff > 1e-3 || std::isnan(diff)) iter[j]++;
1524 int accuracy_level = 0;
1525 for (
int f = 0; f < fail_check; f++) {
1526 if (fail[f] == 0) { accuracy_level = f + 1; }
1531 for (
int f = 0; f < fail_check; f++) {
1532 printfQuda(
"%e Failures: %d / %d = %e\n",
pow(10.0, -(f + 1)), fail[f], len * 9, fail[f] / (
double)(len * 9));
1535 return accuracy_level;
1538 static void printMomElement(
void *mom,
int X,
QudaPrecision precision)
1543 printfQuda(
"(%9f,%9f) (%9f,%9f)\n", thismom[6], thismom[7], thismom[8], thismom[9]);
1547 printfQuda(
"(%9f,%9f) (%9f,%9f)\n", thismom[6], thismom[7], thismom[8], thismom[9]);
1554 printMomElement(momA, 0,
prec);
1556 printMomElement(momA, 1,
prec);
1558 printMomElement(momA, 2,
prec);
1560 printMomElement(momA, 3,
prec);
1564 printMomElement(momB, 0,
prec);
1566 printMomElement(momB, 1,
prec);
1568 printMomElement(momB, 2,
prec);
1570 printMomElement(momB, 3,
prec);
1575 ret =
compare_mom((
double *)momA, (
double *)momB, len);
1577 ret =
compare_mom((
float *)momA, (
float *)momB, len);
1588 double action = 0.0;
1589 for (
int i = 0; i < len; i++) {
1592 for (
int j = 0; j < 6; j++) local += mom[j] * mom[j];
1593 for (
int j = 6; j < 9; j++) local += 0.5 * mom[j] * mom[j];
1603 double action = 0.0;
1605 action = mom_action<double>((
double *)mom, len);
1607 action = mom_action<float>((
float *)mom, len);
1613 static struct timeval startTime;
1619 struct timeval endTime;
1620 gettimeofday(&endTime, NULL);
1622 long ds = endTime.tv_sec - startTime.tv_sec;
1623 long dus = endTime.tv_usec - startTime.tv_usec;
1624 return ds + 0.000001 * dus;
1627 void performanceStats(std::vector<double> &time, std::vector<double> &gflops, std::vector<int> &iter)
1629 auto mean_time = 0.0;
1630 auto mean_time2 = 0.0;
1631 auto mean_gflops = 0.0;
1632 auto mean_gflops2 = 0.0;
1633 auto mean_iter = 0.0;
1634 auto mean_iter2 = 0.0;
1636 for (
int i = 1; i <
Nsrc; i++) {
1637 mean_time += time[i];
1638 mean_time2 += time[i] * time[i];
1639 mean_gflops += gflops[i];
1640 mean_gflops2 += gflops[i] * gflops[i];
1641 mean_iter += iter[i];
1642 mean_iter2 += iter[i] * iter[i];
1645 auto NsrcM1 =
Nsrc - 1;
1647 mean_time /= NsrcM1;
1648 mean_time2 /= NsrcM1;
1649 auto stddev_time = NsrcM1 > 1 ?
sqrt((NsrcM1 / ((
double)NsrcM1 - 1.0)) * (mean_time2 - mean_time * mean_time)) :
1650 std::numeric_limits<double>::infinity();
1651 mean_gflops /= NsrcM1;
1652 mean_gflops2 /= NsrcM1;
1653 auto stddev_gflops = NsrcM1 > 1 ?
sqrt((NsrcM1 / ((
double)NsrcM1 - 1.0)) * (mean_gflops2 - mean_gflops * mean_gflops)) :
1654 std::numeric_limits<double>::infinity();
1656 mean_iter /= NsrcM1;
1657 mean_iter2 /= NsrcM1;
1658 auto stddev_iter = NsrcM1 > 1 ?
sqrt((NsrcM1 / ((
double)NsrcM1 - 1.0)) * (mean_iter2 - mean_iter * mean_iter)) :
1659 std::numeric_limits<double>::infinity();
1661 printfQuda(
"%d solves, mean iteration count %g (stddev = %g), with mean solve time %g (stddev = %g), mean GFLOPS %g "
1662 "(stddev = %g) [excluding first solve]\n",
1663 Nsrc, mean_iter, stddev_iter, mean_time, stddev_time, mean_gflops, stddev_gflops);
QudaGammaBasis gammaBasis
QudaFieldLocation location
void setPrecision(QudaPrecision precision, QudaPrecision ghost_precision=QUDA_INVALID_PRECISION, bool force_native=false)
QudaFieldOrder fieldOrder
Class declaration to initialize and hold CURAND RNG states.
int comm_dim_partitioned(int dim)
int(* QudaCommsMap)(const int *coords, void *fdata)
void comm_allreduce(double *data)
void commDimPartitionedSet(int dir)
QudaPrecision prec_refinement_sloppy
quda::mgarray< int > num_setup_iter
quda::mgarray< int > mg_eig_poly_deg
quda::mgarray< QudaEigType > mg_eig_type
QudaReconstructType link_recon_sloppy
quda::mgarray< int > mg_eig_check_interval
quda::mgarray< char[256]> mg_vec_outfile
quda::mgarray< double > setup_ca_lambda_max
QudaReconstructType link_recon
quda::mgarray< bool > mg_eig
quda::mgarray< QudaInverterType > coarse_solver
QudaReconstructType link_recon_precondition
quda::mgarray< QudaCABasis > coarse_solver_ca_basis
quda::mgarray< int > n_block_ortho
std::array< int, 4 > grid_partition
quda::mgarray< int > coarse_solver_maxiter
quda::mgarray< char[256]> mg_vec_infile
quda::mgarray< int > nu_post
quda::mgarray< int > nu_pre
quda::mgarray< int > setup_ca_basis_size
quda::mgarray< int > mg_eig_n_kr
quda::mgarray< int > coarse_solver_ca_basis_size
quda::mgarray< double > mg_eig_amin
quda::mgarray< QudaVerbosity > mg_verbosity
quda::mgarray< double > setup_ca_lambda_min
quda::mgarray< int > setup_maxiter
quda::mgarray< int > nvec
quda::mgarray< QudaCABasis > setup_ca_basis
quda::mgarray< QudaSchwarzType > mg_schwarz_type
quda::mgarray< QudaEigSpectrumType > mg_eig_spectrum
quda::mgarray< int > mg_eig_max_restarts
QudaDslashType dslash_type
quda::mgarray< bool > mg_eig_use_dagger
quda::mgarray< double > mu_factor
quda::mgarray< double > coarse_solver_ca_lambda_max
std::array< int, 4 > dim_partitioned
quda::mgarray< int > mg_eig_n_ev
quda::mgarray< double > mg_eig_amax
quda::mgarray< QudaInverterType > setup_inv
quda::mgarray< double > setup_tol
quda::mgarray< double > coarse_solver_ca_lambda_min
quda::mgarray< bool > mg_eig_use_poly_acc
QudaPrecision prec_eigensolver
quda::mgarray< QudaSolveType > coarse_solve_type
quda::mgarray< double > mg_eig_tol
QudaPrecision prec_precondition
quda::mgarray< bool > mg_eig_use_normop
QudaReconstructType link_recon_eigensolver
quda::mgarray< QudaPrecision > mg_eig_save_prec
quda::mgarray< double > smoother_tol
quda::mgarray< QudaSolveType > smoother_solve_type
quda::mgarray< bool > mg_eig_require_convergence
quda::mgarray< double > coarse_solver_tol
QudaPrecision smoother_halo_prec
quda::mgarray< QudaInverterType > smoother_type
quda::mgarray< int > setup_maxiter_refresh
quda::mgarray< int > mg_schwarz_cycle
std::array< int, 4 > gridsize_from_cmdline
QudaPrecision prec_sloppy
void mat(void *out, void **link, void *in, int dagger_bit, int mu, QudaPrecision sPrecision, QudaPrecision gPrecision)
cpuColorSpinorField * spinor
QudaGaugeParam gauge_param
QudaInvertParam inv_param
enum QudaPrecision_s QudaPrecision
@ QUDA_DOMAIN_WALL_DSLASH
@ QUDA_MOBIUS_DWF_EOFA_DSLASH
@ QUDA_DOMAIN_WALL_4D_DSLASH
@ QUDA_CPU_FIELD_LOCATION
@ QUDA_PARITY_SITE_SUBSET
@ QUDA_RECONSTRUCT_INVALID
enum QudaDslashType_s QudaDslashType
enum QudaSolutionType_s QudaSolutionType
@ QUDA_EVEN_ODD_SITE_ORDER
enum QudaReconstructType_s QudaReconstructType
@ QUDA_SPACE_SPIN_COLOR_FIELD_ORDER
@ QUDA_REFERENCE_FIELD_CREATE
Matrix< N, std::complex< T > > conj(const Matrix< N, std::complex< T > > &mat)
int compare_floats(void *a, void *b, int len, double epsilon, QudaPrecision precision)
void constructHostCloverField(void *clover, void *clover_inv, QudaInvertParam &inv_param)
double mom_action(real *mom_, int len)
size_t host_gauge_data_type_size
int dimPartitioned(int dim)
QudaPrecision & cuda_prec
void initComms(int argc, char **argv, std::array< int, 4 > &commDims)
size_t host_spinor_data_type_size
QudaPrecision & cuda_prec_sloppy
void setQudaMgSolveTypes()
void complexProduct(Float *a, Float *b, Float *c)
void applyGaugeFieldScaling(Float **gauge, int Vh, QudaGaugeParam *param)
void createMomCPU(void *mom, QudaPrecision precision)
void printGaugeElement(void *gauge, int X, QudaPrecision precision)
void constructUnitGaugeField(Float **res, QudaGaugeParam *param)
int strong_check_link(void **linkA, const char *msgA, void **linkB, const char *msgB, int len, QudaPrecision prec)
int neighborIndexFullLattice(int i, int dx4, int dx3, int dx2, int dx1)
int compareLink(Float **linkA, Float **linkB, int len)
int x4_from_full_index(int i)
void performanceStats(std::vector< double > &time, std::vector< double > &gflops, std::vector< int > &iter)
QudaPrecision & cuda_prec_eigensolver
void setQudaDefaultMgTestParams()
int fullLatticeIndex_5d(int i, int oddBit)
void complexDotProduct(Float *a, Float *b, Float *c)
int compare_mom(Float *momA, Float *momB, int len)
int fullLatticeIndex_4d(int i, int oddBit)
size_t host_clover_data_type_size
void su3Construct8(Float *mat)
QudaPrecision & cuda_prec_precondition
int strong_check_mom(void *momA, void *momB, int len, QudaPrecision prec)
int neighborIndexFullLattice_mg(int i, int dx4, int dx3, int dx2, int dx1)
const char * __asan_default_options()
Set the default ASAN options. This ensures that QUDA just works when SANITIZE is enabled without requ...
void su3Construct12(Float *mat)
void constructQudaCloverField(void *clover, double norm, double diag, QudaPrecision precision)
void createSiteLinkCPU(void **link, QudaPrecision precision, int phase)
void complexAddTo(Float *a, Float *b)
QudaPrecision & cuda_prec_refinement_sloppy
void dw_setDims(int *X, const int L5)
void complexConjugateProduct(Float *a, Float *b, Float *c)
int lex_rank_from_coords_x(const int *coords, void *fdata)
void printSpinorElement(void *spinor, int X, QudaPrecision precision)
void su3_construct(void *mat, QudaReconstructType reconstruct, QudaPrecision precision)
void constructRandomSpinorSource(void *v, int nSpin, int nColor, QudaPrecision precision, QudaSolutionType sol_type, const int *const x, quda::RNG &rng)
void constructHostGaugeField(void **gauge, QudaGaugeParam &gauge_param, int argc, char **argv)
void accumulateComplexProduct(Float *a, Float *b, Float *c, Float sign)
void coordinate_from_shrinked_index(int coordinate[4], int shrinked_index, const int shrinked_dim[4], const int shift[4], int parity)
void check_gauge(void **oldG, void **newG, double epsilon, QudaPrecision precision)
void constructCloverField(Float *res, double norm, double diag)
double stopwatchReadSeconds()
void constructRandomGaugeField(Float **res, QudaGaugeParam *param, QudaDslashType dslash_type)
void createHwCPU(void *hw, QudaPrecision precision)
int fullLatticeIndex(int dim[4], int index, int oddBit)
void printVector(Float *v)
int neighborIndex(int i, int oddBit, int dx4, int dx3, int dx2, int dx1)
int lex_rank_from_coords_t(const int *coords, void *fdata)
void accumulateConjugateProduct(Float *a, Float *b, Float *c, int sign)
void constructQudaGaugeField(void **gauge, int type, QudaPrecision precision, QudaGaugeParam *param)
void constructUnitaryGaugeField(Float **res)
QudaPrecision & cuda_prec_ritz
int neighborIndex_mg(int i, int oddBit, int dx4, int dx3, int dx2, int dx1)
void su3_reconstruct(void *mat, int dir, int ga_idx, QudaReconstructType reconstruct, QudaPrecision precision, QudaGaugeParam *param)
int index_4d_cb_from_coordinate_4d(const int coordinate[4], const int dim[4])
void accumulateComplexDotProduct(Float *a, Float *b, Float *c)
void get_size_from_env(int *const dims, const char env[])
void constructWilsonTestSpinorParam(quda::ColorSpinorParam *cs_param, const QudaInvertParam *inv_param, const QudaGaugeParam *gauge_param)
int fullLatticeIndex_5d_4dpc(int i, int oddBit)
bool isPCSolution(QudaSolutionType solution_type)
void applyGaugeFieldScaling_long(Float **gauge, int Vh, QudaGaugeParam *param, QudaDslashType dslash_type)
quda::cudaGaugeField * checkGauge(QudaInvertParam *param)
__host__ __device__ ValueType cos(ValueType x)
__host__ __device__ ValueType atan2(ValueType x, ValueType y)
std::map< TuneKey, TuneParam > map
__host__ __device__ ValueType sin(ValueType x)
__host__ __device__ ValueType sqrt(ValueType x)
__host__ __device__ ValueType pow(ValueType x, ExponentType e)
__host__ __device__ ValueType norm(const complex< ValueType > &z)
Returns the magnitude of z squared.
void spinorNoise(ColorSpinorField &src, RNG &randstates, QudaNoiseType type)
Generate a random noise spinor. This variant allows the user to manage the RNG state.
FloatingPoint< float > Float
__host__ __device__ T sum(const array< T, s > &a)
_EXTERN_C_ int MPI_Init(int *argc, char ***argv)
void read_gauge_field(const char *filename, void *gauge[], QudaPrecision prec, const int *X, int argc, char *argv[])
void initCommsGridQuda(int nDim, const int *dims, QudaCommsMap func, void *fdata)
#define QUDA_MAX_MG_LEVEL
Maximum number of multi-grid levels. This number may be increased if needed.
QudaFieldLocation location
QudaSolutionType solution_type
int compute_clover_inverse
QudaPrecision clover_cpu_prec
QudaDslashType dslash_type
int return_clover_inverse
QudaGammaBasis gamma_basis
QudaSiteSubset siteSubset