9 #elif defined(MPI_COMMS) 55 for (
int i = 1;
i < 4;
i++) {
64 for (
int i = 2;
i >= 0;
i--) {
72 void initComms(
int argc,
char **argv,
const int *commDims)
74 #if defined(QMP_COMMS) 75 QMP_thread_level_t tl;
76 QMP_init_msg_passing(&argc, &argv, QMP_THREAD_SINGLE, &tl);
79 QMP_declare_logical_topology(commDims, 4);
81 #elif defined(MPI_COMMS) 84 MPI_Init_thread(&argc, &argv, MPI_THREAD_MULTIPLE, &provided);
95 printfQuda(
"Rank order is %s major (%s running fastest)\n",
100 for (
int i=0; i<4; i++) if (comm_dim(i) > 1) partitioned++;
102 errorQuda(
"Use of QIO is not supported with column-major process ordering, use row-major instead (--rank-order row)");
109 #if defined(QMP_COMMS) 110 QMP_finalize_msg_passing();
111 #elif defined(MPI_COMMS) 121 #if defined(QMP_COMMS) 122 rank = QMP_get_node_number();
123 #elif defined(MPI_COMMS) 124 MPI_Comm_rank(MPI_COMM_WORLD, &
rank);
132 for (
int d=0;
d< 4;
d++) {
137 for (
int i=0;
i<4;
i++) {
170 for (
int d=0;
d< 4;
d++)
176 for (
int i=0;
i<4;
i++) {
198 template <
typename Float>
200 printfQuda(
"{(%f %f) (%f %f) (%f %f)}\n", v[0], v[1], v[2], v[3], v[4], v[5]);
229 int x4 = Y/(
Z[2]*
Z[1]*
Z[0]);
230 int x3 = (Y/(
Z[1]*
Z[0])) %
Z[2];
231 int x2 = (Y/
Z[0]) %
Z[1];
233 return (x4+x3+x2+x1) % 2;
237 template <
typename Float>
244 template <
typename Float>
246 a[0] =
b[0]*
c[0] -
b[1]*
c[1];
247 a[1] =
b[0]*
c[1] +
b[1]*
c[0];
251 template <
typename Float>
253 a[0] =
b[0]*
c[0] -
b[1]*
c[1];
254 a[1] = -
b[0]*
c[1] -
b[1]*
c[0];
258 template <
typename Float>
260 a[0] =
b[0]*
c[0] +
b[1]*
c[1];
261 a[1] =
b[0]*
c[1] -
b[1]*
c[0];
265 template <
typename Float>
272 template <
typename Float>
274 a[0] +=
b[0]*
c[0] +
b[1]*
c[1];
275 a[1] +=
b[0]*
c[1] -
b[1]*
c[0];
278 template <
typename Float>
284 template <
typename Float>
296 template <
typename Float>
300 for (
int i=8;
i<18;
i++)
mat[
i] = 0.0;
317 template <
typename Float>
319 Float *u = &
mat[0*(3*2)];
320 Float *v = &
mat[1*(3*2)];
321 Float *
w = &
mat[2*(3*2)];
322 w[0] = 0.0;
w[1] = 0.0;
w[2] = 0.0;
w[3] = 0.0;
w[4] = 0.0;
w[5] = 0.0;
331 w[0]*=u0;
w[1]*=u0;
w[2]*=u0;
w[3]*=u0;
w[4]*=u0;
w[5]*=u0;
334 template <
typename Float>
344 Float U00_mag =
sqrt(1.
f/(u0*u0) - row_sum);
352 Float column_sum = 0.0;
353 for (
int i=0;
i<2;
i++) column_sum +=
mat[
i]*
mat[
i];
354 for (
int i=6;
i<8;
i++) column_sum +=
mat[
i]*
mat[
i];
355 Float U20_mag =
sqrt(1.
f/(u0*u0) - column_sum);
363 Float r_inv2 = 1.0/(u0*row_sum);
425 template <
typename Float>
427 for (
int i = 0;
i <
len;
i++) {
429 if (diff > epsilon) {
448 int x3 =
zb - x4*
dim[2];
450 return 2*
index + ((x2 + x3 + x4 + oddBit) & 1);
474 int x1odd = (x2 + x3 + x4 + oddBit) & 1;
476 int X = 2*
sid + x1odd;
494 int x4 = Y/(
Z[2]*
Z[1]*
Z[0]);
495 int x3 = (Y/(
Z[1]*
Z[0])) %
Z[2];
496 int x2 = (Y/
Z[0]) %
Z[1];
501 x4 = (x4+dx4+
Z[3]) %
Z[3];
502 x3 = (x3+dx3+
Z[2]) %
Z[2];
503 x2 = (x2+dx2+
Z[1]) %
Z[1];
504 x1 = (x1+dx1+
Z[0]) %
Z[0];
506 return (x4*(
Z[2]*
Z[1]*
Z[0]) + x3*(
Z[1]*
Z[0]) + x2*(
Z[0]) + x1) / 2;
517 x[1] = (fullIndex/
dim[0]) %
dim[1];
518 x[0] = fullIndex %
dim[0];
520 for(
int dir=0; dir<4; ++dir)
521 x[dir] = (
x[dir]+dx[dir]+
dim[dir]) %
dim[dir];
523 return (((
x[3]*
dim[2] +
x[2])*
dim[1] +
x[1])*
dim[0] +
x[0])/2;
532 int x4 = Y/(
Z[2]*
Z[1]*
Z[0]);
533 int x3 = (Y/(
Z[1]*
Z[0])) %
Z[2];
534 int x2 = (Y/
Z[0]) %
Z[1];
537 int ghost_x4 = x4+ dx4;
541 x4 = (x4+dx4+
Z[3]) %
Z[3];
542 x3 = (x3+dx3+
Z[2]) %
Z[2];
543 x2 = (x2+dx2+
Z[1]) %
Z[1];
544 x1 = (x1+dx1+
Z[0]) %
Z[0];
547 ret = (x4*(
Z[2]*
Z[1]*
Z[0]) + x3*(
Z[1]*
Z[0]) + x2*(
Z[0]) + x1) / 2;
549 ret = (x3*(
Z[1]*
Z[0]) + x2*(
Z[0]) + x1) / 2;
573 int oddBitChanged = (dx4+dx3+dx2+dx1)%2;
577 int ret = nbr_half_idx;
579 ret =
Vh + nbr_half_idx;
589 const int halfVolume = volume/2;
591 int halfIndex =
index;
593 if(
index >= halfVolume){
595 halfIndex =
index - halfVolume;
600 int oddBitChanged = (dx[0]+dx[1]+dx[2]+dx[3])%2;
605 return neighborHalfIndex + oddBit*halfVolume;
622 int x4 = Y/(
Z[2]*
Z[1]*
Z[0]);
623 int x3 = (Y/(
Z[1]*
Z[0])) %
Z[2];
624 int x2 = (Y/
Z[0]) %
Z[1];
626 int ghost_x4 = x4+ dx4;
628 x4 = (x4+dx4+
Z[3]) %
Z[3];
629 x3 = (x3+dx3+
Z[2]) %
Z[2];
630 x2 = (x2+dx2+
Z[1]) %
Z[1];
631 x1 = (x1+dx1+
Z[0]) %
Z[0];
633 if ( ghost_x4 >= 0 && ghost_x4 <
Z[3]){
634 ret = (x4*(
Z[2]*
Z[1]*
Z[0]) + x3*(
Z[1]*
Z[0]) + x2*(
Z[0]) + x1) / 2;
636 ret = (x3*(
Z[1]*
Z[0]) + x2*(
Z[0]) + x1) / 2;
640 int oddBitChanged = (dx4+dx3+dx2+dx1)%2;
659 if (
i >=
Vh ||
i < 0) {
printf(
"i out of range in fullLatticeIndex_4d");
exit(-1);}
678 int x1odd = (x2 + x3 + x4 + oddBit) & 1;
680 int X = 2*
sid + x1odd;
693 int boundaryCrossings =
i/(
Z[0]/2) +
i/(
Z[1]*
Z[0]/2) +
i/(
Z[2]*
Z[1]*
Z[0]/2) +
i/(
Z[3]*
Z[2]*
Z[1]*
Z[0]/2);
694 return 2*
i + (boundaryCrossings + oddBit) % 2;
698 int boundaryCrossings =
i/(
Z[0]/2) +
i/(
Z[1]*
Z[0]/2) +
i/(
Z[2]*
Z[1]*
Z[0]/2);
699 return 2*
i + (boundaryCrossings + oddBit) % 2;
713 int x4 = Y/(
Z[2]*
Z[1]*
Z[0]);
718 template <
typename Float>
721 for (
int d = 0;
d < 3;
d++) {
731 bool last_node_in_t =
true;
736 for (
int j = (
Z[0]/2)*
Z[1]*
Z[2]*(
Z[3]-1); j <
Vh; j++) {
747 int iMax = ( last_node_in_t ? (
Z[0]/2)*
Z[1]*
Z[2]*(
Z[3]-1) :
Vh );
749 Float *even = gauge[dir];
751 for (
int i = 0;
i< iMax;
i++) {
752 for (
int m = 0; m < 3; m++) {
753 for (
int n = 0;
n < 3;
n++) {
754 even[
i*(3*3*2) + m*(3*2) +
n*(2) + 0] = (m==
n) ? 1 : 0;
755 even[
i*(3*3*2) + m*(3*2) +
n*(2) + 1] = 0.0;
756 odd [
i*(3*3*2) + m*(3*2) +
n*(2) + 0] = (m==
n) ? 1 : 0;
757 odd [
i*(3*3*2) + m*(3*2) +
n*(2) + 1] = 0.0;
764 template <
typename Float>
776 for(
int d=0;
d<4;
d++){
784 for (
int d = 0;
d < 3;
d++) {
787 for (
int i = 0;
i <
Vh;
i++) {
790 int i4 =
index /(X3*X2*X1);
791 int i3 = (
index - i4*(X3*X2*X1))/(X2*X1);
792 int i2 = (
index - i4*(X3*X2*X1) - i3*(X2*X1))/X1;
793 int i1 =
index - i4*(X3*X2*X1) - i3*(X2*X1) - i2*X1;
803 if ((i4+i1) % 2 == 1){
808 if ( (i4+i1+i2) % 2 == 1){
813 for (
int j=0; j < 18; j++) {
818 for (
int i = 0;
i <
Vh;
i++) {
820 int i4 =
index /(X3*X2*X1);
821 int i3 = (
index - i4*(X3*X2*X1))/(X2*X1);
822 int i2 = (
index - i4*(X3*X2*X1) - i3*(X2*X1))/X1;
823 int i1 =
index - i4*(X3*X2*X1) - i3*(X2*X1) - i2*X1;
833 if ((i4+i1) % 2 == 1){
838 if ( (i4+i1+i2) % 2 == 1){
843 for (
int j=0; j<18; j++){
852 for (
int j = 0; j <
Vh; j++) {
855 if (j >= (X4-3)*X1h*X2*X3 ){
859 if (j >= (X4-1)*X1h*X2*X3 ){
864 for (
int i=0;
i<18;
i++) {
875 template <
typename Float>
877 Float *resOdd[4], *resEven[4];
878 for (
int dir = 0; dir < 4; dir++) {
879 resEven[dir] = res[dir];
883 for (
int dir = 0; dir < 4; dir++) {
884 for (
int i = 0;
i <
Vh;
i++) {
885 for (
int m = 0; m < 3; m++) {
886 for (
int n = 0;
n < 3;
n++) {
887 resEven[dir][
i*(3*3*2) + m*(3*2) +
n*(2) + 0] = (m==
n) ? 1 : 0;
888 resEven[dir][
i*(3*3*2) + m*(3*2) +
n*(2) + 1] = 0.0;
889 resOdd[dir][
i*(3*3*2) + m*(3*2) +
n*(2) + 0] = (m==
n) ? 1 : 0;
890 resOdd[dir][
i*(3*3*2) + m*(3*2) +
n*(2) + 1] = 0.0;
900 template <
typename Float>
908 template <
typename Float>
910 complex<double>
dot = 0.0;
915 template <
typename Float>
917 Float *resOdd[4], *resEven[4];
918 for (
int dir = 0; dir < 4; dir++) {
919 resEven[dir] = res[dir];
923 for (
int dir = 0; dir < 4; dir++) {
924 for (
int i = 0;
i <
Vh;
i++) {
925 for (
int m = 1; m < 3; m++) {
926 for (
int n = 0;
n < 3;
n++) {
927 resEven[dir][
i*(3*3*2) + m*(3*2) +
n*(2) + 0] =
rand() / (Float)RAND_MAX;
928 resEven[dir][
i*(3*3*2) + m*(3*2) +
n*(2) + 1] =
rand() / (Float)RAND_MAX;
929 resOdd[dir][
i*(3*3*2) + m*(3*2) +
n*(2) + 0] =
rand() / (Float)RAND_MAX;
930 resOdd[dir][
i*(3*3*2) + m*(3*2) +
n*(2) + 1] =
rand() / (Float)RAND_MAX;
933 normalize((complex<Float>*)(resEven[dir] + (
i*3+1)*3*2), 3);
934 orthogonalize((complex<Float>*)(resEven[dir] + (
i*3+1)*3*2), (complex<Float>*)(resEven[dir] + (
i*3+2)*3*2), 3);
935 normalize((complex<Float>*)(resEven[dir] + (
i*3 + 2)*3*2), 3);
937 normalize((complex<Float>*)(resOdd[dir] + (
i*3+1)*3*2), 3);
938 orthogonalize((complex<Float>*)(resOdd[dir] + (
i*3+1)*3*2), (complex<Float>*)(resOdd[dir] + (
i*3+2)*3*2), 3);
939 normalize((complex<Float>*)(resOdd[dir] + (
i*3 + 2)*3*2), 3);
942 Float *
w = resEven[dir]+(
i*3+0)*3*2;
943 Float *u = resEven[dir]+(
i*3+1)*3*2;
944 Float *v = resEven[dir]+(
i*3+2)*3*2;
946 for (
int n = 0;
n < 6;
n++)
w[
n] = 0.0;
956 Float *
w = resOdd[dir]+(
i*3+0)*3*2;
957 Float *u = resOdd[dir]+(
i*3+1)*3*2;
958 Float *v = resOdd[dir]+(
i*3+2)*3*2;
960 for (
int n = 0;
n < 6;
n++)
w[
n] = 0.0;
977 for (
int dir = 0; dir < 4; dir++){
978 for (
int i = 0;
i <
Vh;
i++) {
979 for (
int m = 0; m < 3; m++) {
980 for (
int n = 0;
n < 3;
n++) {
981 resEven[dir][
i*(3*3*2) + m*(3*2) +
n*(2) + 0] =1.0*
rand() / (Float)RAND_MAX;
982 resEven[dir][
i*(3*3*2) + m*(3*2) +
n*(2) + 1] = 2.0*
rand() / (Float)RAND_MAX;
983 resOdd[dir][
i*(3*3*2) + m*(3*2) +
n*(2) + 0] = 3.0*
rand() / (Float)RAND_MAX;
984 resOdd[dir][
i*(3*3*2) + m*(3*2) +
n*(2) + 1] = 4.0*
rand() / (Float)RAND_MAX;
994 template <
typename Float>
997 Float *resOdd[4], *resEven[4];
998 for (
int dir = 0; dir < 4; dir++) {
999 resEven[dir] = res[dir];
1003 for (
int dir = 0; dir < 4; dir++) {
1004 for (
int i = 0;
i <
Vh;
i++) {
1005 for (
int m = 1; m < 3; m++) {
1006 for (
int n = 0;
n < 3;
n++) {
1007 resEven[dir][
i*(3*3*2) + m*(3*2) +
n*(2) + 0] =
rand() / (Float)RAND_MAX;
1008 resEven[dir][
i*(3*3*2) + m*(3*2) +
n*(2) + 1] =
rand() / (Float)RAND_MAX;
1009 resOdd[dir][
i*(3*3*2) + m*(3*2) +
n*(2) + 0] =
rand() / (Float)RAND_MAX;
1010 resOdd[dir][
i*(3*3*2) + m*(3*2) +
n*(2) + 1] =
rand() / (Float)RAND_MAX;
1013 normalize((complex<Float>*)(resEven[dir] + (
i*3+1)*3*2), 3);
1014 orthogonalize((complex<Float>*)(resEven[dir] + (
i*3+1)*3*2), (complex<Float>*)(resEven[dir] + (
i*3+2)*3*2), 3);
1015 normalize((complex<Float>*)(resEven[dir] + (
i*3 + 2)*3*2), 3);
1017 normalize((complex<Float>*)(resOdd[dir] + (
i*3+1)*3*2), 3);
1018 orthogonalize((complex<Float>*)(resOdd[dir] + (
i*3+1)*3*2), (complex<Float>*)(resOdd[dir] + (
i*3+2)*3*2), 3);
1019 normalize((complex<Float>*)(resOdd[dir] + (
i*3 + 2)*3*2), 3);
1022 Float *
w = resEven[dir]+(
i*3+0)*3*2;
1023 Float *u = resEven[dir]+(
i*3+1)*3*2;
1024 Float *v = resEven[dir]+(
i*3+2)*3*2;
1026 for (
int n = 0;
n < 6;
n++)
w[
n] = 0.0;
1036 Float *
w = resOdd[dir]+(
i*3+0)*3*2;
1037 Float *u = resOdd[dir]+(
i*3+1)*3*2;
1038 Float *v = resOdd[dir]+(
i*3+2)*3*2;
1040 for (
int n = 0;
n < 6;
n++)
w[
n] = 0.0;
1058 }
else if (type == 1) {
1099 const double phase = (M_PI *
rand())/RAND_MAX;
1100 const complex<double>
z =
polar(1.0, phase);
1101 for (
int dir=0; dir<4; ++dir) {
1102 for (
int i=0;
i<
V; ++
i) {
1118 for(
int dir=0; dir<4; ++dir){
1119 for(
int i=0;
i<
V; ++
i){
1136 template <
typename Float>
1139 Float
c = 2.0 *
norm / RAND_MAX;
1141 for(
int i = 0;
i <
V;
i++) {
1142 for (
int j = 0; j < 72; j++) {
1147 for (
int ch=0; ch<2; ch++) {
1148 res[
i*72 + 3 + 36*ch] = -res[
i*72 + 0 + 36*ch];
1149 res[
i*72 + 4 + 36*ch] = -res[
i*72 + 1 + 36*ch];
1150 res[
i*72 + 5 + 36*ch] = -res[
i*72 + 2 + 36*ch];
1151 res[
i*72 + 30 + 36*ch] = -res[
i*72 + 6 + 36*ch];
1152 res[
i*72 + 31 + 36*ch] = -res[
i*72 + 7 + 36*ch];
1153 res[
i*72 + 32 + 36*ch] = -res[
i*72 + 8 + 36*ch];
1154 res[
i*72 + 33 + 36*ch] = -res[
i*72 + 9 + 36*ch];
1155 res[
i*72 + 34 + 36*ch] = -res[
i*72 + 16 + 36*ch];
1156 res[
i*72 + 35 + 36*ch] = -res[
i*72 + 17 + 36*ch];
1159 for (
int j = 0; j<6; j++) {
1160 res[
i*72 + j] += diag;
1161 res[
i*72 + j+36] += diag;
1184 template <
typename Float>
1185 static void checkGauge(Float **oldG, Float **newG,
double epsilon) {
1187 const int fail_check = 17;
1188 int fail[4][fail_check];
1190 for (
int d=0;
d<4;
d++)
for (
int i=0;
i<fail_check;
i++) fail[
d][
i] = 0;
1191 for (
int d=0;
d<4;
d++)
for (
int i=0;
i<18;
i++) iter[
d][
i] = 0;
1193 for (
int d=0;
d<4;
d++) {
1194 for (
int eo=0; eo<2; eo++) {
1195 for (
int i=0;
i<
Vh;
i++) {
1197 for (
int j=0; j<18; j++) {
1200 for (
int f=0; f<fail_check; f++) if (diff >
pow(10.0,-(
f+1))) fail[
d][
f]++;
1201 if (diff > epsilon) iter[
d][j]++;
1207 printf(
"Component fails (X, Y, Z, T)\n");
1208 for (
int i=0;
i<18;
i++)
printf(
"%d fails = (%8d, %8d, %8d, %8d)\n",
i, iter[0][
i], iter[1][
i], iter[2][
i], iter[3][
i]);
1210 printf(
"\nDeviation Failures = (X, Y, Z, T)\n");
1211 for (
int f=0;
f<fail_check;
f++) {
1212 printf(
"%e Failures = (%9d, %9d, %9d, %9d) = (%6.5f, %6.5f, %6.5f, %6.5f)\n",
pow(10.0,-(
f+1)),
1213 fail[0][
f], fail[1][
f], fail[2][
f], fail[3][
f],
1214 fail[0][
f]/(
double)(
V*18), fail[1][
f]/(
double)(
V*18), fail[2][
f]/(
double)(
V*18), fail[3][
f]/(
double)(
V*18));
1221 checkGauge((
double**)oldG, (
double**)newG, epsilon);
1223 checkGauge((
float**)oldG, (
float**)newG, epsilon);
1242 bool last_node_in_t =
true;
1247 for(
int i=0;
i <
V;
i++){
1248 for(
int dir =
XUP; dir <=
TUP; dir++){
1263 int i3 = (
full_idx - i4*(X3*X2*X1))/(X2*X1);
1264 int i2 = (
full_idx - i4*(X3*X2*X1) - i3*(X2*X1))/X1;
1265 int i1 =
full_idx - i4*(X3*X2*X1) - i3*(X2*X1) - i2*X1;
1270 if ( (i4 & 1) != 0){
1276 if ( ((i4+i1) & 1) != 0){
1282 if ( ((i4+i1+i2) & 1) != 0){
1288 if (last_node_in_t && i4 == (X4-1)){
1294 printf(
"ERROR: wrong dir(%d)\n", dir);
1301 double* mylink = (
double*)link[dir];
1304 mylink[12] *=
coeff;
1305 mylink[13] *=
coeff;
1306 mylink[14] *=
coeff;
1307 mylink[15] *=
coeff;
1308 mylink[16] *=
coeff;
1309 mylink[17] *=
coeff;
1314 float* mylink = (
float*)link[dir];
1317 mylink[12] *=
coeff;
1318 mylink[13] *=
coeff;
1319 mylink[14] *=
coeff;
1320 mylink[15] *=
coeff;
1321 mylink[16] *=
coeff;
1322 mylink[17] *=
coeff;
1331 for(
int dir= 0;dir < 4;dir++){
1334 float*
f = (
float*)link[dir];
1336 fprintf(stderr,
"ERROR: %dth: bad number(%f) in function %s \n",
i,
f[
i], __FUNCTION__);
1340 double*
f = (
double*)link[dir];
1342 fprintf(stderr,
"ERROR: %dth: bad number(%f) in function %s \n",
i,
f[
i], __FUNCTION__);
1358 template <
typename Float>
1360 const int fail_check = 16;
1361 int fail[fail_check];
1362 for (
int f=0;
f<fail_check;
f++) fail[
f] = 0;
1365 for (
int i=0;
i<18;
i++) iter[
i] = 0;
1367 for(
int dir=0;dir < 4; dir++){
1368 for (
int i=0;
i<
len;
i++) {
1369 for (
int j=0; j<18; j++) {
1371 double diff =
fabs(linkA[dir][is]-linkB[dir][is]);
1372 for (
int f=0;
f<fail_check;
f++)
1373 if (diff >
pow(10.0,-(
f+1))) fail[
f]++;
1375 if (diff > 1
e-3) iter[j]++;
1382 int accuracy_level = 0;
1383 for(
int f =0;
f < fail_check;
f++){
1389 for (
int f=0;
f<fail_check;
f++) {
1390 printfQuda(
"%e Failures: %d / %d = %e\n",
pow(10.0,-(
f+1)), fail[
f], 4*
len*18, fail[
f] / (
double)(4*
len*18));
1393 return accuracy_level;
1416 for(
int i=0;
i < 3;
i++){
1422 for(
int i=0;
i < 3;
i++){
1429 void **linkB,
const char* msgB,
1461 fprintf(stderr,
"Error: malloc failed for temp in function %s\n", __FUNCTION__);
1467 for(
int i=0;
i <
V;
i++){
1469 for(
int dir=0;dir < 4;dir++){
1470 double* thismom = (
double*)mom;
1477 for(
int dir=0;dir < 4;dir++){
1478 float* thismom=(
float*)mom;
1494 for(
int i=0;
i <
V;
i++){
1496 for(
int dir=0;dir < 4;dir++){
1497 double* thishw = (
double*)
hw;
1503 for(
int dir=0;dir < 4;dir++){
1504 float* thishw=(
float*)
hw;
1516 template <
typename Float>
1518 const int fail_check = 16;
1519 int fail[fail_check];
1520 for (
int f=0;
f<fail_check;
f++) fail[
f] = 0;
1525 for (
int i=0;
i<
len;
i++) {
1528 double diff =
fabs(momA[is]-momB[is]);
1529 for (
int f=0;
f<fail_check;
f++)
1530 if (diff >
pow(10.0,-(
f+1))) fail[
f]++;
1532 if (diff > 1
e-3) iter[j]++;
1536 int accuracy_level = 0;
1537 for(
int f =0;
f < fail_check;
f++){
1539 accuracy_level =
f+1;
1545 for (
int f=0;
f<fail_check;
f++) {
1549 return accuracy_level;
1558 printfQuda(
"(%9f,%9f) (%9f,%9f)\n", thismom[6], thismom[7], thismom[8], thismom[9]);
1562 printfQuda(
"(%9f,%9f) (%9f,%9f)\n", thismom[6], thismom[7], thismom[8], thismom[9]);
1695 printf(
"Usage: %s [options]\n", argv[0]);
1696 printf(
"Common options: \n");
1698 printf(
" --device <n> # Set the CUDA device to use (default 0, single GPU only)\n");
1700 printf(
" --prec <double/single/half> # Precision in GPU\n");
1701 printf(
" --prec-sloppy <double/single/half> # Sloppy precision in GPU\n");
1702 printf(
" --prec-precondition <double/single/half> # Preconditioner precision in GPU\n");
1703 printf(
" --prec-ritz <double/single/half> # Eigenvector precision in GPU\n");
1704 printf(
" --recon <8/9/12/13/18> # Link reconstruction type\n");
1705 printf(
" --recon-sloppy <8/9/12/13/18> # Sloppy link reconstruction type\n");
1706 printf(
" --recon-precondition <8/9/12/13/18> # Preconditioner link reconstruction type\n");
1707 printf(
" --dagger # Set the dagger to 1 (default 0)\n");
1708 printf(
" --dim <n> # Set space-time dimension (X Y Z T)\n");
1709 printf(
" --sdim <n> # Set space dimension(X/Y/Z) size\n");
1710 printf(
" --xdim <n> # Set X dimension size(default 24)\n");
1711 printf(
" --ydim <n> # Set X dimension size(default 24)\n");
1712 printf(
" --zdim <n> # Set X dimension size(default 24)\n");
1713 printf(
" --tdim <n> # Set T dimension size(default 24)\n");
1714 printf(
" --Lsdim <n> # Set Ls dimension size(default 16)\n");
1715 printf(
" --gridsize <x y z t> # Set the grid size in all four dimension (default 1 1 1 1)\n");
1716 printf(
" --xgridsize <n> # Set grid size in X dimension (default 1)\n");
1717 printf(
" --ygridsize <n> # Set grid size in Y dimension (default 1)\n");
1718 printf(
" --zgridsize <n> # Set grid size in Z dimension (default 1)\n");
1719 printf(
" --tgridsize <n> # Set grid size in T dimension (default 1)\n");
1720 printf(
" --partition <mask> # Set the communication topology (X=1, Y=2, Z=4, T=8, and combinations of these)\n");
1721 printf(
" --rank-order <col/row> # Set the [t][z][y][x] rank order as either column major (t fastest, default) or row major (x fastest)\n");
1722 printf(
" --kernel-pack-t # Set T dimension kernel packing to be true (default false)\n");
1723 printf(
" --dslash-type <type> # Set the dslash type, the following values are valid\n" 1724 " wilson/clover/twisted-mass/twisted-clover/staggered\n" 1725 " /asqtad/domain-wall/domain-wall-4d/mobius/laplace\n");
1726 printf(
" --flavor <type> # Set the twisted mass flavor type (singlet (default), deg-doublet, nondeg-doublet)\n");
1727 printf(
" --load-gauge file # Load gauge field \"file\" for the test (requires QIO)\n");
1728 printf(
" --niter <n> # The number of iterations to perform (default 10)\n");
1729 printf(
" --ngcrkrylov <n> # The number of inner iterations to use for GCR, BiCGstab-l (default 10)\n");
1730 printf(
" --pipeline <n> # The pipeline length for fused operations in GCR, BiCGstab-l (default 0, no pipelining)\n");
1731 printf(
" --solution-pipeline <n> # The pipeline length for fused solution accumulation (default 0, no pipelining)\n");
1732 printf(
" --inv-type <cg/bicgstab/gcr> # The type of solver to use (default cg)\n");
1733 printf(
" --precon-type <mr/ (unspecified)> # The type of solver to use (default none (=unspecified)).\n" 1734 " For multigrid this sets the smoother type.\n");
1735 printf(
" --multishift <true/false> # Whether to do a multi-shift solver test or not (default false)\n");
1736 printf(
" --mass # Mass of Dirac operator (default 0.1)\n");
1737 printf(
" --mu # Twisted-Mass of Dirac operator (default 0.1)\n");
1738 printf(
" --compute-clover # Compute the clover field or use random numbers (default false)\n");
1739 printf(
" --clover-coeff # Clover coefficient (default 1.0)\n");
1740 printf(
" --anisotropy # Temporal anisotropy factor (default 1.0)\n");
1741 printf(
" --mass-normalization # Mass normalization (kappa (default) / mass / asym-mass)\n");
1742 printf(
" --matpc # Matrix preconditioning type (even-even, odd-odd, even-even-asym, odd-odd-asym) \n");
1743 printf(
" --solve-type # The type of solve to do (direct, direct-pc, normop, normop-pc, normerr, normerr-pc) \n");
1744 printf(
" --tol <resid_tol> # Set L2 residual tolerance\n");
1745 printf(
" --tolhq <resid_hq_tol> # Set heavy-quark residual tolerance\n");
1746 printf(
" --test # Test method (different for each test)\n");
1747 printf(
" --verify <true/false> # Verify the GPU results using CPU results (default true)\n");
1748 printf(
" --mg-nvec <level nvec> # Number of null-space vectors to define the multigrid transfer operator on a given level\n");
1749 printf(
" --mg-gpu-prolongate <true/false> # Whether to do the multigrid transfer operators on the GPU (default false)\n");
1750 printf(
" --mg-levels <2+> # The number of multigrid levels to do (default 2)\n");
1751 printf(
" --mg-nu-pre <1-20> # The number of pre-smoother applications to do at each multigrid level (default 2)\n");
1752 printf(
" --mg-nu-post <1-20> # The number of post-smoother applications to do at each multigrid level (default 2)\n");
1753 printf(
" --mg-setup-inv <level inv> # The inverter to use for the setup of multigrid (default bicgstab)\n");
1754 printf(
" --mg-setup-tol # The tolerance to use for the setup of multigrid (default 5e-6)\n");
1755 printf(
" --mg-omega # The over/under relaxation factor for the smoother of multigrid (default 0.85)\n");
1756 printf(
" --mg-smoother # The smoother to use for multigrid (default mr)\n");
1757 printf(
" --mg-block-size <level x y z t> # Set the geometric block size for the each multigrid level's transfer operator (default 4 4 4 4)\n");
1758 printf(
" --mg-mu-factor <level factor> # Set the multiplicative factor for the twisted mass mu parameter on each level (default 1)\n");
1759 printf(
" --mg-generate-nullspace <true/false> # Generate the null-space vector dynamically (default true)\n");
1760 printf(
" --mg-generate-all-levels <true/talse> # true=generate nul space on all levels, false=generate on level 0 and create other levels from that (default true)\n");
1761 printf(
" --mg-load-vec file # Load the vectors \"file\" for the multigrid_test (requires QIO)\n");
1762 printf(
" --mg-save-vec file # Save the generated null-space vectors \"file\" from the multigrid_test (requires QIO)\n");
1763 printf(
" --mg-vebosity <level verb> # The verbosity to use on each level of the multigrid (default silent)\n");
1764 printf(
" --df-nev <nev> # Set number of eigenvectors computed within a single solve cycle (default 8)\n");
1765 printf(
" --df-max-search-dim <dim> # Set the size of eigenvector search space (default 64)\n");
1766 printf(
" --df-deflation-grid <n> # Set maximum number of cycles needed to compute eigenvectors(default 1)\n");
1767 printf(
" --df-eigcg-max-restarts <n> # Set how many iterative refinement cycles will be solved with eigCG within a single physical right hand site solve (default 4)\n");
1768 printf(
" --df-tol-restart <tol> # Set tolerance for the first restart in the initCG solver(default 5e-5)\n");
1769 printf(
" --df-tol-inc <tol> # Set tolerance for the subsequent restarts in the initCG solver (default 1e-2)\n");
1770 printf(
" --df-max-restart-num <n> # Set maximum number of the initCG restarts in the deflation stage (default 3)\n");
1771 printf(
" --df-tol-eigenval <tol> # Set maximum eigenvalue residual norm (default 1e-1)\n");
1774 printf(
" --solver-ext-lib-type <eigen/magma> # Set external library for the solvers (default Eigen library)\n");
1775 printf(
" --df-ext-lib-type <eigen/magma> # Set external library for the deflation methods (default Eigen library)\n");
1776 printf(
" --df-location-ritz <host/cuda> # Set memory location for the ritz vectors (default cuda memory loction)\n");
1777 printf(
" --df-mem-type-ritz <device/pinned/mapped> # Set memory type for the ritz vectors (default device memory type)\n");
1779 printf(
" --nsrc <n> # How many spinors to apply the dslash to simultaneusly (experimental for staggered only)\n");
1781 printf(
" --msrc <n> # Used for testing non-square block blas routines where nsrc defines the other dimension\n");
1782 printf(
" --help # Print out this message\n");
1788 char msg[]=
"single";
1790 printf(
"Note: this program is %s GPU build\n", msg);
1800 char msg[]=
"single";
1807 if(
strcmp(argv[
i],
"--help")== 0){
1811 if(
strcmp(argv[
i],
"--verify") == 0){
1816 if (
strcmp(argv[
i+1],
"true") == 0){
1818 }
else if (
strcmp(argv[
i+1],
"false") == 0){
1821 fprintf(stderr,
"ERROR: invalid verify type\n");
1830 if(
strcmp(argv[
i],
"--device") == 0){
1835 if (device < 0 || device > 16){
1836 printf(
"ERROR: Invalid CUDA device number (%d)\n",
device);
1844 if(
strcmp(argv[
i],
"--prec") == 0){
1854 if(
strcmp(argv[
i],
"--prec-sloppy") == 0){
1864 if(
strcmp(argv[
i],
"--prec-precondition") == 0){
1874 if(
strcmp(argv[
i],
"--prec-ritz") == 0){
1884 if(
strcmp(argv[
i],
"--recon") == 0){
1894 if(
strcmp(argv[
i],
"--recon-sloppy") == 0){
1904 if(
strcmp(argv[
i],
"--recon-precondition") == 0){
1914 if(
strcmp(argv[
i],
"--dim") == 0){
1919 if (xdim < 0 || xdim > 512){
1920 printf(
"ERROR: invalid X dimension (%d)\n",
xdim);
1929 if (ydim < 0 || ydim > 512){
1930 printf(
"ERROR: invalid Y dimension (%d)\n",
ydim);
1939 if (zdim < 0 || zdim > 512){
1940 printf(
"ERROR: invalid Z dimension (%d)\n",
zdim);
1949 if (tdim < 0 || tdim > 512){
1950 printf(
"ERROR: invalid T dimension (%d)\n",
tdim);
1959 if(
strcmp(argv[
i],
"--xdim") == 0){
1964 if (xdim < 0 || xdim > 512){
1965 printf(
"ERROR: invalid X dimension (%d)\n",
xdim);
1973 if(
strcmp(argv[
i],
"--ydim") == 0){
1978 if (ydim < 0 || ydim > 512){
1979 printf(
"ERROR: invalid T dimension (%d)\n",
ydim);
1988 if(
strcmp(argv[
i],
"--zdim") == 0){
1993 if (zdim < 0 || zdim > 512){
1994 printf(
"ERROR: invalid T dimension (%d)\n",
zdim);
2002 if(
strcmp(argv[
i],
"--tdim") == 0){
2007 if (tdim < 0 || tdim > 512){
2008 printf(
"Error: invalid t dimension");
2016 if(
strcmp(argv[
i],
"--sdim") == 0){
2020 int sdim =
atoi(argv[
i+1]);
2021 if (sdim < 0 || sdim > 512){
2022 printf(
"ERROR: invalid S dimension\n");
2031 if(
strcmp(argv[
i],
"--Lsdim") == 0){
2036 if (Ls < 0 || Ls > 128){
2037 printf(
"ERROR: invalid Ls dimension\n");
2046 if(
strcmp(argv[
i],
"--dagger") == 0){
2052 if(
strcmp(argv[
i],
"--partition") == 0){
2058 for(
int j=0; j < 4;j++){
2059 if (
value & (1 << j)){
2065 printfQuda(
"WARNING: Ignoring --partition option since this is a single-GPU build.\n");
2072 if(
strcmp(argv[
i],
"--kernel-pack-t") == 0){
2079 if(
strcmp(argv[
i],
"--multishift") == 0){
2084 if (
strcmp(argv[
i+1],
"true") == 0){
2086 }
else if (
strcmp(argv[
i+1],
"false") == 0){
2089 fprintf(stderr,
"ERROR: invalid multishift boolean\n");
2098 if(
strcmp(argv[
i],
"--gridsize") == 0){
2104 printf(
"ERROR: invalid X grid size");
2112 printf(
"ERROR: invalid Y grid size");
2118 int zsize =
atoi(argv[
i+1]);
2120 printf(
"ERROR: invalid Z grid size");
2126 int tsize =
atoi(argv[
i+1]);
2128 printf(
"ERROR: invalid T grid size");
2138 if(
strcmp(argv[
i],
"--xgridsize") == 0){
2144 printf(
"ERROR: invalid X grid size");
2153 if(
strcmp(argv[
i],
"--ygridsize") == 0){
2159 printf(
"ERROR: invalid Y grid size");
2168 if(
strcmp(argv[
i],
"--zgridsize") == 0){
2172 int zsize =
atoi(argv[
i+1]);
2174 printf(
"ERROR: invalid Z grid size");
2183 if(
strcmp(argv[
i],
"--tgridsize") == 0){
2187 int tsize =
atoi(argv[
i+1]);
2189 printf(
"ERROR: invalid T grid size");
2198 if(
strcmp(argv[
i],
"--rank-order") == 0){
2208 if(
strcmp(argv[
i],
"--dslash-type") == 0){
2218 if(
strcmp(argv[
i],
"--flavor") == 0){
2228 if(
strcmp(argv[
i],
"--inv-type") == 0){
2238 if(
strcmp(argv[
i],
"--precon-type") == 0){
2248 if(
strcmp(argv[
i],
"--mass") == 0){
2258 if(
strcmp(argv[
i],
"--compute-clover") == 0){
2262 if (
strcmp(argv[
i+1],
"true") == 0){
2264 }
else if (
strcmp(argv[
i+1],
"false") == 0){
2267 fprintf(stderr,
"ERROR: invalid compute_clover type\n");
2276 if(
strcmp(argv[
i],
"--clover-coeff") == 0){
2286 if(
strcmp(argv[
i],
"--mu") == 0){
2296 if(
strcmp(argv[
i],
"--anisotropy") == 0){
2306 if(
strcmp(argv[
i],
"--tol") == 0){
2316 if(
strcmp(argv[
i],
"--tolhq") == 0){
2326 if(
strcmp(argv[
i],
"--mass-normalization") == 0){
2336 if(
strcmp(argv[
i],
"--matpc") == 0){
2346 if(
strcmp(argv[
i],
"--solve-type") == 0){
2356 if(
strcmp(argv[
i],
"--load-gauge") == 0){
2366 if(
strcmp(argv[
i],
"--nsrc") == 0){
2371 if (Nsrc < 1 || Nsrc > 128){
2372 printf(
"ERROR: invalid number of sources (Nsrc=%d)\n",
Nsrc);
2380 if(
strcmp(argv[
i],
"--msrc") == 0){
2385 if (Msrc < 1 || Msrc > 128){
2386 printf(
"ERROR: invalid number of sources (Msrc=%d)\n",
Msrc);
2394 if(
strcmp(argv[
i],
"--test") == 0){
2404 if(
strcmp(argv[
i],
"--mg-nvec") == 0){
2410 printf(
"ERROR: invalid multigrid level %d",
level);
2425 if(
strcmp(argv[
i],
"--mg-levels") == 0){
2431 printf(
"ERROR: invalid number of multigrid levels (%d)\n",
mg_levels);
2439 if(
strcmp(argv[
i],
"--mg-nu-pre") == 0){
2444 if (nu_pre < 0 || nu_pre > 20){
2445 printf(
"ERROR: invalid pre-smoother applications value (nu_pre=%d)\n",
nu_pre);
2453 if(
strcmp(argv[
i],
"--mg-nu-post") == 0){
2458 if (nu_post < 0 || nu_post > 20){
2459 printf(
"ERROR: invalid pre-smoother applications value (nu_pist=%d)\n",
nu_post);
2467 if(
strcmp(argv[
i],
"--mg-setup-inv") == 0){
2473 printf(
"ERROR: invalid multigrid level %d",
level);
2484 if(
strcmp(argv[
i],
"--mg-setup-tol") == 0){
2495 if(
strcmp(argv[
i],
"--mg-omega") == 0){
2506 if(
strcmp(argv[
i],
"--mg-verbosity") == 0){
2512 printf(
"ERROR: invalid multigrid level %d",
level);
2523 if(
strcmp(argv[
i],
"--mg-smoother") == 0){
2533 if(
strcmp(argv[
i],
"--mg-block-size") == 0){
2539 printf(
"ERROR: invalid multigrid level %d",
level);
2546 printf(
"ERROR: invalid X block size");
2554 printf(
"ERROR: invalid Y block size");
2560 int zsize =
atoi(argv[
i+1]);
2562 printf(
"ERROR: invalid Z block size");
2568 int tsize =
atoi(argv[
i+1]);
2570 printf(
"ERROR: invalid T block size");
2580 if(
strcmp(argv[
i],
"--mass") == 0){
2590 if(
strcmp(argv[
i],
"--mg-mu-factor") == 0){
2596 printf(
"ERROR: invalid multigrid level %d",
level);
2601 double factor =
atof(argv[
i+1]);
2608 if(
strcmp(argv[
i],
"--mg-generate-nullspace") == 0){
2613 if (
strcmp(argv[
i+1],
"true") == 0){
2615 }
else if (
strcmp(argv[
i+1],
"false") == 0){
2618 fprintf(stderr,
"ERROR: invalid generate nullspace type\n");
2627 if(
strcmp(argv[
i],
"--mg-generate-all-levels") == 0){
2632 if (
strcmp(argv[
i+1],
"true") == 0){
2634 }
else if (
strcmp(argv[
i+1],
"false") == 0){
2637 fprintf(stderr,
"ERROR: invalid value for generate_all_levels type\n");
2646 if(
strcmp(argv[
i],
"--mg-load-vec") == 0){
2656 if(
strcmp(argv[
i],
"--mg-save-vec") == 0){
2666 if(
strcmp(argv[
i],
"--df-nev") == 0){
2677 if(
strcmp(argv[
i],
"--df-max-search-dim") == 0){
2688 if(
strcmp(argv[
i],
"--df-deflation-grid") == 0){
2700 if(
strcmp(argv[
i],
"--df-eigcg-max-restarts") == 0){
2711 if(
strcmp(argv[
i],
"--df-max-restart-num") == 0){
2723 if(
strcmp(argv[
i],
"--df-tol-restart") == 0){
2735 if(
strcmp(argv[
i],
"--df-tol-eigenval") == 0){
2746 if(
strcmp(argv[
i],
"--df-tol-inc") == 0){
2757 if(
strcmp(argv[
i],
"--solver-ext-lib-type") == 0){
2767 if(
strcmp(argv[
i],
"--df-ext-lib-type") == 0){
2777 if(
strcmp(argv[
i],
"--df-location-ritz") == 0){
2787 if(
strcmp(argv[
i],
"--df-mem-type-ritz") == 0){
2797 if(
strcmp(argv[
i],
"--niter") == 0){
2802 if (niter < 1 || niter > 1e6){
2803 printf(
"ERROR: invalid number of iterations (%d)\n",
niter);
2811 if(
strcmp(argv[
i],
"--ngcrkrylov") == 0){
2816 if (gcrNkrylov < 1 || gcrNkrylov > 1e6){
2817 printf(
"ERROR: invalid number of gcrkrylov iterations (%d)\n",
gcrNkrylov);
2825 if(
strcmp(argv[
i],
"--pipeline") == 0){
2830 if (pipeline < 0 || pipeline > 8){
2839 if(
strcmp(argv[
i],
"--solution-pipeline") == 0){
2844 if (solution_accumulator_pipeline < 0 || solution_accumulator_pipeline > 16){
2853 if(
strcmp(argv[
i],
"--version") == 0){
2854 printf(
"This program is linked with QUDA library, version %s,",
2856 printf(
" %s GPU build\n", msg);
2875 gettimeofday(&endTime, NULL);
2879 return ds + 0.000001*dus;
void printGaugeElement(void *gauge, int X, QudaPrecision precision)
QudaPrecision prec_precondition
static void orthogonalize(complex< Float > *a, complex< Float > *b, int len)
int dimPartitioned(int dim)
void applyGaugeFieldScaling_long(Float **gauge, int Vh, QudaGaugeParam *param, QudaDslashType dslash_type)
enum QudaMassNormalization_s QudaMassNormalization
void construct_gauge_field(void **gauge, int type, QudaPrecision precision, QudaGaugeParam *param)
static void constructUnitGaugeField(Float **res, QudaGaugeParam *param)
int compareLink(Float **linkA, Float **linkB, int len)
enum QudaPrecision_s QudaPrecision
_EXTERN_C_ int MPI_Init(int *argc, char ***argv)
QudaTwistFlavorType get_flavor_type(char *s)
void dw_setDims(int *X, const int L5)
static void printVector(Float *v)
int geo_block_size[QUDA_MAX_MG_LEVEL][QUDA_MAX_DIM]
__host__ __device__ ValueType norm(const complex< ValueType > &z)
Returns the magnitude of z squared.
void accumulateConjugateProduct(Float *a, Float *b, Float *c, int sign)
Matrix< N, std::complex< T > > conj(const Matrix< N, std::complex< T > > &mat)
void printSpinorElement(void *spinor, int X, QudaPrecision precision)
void createHwCPU(void *hw, QudaPrecision precision)
int nvec[QUDA_MAX_MG_LEVEL]
void createMomCPU(void *mom, QudaPrecision precision)
enum QudaSolveType_s QudaSolveType
void commDimPartitionedSet(int dir)
int process_command_line_option(int argc, char **argv, int *idx)
static __inline__ dim3 dim3 void size_t cudaStream_t int dim
void complexDotProduct(Float *a, Float *b, Float *c)
QudaFieldLocation location_ritz
cudaMipmappedArray_const_t unsigned int level
int fullLatticeIndex_5d_4dpc(int i, int oddBit)
static void applyGaugeFieldScaling(Float **gauge, int Vh, QudaGaugeParam *param)
char * strcpy(char *__dst, const char *__src)
double pow(double, double)
double atan2(double, double)
static struct timeval startTime
void su3Construct8(Float *mat)
void usage_extra(char **argv)
void constructUnitaryGaugeField(Float **res)
int gridsize_from_cmdline[4]
int compare_floats(void *a, void *b, int len, double epsilon, QudaPrecision precision)
int get_rank_order(char *s)
double mu_factor[QUDA_MAX_MG_LEVEL]
static int compare_link(void **linkA, void **linkB, int len, QudaPrecision precision)
QudaVerbosity mg_verbosity[QUDA_MAX_MG_LEVEL]
QudaReconstructType get_recon(char *s)
QudaInverterType precon_type
static void printMomElement(void *mom, int X, QudaPrecision precision)
void createSiteLinkCPU(void **link, QudaPrecision precision, int phase)
void exit(int) __attribute__((noreturn))
QudaFieldLocation get_df_location_ritz(char *s)
void initCommsGridQuda(int nDim, const int *dims, QudaCommsMap func, void *fdata)
__darwin_suseconds_t tv_usec
int fullLatticeIndex(int dim[4], int index, int oddBit)
QudaSolveType get_solve_type(char *s)
char * index(const char *, int)
QudaReconstructType link_recon_precondition
else return(__swbuf(_c, _p))
QudaExtLibType get_solve_ext_lib_type(char *s)
int strcmp(const char *__s1, const char *__s2)
int compare_mom(Float *momA, Float *momB, int len)
double stopwatchReadSeconds()
void * malloc(size_t __size) __attribute__((__warn_unused_result__)) __attribute__((alloc_size(1)))
QudaInverterType get_solver_type(char *s)
int printf(const char *,...) __attribute__((__format__(__printf__
void srand(unsigned) __attribute__((__availability__(swift
static int dim_partitioned[4]
QudaInverterType setup_inv[QUDA_MAX_MG_LEVEL]
void setSpinorSiteSize(int n)
__host__ __device__ void sum(double &a, double &b)
QudaDslashType get_dslash_type(char *s)
void complexAddTo(Float *a, Float *b)
enum QudaMatPCType_s QudaMatPCType
static int compareFloats(Float *a, Float *b, int len, double epsilon)
void accumulateComplexProduct(Float *a, Float *b, Float *c, Float sign)
QudaReconstructType link_recon_sloppy
int int int enum cudaChannelFormatKind f
static void constructGaugeField(Float **res, QudaGaugeParam *param, QudaDslashType dslash_type=QUDA_WILSON_DSLASH)
QudaMatPCType get_matpc_type(char *s)
int(* QudaCommsMap)(const int *coords, void *fdata)
enum QudaDagType_s QudaDagType
void su3Construct12(Float *mat)
static void su3Reconstruct8(Float *mat, int dir, int ga_idx, QudaGaugeParam *param)
QudaReconstructType reconstruct
void __attribute__((weak)) usage_extra(char **argv)
void complexProduct(Float *a, Float *b, Float *c)
int neighborIndex(int i, int oddBit, int dx4, int dx3, int dx2, int dx1)
static void su3Reconstruct12(Float *mat, int dir, int ga_idx, QudaGaugeParam *param)
int fprintf(FILE *, const char *,...) __attribute__((__format__(__printf__
int strong_check_mom(void *momA, void *momB, int len, QudaPrecision prec)
void check_gauge(void **oldG, void **newG, double epsilon, QudaPrecision precision)
static void printLinkElement(void *link, int X, QudaPrecision precision)
QudaTwistFlavorType twist_flavor
int rand(void) __attribute__((__availability__(swift
int strong_check_link(void **linkA, const char *msgA, void **linkB, const char *msgB, int len, QudaPrecision prec)
QudaMemoryType get_df_mem_type_ritz(char *s)
int neighborIndex_mg(int i, int oddBit, int dx4, int dx3, int dx2, int dx1)
void construct_fat_long_gauge_field(void **fatlink, void **longlink, int type, QudaPrecision precision, QudaGaugeParam *param, QudaDslashType dslash_type)
void construct_clover_field(void *clover, double norm, double diag, QudaPrecision precision)
QudaDslashType dslash_type
QudaExtLibType solver_ext_lib
int neighborIndexFullLattice_mg(int i, int dx4, int dx3, int dx2, int dx1)
enum QudaFieldLocation_s QudaFieldLocation
int fullLatticeIndex_5d(int i, int oddBit)
cpuColorSpinorField * out
double atof(const char *)
int solution_accumulator_pipeline
QudaExtLibType deflation_ext_lib
enum QudaReconstructType_s QudaReconstructType
#define QUDA_MAX_MG_LEVEL
Maximum number of multi-grid levels. This number may be increased if needed.
int fullLatticeIndex_4d(int i, int oddBit)
int x4_from_full_index(int i)
QudaMassNormalization get_mass_normalization_type(char *s)
static void normalize(complex< Float > *a, int len)
QudaInverterType inv_type
QudaMassNormalization normalization
enum QudaDslashType_s QudaDslashType
void accumulateComplexDotProduct(Float *a, Float *b, Float *c)
static int lex_rank_from_coords_t(const int *coords, void *fdata)
__host__ __device__ complex< ValueType > polar(const ValueType &m, const ValueType &theta=0)
Returns the complex with magnitude m and angle theta in radians.
enum QudaVerbosity_s QudaVerbosity
QudaInverterType smoother_type
#define QUDA_MAX_DIM
Maximum number of dimensions supported by QUDA. In practice, no routines make use of more than 5...
QudaMemoryType mem_type_ritz
static void constructCloverField(Float *res, double norm, double diag)
QudaVerbosity get_verbosity_type(char *s)
static int lex_rank_from_coords_x(const int *coords, void *fdata)
void mat(void *out, void **link, void *in, int dagger_bit, int mu, QudaPrecision sPrecision, QudaPrecision gPrecision)
static __inline__ size_t size_t d
void su3_construct(void *mat, QudaReconstructType reconstruct, QudaPrecision precision)
const char * get_quda_ver_str()
void su3_reconstruct(void *mat, int dir, int ga_idx, QudaReconstructType reconstruct, QudaPrecision precision, QudaGaugeParam *param)
QudaPrecision prec_sloppy
void initComms(int argc, char **argv, const int *commDims)
enum QudaInverterType_s QudaInverterType
QudaPrecision get_prec(QIO_Reader *infile)
enum QudaMemoryType_s QudaMemoryType
cpuColorSpinorField * spinor
static void checkGauge(Float **oldG, Float **newG, double epsilon)
int comm_dim_partitioned(int dim)
enum QudaExtLibType_s QudaExtLibType
void complexConjugateProduct(Float *a, Float *b, Float *c)
int neighborIndexFullLattice(int i, int dx4, int dx3, int dx2, int dx1)
static void dot(sFloat *res, gFloat *a, sFloat *b)
QudaReconstructType link_recon
enum QudaTwistFlavorType_s QudaTwistFlavorType