33 int xs = X/(
Z[3]*
Z[2]*
Z[1]*
Z[0]);
34 int x4 = (X/(
Z[2]*
Z[1]*
Z[0])) %
Z[3];
35 int x3 = (X/(
Z[1]*
Z[0])) %
Z[2];
36 int x2 = (X/
Z[0]) %
Z[1];
41 xs = (xs+dxs+
Ls) %
Ls;
43 x4 = (x4+dx4+
Z[3]) %
Z[3];
44 x3 = (x3+dx3+
Z[2]) %
Z[2];
45 x2 = (x2+dx2+
Z[1]) %
Z[1];
46 x1 = (x1+dx1+
Z[0]) %
Z[0];
49 return (xs*(
Z[3]*
Z[2]*
Z[1]*
Z[0]) + x4*(Z[2]*Z[1]*Z[0]) + x3*(Z[1]*Z[0]) + x2*(Z[0]) + x1) / 2;
64 if (i < 0 || i >= (
Z[0]*
Z[1]*
Z[2]*
Z[3]/2))
65 { printf(
"i out of range in neighborIndex_4d\n"); exit(-1); }
70 int x4 = X/(
Z[2]*
Z[1]*
Z[0]);
71 int x3 = (X/(
Z[1]*
Z[0])) %
Z[2];
72 int x2 = (X/
Z[0]) %
Z[1];
75 x4 = (x4+dx4+
Z[3]) %
Z[3];
76 x3 = (x3+dx3+
Z[2]) %
Z[2];
77 x2 = (x2+dx2+
Z[1]) %
Z[1];
78 x1 = (x1+dx1+
Z[0]) %
Z[0];
80 return (x4*(
Z[2]*
Z[1]*
Z[0]) + x3*(Z[1]*Z[0]) + x2*(Z[0]) +
x1) / 2;
92 int xs = Y/(
Z[3]*
Z[2]*
Z[1]*
Z[0]);
93 int x4 = (Y/(
Z[2]*
Z[1]*
Z[0])) %
Z[3];
94 int x3 = (Y/(
Z[1]*
Z[0])) %
Z[2];
95 int x2 = (Y/
Z[0]) %
Z[1];
98 int ghost_x4 = x4+ dx4;
100 xs = (xs+dxs+
Ls) %
Ls;
101 x4 = (x4+dx4+
Z[3]) %
Z[3];
102 x3 = (x3+dx3+
Z[2]) %
Z[2];
103 x2 = (x2+dx2+
Z[1]) %
Z[1];
104 x1 = (x1+dx1+
Z[0]) %
Z[0];
106 if ( ghost_x4 >= 0 && ghost_x4 <
Z[3]){
107 ret = (xs*
Z[3]*
Z[2]*
Z[1]*
Z[0] + x4*
Z[2]*
Z[1]*
Z[0] + x3*
Z[1]*
Z[0] + x2*
Z[0] +
x1) >> 1;
109 ret = (xs*
Z[2]*
Z[1]*
Z[0] + x3*
Z[1]*
Z[0] + x2*
Z[0] +
x1) >> 1;
120 return (Y/(
Z[2]*
Z[1]*
Z[0])) % Z[3];
132 template <
typename Float>
142 gaugeField = (oddBit ? gaugeOdd : gaugeEven);
151 default: j = -1;
break;
153 gaugeField = (oddBit ? gaugeEven : gaugeOdd);
156 return &gaugeField[dir/2][j*(3*3*2)];
163 template <
typename Float>
167 int d = nbr_distance;
170 gaugeField = (oddBit ? gaugeOdd : gaugeEven);
175 int x4 = Y/(
Z[2]*
Z[1]*
Z[0]);
176 int x3 = (Y/(
Z[1]*
Z[0])) %
Z[2];
177 int x2 = (Y/
Z[0]) %
Z[1];
183 Float* ghostGaugeField;
188 int new_x1 = (x1 - d +
X1 )% X1;
190 ghostGaugeField = (oddBit?ghostGaugeEven[0]: ghostGaugeOdd[0]);
191 int offset = (n_ghost_faces + x1 -d)*X4*X3*X2/2 + (x4*X3*X2 + x3*X2+x2)/2;
192 return &ghostGaugeField[offset*(3*3*2)];
194 j = (x4*X3*X2*X1 + x3*X2*X1 + x2*X1 + new_x1) / 2;
199 int new_x2 = (x2 - d +
X2 )% X2;
201 ghostGaugeField = (oddBit?ghostGaugeEven[1]: ghostGaugeOdd[1]);
202 int offset = (n_ghost_faces + x2 -d)*X4*X3*X1/2 + (x4*X3*X1 + x3*X1+x1)/2;
203 return &ghostGaugeField[offset*(3*3*2)];
205 j = (x4*X3*X2*X1 + x3*X2*X1 + new_x2*X1 +
x1) / 2;
211 int new_x3 = (x3 - d +
X3 )% X3;
213 ghostGaugeField = (oddBit?ghostGaugeEven[2]: ghostGaugeOdd[2]);
214 int offset = (n_ghost_faces + x3 -d)*X4*X2*X1/2 + (x4*X2*X1 + x2*X1+x1)/2;
215 return &ghostGaugeField[offset*(3*3*2)];
217 j = (x4*X3*X2*X1 + new_x3*X2*X1 + x2*X1 +
x1) / 2;
222 int new_x4 = (x4 - d +
X4)% X4;
224 ghostGaugeField = (oddBit?ghostGaugeEven[3]: ghostGaugeOdd[3]);
225 int offset = (n_ghost_faces + x4 -d)*X1*X2*X3/2 + (x3*X2*X1 + x2*X1+x1)/2;
226 return &ghostGaugeField[offset*(3*3*2)];
228 j = (new_x4*(X3*X2*
X1) + x3*(X2*X1) + x2*(
X1) + x1) / 2;
232 default: j = -1; printf(
"ERROR: wrong dir \n"); exit(1);
234 gaugeField = (oddBit ? gaugeEven : gaugeOdd);
238 return &gaugeField[dir/2][j*(3*3*2)];
242 template <
typename Float>
246 int nb = neighbor_distance;
251 int xs = Y/(
Z[3]*
Z[2]*
Z[1]*
Z[0]);
252 int x4 = (Y/(
Z[2]*
Z[1]*
Z[0])) %
Z[3];
253 int x3 = (Y/(
Z[1]*
Z[0])) %
Z[2];
254 int x2 = (Y/
Z[0]) %
Z[1];
264 int new_x1 = (x1 + nb)% X1;
266 int offset = ((x1 + nb -
X1)*
Ls*X4*X3*X2+xs*X4*X3*X2+x4*X3*X2 + x3*X2+x2) >> 1;
269 j = (xs*X4*X3*X2*X1 + x4*X3*X2*X1 + x3*X2*X1 + x2*X1 + new_x1) >> 1;
275 int new_x1 = (x1 - nb +
X1)% X1;
277 int offset = (( x1+nFace- nb)*
Ls*X4*X3*X2 + xs*X4*X3*X2 + x4*X3*X2 + x3*X2 + x2) >> 1;
280 j = (xs*X4*X3*X2*X1 + x4*X3*X2*X1 + x3*X2*X1 + x2*X1 + new_x1) >> 1;
285 int new_x2 = (x2 + nb)% X2;
287 int offset = (( x2 + nb -
X2)*
Ls*X4*X3*X1+xs*X4*X3*X1+x4*X3*X1 + x3*X1+x1) >> 1;
290 j = (xs*X4*X3*X2*X1 + x4*X3*X2*X1 + x3*X2*X1 + new_x2*X1 +
x1) >> 1;
295 int new_x2 = (x2 - nb +
X2)% X2;
297 int offset = (( x2 + nFace -nb)*
Ls*X4*X3*X1+xs*X4*X3*X1+ x4*X3*X1 + x3*X1+x1) >> 1;
300 j = (xs*X4*X3*X2*X1 + x4*X3*X2*X1 + x3*X2*X1 + new_x2*X1 +
x1) >> 1;
305 int new_x3 = (x3 + nb)% X3;
307 int offset = (( x3 + nb -
X3)*
Ls*X4*X2*X1+xs*X4*X2*X1+x4*X2*X1 + x2*X1+x1) >> 1;
310 j = (xs*X4*X3*X2*X1 + x4*X3*X2*X1 + new_x3*X2*X1 + x2*X1 +
x1) >> 1;
315 int new_x3 = (x3 - nb +
X3)% X3;
317 int offset = (( x3 + nFace -nb)*
Ls*X4*X2*X1+xs*X4*X2*X1+x4*X2*X1+x2*X1+x1) >> 1;
320 j = (xs*X4*X3*X2*X1 + x4*X3*X2*X1 + new_x3*X2*X1 + x2*X1 +
x1) >> 1;
327 if ( (x4 + nb) >=
Z[3])
329 int offset = (x4+nb -
Z[3])*
Vsh_t;
330 return &fwd_nbr_spinor[3][(offset+j)*mySpinorSiteSize];
339 int offset = ( x4 - nb +nFace)*
Vsh_t;
340 return &back_nbr_spinor[3][(offset+j)*mySpinorSiteSize];
344 default: j = -1; printf(
"ERROR: wrong dir\n"); exit(1);
353 template <
typename Float>
367 default: j = -1;
break;
370 return &spinorField[j*(4*3*2)];
379 {{1,0}, {0,0}, {0,0}, {0,-1}},
380 {{0,0}, {1,0}, {0,-1}, {0,0}},
381 {{0,0}, {0,1}, {1,0}, {0,0}},
382 {{0,1}, {0,0}, {0,0}, {1,0}}
385 {{1,0}, {0,0}, {0,0}, {0,1}},
386 {{0,0}, {1,0}, {0,1}, {0,0}},
387 {{0,0}, {0,-1}, {1,0}, {0,0}},
388 {{0,-1}, {0,0}, {0,0}, {1,0}}
391 {{1,0}, {0,0}, {0,0}, {1,0}},
392 {{0,0}, {1,0}, {-1,0}, {0,0}},
393 {{0,0}, {-1,0}, {1,0}, {0,0}},
394 {{1,0}, {0,0}, {0,0}, {1,0}}
397 {{1,0}, {0,0}, {0,0}, {-1,0}},
398 {{0,0}, {1,0}, {1,0}, {0,0}},
399 {{0,0}, {1,0}, {1,0}, {0,0}},
400 {{-1,0}, {0,0}, {0,0}, {1,0}}
403 {{1,0}, {0,0}, {0,-1}, {0,0}},
404 {{0,0}, {1,0}, {0,0}, {0,1}},
405 {{0,1}, {0,0}, {1,0}, {0,0}},
406 {{0,0}, {0,-1}, {0,0}, {1,0}}
409 {{1,0}, {0,0}, {0,1}, {0,0}},
410 {{0,0}, {1,0}, {0,0}, {0,-1}},
411 {{0,-1}, {0,0}, {1,0}, {0,0}},
412 {{0,0}, {0,1}, {0,0}, {1,0}}
415 {{1,0}, {0,0}, {-1,0}, {0,0}},
416 {{0,0}, {1,0}, {0,0}, {-1,0}},
417 {{-1,0}, {0,0}, {1,0}, {0,0}},
418 {{0,0}, {-1,0}, {0,0}, {1,0}}
421 {{1,0}, {0,0}, {1,0}, {0,0}},
422 {{0,0}, {1,0}, {0,0}, {1,0}},
423 {{1,0}, {0,0}, {1,0}, {0,0}},
424 {{0,0}, {1,0}, {0,0}, {1,0}}
428 {{0,0}, {0,0}, {0,0}, {0,0}},
429 {{0,0}, {0,0}, {0,0}, {0,0}},
430 {{0,0}, {0,0}, {2,0}, {0,0}},
431 {{0,0}, {0,0}, {0,0}, {2,0}}
435 {{2,0}, {0,0}, {0,0}, {0,0}},
436 {{0,0}, {2,0}, {0,0}, {0,0}},
437 {{0,0}, {0,0}, {0,0}, {0,0}},
438 {{0,0}, {0,0}, {0,0}, {0,0}}
444 template <
typename Float>
446 for (
int i=0; i<4*3*2; i++) res[i] = 0.0;
448 for (
int s = 0;
s < 4;
s++) {
449 for (
int t = 0; t < 4; t++) {
453 for (
int m = 0; m < 3; m++) {
454 Float spinorRe = spinorIn[t*(3*2) + m*(2) + 0];
455 Float spinorIm = spinorIn[t*(3*2) + m*(2) + 1];
456 res[
s*(3*2) + m*(2) + 0] += projRe*spinorRe - projIm*spinorIm;
457 res[
s*(3*2) + m*(2) + 1] += projRe*spinorIm + projIm*spinorRe;
478 template <
typename sFloat,
typename gFloat>
480 int oddBit,
int daggerBit) {
484 for (
int i=0; i<
V5h*4*3*2; i++) res[i] = 0.0;
487 gFloat *gaugeEven[4], *gaugeOdd[4];
491 gaugeEven[
dir] = gaugeFull[
dir];
496 int sp_idx,oddBit_gge;
498 for (
int gge_idx = 0; gge_idx <
Vh; gge_idx++) {
500 sp_idx=gge_idx+Vh*
xs;
506 if ((xs % 2) == 0) oddBit_gge=oddBit;
507 else oddBit_gge= (oddBit+1) % 2;
513 sFloat projectedSpinor[4*3*2], gaugedSpinor[4*3*2];
514 int projIdx = 2*(
dir/2)+(
dir+daggerBit)%2;
517 for (
int s = 0;
s < 4;
s++) {
519 su3Mul(&gaugedSpinor[
s*(3*2)], gauge, &projectedSpinor[
s*(3*2)]);
521 std::cout <<
"spinor:" << std::endl;
523 std::cout <<
"gauge:" << std::endl;
526 su3Tmul(&gaugedSpinor[
s*(3*2)], gauge, &projectedSpinor[
s*(3*2)]);
530 sum(&res[sp_idx*(4*3*2)], &res[sp_idx*(4*3*2)], gaugedSpinor, 4*3*2);
537 template <
typename sFloat,
typename gFloat>
538 void dslashReference_4d_mgpu(sFloat *res, gFloat **gaugeFull, gFloat **ghostGauge, sFloat *spinorField, sFloat **fwdSpinor, sFloat **backSpinor,
int oddBit,
int daggerBit)
544 gFloat *gaugeEven[4], *gaugeOdd[4];
545 gFloat *ghostGaugeEven[4], *ghostGaugeOdd[4];
549 gaugeEven[
dir] = gaugeFull[
dir];
552 ghostGaugeEven[
dir] = ghostGauge[
dir];
558 for (
int i = 0; i <
Vh; i++)
565 if ((xs % 2) == 0) oddBit_gge=oddBit;
566 else oddBit_gge= (oddBit+1) % 2;
568 gFloat *
gauge =
gaugeLink_mgpu(i,
dir, oddBit_gge, gaugeEven, gaugeOdd, ghostGaugeEven, ghostGaugeOdd, 1, 1);
572 int projIdx = 2*(
dir/2)+(
dir+daggerBit)%2;
575 for (
int s = 0;
s < 4;
s++)
577 if (
dir % 2 == 0) su3Mul(&gaugedSpinor[
s*(3*2)], gauge, &projectedSpinor[
s*(3*2)]);
578 else su3Tmul(&gaugedSpinor[
s*(3*2)], gauge, &projectedSpinor[
s*(3*2)]);
580 sum(&res[sp_idx*(4*3*2)], &res[sp_idx*(4*3*2)], gaugedSpinor, 4*3*2);
588 template <
typename sFloat>
590 int oddBit,
int daggerBit, sFloat mferm) {
591 for (
int i = 0; i <
V5h; i++) {
597 sFloat projectedSpinor[4*3*2];
598 int projIdx = 2*(
dir/2)+(
dir+daggerBit)%2;
602 int xs = X/(
Z[3]*
Z[2]*
Z[1]*
Z[0]);
604 if ( (xs == 0 &&
dir == 9) || (xs ==
Ls-1 &&
dir == 8) ) {
605 ax(projectedSpinor,(sFloat)(-mferm),projectedSpinor,4*3*2);
607 sum(&res[i*(4*3*2)], &res[i*(4*3*2)], projectedSpinor, 4*3*2);
684 void **ghostGauge = (
void**)cpu.
Ghost();
694 for (
int d=0; d<4; d++) csParam.
x[d] =
Z[d];
711 else errorQuda(
"ERROR: full parity not supported in function %s", __FUNCTION__);
717 void** fwd_nbr_spinor = inField.fwdGhostFaceBuffer;
718 void** back_nbr_spinor = inField.backGhostFaceBuffer;
722 dslashReference_4d_mgpu((
double*)out, (
double**)gauge, (
double**)ghostGauge, (
double*)in,(
double**)fwd_nbr_spinor, (
double**)back_nbr_spinor, oddBit, daggerBit);
727 (
float**)fwd_nbr_spinor, (
float**)back_nbr_spinor, oddBit, daggerBit);
746 dw_dslash(outOdd, gauge, inEven, 1, dagger_bit, precision, gauge_param, mferm);
747 dw_dslash(outEven, gauge, inOdd, 0, dagger_bit, precision, gauge_param, mferm);
751 else xpay((
float*)in, -(
float)kappa, (
float*)out,
V5*spinorSiteSize);
758 dw_mat(tmp, gauge, in, kappa, dagger_bit, precision, gauge_param, mferm);
759 dagger_bit = (dagger_bit == 1) ? 0 : 1;
760 dw_mat(out, gauge, tmp, kappa, dagger_bit, precision, gauge_param, mferm);
770 dw_dslash(tmp, gauge, in, 1, dagger_bit, precision, gauge_param, mferm);
771 dw_dslash(out, gauge, tmp, 0, dagger_bit, precision, gauge_param, mferm);
773 dw_dslash(tmp, gauge, in, 0, dagger_bit, precision, gauge_param, mferm);
774 dw_dslash(out, gauge, tmp, 1, dagger_bit, precision, gauge_param, mferm);
778 double kappa2 = -kappa*
kappa;
780 else xpay((
float*)in, (
float)kappa2, (
float*)out,
V5h*spinorSiteSize);