34 int xs = X/(
Z[3]*
Z[2]*
Z[1]*
Z[0]);
35 int x4 = (X/(
Z[2]*
Z[1]*
Z[0])) %
Z[3];
36 int x3 = (X/(
Z[1]*
Z[0])) %
Z[2];
37 int x2 = (X/
Z[0]) %
Z[1];
42 xs = (xs+dxs+
Ls) %
Ls;
44 x4 = (x4+dx4+
Z[3]) %
Z[3];
45 x3 = (x3+dx3+
Z[2]) %
Z[2];
46 x2 = (x2+dx2+
Z[1]) %
Z[1];
47 x1 = (x1+dx1+
Z[0]) %
Z[0];
50 return (xs*(
Z[3]*
Z[2]*
Z[1]*
Z[0]) + x4*(Z[2]*Z[1]*Z[0]) + x3*(Z[1]*Z[0]) + x2*(Z[0]) + x1) / 2;
59 int xs = X/(
Z[3]*
Z[2]*
Z[1]*
Z[0]);
60 int x4 = (X/(
Z[2]*
Z[1]*
Z[0])) %
Z[3];
61 int x3 = (X/(
Z[1]*
Z[0])) %
Z[2];
62 int x2 = (X/
Z[0]) %
Z[1];
67 xs = (xs+dxs+
Ls) %
Ls;
69 x4 = (x4+dx4+
Z[3]) %
Z[3];
70 x3 = (x3+dx3+
Z[2]) %
Z[2];
71 x2 = (x2+dx2+
Z[1]) %
Z[1];
72 x1 = (x1+dx1+
Z[0]) %
Z[0];
75 return (xs*(
Z[3]*
Z[2]*
Z[1]*
Z[0]) + x4*(Z[2]*Z[1]*Z[0]) + x3*(Z[1]*Z[0]) + x2*(Z[0]) + x1) / 2;
90 if (i < 0 || i >= (
Z[0]*
Z[1]*
Z[2]*
Z[3]/2))
91 { printf(
"i out of range in neighborIndex_4d\n"); exit(-1); }
96 int x4 = X/(
Z[2]*
Z[1]*
Z[0]);
97 int x3 = (X/(
Z[1]*
Z[0])) %
Z[2];
98 int x2 = (X/
Z[0]) %
Z[1];
101 x4 = (x4+dx4+
Z[3]) %
Z[3];
102 x3 = (x3+dx3+
Z[2]) %
Z[2];
103 x2 = (x2+dx2+
Z[1]) %
Z[1];
104 x1 = (x1+dx1+
Z[0]) %
Z[0];
106 return (x4*(
Z[2]*
Z[1]*
Z[0]) + x3*(Z[1]*Z[0]) + x2*(Z[0]) +
x1) / 2;
118 int xs = Y/(
Z[3]*
Z[2]*
Z[1]*
Z[0]);
119 int x4 = (Y/(
Z[2]*
Z[1]*
Z[0])) %
Z[3];
120 int x3 = (Y/(
Z[1]*
Z[0])) %
Z[2];
121 int x2 = (Y/
Z[0]) %
Z[1];
124 int ghost_x4 = x4+ dx4;
126 xs = (xs+dxs+
Ls) %
Ls;
127 x4 = (x4+dx4+
Z[3]) %
Z[3];
128 x3 = (x3+dx3+
Z[2]) %
Z[2];
129 x2 = (x2+dx2+
Z[1]) %
Z[1];
130 x1 = (x1+dx1+
Z[0]) %
Z[0];
132 if ( ghost_x4 >= 0 && ghost_x4 <
Z[3]){
133 ret = (xs*
Z[3]*
Z[2]*
Z[1]*
Z[0] + x4*
Z[2]*
Z[1]*
Z[0] + x3*
Z[1]*
Z[0] + x2*
Z[0] +
x1) >> 1;
135 ret = (xs*
Z[2]*
Z[1]*
Z[0] + x3*
Z[1]*
Z[0] + x2*
Z[0] +
x1) >> 1;
147 int xs = Y/(
Z[3]*
Z[2]*
Z[1]*
Z[0]);
148 int x4 = (Y/(
Z[2]*
Z[1]*
Z[0])) %
Z[3];
149 int x3 = (Y/(
Z[1]*
Z[0])) %
Z[2];
150 int x2 = (Y/
Z[0]) %
Z[1];
153 int ghost_x4 = x4+ dx4;
155 xs = (xs+dxs+
Ls) %
Ls;
156 x4 = (x4+dx4+
Z[3]) %
Z[3];
157 x3 = (x3+dx3+
Z[2]) %
Z[2];
158 x2 = (x2+dx2+
Z[1]) %
Z[1];
159 x1 = (x1+dx1+
Z[0]) %
Z[0];
161 if ( ghost_x4 >= 0 && ghost_x4 <
Z[3]){
162 ret = (xs*
Z[3]*
Z[2]*
Z[1]*
Z[0] + x4*
Z[2]*
Z[1]*
Z[0] + x3*
Z[1]*
Z[0] + x2*
Z[0] +
x1) >> 1;
164 ret = (xs*
Z[2]*
Z[1]*
Z[0] + x3*
Z[1]*
Z[0] + x2*
Z[0] +
x1) >> 1;
174 return (Y/(
Z[2]*
Z[1]*
Z[0])) % Z[3];
181 return (Y/(
Z[2]*
Z[1]*
Z[0])) % Z[3];
193 template <
typename Float>
203 gaugeField = (oddBit ? gaugeOdd : gaugeEven);
212 default: j = -1;
break;
214 gaugeField = (oddBit ? gaugeEven : gaugeOdd);
217 return &gaugeField[dir/2][j*(3*3*2)];
224 template <
typename Float>
228 int d = nbr_distance;
231 gaugeField = (oddBit ? gaugeOdd : gaugeEven);
236 int x4 = Y/(
Z[2]*
Z[1]*
Z[0]);
237 int x3 = (Y/(
Z[1]*
Z[0])) %
Z[2];
238 int x2 = (Y/
Z[0]) %
Z[1];
244 Float* ghostGaugeField;
249 int new_x1 = (x1 - d +
X1 )% X1;
251 ghostGaugeField = (oddBit?ghostGaugeEven[0]: ghostGaugeOdd[0]);
252 int offset = (n_ghost_faces + x1 -d)*X4*X3*X2/2 + (x4*X3*X2 + x3*X2+x2)/2;
253 return &ghostGaugeField[offset*(3*3*2)];
255 j = (x4*X3*X2*X1 + x3*X2*X1 + x2*X1 + new_x1) / 2;
260 int new_x2 = (x2 - d +
X2 )% X2;
262 ghostGaugeField = (oddBit?ghostGaugeEven[1]: ghostGaugeOdd[1]);
263 int offset = (n_ghost_faces + x2 -d)*X4*X3*X1/2 + (x4*X3*X1 + x3*X1+x1)/2;
264 return &ghostGaugeField[offset*(3*3*2)];
266 j = (x4*X3*X2*X1 + x3*X2*X1 + new_x2*X1 +
x1) / 2;
272 int new_x3 = (x3 - d +
X3 )% X3;
274 ghostGaugeField = (oddBit?ghostGaugeEven[2]: ghostGaugeOdd[2]);
275 int offset = (n_ghost_faces + x3 -d)*X4*X2*X1/2 + (x4*X2*X1 + x2*X1+x1)/2;
276 return &ghostGaugeField[offset*(3*3*2)];
278 j = (x4*X3*X2*X1 + new_x3*X2*X1 + x2*X1 +
x1) / 2;
283 int new_x4 = (x4 - d +
X4)% X4;
285 ghostGaugeField = (oddBit?ghostGaugeEven[3]: ghostGaugeOdd[3]);
286 int offset = (n_ghost_faces + x4 -d)*X1*X2*X3/2 + (x3*X2*X1 + x2*X1+x1)/2;
287 return &ghostGaugeField[offset*(3*3*2)];
289 j = (new_x4*(X3*X2*
X1) + x3*(X2*X1) + x2*(
X1) + x1) / 2;
293 default: j = -1; printf(
"ERROR: wrong dir \n"); exit(1);
295 gaugeField = (oddBit ? gaugeEven : gaugeOdd);
299 return &gaugeField[dir/2][j*(3*3*2)];
303 template <
typename Float>
307 int nb = neighbor_distance;
312 int xs = Y/(
Z[3]*
Z[2]*
Z[1]*
Z[0]);
313 int x4 = (Y/(
Z[2]*
Z[1]*
Z[0])) %
Z[3];
314 int x3 = (Y/(
Z[1]*
Z[0])) %
Z[2];
315 int x2 = (Y/
Z[0]) %
Z[1];
325 int new_x1 = (x1 + nb)% X1;
327 int offset = ((x1 + nb -
X1)*
Ls*X4*X3*X2+xs*X4*X3*X2+x4*X3*X2 + x3*X2+x2) >> 1;
330 j = (xs*X4*X3*X2*X1 + x4*X3*X2*X1 + x3*X2*X1 + x2*X1 + new_x1) >> 1;
336 int new_x1 = (x1 - nb +
X1)% X1;
338 int offset = (( x1+nFace- nb)*
Ls*X4*X3*X2 + xs*X4*X3*X2 + x4*X3*X2 + x3*X2 + x2) >> 1;
341 j = (xs*X4*X3*X2*X1 + x4*X3*X2*X1 + x3*X2*X1 + x2*X1 + new_x1) >> 1;
346 int new_x2 = (x2 + nb)% X2;
348 int offset = (( x2 + nb -
X2)*
Ls*X4*X3*X1+xs*X4*X3*X1+x4*X3*X1 + x3*X1+x1) >> 1;
351 j = (xs*X4*X3*X2*X1 + x4*X3*X2*X1 + x3*X2*X1 + new_x2*X1 +
x1) >> 1;
356 int new_x2 = (x2 - nb +
X2)% X2;
358 int offset = (( x2 + nFace -nb)*
Ls*X4*X3*X1+xs*X4*X3*X1+ x4*X3*X1 + x3*X1+x1) >> 1;
361 j = (xs*X4*X3*X2*X1 + x4*X3*X2*X1 + x3*X2*X1 + new_x2*X1 +
x1) >> 1;
366 int new_x3 = (x3 + nb)% X3;
368 int offset = (( x3 + nb -
X3)*
Ls*X4*X2*X1+xs*X4*X2*X1+x4*X2*X1 + x2*X1+x1) >> 1;
371 j = (xs*X4*X3*X2*X1 + x4*X3*X2*X1 + new_x3*X2*X1 + x2*X1 +
x1) >> 1;
376 int new_x3 = (x3 - nb +
X3)% X3;
378 int offset = (( x3 + nFace -nb)*
Ls*X4*X2*X1+xs*X4*X2*X1+x4*X2*X1+x2*X1+x1) >> 1;
381 j = (xs*X4*X3*X2*X1 + x4*X3*X2*X1 + new_x3*X2*X1 + x2*X1 +
x1) >> 1;
388 if ( (x4 + nb) >=
Z[3])
390 int offset = (x4+nb -
Z[3])*
Vsh_t;
391 return &fwd_nbr_spinor[3][(offset+j)*mySpinorSiteSize];
400 int offset = ( x4 - nb +nFace)*
Vsh_t;
401 return &back_nbr_spinor[3][(offset+j)*mySpinorSiteSize];
405 default: j = -1; printf(
"ERROR: wrong dir\n"); exit(1);
411 template <
typename Float>
415 int nb = neighbor_distance;
420 int xs = Y/(
Z[3]*
Z[2]*
Z[1]*
Z[0]);
421 int x4 = (Y/(
Z[2]*
Z[1]*
Z[0])) %
Z[3];
422 int x3 = (Y/(
Z[1]*
Z[0])) %
Z[2];
423 int x2 = (Y/
Z[0]) %
Z[1];
433 int new_x1 = (x1 + nb)% X1;
435 int offset = ((x1 + nb -
X1)*
Ls*X4*X3*X2+xs*X4*X3*X2+x4*X3*X2 + x3*X2+x2) >> 1;
438 j = (xs*X4*X3*X2*X1 + x4*X3*X2*X1 + x3*X2*X1 + x2*X1 + new_x1) >> 1;
444 int new_x1 = (x1 - nb +
X1)% X1;
446 int offset = (( x1+nFace- nb)*
Ls*X4*X3*X2 + xs*X4*X3*X2 + x4*X3*X2 + x3*X2 + x2) >> 1;
449 j = (xs*X4*X3*X2*X1 + x4*X3*X2*X1 + x3*X2*X1 + x2*X1 + new_x1) >> 1;
454 int new_x2 = (x2 + nb)% X2;
456 int offset = (( x2 + nb -
X2)*
Ls*X4*X3*X1+xs*X4*X3*X1+x4*X3*X1 + x3*X1+x1) >> 1;
459 j = (xs*X4*X3*X2*X1 + x4*X3*X2*X1 + x3*X2*X1 + new_x2*X1 +
x1) >> 1;
464 int new_x2 = (x2 - nb +
X2)% X2;
466 int offset = (( x2 + nFace -nb)*
Ls*X4*X3*X1+xs*X4*X3*X1+ x4*X3*X1 + x3*X1+x1) >> 1;
469 j = (xs*X4*X3*X2*X1 + x4*X3*X2*X1 + x3*X2*X1 + new_x2*X1 +
x1) >> 1;
474 int new_x3 = (x3 + nb)% X3;
476 int offset = (( x3 + nb -
X3)*
Ls*X4*X2*X1+xs*X4*X2*X1+x4*X2*X1 + x2*X1+x1) >> 1;
479 j = (xs*X4*X3*X2*X1 + x4*X3*X2*X1 + new_x3*X2*X1 + x2*X1 +
x1) >> 1;
484 int new_x3 = (x3 - nb +
X3)% X3;
486 int offset = (( x3 + nFace -nb)*
Ls*X4*X2*X1+xs*X4*X2*X1+x4*X2*X1+x2*X1+x1) >> 1;
489 j = (xs*X4*X3*X2*X1 + x4*X3*X2*X1 + new_x3*X2*X1 + x2*X1 +
x1) >> 1;
496 if ( (x4 + nb) >=
Z[3])
498 int offset = (x4+nb -
Z[3])*
Vsh_t;
499 return &fwd_nbr_spinor[3][(offset+j)*mySpinorSiteSize];
508 int offset = ( x4 - nb +nFace)*
Vsh_t;
509 return &back_nbr_spinor[3][(offset+j)*mySpinorSiteSize];
513 default: j = -1; printf(
"ERROR: wrong dir\n"); exit(1);
522 template <
typename Float>
536 default: j = -1;
break;
539 return &spinorField[j*(4*3*2)];
542 template <
typename Float>
556 default: j = -1;
break;
559 return &spinorField[j*(4*3*2)];
568 {{1,0}, {0,0}, {0,0}, {0,-1}},
569 {{0,0}, {1,0}, {0,-1}, {0,0}},
570 {{0,0}, {0,1}, {1,0}, {0,0}},
571 {{0,1}, {0,0}, {0,0}, {1,0}}
574 {{1,0}, {0,0}, {0,0}, {0,1}},
575 {{0,0}, {1,0}, {0,1}, {0,0}},
576 {{0,0}, {0,-1}, {1,0}, {0,0}},
577 {{0,-1}, {0,0}, {0,0}, {1,0}}
580 {{1,0}, {0,0}, {0,0}, {1,0}},
581 {{0,0}, {1,0}, {-1,0}, {0,0}},
582 {{0,0}, {-1,0}, {1,0}, {0,0}},
583 {{1,0}, {0,0}, {0,0}, {1,0}}
586 {{1,0}, {0,0}, {0,0}, {-1,0}},
587 {{0,0}, {1,0}, {1,0}, {0,0}},
588 {{0,0}, {1,0}, {1,0}, {0,0}},
589 {{-1,0}, {0,0}, {0,0}, {1,0}}
592 {{1,0}, {0,0}, {0,-1}, {0,0}},
593 {{0,0}, {1,0}, {0,0}, {0,1}},
594 {{0,1}, {0,0}, {1,0}, {0,0}},
595 {{0,0}, {0,-1}, {0,0}, {1,0}}
598 {{1,0}, {0,0}, {0,1}, {0,0}},
599 {{0,0}, {1,0}, {0,0}, {0,-1}},
600 {{0,-1}, {0,0}, {1,0}, {0,0}},
601 {{0,0}, {0,1}, {0,0}, {1,0}}
604 {{1,0}, {0,0}, {-1,0}, {0,0}},
605 {{0,0}, {1,0}, {0,0}, {-1,0}},
606 {{-1,0}, {0,0}, {1,0}, {0,0}},
607 {{0,0}, {-1,0}, {0,0}, {1,0}}
610 {{1,0}, {0,0}, {1,0}, {0,0}},
611 {{0,0}, {1,0}, {0,0}, {1,0}},
612 {{1,0}, {0,0}, {1,0}, {0,0}},
613 {{0,0}, {1,0}, {0,0}, {1,0}}
617 {{0,0}, {0,0}, {0,0}, {0,0}},
618 {{0,0}, {0,0}, {0,0}, {0,0}},
619 {{0,0}, {0,0}, {2,0}, {0,0}},
620 {{0,0}, {0,0}, {0,0}, {2,0}}
624 {{2,0}, {0,0}, {0,0}, {0,0}},
625 {{0,0}, {2,0}, {0,0}, {0,0}},
626 {{0,0}, {0,0}, {0,0}, {0,0}},
627 {{0,0}, {0,0}, {0,0}, {0,0}}
633 template <
typename Float>
635 for (
int i=0; i<4*3*2; i++) res[i] = 0.0;
637 for (
int s = 0;
s < 4;
s++) {
638 for (
int t = 0; t < 4; t++) {
642 for (
int m = 0; m < 3; m++) {
643 Float spinorRe = spinorIn[t*(3*2) + m*(2) + 0];
644 Float spinorIm = spinorIn[t*(3*2) + m*(2) + 1];
645 res[
s*(3*2) + m*(2) + 0] += projRe*spinorRe - projIm*spinorIm;
646 res[
s*(3*2) + m*(2) + 1] += projRe*spinorIm + projIm*spinorRe;
667 template <
typename sFloat,
typename gFloat>
669 int oddBit,
int daggerBit) {
673 for (
int i=0; i<
V5h*4*3*2; i++) res[i] = 0.0;
676 gFloat *gaugeEven[4], *gaugeOdd[4];
679 for (
int dir = 0; dir < 4; dir++) {
680 gaugeEven[dir] = gaugeFull[dir];
687 for (
int gge_idx = 0; gge_idx <
Vh; gge_idx++) {
688 for (
int dir = 0; dir < 8; dir++) {
689 sp_idx=gge_idx+Vh*
xs;
695 if ((xs % 2) == 0) oddBit_gge=
oddBit;
696 else oddBit_gge= (oddBit+1) % 2;
702 sFloat projectedSpinor[4*3*2], gaugedSpinor[4*3*2];
703 int projIdx = 2*(dir/2)+(dir+daggerBit)%2;
706 for (
int s = 0;
s < 4;
s++) {
708 su3Mul(&gaugedSpinor[
s*(3*2)], gauge, &projectedSpinor[
s*(3*2)]);
710 std::cout <<
"spinor:" << std::endl;
712 std::cout <<
"gauge:" << std::endl;
715 su3Tmul(&gaugedSpinor[
s*(3*2)], gauge, &projectedSpinor[
s*(3*2)]);
719 sum(&res[sp_idx*(4*3*2)], &res[sp_idx*(4*3*2)], gaugedSpinor, 4*3*2);
725 template <
typename sFloat,
typename gFloat>
727 int oddBit,
int daggerBit) {
731 for (
int i=0; i<
V5h*4*3*2; i++) res[i] = 0.0;
734 gFloat *gaugeEven[4], *gaugeOdd[4];
737 for (
int dir = 0; dir < 4; dir++) {
738 gaugeEven[dir] = gaugeFull[dir];
745 for (
int gge_idx = 0; gge_idx <
Vh; gge_idx++) {
746 for (
int dir = 0; dir < 8; dir++) {
747 sp_idx=gge_idx+Vh*
xs;
760 sFloat projectedSpinor[4*3*2], gaugedSpinor[4*3*2];
761 int projIdx = 2*(dir/2)+(dir+daggerBit)%2;
764 for (
int s = 0;
s < 4;
s++) {
766 su3Mul(&gaugedSpinor[
s*(3*2)], gauge, &projectedSpinor[
s*(3*2)]);
768 std::cout <<
"spinor:" << std::endl;
770 std::cout <<
"gauge:" << std::endl;
773 su3Tmul(&gaugedSpinor[
s*(3*2)], gauge, &projectedSpinor[
s*(3*2)]);
777 sum(&res[sp_idx*(4*3*2)], &res[sp_idx*(4*3*2)], gaugedSpinor, 4*3*2);
784 template <
typename sFloat,
typename gFloat>
785 void dslashReference_4d_mgpu(sFloat *res, gFloat **gaugeFull, gFloat **ghostGauge, sFloat *spinorField, sFloat **fwdSpinor, sFloat **backSpinor,
int oddBit,
int daggerBit)
791 gFloat *gaugeEven[4], *gaugeOdd[4];
792 gFloat *ghostGaugeEven[4], *ghostGaugeOdd[4];
794 for (
int dir = 0; dir < 4; dir++)
796 gaugeEven[dir] = gaugeFull[dir];
799 ghostGaugeEven[dir] = ghostGauge[dir];
800 ghostGaugeOdd[dir] = ghostGauge[dir] + (
faceVolume[dir]/2)*gaugeSiteSize;
805 for (
int i = 0; i <
Vh; i++)
808 for (
int dir = 0; dir < 8; dir++)
812 if ((xs % 2) == 0) oddBit_gge=oddBit;
813 else oddBit_gge= (oddBit+1) % 2;
815 gFloat *
gauge =
gaugeLink_mgpu(i, dir, oddBit_gge, gaugeEven, gaugeOdd, ghostGaugeEven, ghostGaugeOdd, 1, 1);
819 int projIdx = 2*(dir/2)+(dir+daggerBit)%2;
822 for (
int s = 0;
s < 4;
s++)
824 if (dir % 2 == 0) su3Mul(&gaugedSpinor[
s*(3*2)], gauge, &projectedSpinor[
s*(3*2)]);
825 else su3Tmul(&gaugedSpinor[
s*(3*2)], gauge, &projectedSpinor[
s*(3*2)]);
827 sum(&res[sp_idx*(4*3*2)], &res[sp_idx*(4*3*2)], gaugedSpinor, 4*3*2);
833 template <
typename sFloat,
typename gFloat>
840 gFloat *gaugeEven[4], *gaugeOdd[4];
841 gFloat *ghostGaugeEven[4], *ghostGaugeOdd[4];
843 for (
int dir = 0; dir < 4; dir++)
845 gaugeEven[dir] = gaugeFull[dir];
848 ghostGaugeEven[dir] = ghostGauge[dir];
849 ghostGaugeOdd[dir] = ghostGauge[dir] + (
faceVolume[dir]/2)*gaugeSiteSize;
854 for (
int i = 0; i <
Vh; i++)
857 for (
int dir = 0; dir < 8; dir++)
864 gFloat *
gauge =
gaugeLink_mgpu(i, dir, oddBit, gaugeEven, gaugeOdd, ghostGaugeEven, ghostGaugeOdd, 1, 1);
869 int projIdx = 2*(dir/2)+(dir+daggerBit)%2;
872 for (
int s = 0;
s < 4;
s++)
874 if (dir % 2 == 0) su3Mul(&gaugedSpinor[
s*(3*2)], gauge, &projectedSpinor[
s*(3*2)]);
875 else su3Tmul(&gaugedSpinor[
s*(3*2)], gauge, &projectedSpinor[
s*(3*2)]);
877 sum(&res[sp_idx*(4*3*2)], &res[sp_idx*(4*3*2)], gaugedSpinor, 4*3*2);
885 template <
typename sFloat>
887 int oddBit,
int daggerBit, sFloat mferm) {
888 for (
int i = 0; i <
V5h; i++) {
889 for (
int dir = 8; dir < 10; dir++) {
894 sFloat projectedSpinor[4*3*2];
895 int projIdx = 2*(dir/2)+(dir+daggerBit)%2;
899 int xs = X/(
Z[3]*
Z[2]*
Z[1]*
Z[0]);
901 if ( (xs == 0 && dir == 9) || (xs ==
Ls-1 && dir == 8) ) {
902 ax(projectedSpinor,(sFloat)(-mferm),projectedSpinor,4*3*2);
904 sum(&res[i*(4*3*2)], &res[i*(4*3*2)], projectedSpinor, 4*3*2);
910 template <
typename sFloat>
912 int oddBit,
int daggerBit, sFloat mferm) {
913 for (
int i = 0; i <
V5h; i++) {
914 for(
int one_site = 0 ; one_site < 24 ; one_site++)
915 res[i*(4*3*2)+one_site] = 0.0;
916 for (
int dir = 8; dir < 10; dir++) {
921 sFloat projectedSpinor[4*3*2];
922 int projIdx = 2*(dir/2)+(dir+daggerBit)%2;
926 int xs = X/(
Z[3]*
Z[2]*
Z[1]*
Z[0]);
928 if ( (xs == 0 && dir == 9) || (xs ==
Ls-1 && dir == 8) ) {
929 ax(projectedSpinor,(sFloat)(-mferm),projectedSpinor,4*3*2);
931 sum(&res[i*(4*3*2)], &res[i*(4*3*2)], projectedSpinor, 4*3*2);
937 template <
typename sFloat>
939 int oddBit,
int daggerBit, sFloat mferm,
double *
kappa) {
940 double *inv_Ftr = (
double*)malloc(
Ls*
sizeof(sFloat));
941 double *Ftr = (
double*)malloc(
Ls*
sizeof(sFloat));
944 inv_Ftr[
xs] = 1.0/(1.0+
pow(2.0*kappa[
xs], Ls)*mferm);
945 Ftr[
xs] = -2.0*kappa[
xs]*mferm*inv_Ftr[
xs];
946 for (
int i = 0; i <
Vh; i++) {
947 memcpy(&res[24*(i+Vh*xs)], &spinorField[24*(i+Vh*xs)], 24*
sizeof(sFloat));
953 for (
int i = 0; i <
Vh; i++) {
954 ax(&res[12+24*(i+Vh*(Ls-1))],(sFloat)(inv_Ftr[0]), &spinorField[12+24*(i+Vh*(Ls-1))], 12);
958 for(
int xs = 0 ;
xs <= Ls-2 ; ++
xs)
960 for (
int i = 0; i <
Vh; i++) {
961 axpy((sFloat)(2.0*kappa[
xs]), &res[24*(i+Vh*xs)], &res[24*(i+Vh*(xs+1))], 12);
962 axpy((sFloat)Ftr[xs], &res[12+24*(i+Vh*xs)], &res[12+24*(i+Vh*(Ls-1))], 12);
964 for (
int tmp_s = 0 ; tmp_s <
Ls ; tmp_s++)
965 Ftr[tmp_s] *= 2.0*kappa[tmp_s];
969 Ftr[
xs] = -
pow(2.0*kappa[
xs],Ls-1)*mferm*inv_Ftr[
xs];
972 for(
int xs = Ls-2 ;
xs >=0 ; --
xs)
974 for (
int i = 0; i <
Vh; i++) {
975 axpy((sFloat)Ftr[
xs], &res[24*(i+Vh*(Ls-1))], &res[24*(i+Vh*xs)], 12);
976 axpy((sFloat)(2.0*kappa[xs]), &res[12+24*(i+Vh*(xs+1))], &res[12+24*(i+Vh*xs)], 12);
978 for (
int tmp_s = 0 ; tmp_s <
Ls ; tmp_s++)
979 Ftr[tmp_s] /= 2.0*kappa[tmp_s];
982 for (
int i = 0; i <
Vh; i++) {
983 ax(&res[24*(i+Vh*(Ls-1))], (sFloat)(inv_Ftr[Ls-1]), &res[24*(i+Vh*(Ls-1))], 12);
989 for (
int i = 0; i <
Vh; i++) {
990 ax(&res[24*(i+Vh*(Ls-1))],(sFloat)(inv_Ftr[0]), &spinorField[24*(i+Vh*(Ls-1))], 12);
994 for(
int xs = 0 ;
xs <= Ls-2 ; ++
xs)
996 for (
int i = 0; i <
Vh; i++) {
997 axpy((sFloat)Ftr[
xs], &res[24*(i+Vh*xs)], &res[24*(i+Vh*(Ls-1))], 12);
998 axpy((sFloat)(2.0*kappa[xs]), &res[12+24*(i+Vh*xs)], &res[12+24*(i+Vh*(xs+1))], 12);
1000 for (
int tmp_s = 0 ; tmp_s <
Ls ; tmp_s++)
1001 Ftr[tmp_s] *= 2.0*kappa[tmp_s];
1005 Ftr[
xs] = -
pow(2.0*kappa[
xs],Ls-1)*mferm*inv_Ftr[
xs];
1008 for(
int xs = Ls-2 ;
xs >=0 ; --
xs)
1010 for (
int i = 0; i <
Vh; i++) {
1011 axpy((sFloat)(2.0*kappa[
xs]), &res[24*(i+Vh*(xs+1))], &res[24*(i+Vh*xs)], 12);
1012 axpy((sFloat)Ftr[xs], &res[12+24*(i+Vh*(Ls-1))], &res[12+24*(i+Vh*xs)], 12);
1014 for (
int tmp_s = 0 ; tmp_s <
Ls ; tmp_s++)
1015 Ftr[tmp_s] /= 2.0*kappa[tmp_s];
1018 for (
int i = 0; i <
Vh; i++) {
1019 ax(&res[12+24*(i+Vh*(Ls-1))], (sFloat)(inv_Ftr[Ls-1]), &res[12+24*(i+Vh*(Ls-1))], 12);
1097 void **ghostGauge = (
void**)cpu.
Ghost();
1107 for (
int d=0; d<4; d++) csParam.
x[d] =
Z[d];
1125 else errorQuda(
"ERROR: full parity not supported in function %s", __FUNCTION__);
1131 void** fwd_nbr_spinor = inField.fwdGhostFaceBuffer;
1132 void** back_nbr_spinor = inField.backGhostFaceBuffer;
1136 dslashReference_4d_mgpu((
double*)out, (
double**)gauge, (
double**)ghostGauge, (
double*)in,(
double**)fwd_nbr_spinor, (
double**)back_nbr_spinor, oddBit, daggerBit);
1141 (
float**)fwd_nbr_spinor, (
float**)back_nbr_spinor, oddBit, daggerBit);
1176 void **ghostGauge = (
void**)cpu.
Ghost();
1186 for (
int d=0; d<4; d++) csParam.
x[d] =
Z[d];
1204 else errorQuda(
"ERROR: full parity not supported in function %s", __FUNCTION__);
1210 void** fwd_nbr_spinor = inField.fwdGhostFaceBuffer;
1211 void** back_nbr_spinor = inField.backGhostFaceBuffer;
1215 dslashReference_4d_4dpc_mgpu((
double*)out, (
double**)gauge, (
double**)ghostGauge, (
double*)in,(
double**)fwd_nbr_spinor, (
double**)back_nbr_spinor, oddBit, daggerBit);
1219 (
float**)fwd_nbr_spinor, (
float**)back_nbr_spinor, oddBit, daggerBit);
1283 void *outEven =
out;
1286 dw_dslash(outOdd, gauge, inEven, 1, dagger_bit, precision, gauge_param, mferm);
1287 dw_dslash(outEven, gauge, inOdd, 0, dagger_bit, precision, gauge_param, mferm);
1291 else xpay((
float*)in, -(
float)kappa, (
float*)out,
V5*spinorSiteSize);
1329 dw_mat(tmp, gauge, in, kappa, dagger_bit, precision, gauge_param, mferm);
1330 dagger_bit = (dagger_bit == 1) ? 0 : 1;
1331 dw_mat(out, gauge, tmp, kappa, dagger_bit, precision, gauge_param, mferm);
1341 dw_dslash(tmp, gauge, in, 1, dagger_bit, precision, gauge_param, mferm);
1342 dw_dslash(out, gauge, tmp, 0, dagger_bit, precision, gauge_param, mferm);
1344 dw_dslash(tmp, gauge, in, 0, dagger_bit, precision, gauge_param, mferm);
1345 dw_dslash(out, gauge, tmp, 1, dagger_bit, precision, gauge_param, mferm);
1349 double kappa2 = -kappa*
kappa;
1351 else xpay((
float*)in, (
float)kappa2, (
float*)out,
V5h*spinorSiteSize);
1359 double kappa2 = -kappa*
kappa;
1360 double *
kappa5 = (
double*)malloc(
Ls*
sizeof(
double));
1365 double *output = (
double*)out;
1371 dslash_4_4d(tmp, gauge, in, 0, dagger_bit, precision, gauge_param, mferm);
1372 dslash_5_inv(out, gauge, tmp, 1, dagger_bit, precision, gauge_param, mferm, kappa5);
1373 dslash_4_4d(tmp, gauge, out, 1, dagger_bit, precision, gauge_param, mferm);
1375 else xpay((
float*)in, (
float)kappa2, (
float*)tmp,
V5h*spinorSiteSize);
1376 dw_dslash_5_4d(out, gauge, in, 1, dagger_bit, precision, gauge_param, mferm);
1378 else xpay((
float*)tmp, -(
float)kappa, (
float*)out,
V5h*spinorSiteSize);
1380 dslash_4_4d(tmp, gauge, in, 1, dagger_bit, precision, gauge_param, mferm);
1381 dslash_5_inv(out, gauge, tmp, 0, dagger_bit, precision, gauge_param, mferm, kappa5);
1382 dslash_4_4d(tmp, gauge, out, 0, dagger_bit, precision, gauge_param, mferm);
1384 else xpay((
float*)in, (
float)kappa2, (
float*)tmp,
V5h*spinorSiteSize);
1385 dw_dslash_5_4d(out, gauge, in, 0, dagger_bit, precision, gauge_param, mferm);
1387 else xpay((
float*)tmp, -(
float)kappa, (
float*)out,
V5h*spinorSiteSize);
1393 void mdw_matpc(
void *
out,
void **
gauge,
void *
in,
double *kappa_b,
double *kappa_c,
QudaMatPCType matpc_type,
int dagger_bit,
QudaPrecision precision,
QudaGaugeParam &
gauge_param,
double mferm,
double *b5,
double *c5)
1396 double *
kappa5 = (
double*)malloc(
Ls*
sizeof(
double));
1397 double *kappa2 = (
double*)malloc(
Ls*
sizeof(
double));
1398 double *kappa_mdwf = (
double*)malloc(
Ls*
sizeof(
double));
1401 kappa5[
xs] = 0.5*kappa_b[
xs]/kappa_c[
xs];
1402 kappa2[
xs] = -kappa_b[
xs]*kappa_b[
xs];
1403 kappa_mdwf[
xs] = -kappa5[
xs];
1409 mdw_dslash_4_pre(out, gauge, in, 1, dagger_bit, precision, gauge_param, mferm, b5, c5);
1410 dslash_4_4d(tmp, gauge, out, 0, dagger_bit, precision, gauge_param, mferm);
1411 dslash_5_inv(out, gauge, tmp, 1, dagger_bit, precision, gauge_param, mferm, kappa_mdwf);
1412 mdw_dslash_4_pre(tmp, gauge, out, 0, dagger_bit, precision, gauge_param, mferm, b5, c5);
1413 dslash_4_4d(out, gauge, tmp, 1, dagger_bit, precision, gauge_param, mferm);
1414 mdw_dslash_5(tmp, gauge, in, 0, dagger_bit, precision, gauge_param, mferm, kappa5);
1418 else xpay((
float*)tmp +
Vh*spinorSiteSize*xs, (
float)kappa2[xs], (
float*)out +
Vh*spinorSiteSize*xs,
Vh*spinorSiteSize);
1422 mdw_dslash_4_pre(out, gauge, in, 0, dagger_bit, precision, gauge_param, mferm, b5, c5);
1423 dslash_4_4d(tmp, gauge, out, 1, dagger_bit, precision, gauge_param, mferm);
1424 dslash_5_inv(out, gauge, tmp, 0, dagger_bit, precision, gauge_param, mferm, kappa_mdwf);
1425 mdw_dslash_4_pre(tmp, gauge, out, 1, dagger_bit, precision, gauge_param, mferm, b5, c5);
1426 dslash_4_4d(out, gauge, tmp, 0, dagger_bit, precision, gauge_param, mferm);
1427 mdw_dslash_5(tmp, gauge, in, 1, dagger_bit, precision, gauge_param, mferm, kappa5);
1431 else xpay((
float*)tmp +
Vh*spinorSiteSize*xs, (
float)kappa2[xs], (
float*)out +
Vh*spinorSiteSize*xs,
Vh*spinorSiteSize);
1437 dslash_4_4d(out, gauge, in, 0, dagger_bit, precision, gauge_param, mferm);
1438 mdw_dslash_4_pre(tmp, gauge, out, 1, dagger_bit, precision, gauge_param, mferm, b5, c5);
1439 dslash_5_inv(out, gauge, tmp, 0, dagger_bit, precision, gauge_param, mferm, kappa_mdwf);
1440 dslash_4_4d(tmp, gauge, out, 1, dagger_bit, precision, gauge_param, mferm);
1441 mdw_dslash_4_pre(out, gauge, tmp, 0, dagger_bit, precision, gauge_param, mferm, b5, c5);
1442 mdw_dslash_5(tmp, gauge, in, 0, dagger_bit, precision, gauge_param, mferm, kappa5);
1446 else xpay((
float*)tmp +
Vh*spinorSiteSize*xs, (
float)kappa2[xs], (
float*)out +
Vh*spinorSiteSize*xs,
Vh*spinorSiteSize);
1450 dslash_4_4d(out, gauge, in, 1, dagger_bit, precision, gauge_param, mferm);
1451 mdw_dslash_4_pre(tmp, gauge, out, 0, dagger_bit, precision, gauge_param, mferm, b5, c5);
1452 dslash_5_inv(out, gauge, tmp, 1, dagger_bit, precision, gauge_param, mferm, kappa_mdwf);
1453 dslash_4_4d(tmp, gauge, out, 0, dagger_bit, precision, gauge_param, mferm);
1454 mdw_dslash_4_pre(out, gauge, tmp, 1, dagger_bit, precision, gauge_param, mferm, b5, c5);
1455 mdw_dslash_5(tmp, gauge, in, 1, dagger_bit, precision, gauge_param, mferm, kappa5);
1459 else xpay((
float*)tmp +
Vh*spinorSiteSize*xs, (
float)kappa2[xs], (
float*)out +
Vh*spinorSiteSize*xs,
Vh*spinorSiteSize);
QudaGaugeParam gauge_param
Float * spinorNeighbor_5d_4dpc(int i, int dir, int oddBit, Float *spinorField)
void dw_dslash_5_4d(void *out, void **gauge, void *in, int oddBit, int daggerBit, QudaPrecision precision, QudaGaugeParam &gauge_param, double mferm)
void dw_4d_matpc(void *out, void **gauge, void *in, double kappa, QudaMatPCType matpc_type, int dagger_bit, QudaPrecision precision, QudaGaugeParam &gauge_param, double mferm)
const void ** Ghost() const
enum QudaPrecision_s QudaPrecision
int neighborIndex_5d_mgpu(int i, int oddBit, int dxs, int dx4, int dx3, int dx2, int dx1)
void dslashReference_5th_4d(sFloat *res, sFloat *spinorField, int oddBit, int daggerBit, sFloat mferm)
void mdw_matpc(void *out, void **gauge, void *in, double *kappa_b, double *kappa_c, QudaMatPCType matpc_type, int dagger_bit, QudaPrecision precision, QudaGaugeParam &gauge_param, double mferm, double *b5, double *c5)
void axpby(const Float &a, const Float *x, const Float &b, Float *y, const int N)
void printSpinorElement(void *spinor, int X, QudaPrecision precision)
int fullLatticeIndex_5d_4dpc(int i, int oddBit)
void dslashReference_4d_4dpc_mgpu(sFloat *res, gFloat **gaugeFull, gFloat **ghostGauge, sFloat *spinorField, sFloat **fwdSpinor, sFloat **backSpinor, int oddBit, int daggerBit)
int x4_5d_mgpu(int i, int oddBit)
void dw_dslash(void *out, void **gauge, void *in, int oddBit, int daggerBit, QudaPrecision precision, QudaGaugeParam &gauge_param, double mferm)
void dw_mat(void *out, void **gauge, void *in, double kappa, int dagger_bit, QudaPrecision precision, QudaGaugeParam &gauge_param, double mferm)
cpuColorSpinorField * spinor
QudaSiteSubset siteSubset
void dslashReference_4d_mgpu(sFloat *res, gFloat **gaugeFull, gFloat **ghostGauge, sFloat *spinorField, sFloat **fwdSpinor, sFloat **backSpinor, int oddBit, int daggerBit)
void ax(double a, void *x, int len, QudaPrecision precision)
int fullLatticeIndex(int dim[4], int index, int oddBit)
void matpc(void *outEven, void **gauge, void *inEven, double kappa, QudaMatPCType matpc_type, int dagger_bit, QudaPrecision sPrecision, QudaPrecision gPrecision, double mferm)
int neighborIndex_5d(int i, int oddBit, int dxs, int dx4, int dx3, int dx2, int dx1)
void dw_matdagmat(void *out, void **gauge, void *in, double kappa, int dagger_bit, QudaPrecision precision, QudaGaugeParam &gauge_param, double mferm)
void dw_matpc(void *out, void **gauge, void *in, double kappa, QudaMatPCType matpc_type, int dagger_bit, QudaPrecision precision, QudaGaugeParam &gauge_param, double mferm)
cudaColorSpinorField * tmp
VOLATILE spinorFloat kappa
QudaFieldOrder fieldOrder
Float * spinorNeighbor_5d_mgpu(int i, int dir, int oddBit, Float *spinorField, Float **fwd_nbr_spinor, Float **back_nbr_spinor, int neighbor_distance, int nFace)
FloatingPoint< float > Float
void exchangeCpuSpinor(quda::cpuColorSpinorField &in, int parity, int dagger)
Float * gaugeLink_sgpu(int i, int dir, int oddBit, Float **gaugeEven, Float **gaugeOdd)
enum QudaMatPCType_s QudaMatPCType
QudaGammaBasis gammaBasis
Float * spinorNeighbor_5d_4dpc_mgpu(int i, int dir, int oddBit, Float *spinorField, Float **fwd_nbr_spinor, Float **back_nbr_spinor, int neighbor_distance, int nFace)
__host__ __device__ ValueType pow(ValueType x, ExponentType e)
enum QudaParity_s QudaParity
void mdw_dslash_5(void *out, void **gauge, void *in, int oddBit, int daggerBit, QudaPrecision precision, QudaGaugeParam &gauge_param, double mferm, double *kappa)
void matpcdagmatpc(void *out, void **gauge, void *in, double kappa, QudaPrecision sPrecision, QudaPrecision gPrecision, double mferm, QudaMatPCType matpc_type)
Float * spinorNeighbor_5d(int i, int dir, int oddBit, Float *spinorField)
void dslashReference_5th_inv(sFloat *res, sFloat *spinorField, int oddBit, int daggerBit, sFloat mferm, double *kappa)
void axpy(double a, void *x, void *y, int len, QudaPrecision precision)
int fullLatticeIndex_5d(int i, int oddBit)
cpuColorSpinorField * out
void mdw_dslash_4_pre(void *out, void **gauge, void *in, int oddBit, int daggerBit, QudaPrecision precision, QudaGaugeParam &gauge_param, double mferm, double *b5, double *c5)
void dslashReference_4d_4dpc_sgpu(sFloat *res, gFloat **gaugeFull, sFloat *spinorField, int oddBit, int daggerBit)
Main header file for the QUDA library.
void dslashReference_4d_sgpu(sFloat *res, gFloat **gaugeFull, sFloat *spinorField, int oddBit, int daggerBit)
int fullLatticeIndex_4d(int i, int oddBit)
const double projector[10][4][4][2]
int neighborIndex_5d_4dpc(int i, int oddBit, int dxs, int dx4, int dx3, int dx2, int dx1)
void dslash_4_4d(void *out, void **gauge, void *in, int oddBit, int daggerBit, QudaPrecision precision, QudaGaugeParam &gauge_param, double mferm)
void matdagmat(void *out, void **gauge, void *in, double kappa, QudaPrecision sPrecision, QudaPrecision gPrecision, double mferm)
Float * gaugeLink_mgpu(int i, int dir, int oddBit, Float **gaugeEven, Float **gaugeOdd, Float **ghostGaugeEven, Float **ghostGaugeOdd, int n_ghost_faces, int nbr_distance)
int neighborIndex_4d(int i, int oddBit, int dx4, int dx3, int dx2, int dx1)
int neighborIndex_5d_4dpc_mgpu(int i, int oddBit, int dxs, int dx4, int dx3, int dx2, int dx1)
void dslash_5_inv(void *out, void **gauge, void *in, int oddBit, int daggerBit, QudaPrecision precision, QudaGaugeParam &gauge_param, double mferm, double *kappa)
void multiplySpinorByDiracProjector5(Float *res, int projIdx, Float *spinorIn)
int x4_5d_4dpc_mgpu(int i, int oddBit)
void dslashReference_5th(sFloat *res, sFloat *spinorField, int oddBit, int daggerBit, sFloat mferm)