6 template <
typename Float>
8 for (
int i = 0; i < cnt; i++)
12 template <
typename Float>
14 for (
int i = 0; i < cnt; i++)
18 template <
typename Float>
20 for (
int i = 0; i < cnt; i++)
25 template <
typename Float>
27 for (
int i=0; i<len; i++) y[i] = a*x[i] + y[i];
31 template <
typename Float>
33 for (
int i=0; i<len; i++) y[i] = a*x[i] + b*y[i];
37 template <
typename Float>
39 for (
int i=0; i<len; i++) y[i] = a*x[i] - y[i];
42 template <
typename Float>
45 for (
int i=0; i<len; i++) sum += v[i]*v[i];
49 template <
typename Float>
50 static inline void negx(
Float *x,
int len) {
51 for (
int i=0; i<len; i++) x[i] = -x[i];
54 template <
typename sFloat,
typename gFloat>
55 static inline void dot(sFloat* res, gFloat* a, sFloat* b) {
57 for (
int m = 0; m < 3; m++) {
58 sFloat a_re = a[2*m+0];
59 sFloat a_im = a[2*m+1];
60 sFloat b_re = b[2*m+0];
61 sFloat b_im = b[2*m+1];
62 res[0] += a_re * b_re - a_im * b_im;
63 res[1] += a_re * b_im + a_im * b_re;
67 template <
typename Float>
69 for (
int m = 0; m < 3; m++) {
70 for (
int n = 0; n < 3; n++) {
71 res[m*(3*2) + n*(2) + 0] = +
mat[n*(3*2) + m*(2) + 0];
72 res[m*(3*2) + n*(2) + 1] = -
mat[n*(3*2) + m*(2) + 1];
78 template <
typename sFloat,
typename gFloat>
79 static inline void su3Mul(sFloat *res, gFloat *
mat, sFloat *vec) {
80 for (
int n = 0; n < 3; n++) dot(&res[n*(2)], &
mat[n*(3*2)], vec);
83 template <
typename sFloat,
typename gFloat>
84 static inline void su3Tmul(sFloat *res, gFloat *
mat, sFloat *vec) {
86 su3Transpose(matT,
mat);
87 su3Mul(res, matT, vec);
120 template <
typename Float>
121 static inline Float *gaugeLink(
int i,
int dir,
int oddBit,
Float **gaugeEven,
Float **gaugeOdd,
int nbr_distance) {
124 int d = nbr_distance;
127 gaugeField = (oddBit ? gaugeOdd : gaugeEven);
135 default: j = -1;
break;
137 gaugeField = (oddBit ? gaugeEven : gaugeOdd);
140 return &gaugeField[dir/2][j*(3*3*2)];
143 template <
typename Float>
144 static inline Float *spinorNeighbor(
int i,
int dir,
int oddBit,
Float *spinorField,
int neighbor_distance,
148 int nb = neighbor_distance;
158 default: j = -1;
break;
161 return &spinorField[j * site_size];
175 template <QudaPCType type>
int neighborIndex_5d(
int i,
int oddBit,
int dxs,
int dx4,
int dx3,
int dx2,
int dx1)
181 int xs = X/(
Z[3]*
Z[2]*
Z[1]*
Z[0]);
182 int x4 = (X/(
Z[2]*
Z[1]*
Z[0])) %
Z[3];
183 int x3 = (X/(
Z[1]*
Z[0])) %
Z[2];
184 int x2 = (X/
Z[0]) %
Z[1];
189 xs = (xs+dxs+
Ls) %
Ls;
191 x4 = (x4+dx4+
Z[3]) %
Z[3];
192 x3 = (x3+dx3+
Z[2]) %
Z[2];
193 x2 = (x2+dx2+
Z[1]) %
Z[1];
194 x1 = (x1+dx1+
Z[0]) %
Z[0];
197 return (xs*(
Z[3]*
Z[2]*
Z[1]*
Z[0]) + x4*(
Z[2]*
Z[1]*
Z[0]) + x3*(
Z[1]*
Z[0]) + x2*(
Z[0]) + x1) / 2;
200 template <QudaPCType type,
typename Float>
203 int nb = neighbor_distance;
206 case 0: j = neighborIndex_5d<type>(i, oddBit, 0, 0, 0, 0, +nb);
break;
207 case 1: j = neighborIndex_5d<type>(i, oddBit, 0, 0, 0, 0, -nb);
break;
208 case 2: j = neighborIndex_5d<type>(i, oddBit, 0, 0, 0, +nb, 0);
break;
209 case 3: j = neighborIndex_5d<type>(i, oddBit, 0, 0, 0, -nb, 0);
break;
210 case 4: j = neighborIndex_5d<type>(i, oddBit, 0, 0, +nb, 0, 0);
break;
211 case 5: j = neighborIndex_5d<type>(i, oddBit, 0, 0, -nb, 0, 0);
break;
212 case 6: j = neighborIndex_5d<type>(i, oddBit, 0, +nb, 0, 0, 0);
break;
213 case 7: j = neighborIndex_5d<type>(i, oddBit, 0, -nb, 0, 0, 0);
break;
214 case 8: j = neighborIndex_5d<type>(i, oddBit, +nb, 0, 0, 0, 0);
break;
215 case 9: j = neighborIndex_5d<type>(i, oddBit, -nb, 0, 0, 0, 0);
break;
216 default: j = -1;
break;
218 return &spinorField[j * site_size];
222 inline int x4_mg(
int i,
int oddBit)
225 int x4 = Y/(
Z[2]*
Z[1]*
Z[0]);
229 template <
typename Float>
230 static inline Float *gaugeLink_mg4dir(
int i,
int dir,
int oddBit,
Float **gaugeEven,
Float **gaugeOdd,
231 Float** ghostGaugeEven,
Float** ghostGaugeOdd,
int n_ghost_faces,
int nbr_distance) {
234 int d = nbr_distance;
237 gaugeField = (oddBit ? gaugeOdd : gaugeEven);
242 int x4 = Y/(
Z[2]*
Z[1]*
Z[0]);
243 int x3 = (Y/(
Z[1]*
Z[0])) %
Z[2];
244 int x2 = (Y/
Z[0]) %
Z[1];
250 Float* ghostGaugeField;
255 int new_x1 = (x1 - d + X1 )% X1;
257 ghostGaugeField = (oddBit?ghostGaugeEven[0]: ghostGaugeOdd[0]);
258 int offset = (n_ghost_faces + x1 -d)*X4*X3*X2/2 + (x4*X3*X2 + x3*X2+x2)/2;
259 return &ghostGaugeField[offset*(3*3*2)];
261 j = (x4*X3*X2*X1 + x3*X2*X1 + x2*X1 + new_x1) / 2;
266 int new_x2 = (x2 - d + X2 )% X2;
268 ghostGaugeField = (oddBit?ghostGaugeEven[1]: ghostGaugeOdd[1]);
269 int offset = (n_ghost_faces + x2 -d)*X4*X3*X1/2 + (x4*X3*X1 + x3*X1+x1)/2;
270 return &ghostGaugeField[offset*(3*3*2)];
272 j = (x4*X3*X2*X1 + x3*X2*X1 + new_x2*X1 + x1) / 2;
278 int new_x3 = (x3 - d + X3 )% X3;
280 ghostGaugeField = (oddBit?ghostGaugeEven[2]: ghostGaugeOdd[2]);
281 int offset = (n_ghost_faces + x3 -d)*X4*X2*X1/2 + (x4*X2*X1 + x2*X1+x1)/2;
282 return &ghostGaugeField[offset*(3*3*2)];
284 j = (x4*X3*X2*X1 + new_x3*X2*X1 + x2*X1 + x1) / 2;
289 int new_x4 = (x4 - d + X4)% X4;
291 ghostGaugeField = (oddBit?ghostGaugeEven[3]: ghostGaugeOdd[3]);
292 int offset = (n_ghost_faces + x4 -d)*X1*X2*X3/2 + (x3*X2*X1 + x2*X1+x1)/2;
293 return &ghostGaugeField[offset*(3*3*2)];
295 j = (new_x4*(X3*X2*X1) + x3*(X2*X1) + x2*(X1) + x1) / 2;
299 default: j = -1; printf(
"ERROR: wrong dir \n"); exit(1);
301 gaugeField = (oddBit ? gaugeEven : gaugeOdd);
305 return &gaugeField[dir/2][j*(3*3*2)];
308 template <
typename Float>
309 static inline Float *spinorNeighbor_mg4dir(
int i,
int dir,
int oddBit,
Float *spinorField,
Float **fwd_nbr_spinor,
310 Float **back_nbr_spinor,
int neighbor_distance,
int nFace,
int site_size = 24)
313 int nb = neighbor_distance;
315 int x4 = Y/(
Z[2]*
Z[1]*
Z[0]);
316 int x3 = (Y/(
Z[1]*
Z[0])) %
Z[2];
317 int x2 = (Y/
Z[0]) %
Z[1];
327 int new_x1 = (x1 + nb)% X1;
329 int offset = ( x1 + nb -X1)*X4*X3*X2/2+(x4*X3*X2 + x3*X2+x2)/2;
330 return fwd_nbr_spinor[0] + offset * site_size;
332 j = (x4*X3*X2*X1 + x3*X2*X1 + x2*X1 + new_x1) / 2;
337 int new_x1 = (x1 - nb + X1)% X1;
339 int offset = ( x1+nFace- nb)*X4*X3*X2/2+(x4*X3*X2 + x3*X2+x2)/2;
340 return back_nbr_spinor[0] + offset * site_size;
342 j = (x4*X3*X2*X1 + x3*X2*X1 + x2*X1 + new_x1) / 2;
347 int new_x2 = (x2 + nb)% X2;
349 int offset = ( x2 + nb -X2)*X4*X3*X1/2+(x4*X3*X1 + x3*X1+x1)/2;
350 return fwd_nbr_spinor[1] + offset * site_size;
352 j = (x4*X3*X2*X1 + x3*X2*X1 + new_x2*X1 + x1) / 2;
357 int new_x2 = (x2 - nb + X2)% X2;
359 int offset = ( x2 + nFace -nb)*X4*X3*X1/2+(x4*X3*X1 + x3*X1+x1)/2;
360 return back_nbr_spinor[1] + offset * site_size;
362 j = (x4*X3*X2*X1 + x3*X2*X1 + new_x2*X1 + x1) / 2;
367 int new_x3 = (x3 + nb)% X3;
369 int offset = ( x3 + nb -X3)*X4*X2*X1/2+(x4*X2*X1 + x2*X1+x1)/2;
370 return fwd_nbr_spinor[2] + offset * site_size;
372 j = (x4*X3*X2*X1 + new_x3*X2*X1 + x2*X1 + x1) / 2;
377 int new_x3 = (x3 - nb + X3)% X3;
379 int offset = ( x3 + nFace -nb)*X4*X2*X1/2+(x4*X2*X1 + x2*X1+x1)/2;
380 return back_nbr_spinor[2] + offset * site_size;
382 j = (x4*X3*X2*X1 + new_x3*X2*X1 + x2*X1 + x1) / 2;
388 int x4 = x4_mg(i, oddBit);
390 int offset = (x4+nb -
Z[3])*
Vsh_t;
391 return &fwd_nbr_spinor[3][(offset + j) * site_size];
398 int x4 = x4_mg(i, oddBit);
400 int offset = ( x4 - nb +nFace)*
Vsh_t;
401 return &back_nbr_spinor[3][(offset + j) * site_size];
405 default: j = -1; printf(
"ERROR: wrong dir\n"); exit(1);
408 return &spinorField[j * site_size];
411 template <QudaPCType type>
int neighborIndex_5d_mgpu(
int i,
int oddBit,
int dxs,
int dx4,
int dx3,
int dx2,
int dx1)
417 int xs = Y/(
Z[3]*
Z[2]*
Z[1]*
Z[0]);
418 int x4 = (Y/(
Z[2]*
Z[1]*
Z[0])) %
Z[3];
419 int x3 = (Y/(
Z[1]*
Z[0])) %
Z[2];
420 int x2 = (Y/
Z[0]) %
Z[1];
422 int ghost_x4 = x4+ dx4;
424 xs = (xs+dxs+
Ls) %
Ls;
425 x4 = (x4+dx4+
Z[3]) %
Z[3];
426 x3 = (x3+dx3+
Z[2]) %
Z[2];
427 x2 = (x2+dx2+
Z[1]) %
Z[1];
428 x1 = (x1+dx1+
Z[0]) %
Z[0];
431 ret = (xs*
Z[3]*
Z[2]*
Z[1]*
Z[0] + x4*
Z[2]*
Z[1]*
Z[0] + x3*
Z[1]*
Z[0] + x2*
Z[0] + x1) >> 1;
433 ret = (xs*
Z[2]*
Z[1]*
Z[0] + x3*
Z[1]*
Z[0] + x2*
Z[0] + x1) >> 1;
439 template <QudaPCType type>
int x4_5d_mgpu(
int i,
int oddBit)
442 return (Y/(
Z[2]*
Z[1]*
Z[0])) %
Z[3];
445 template <QudaPCType type,
typename Float>
446 Float *spinorNeighbor_5d_mgpu(
int i,
int dir,
int oddBit,
Float *spinorField,
Float **fwd_nbr_spinor,
447 Float **back_nbr_spinor,
int neighbor_distance,
int nFace,
int site_size = 24)
450 int nb = neighbor_distance;
453 int xs = Y/(
Z[3]*
Z[2]*
Z[1]*
Z[0]);
454 int x4 = (Y/(
Z[2]*
Z[1]*
Z[0])) %
Z[3];
455 int x3 = (Y/(
Z[1]*
Z[0])) %
Z[2];
456 int x2 = (Y/
Z[0]) %
Z[1];
466 int new_x1 = (x1 + nb)% X1;
468 int offset = ((x1 + nb -X1)*
Ls*X4*X3*X2+xs*X4*X3*X2+x4*X3*X2 + x3*X2+x2) >> 1;
469 return fwd_nbr_spinor[0] + offset * site_size;
471 j = (xs*X4*X3*X2*X1 + x4*X3*X2*X1 + x3*X2*X1 + x2*X1 + new_x1) >> 1;
476 int new_x1 = (x1 - nb + X1)% X1;
478 int offset = (( x1+nFace- nb)*
Ls*X4*X3*X2 + xs*X4*X3*X2 + x4*X3*X2 + x3*X2 + x2) >> 1;
479 return back_nbr_spinor[0] + offset * site_size;
481 j = (xs*X4*X3*X2*X1 + x4*X3*X2*X1 + x3*X2*X1 + x2*X1 + new_x1) >> 1;
486 int new_x2 = (x2 + nb)% X2;
488 int offset = (( x2 + nb -X2)*
Ls*X4*X3*X1+xs*X4*X3*X1+x4*X3*X1 + x3*X1+x1) >> 1;
489 return fwd_nbr_spinor[1] + offset * site_size;
491 j = (xs*X4*X3*X2*X1 + x4*X3*X2*X1 + x3*X2*X1 + new_x2*X1 + x1) >> 1;
496 int new_x2 = (x2 - nb + X2)% X2;
498 int offset = (( x2 + nFace -nb)*
Ls*X4*X3*X1+xs*X4*X3*X1+ x4*X3*X1 + x3*X1+x1) >> 1;
499 return back_nbr_spinor[1] + offset * site_size;
501 j = (xs*X4*X3*X2*X1 + x4*X3*X2*X1 + x3*X2*X1 + new_x2*X1 + x1) >> 1;
506 int new_x3 = (x3 + nb)% X3;
508 int offset = (( x3 + nb -X3)*
Ls*X4*X2*X1+xs*X4*X2*X1+x4*X2*X1 + x2*X1+x1) >> 1;
509 return fwd_nbr_spinor[2] + offset * site_size;
511 j = (xs*X4*X3*X2*X1 + x4*X3*X2*X1 + new_x3*X2*X1 + x2*X1 + x1) >> 1;
516 int new_x3 = (x3 - nb + X3)% X3;
518 int offset = (( x3 + nFace -nb)*
Ls*X4*X2*X1+xs*X4*X2*X1+x4*X2*X1+x2*X1+x1) >> 1;
519 return back_nbr_spinor[2] + offset * site_size;
521 j = (xs*X4*X3*X2*X1 + x4*X3*X2*X1 + new_x3*X2*X1 + x2*X1 + x1) >> 1;
526 int x4 = x4_5d_mgpu<type>(i, oddBit);
528 int offset = ((x4 + nb -
Z[3])*
Ls*X3*X2*X1+xs*X3*X2*X1+x3*X2*X1+x2*X1+x1) >> 1;
529 return fwd_nbr_spinor[3] + offset * site_size;
531 j = neighborIndex_5d_mgpu<type>(i, oddBit, 0, +nb, 0, 0, 0);
536 int x4 = x4_5d_mgpu<type>(i, oddBit);
538 int offset = (( x4 - nb +nFace)*
Ls*X3*X2*X1+xs*X3*X2*X1+x3*X2*X1+x2*X1+x1) >> 1;
539 return back_nbr_spinor[3] + offset * site_size;
541 j = neighborIndex_5d_mgpu<type>(i, oddBit, 0, -nb, 0, 0, 0);
544 default: j = -1; printf(
"ERROR: wrong dir\n"); exit(1);
547 return &spinorField[j * site_size];
int comm_dim_partitioned(int dim)
void mat(void *out, void **link, void *in, int dagger_bit, int mu, QudaPrecision sPrecision, QudaPrecision gPrecision)
cudaColorSpinorField * tmp
cpuColorSpinorField * spinorOut
QudaGaugeParam gauge_param
QudaInvertParam inv_param
Float * spinorNeighbor_5d(int i, int dir, int oddBit, Float *spinorField, int neighbor_distance=1, int site_size=24)
void verifyInversion(void *spinorOut, void *spinorIn, void *spinorCheck, QudaGaugeParam &gauge_param, QudaInvertParam &inv_param, void **gauge, void *clover, void *clover_inv)
void verifyStaggeredInversion(quda::ColorSpinorField *tmp, quda::ColorSpinorField *ref, quda::ColorSpinorField *in, quda::ColorSpinorField *out, double mass, void *qdp_fatlink[], void *qdp_longlink[], void **ghost_fatlink, void **ghost_longlink, QudaGaugeParam &gauge_param, QudaInvertParam &inv_param, int shift)
int neighborIndex_5d(int i, int oddBit, int dxs, int dx4, int dx3, int dx2, int dx1)
void verifyDomainWallTypeInversion(void *spinorOut, void **spinorOutMulti, void *spinorIn, void *spinorCheck, QudaGaugeParam &gauge_param, QudaInvertParam &inv_param, void **gauge, void *clover, void *clover_inv)
void verifyWilsonTypeInversion(void *spinorOut, void **spinorOutMulti, void *spinorIn, void *spinorCheck, QudaGaugeParam &gauge_param, QudaInvertParam &inv_param, void **gauge, void *clover, void *clover_inv)
int fullLatticeIndex_5d(int i, int oddBit)
int fullLatticeIndex(int dim[4], int index, int oddBit)
int neighborIndex(int i, int oddBit, int dx4, int dx3, int dx2, int dx1)
int neighborIndex_mg(int i, int oddBit, int dx4, int dx3, int dx2, int dx1)
int fullLatticeIndex_5d_4dpc(int i, int oddBit)
void ax(double a, ColorSpinorField &x)
double norm2(const ColorSpinorField &a)
void axpy(double a, ColorSpinorField &x, ColorSpinorField &y)
void axpby(double a, ColorSpinorField &x, double b, ColorSpinorField &y)
FloatingPoint< float > Float
__host__ __device__ T sum(const array< T, s > &a)