7 template <
typename Float>
8 static inline void sum(Float *dst, Float *
a, Float *
b,
int cnt) {
9 for (
int i = 0;
i <
cnt;
i++)
13 template <
typename Float>
14 static inline void sub(Float *dst, Float *
a, Float *
b,
int cnt) {
15 for (
int i = 0;
i <
cnt;
i++)
19 template <
typename Float>
20 static inline void ax(Float *dst, Float
a, Float *
x,
int cnt) {
21 for (
int i = 0;
i <
cnt;
i++)
26 template <
typename Float>
27 static inline void axpy(Float
a, Float *
x, Float *
y,
int len) {
32 template <
typename Float>
33 static inline void axpby(Float
a, Float *
x, Float
b, Float *
y,
int len) {
38 template <
typename Float>
39 static inline void axmy(Float *
x, Float
a, Float *
y,
int len) {
43 template <
typename Float>
50 template <
typename Float>
55 template <
typename sFloat,
typename gFloat>
56 static inline void dot(sFloat* res, gFloat*
a, sFloat*
b) {
58 for (
int m = 0; m < 3; m++) {
59 sFloat
a_re =
a[2*m+0];
60 sFloat
a_im =
a[2*m+1];
61 sFloat b_re =
b[2*m+0];
62 sFloat b_im =
b[2*m+1];
68 template <
typename Float>
70 for (
int m = 0; m < 3; m++) {
71 for (
int n = 0;
n < 3;
n++) {
72 res[m*(3*2) +
n*(2) + 0] = +
mat[
n*(3*2) + m*(2) + 0];
73 res[m*(3*2) +
n*(2) + 1] = -
mat[
n*(3*2) + m*(2) + 1];
79 template <
typename sFloat,
typename gFloat>
80 static inline void su3Mul(sFloat *res, gFloat *
mat, sFloat *vec) {
81 for (
int n = 0;
n < 3;
n++)
dot(&res[
n*(2)], &
mat[
n*(3*2)], vec);
84 template <
typename sFloat,
typename gFloat>
85 static inline void su3Tmul(sFloat *res, gFloat *
mat, sFloat *vec) {
103 template <
typename Float>
104 static inline Float *
gaugeLink(
int i,
int dir,
int oddBit, Float **gaugeEven, Float **gaugeOdd,
int nbr_distance) {
107 int d = nbr_distance;
110 gaugeField = (oddBit ? gaugeOdd : gaugeEven);
118 default: j = -1;
break;
120 gaugeField = (oddBit ? gaugeEven : gaugeOdd);
123 return &gaugeField[dir/2][j*(3*3*2)];
126 template <
typename Float>
127 static inline Float *
spinorNeighbor(
int i,
int dir,
int oddBit, Float *spinorField,
int neighbor_distance)
130 int nb = neighbor_distance;
140 default: j = -1;
break;
157 template<QudaDWFPCType type>
163 int xs =
X/(
Z[3]*
Z[2]*
Z[1]*
Z[0]);
164 int x4 = (
X/(
Z[2]*
Z[1]*
Z[0])) %
Z[3];
165 int x3 = (
X/(
Z[1]*
Z[0])) %
Z[2];
166 int x2 = (
X/
Z[0]) %
Z[1];
171 xs = (xs+dxs+
Ls) %
Ls;
173 x4 = (x4+dx4+
Z[3]) %
Z[3];
174 x3 = (x3+dx3+
Z[2]) %
Z[2];
175 x2 = (x2+dx2+
Z[1]) %
Z[1];
176 x1 = (x1+dx1+
Z[0]) %
Z[0];
179 return (xs*(
Z[3]*
Z[2]*
Z[1]*
Z[0]) + x4*(
Z[2]*
Z[1]*
Z[0]) + x3*(
Z[1]*
Z[0]) + x2*(
Z[0]) + x1) / 2;
183 template <QudaDWFPCType type,
typename Float>
184 Float *
spinorNeighbor_5d(
int i,
int dir,
int oddBit, Float *spinorField,
int neighbor_distance=1,
int siteSize=24) {
185 int nb = neighbor_distance;
188 case 0: j = neighborIndex_5d<type>(
i, oddBit, 0, 0, 0, 0, +nb);
break;
189 case 1: j = neighborIndex_5d<type>(
i, oddBit, 0, 0, 0, 0, -nb);
break;
190 case 2: j = neighborIndex_5d<type>(
i, oddBit, 0, 0, 0, +nb, 0);
break;
191 case 3: j = neighborIndex_5d<type>(
i, oddBit, 0, 0, 0, -nb, 0);
break;
192 case 4: j = neighborIndex_5d<type>(
i, oddBit, 0, 0, +nb, 0, 0);
break;
193 case 5: j = neighborIndex_5d<type>(
i, oddBit, 0, 0, -nb, 0, 0);
break;
194 case 6: j = neighborIndex_5d<type>(
i, oddBit, 0, +nb, 0, 0, 0);
break;
195 case 7: j = neighborIndex_5d<type>(
i, oddBit, 0, -nb, 0, 0, 0);
break;
196 case 8: j = neighborIndex_5d<type>(
i, oddBit, +nb, 0, 0, 0, 0);
break;
197 case 9: j = neighborIndex_5d<type>(
i, oddBit, -nb, 0, 0, 0, 0);
break;
198 default: j = -1;
break;
200 return &spinorField[j*siteSize];
207 x4_mg(
int i,
int oddBit)
210 int x4 = Y/(
Z[2]*
Z[1]*
Z[0]);
214 template <
typename Float>
215 static inline Float *gaugeLink_mg4dir(
int i,
int dir,
int oddBit, Float **gaugeEven, Float **gaugeOdd,
216 Float** ghostGaugeEven, Float** ghostGaugeOdd,
int n_ghost_faces,
int nbr_distance) {
219 int d = nbr_distance;
222 gaugeField = (oddBit ? gaugeOdd : gaugeEven);
227 int x4 = Y/(
Z[2]*
Z[1]*
Z[0]);
228 int x3 = (Y/(
Z[1]*
Z[0])) %
Z[2];
229 int x2 = (Y/
Z[0]) %
Z[1];
235 Float* ghostGaugeField;
240 int new_x1 = (x1 -
d + X1 )% X1;
242 ghostGaugeField = (oddBit?ghostGaugeEven[0]: ghostGaugeOdd[0]);
243 int offset = (n_ghost_faces + x1 -
d)*X4*X3*X2/2 + (x4*X3*X2 + x3*X2+x2)/2;
244 return &ghostGaugeField[
offset*(3*3*2)];
246 j = (x4*X3*X2*X1 + x3*X2*X1 + x2*X1 + new_x1) / 2;
251 int new_x2 = (x2 -
d + X2 )% X2;
253 ghostGaugeField = (oddBit?ghostGaugeEven[1]: ghostGaugeOdd[1]);
254 int offset = (n_ghost_faces + x2 -
d)*X4*X3*X1/2 + (x4*X3*X1 + x3*X1+x1)/2;
255 return &ghostGaugeField[
offset*(3*3*2)];
257 j = (x4*X3*X2*X1 + x3*X2*X1 + new_x2*X1 + x1) / 2;
263 int new_x3 = (x3 -
d + X3 )% X3;
265 ghostGaugeField = (oddBit?ghostGaugeEven[2]: ghostGaugeOdd[2]);
266 int offset = (n_ghost_faces + x3 -
d)*X4*X2*X1/2 + (x4*X2*X1 + x2*X1+x1)/2;
267 return &ghostGaugeField[
offset*(3*3*2)];
269 j = (x4*X3*X2*X1 + new_x3*X2*X1 + x2*X1 + x1) / 2;
274 int new_x4 = (x4 -
d + X4)% X4;
276 ghostGaugeField = (oddBit?ghostGaugeEven[3]: ghostGaugeOdd[3]);
277 int offset = (n_ghost_faces + x4 -
d)*X1*X2*X3/2 + (x3*X2*X1 + x2*X1+x1)/2;
278 return &ghostGaugeField[
offset*(3*3*2)];
280 j = (new_x4*(X3*X2*X1) + x3*(X2*X1) + x2*(X1) + x1) / 2;
284 default: j = -1;
printf(
"ERROR: wrong dir \n");
exit(1);
286 gaugeField = (oddBit ? gaugeEven : gaugeOdd);
290 return &gaugeField[dir/2][j*(3*3*2)];
293 template <
typename Float>
294 static inline Float *spinorNeighbor_mg4dir(
int i,
int dir,
int oddBit, Float *spinorField, Float** fwd_nbr_spinor,
295 Float** back_nbr_spinor,
int neighbor_distance,
int nFace)
298 int nb = neighbor_distance;
300 int x4 = Y/(
Z[2]*
Z[1]*
Z[0]);
301 int x3 = (Y/(
Z[1]*
Z[0])) %
Z[2];
302 int x2 = (Y/
Z[0]) %
Z[1];
312 int new_x1 = (x1 + nb)% X1;
314 int offset = ( x1 + nb -X1)*X4*X3*X2/2+(x4*X3*X2 + x3*X2+x2)/2;
317 j = (x4*X3*X2*X1 + x3*X2*X1 + x2*X1 + new_x1) / 2;
322 int new_x1 = (x1 - nb + X1)% X1;
324 int offset = ( x1+nFace- nb)*X4*X3*X2/2+(x4*X3*X2 + x3*X2+x2)/2;
327 j = (x4*X3*X2*X1 + x3*X2*X1 + x2*X1 + new_x1) / 2;
332 int new_x2 = (x2 + nb)% X2;
334 int offset = ( x2 + nb -X2)*X4*X3*X1/2+(x4*X3*X1 + x3*X1+x1)/2;
337 j = (x4*X3*X2*X1 + x3*X2*X1 + new_x2*X1 + x1) / 2;
342 int new_x2 = (x2 - nb + X2)% X2;
344 int offset = ( x2 + nFace -nb)*X4*X3*X1/2+(x4*X3*X1 + x3*X1+x1)/2;
347 j = (x4*X3*X2*X1 + x3*X2*X1 + new_x2*X1 + x1) / 2;
352 int new_x3 = (x3 + nb)% X3;
354 int offset = ( x3 + nb -X3)*X4*X2*X1/2+(x4*X2*X1 + x2*X1+x1)/2;
357 j = (x4*X3*X2*X1 + new_x3*X2*X1 + x2*X1 + x1) / 2;
362 int new_x3 = (x3 - nb + X3)% X3;
364 int offset = ( x3 + nFace -nb)*X4*X2*X1/2+(x4*X2*X1 + x2*X1+x1)/2;
367 j = (x4*X3*X2*X1 + new_x3*X2*X1 + x2*X1 + x1) / 2;
373 int x4 = x4_mg(
i, oddBit);
383 int x4 = x4_mg(
i, oddBit);
390 default: j = -1;
printf(
"ERROR: wrong dir\n");
exit(1);
396 template<QudaDWFPCType type>
397 int neighborIndex_5d_mgpu(
int i,
int oddBit,
int dxs,
int dx4,
int dx3,
int dx2,
int dx1)
403 int xs = Y/(
Z[3]*
Z[2]*
Z[1]*
Z[0]);
404 int x4 = (Y/(
Z[2]*
Z[1]*
Z[0])) %
Z[3];
405 int x3 = (Y/(
Z[1]*
Z[0])) %
Z[2];
406 int x2 = (Y/
Z[0]) %
Z[1];
408 int ghost_x4 = x4+ dx4;
410 xs = (xs+dxs+
Ls) %
Ls;
411 x4 = (x4+dx4+
Z[3]) %
Z[3];
412 x3 = (x3+dx3+
Z[2]) %
Z[2];
413 x2 = (x2+dx2+
Z[1]) %
Z[1];
414 x1 = (x1+dx1+
Z[0]) %
Z[0];
417 ret = (xs*
Z[3]*
Z[2]*
Z[1]*
Z[0] + x4*
Z[2]*
Z[1]*
Z[0] + x3*
Z[1]*
Z[0] + x2*
Z[0] + x1) >> 1;
419 ret = (xs*
Z[2]*
Z[1]*
Z[0] + x3*
Z[1]*
Z[0] + x2*
Z[0] + x1) >> 1;
425 template <QudaDWFPCType type>
426 int x4_5d_mgpu(
int i,
int oddBit)
429 return (Y/(
Z[2]*
Z[1]*
Z[0])) %
Z[3];
433 template <QudaDWFPCType type,
typename Float>
434 Float *spinorNeighbor_5d_mgpu(
int i,
int dir,
int oddBit, Float *spinorField, Float** fwd_nbr_spinor, Float** back_nbr_spinor,
int neighbor_distance,
int nFace,
int spinorSize = 24)
437 int nb = neighbor_distance;
440 int xs = Y/(
Z[3]*
Z[2]*
Z[1]*
Z[0]);
441 int x4 = (Y/(
Z[2]*
Z[1]*
Z[0])) %
Z[3];
442 int x3 = (Y/(
Z[1]*
Z[0])) %
Z[2];
443 int x2 = (Y/
Z[0]) %
Z[1];
453 int new_x1 = (x1 + nb)% X1;
455 int offset = ((x1 + nb -X1)*
Ls*X4*X3*X2+xs*X4*X3*X2+x4*X3*X2 + x3*X2+x2) >> 1;
456 return fwd_nbr_spinor[0] +
offset*spinorSize;
458 j = (xs*X4*X3*X2*X1 + x4*X3*X2*X1 + x3*X2*X1 + x2*X1 + new_x1) >> 1;
463 int new_x1 = (x1 - nb + X1)% X1;
465 int offset = (( x1+nFace- nb)*
Ls*X4*X3*X2 + xs*X4*X3*X2 + x4*X3*X2 + x3*X2 + x2) >> 1;
466 return back_nbr_spinor[0] +
offset*spinorSize;
468 j = (xs*X4*X3*X2*X1 + x4*X3*X2*X1 + x3*X2*X1 + x2*X1 + new_x1) >> 1;
473 int new_x2 = (x2 + nb)% X2;
475 int offset = (( x2 + nb -X2)*
Ls*X4*X3*X1+xs*X4*X3*X1+x4*X3*X1 + x3*X1+x1) >> 1;
476 return fwd_nbr_spinor[1] +
offset*spinorSize;
478 j = (xs*X4*X3*X2*X1 + x4*X3*X2*X1 + x3*X2*X1 + new_x2*X1 + x1) >> 1;
483 int new_x2 = (x2 - nb + X2)% X2;
485 int offset = (( x2 + nFace -nb)*
Ls*X4*X3*X1+xs*X4*X3*X1+ x4*X3*X1 + x3*X1+x1) >> 1;
486 return back_nbr_spinor[1] +
offset*spinorSize;
488 j = (xs*X4*X3*X2*X1 + x4*X3*X2*X1 + x3*X2*X1 + new_x2*X1 + x1) >> 1;
493 int new_x3 = (x3 + nb)% X3;
495 int offset = (( x3 + nb -X3)*
Ls*X4*X2*X1+xs*X4*X2*X1+x4*X2*X1 + x2*X1+x1) >> 1;
496 return fwd_nbr_spinor[2] +
offset*spinorSize;
498 j = (xs*X4*X3*X2*X1 + x4*X3*X2*X1 + new_x3*X2*X1 + x2*X1 + x1) >> 1;
503 int new_x3 = (x3 - nb + X3)% X3;
505 int offset = (( x3 + nFace -nb)*
Ls*X4*X2*X1+xs*X4*X2*X1+x4*X2*X1+x2*X1+x1) >> 1;
506 return back_nbr_spinor[2] +
offset*spinorSize;
508 j = (xs*X4*X3*X2*X1 + x4*X3*X2*X1 + new_x3*X2*X1 + x2*X1 + x1) >> 1;
513 int x4 = x4_5d_mgpu<type>(
i, oddBit);
515 int offset = ((x4 + nb -
Z[3])*
Ls*X3*X2*X1+xs*X3*X2*X1+x3*X2*X1+x2*X1+x1) >> 1;
516 return fwd_nbr_spinor[3] +
offset*spinorSize;
518 j = neighborIndex_5d_mgpu<type>(
i, oddBit, 0, +nb, 0, 0, 0);
523 int x4 = x4_5d_mgpu<type>(
i, oddBit);
525 int offset = (( x4 - nb +nFace)*
Ls*X3*X2*X1+xs*X3*X2*X1+x3*X2*X1+x2*X1+x1) >> 1;
526 return back_nbr_spinor[3] +
offset*spinorSize;
528 j = neighborIndex_5d_mgpu<type>(
i, oddBit, 0, -nb, 0, 0, 0);
531 default: j = -1;
printf(
"ERROR: wrong dir\n");
exit(1);
534 return &spinorField[j*(spinorSize)];
540 #endif // _DSLASH_UTIL_H __device__ __forceinline__ int neighborIndex(const unsigned int &cb_idx, const int(&shift)[4], const bool(&partitioned)[4], const unsigned int &parity)
static void sum(Float *dst, Float *a, Float *b, int cnt)
Float * spinorNeighbor_5d(int i, int dir, int oddBit, Float *spinorField, int neighbor_distance=1, int siteSize=24)
static double norm2(Float *v, int len)
static void sub(Float *dst, Float *a, Float *b, int cnt)
int fullLatticeIndex_5d_4dpc(int i, int oddBit)
static void axmy(Float *x, Float a, Float *y, int len)
void exit(int) __attribute__((noreturn))
static void axpby(Float a, Float *x, Float b, Float *y, int len)
static Float * spinorNeighbor(int i, int dir, int oddBit, Float *spinorField, int neighbor_distance)
int fullLatticeIndex(int dim[4], int index, int oddBit)
int printf(const char *,...) __attribute__((__format__(__printf__
static void ax(Float *dst, Float a, Float *x, int cnt)
int neighborIndex_5d(int i, int oddBit, int dxs, int dx4, int dx3, int dx2, int dx1)
int neighborIndex_mg(int i, int oddBit, int dx4, int dx3, int dx2, int dx1)
static Float * gaugeLink(int i, int dir, int oddBit, Float **gaugeEven, Float **gaugeOdd, int nbr_distance)
int fullLatticeIndex_5d(int i, int oddBit)
static void su3Mul(sFloat *res, gFloat *mat, sFloat *vec)
static void su3Tmul(sFloat *res, gFloat *mat, sFloat *vec)
void mat(void *out, void **link, void *in, int dagger_bit, int mu, QudaPrecision sPrecision, QudaPrecision gPrecision)
static __inline__ size_t size_t d
static void axpy(Float a, Float *x, Float *y, int len)
int comm_dim_partitioned(int dim)
static void negx(Float *x, int len)
static void su3Transpose(Float *res, Float *mat)
static void dot(sFloat *res, gFloat *a, sFloat *b)