7 template <
typename Float>
8 static inline void sum(Float *dst, Float *a, Float *b,
int cnt) {
9 for (
int i = 0; i < cnt; i++)
13 template <
typename Float>
14 static inline void sub(Float *dst, Float *a, Float *b,
int cnt) {
15 for (
int i = 0; i < cnt; i++)
19 template <
typename Float>
20 static inline void ax(Float *dst, Float a, Float *x,
int cnt) {
21 for (
int i = 0; i < cnt; i++)
26 template <
typename Float>
27 static inline void axpy(Float a, Float *x, Float *y,
int len) {
28 for (
int i=0; i<len; i++) y[i] = a*x[i] + y[i];
32 template <
typename Float>
33 static inline void axpby(Float a, Float *x, Float b, Float *y,
int len) {
34 for (
int i=0; i<len; i++) y[i] = a*x[i] + b*y[i];
38 template <
typename Float>
39 static inline void axmy(Float *x, Float a, Float *y,
int len) {
40 for (
int i=0; i<len; i++) y[i] = a*x[i] - y[i];
43 template <
typename Float>
44 static double norm2(Float *v,
int len) {
46 for (
int i=0; i<len; i++) sum += v[i]*v[i];
50 template <
typename Float>
51 static inline void negx(Float *x,
int len) {
52 for (
int i=0; i<len; i++) x[i] = -x[i];
55 template <
typename sFloat,
typename gFloat>
56 static inline void dot(sFloat* res, gFloat* a, sFloat* b) {
58 for (
int m = 0; m < 3; m++) {
59 sFloat a_re = a[2*m+0];
60 sFloat a_im = a[2*m+1];
61 sFloat b_re = b[2*m+0];
62 sFloat b_im = b[2*m+1];
63 res[0] += a_re * b_re - a_im * b_im;
64 res[1] += a_re * b_im + a_im * b_re;
68 template <
typename Float>
70 for (
int m = 0; m < 3; m++) {
71 for (
int n = 0; n < 3; n++) {
72 res[m*(3*2) + n*(2) + 0] = + mat[n*(3*2) + m*(2) + 0];
73 res[m*(3*2) + n*(2) + 1] = - mat[n*(3*2) + m*(2) + 1];
79 template <
typename sFloat,
typename gFloat>
80 static inline void su3Mul(sFloat *res, gFloat *
mat, sFloat *vec) {
81 for (
int n = 0; n < 3; n++)
dot(&res[n*(2)], &mat[n*(3*2)], vec);
84 template <
typename sFloat,
typename gFloat>
85 static inline void su3Tmul(sFloat *res, gFloat *
mat, sFloat *vec) {
103 template <
typename Float>
104 static inline Float *
gaugeLink(
int i,
int dir,
int oddBit, Float **gaugeEven, Float **gaugeOdd,
int nbr_distance) {
107 int d = nbr_distance;
110 gaugeField = (oddBit ? gaugeOdd : gaugeEven);
118 default: j = -1;
break;
120 gaugeField = (oddBit ? gaugeEven : gaugeOdd);
123 return &gaugeField[dir/2][j*(3*3*2)];
126 template <
typename Float>
127 static inline Float *
spinorNeighbor(
int i,
int dir,
int oddBit, Float *spinorField,
int neighbor_distance)
130 int nb = neighbor_distance;
140 default: j = -1;
break;
157 template <QudaPCType type>
int neighborIndex_5d(
int i,
int oddBit,
int dxs,
int dx4,
int dx3,
int dx2,
int dx1)
163 int xs = X/(
Z[3]*
Z[2]*
Z[1]*
Z[0]);
164 int x4 = (X/(
Z[2]*
Z[1]*
Z[0])) %
Z[3];
165 int x3 = (X/(
Z[1]*
Z[0])) %
Z[2];
166 int x2 = (X/
Z[0]) %
Z[1];
171 xs = (xs+dxs+
Ls) %
Ls;
173 x4 = (x4+dx4+
Z[3]) %
Z[3];
174 x3 = (x3+dx3+
Z[2]) %
Z[2];
175 x2 = (x2+dx2+
Z[1]) %
Z[1];
176 x1 = (x1+dx1+
Z[0]) %
Z[0];
179 return (xs*(
Z[3]*
Z[2]*
Z[1]*
Z[0]) + x4*(Z[2]*Z[1]*Z[0]) + x3*(Z[1]*Z[0]) + x2*(Z[0]) + x1) / 2;
182 template <QudaPCType type,
typename Float>
183 Float *
spinorNeighbor_5d(
int i,
int dir,
int oddBit, Float *spinorField,
int neighbor_distance = 1,
int siteSize = 24)
185 int nb = neighbor_distance;
188 case 0: j = neighborIndex_5d<type>(i, oddBit, 0, 0, 0, 0, +nb);
break;
189 case 1: j = neighborIndex_5d<type>(i, oddBit, 0, 0, 0, 0, -nb);
break;
190 case 2: j = neighborIndex_5d<type>(i, oddBit, 0, 0, 0, +nb, 0);
break;
191 case 3: j = neighborIndex_5d<type>(i, oddBit, 0, 0, 0, -nb, 0);
break;
192 case 4: j = neighborIndex_5d<type>(i, oddBit, 0, 0, +nb, 0, 0);
break;
193 case 5: j = neighborIndex_5d<type>(i, oddBit, 0, 0, -nb, 0, 0);
break;
194 case 6: j = neighborIndex_5d<type>(i, oddBit, 0, +nb, 0, 0, 0);
break;
195 case 7: j = neighborIndex_5d<type>(i, oddBit, 0, -nb, 0, 0, 0);
break;
196 case 8: j = neighborIndex_5d<type>(i, oddBit, +nb, 0, 0, 0, 0);
break;
197 case 9: j = neighborIndex_5d<type>(i, oddBit, -nb, 0, 0, 0, 0);
break;
198 default: j = -1;
break;
200 return &spinorField[j*siteSize];
205 inline int x4_mg(
int i,
int oddBit)
208 int x4 = Y/(
Z[2]*
Z[1]*
Z[0]);
212 template <
typename Float>
213 static inline Float *gaugeLink_mg4dir(
int i,
int dir,
int oddBit, Float **gaugeEven, Float **gaugeOdd,
214 Float** ghostGaugeEven, Float** ghostGaugeOdd,
int n_ghost_faces,
int nbr_distance) {
217 int d = nbr_distance;
220 gaugeField = (oddBit ? gaugeOdd : gaugeEven);
225 int x4 = Y/(
Z[2]*
Z[1]*
Z[0]);
226 int x3 = (Y/(
Z[1]*
Z[0])) %
Z[2];
227 int x2 = (Y/
Z[0]) %
Z[1];
233 Float* ghostGaugeField;
238 int new_x1 = (x1 - d +
X1 )% X1;
240 ghostGaugeField = (oddBit?ghostGaugeEven[0]: ghostGaugeOdd[0]);
241 int offset = (n_ghost_faces + x1 -d)*X4*X3*X2/2 + (x4*X3*X2 + x3*X2+x2)/2;
242 return &ghostGaugeField[offset*(3*3*2)];
244 j = (x4*X3*X2*X1 + x3*X2*X1 + x2*X1 + new_x1) / 2;
249 int new_x2 = (x2 - d +
X2 )% X2;
251 ghostGaugeField = (oddBit?ghostGaugeEven[1]: ghostGaugeOdd[1]);
252 int offset = (n_ghost_faces + x2 -d)*X4*X3*X1/2 + (x4*X3*X1 + x3*X1+x1)/2;
253 return &ghostGaugeField[offset*(3*3*2)];
255 j = (x4*X3*X2*X1 + x3*X2*X1 + new_x2*X1 + x1) / 2;
261 int new_x3 = (x3 - d +
X3 )% X3;
263 ghostGaugeField = (oddBit?ghostGaugeEven[2]: ghostGaugeOdd[2]);
264 int offset = (n_ghost_faces + x3 -d)*X4*X2*X1/2 + (x4*X2*X1 + x2*X1+x1)/2;
265 return &ghostGaugeField[offset*(3*3*2)];
267 j = (x4*X3*X2*X1 + new_x3*X2*X1 + x2*X1 + x1) / 2;
272 int new_x4 = (x4 - d +
X4)% X4;
274 ghostGaugeField = (oddBit?ghostGaugeEven[3]: ghostGaugeOdd[3]);
275 int offset = (n_ghost_faces + x4 -d)*X1*X2*X3/2 + (x3*X2*X1 + x2*X1+x1)/2;
276 return &ghostGaugeField[offset*(3*3*2)];
278 j = (new_x4*(X3*X2*
X1) + x3*(X2*X1) + x2*(
X1) + x1) / 2;
282 default: j = -1; printf(
"ERROR: wrong dir \n"); exit(1);
284 gaugeField = (oddBit ? gaugeEven : gaugeOdd);
288 return &gaugeField[dir/2][j*(3*3*2)];
291 template <
typename Float>
292 static inline Float *spinorNeighbor_mg4dir(
int i,
int dir,
int oddBit, Float *spinorField, Float** fwd_nbr_spinor,
293 Float** back_nbr_spinor,
int neighbor_distance,
int nFace)
296 int nb = neighbor_distance;
298 int x4 = Y/(
Z[2]*
Z[1]*
Z[0]);
299 int x3 = (Y/(
Z[1]*
Z[0])) %
Z[2];
300 int x2 = (Y/
Z[0]) %
Z[1];
310 int new_x1 = (x1 + nb)% X1;
312 int offset = ( x1 + nb -
X1)*X4*X3*X2/2+(x4*X3*X2 + x3*X2+x2)/2;
315 j = (x4*X3*X2*X1 + x3*X2*X1 + x2*X1 + new_x1) / 2;
320 int new_x1 = (x1 - nb +
X1)% X1;
322 int offset = ( x1+nFace- nb)*X4*X3*X2/2+(x4*X3*X2 + x3*X2+x2)/2;
325 j = (x4*X3*X2*X1 + x3*X2*X1 + x2*X1 + new_x1) / 2;
330 int new_x2 = (x2 + nb)% X2;
332 int offset = ( x2 + nb -
X2)*X4*X3*X1/2+(x4*X3*X1 + x3*X1+x1)/2;
335 j = (x4*X3*X2*X1 + x3*X2*X1 + new_x2*X1 + x1) / 2;
340 int new_x2 = (x2 - nb +
X2)% X2;
342 int offset = ( x2 + nFace -nb)*X4*X3*X1/2+(x4*X3*X1 + x3*X1+x1)/2;
345 j = (x4*X3*X2*X1 + x3*X2*X1 + new_x2*X1 + x1) / 2;
350 int new_x3 = (x3 + nb)% X3;
352 int offset = ( x3 + nb -
X3)*X4*X2*X1/2+(x4*X2*X1 + x2*X1+x1)/2;
355 j = (x4*X3*X2*X1 + new_x3*X2*X1 + x2*X1 + x1) / 2;
360 int new_x3 = (x3 - nb +
X3)% X3;
362 int offset = ( x3 + nFace -nb)*X4*X2*X1/2+(x4*X2*X1 + x2*X1+x1)/2;
365 j = (x4*X3*X2*X1 + new_x3*X2*X1 + x2*X1 + x1) / 2;
371 int x4 = x4_mg(i, oddBit);
373 int offset = (x4+nb -
Z[3])*
Vsh_t;
381 int x4 = x4_mg(i, oddBit);
383 int offset = ( x4 - nb +nFace)*
Vsh_t;
388 default: j = -1; printf(
"ERROR: wrong dir\n"); exit(1);
394 template <QudaPCType type>
int neighborIndex_5d_mgpu(
int i,
int oddBit,
int dxs,
int dx4,
int dx3,
int dx2,
int dx1)
400 int xs = Y/(
Z[3]*
Z[2]*
Z[1]*
Z[0]);
401 int x4 = (Y/(
Z[2]*
Z[1]*
Z[0])) %
Z[3];
402 int x3 = (Y/(
Z[1]*
Z[0])) %
Z[2];
403 int x2 = (Y/
Z[0]) %
Z[1];
405 int ghost_x4 = x4+ dx4;
407 xs = (xs+dxs+
Ls) %
Ls;
408 x4 = (x4+dx4+
Z[3]) %
Z[3];
409 x3 = (x3+dx3+
Z[2]) %
Z[2];
410 x2 = (x2+dx2+
Z[1]) %
Z[1];
411 x1 = (x1+dx1+
Z[0]) %
Z[0];
414 ret = (xs*
Z[3]*
Z[2]*
Z[1]*
Z[0] + x4*
Z[2]*
Z[1]*
Z[0] + x3*
Z[1]*
Z[0] + x2*
Z[0] + x1) >> 1;
416 ret = (xs*
Z[2]*
Z[1]*
Z[0] + x3*
Z[1]*
Z[0] + x2*
Z[0] + x1) >> 1;
422 template <QudaPCType type>
int x4_5d_mgpu(
int i,
int oddBit)
425 return (Y/(
Z[2]*
Z[1]*
Z[0])) % Z[3];
428 template <QudaPCType type,
typename Float>
429 Float *spinorNeighbor_5d_mgpu(
int i,
int dir,
int oddBit, Float *spinorField, Float **fwd_nbr_spinor,
430 Float **back_nbr_spinor,
int neighbor_distance,
int nFace,
int spinorSize = 24)
433 int nb = neighbor_distance;
436 int xs = Y/(
Z[3]*
Z[2]*
Z[1]*
Z[0]);
437 int x4 = (Y/(
Z[2]*
Z[1]*
Z[0])) %
Z[3];
438 int x3 = (Y/(
Z[1]*
Z[0])) %
Z[2];
439 int x2 = (Y/
Z[0]) %
Z[1];
449 int new_x1 = (x1 + nb)% X1;
451 int offset = ((x1 + nb -
X1)*
Ls*X4*X3*X2+xs*X4*X3*X2+x4*X3*X2 + x3*X2+x2) >> 1;
452 return fwd_nbr_spinor[0] + offset*spinorSize;
454 j = (xs*X4*X3*X2*X1 + x4*X3*X2*X1 + x3*X2*X1 + x2*X1 + new_x1) >> 1;
459 int new_x1 = (x1 - nb +
X1)% X1;
461 int offset = (( x1+nFace- nb)*
Ls*X4*X3*X2 + xs*X4*X3*X2 + x4*X3*X2 + x3*X2 + x2) >> 1;
462 return back_nbr_spinor[0] + offset*spinorSize;
464 j = (xs*X4*X3*X2*X1 + x4*X3*X2*X1 + x3*X2*X1 + x2*X1 + new_x1) >> 1;
469 int new_x2 = (x2 + nb)% X2;
471 int offset = (( x2 + nb -
X2)*
Ls*X4*X3*X1+xs*X4*X3*X1+x4*X3*X1 + x3*X1+x1) >> 1;
472 return fwd_nbr_spinor[1] + offset*spinorSize;
474 j = (xs*X4*X3*X2*X1 + x4*X3*X2*X1 + x3*X2*X1 + new_x2*X1 + x1) >> 1;
479 int new_x2 = (x2 - nb +
X2)% X2;
481 int offset = (( x2 + nFace -nb)*
Ls*X4*X3*X1+xs*X4*X3*X1+ x4*X3*X1 + x3*X1+x1) >> 1;
482 return back_nbr_spinor[1] + offset*spinorSize;
484 j = (xs*X4*X3*X2*X1 + x4*X3*X2*X1 + x3*X2*X1 + new_x2*X1 + x1) >> 1;
489 int new_x3 = (x3 + nb)% X3;
491 int offset = (( x3 + nb -
X3)*
Ls*X4*X2*X1+xs*X4*X2*X1+x4*X2*X1 + x2*X1+x1) >> 1;
492 return fwd_nbr_spinor[2] + offset*spinorSize;
494 j = (xs*X4*X3*X2*X1 + x4*X3*X2*X1 + new_x3*X2*X1 + x2*X1 + x1) >> 1;
499 int new_x3 = (x3 - nb +
X3)% X3;
501 int offset = (( x3 + nFace -nb)*
Ls*X4*X2*X1+xs*X4*X2*X1+x4*X2*X1+x2*X1+x1) >> 1;
502 return back_nbr_spinor[2] + offset*spinorSize;
504 j = (xs*X4*X3*X2*X1 + x4*X3*X2*X1 + new_x3*X2*X1 + x2*X1 + x1) >> 1;
509 int x4 = x4_5d_mgpu<type>(i, oddBit);
511 int offset = ((x4 + nb -
Z[3])*
Ls*X3*X2*X1+xs*X3*X2*X1+x3*X2*X1+x2*X1+x1) >> 1;
512 return fwd_nbr_spinor[3] + offset*spinorSize;
514 j = neighborIndex_5d_mgpu<type>(i, oddBit, 0, +nb, 0, 0, 0);
519 int x4 = x4_5d_mgpu<type>(i, oddBit);
521 int offset = (( x4 - nb +nFace)*
Ls*X3*X2*X1+xs*X3*X2*X1+x3*X2*X1+x2*X1+x1) >> 1;
522 return back_nbr_spinor[3] + offset*spinorSize;
524 j = neighborIndex_5d_mgpu<type>(i, oddBit, 0, -nb, 0, 0, 0);
527 default: j = -1; printf(
"ERROR: wrong dir\n"); exit(1);
530 return &spinorField[j*(spinorSize)];
536 #endif // _DSLASH_UTIL_H __device__ __forceinline__ int neighborIndex(const unsigned int &cb_idx, const int(&shift)[4], const bool(&partitioned)[4], const unsigned int &parity)
static void sum(Float *dst, Float *a, Float *b, int cnt)
Float * spinorNeighbor_5d(int i, int dir, int oddBit, Float *spinorField, int neighbor_distance=1, int siteSize=24)
static double norm2(Float *v, int len)
static void sub(Float *dst, Float *a, Float *b, int cnt)
int fullLatticeIndex_5d_4dpc(int i, int oddBit)
static void axmy(Float *x, Float a, Float *y, int len)
static void axpby(Float a, Float *x, Float b, Float *y, int len)
static Float * spinorNeighbor(int i, int dir, int oddBit, Float *spinorField, int neighbor_distance)
int fullLatticeIndex(int dim[4], int index, int oddBit)
static void ax(Float *dst, Float a, Float *x, int cnt)
int neighborIndex_5d(int i, int oddBit, int dxs, int dx4, int dx3, int dx2, int dx1)
int neighborIndex_mg(int i, int oddBit, int dx4, int dx3, int dx2, int dx1)
static Float * gaugeLink(int i, int dir, int oddBit, Float **gaugeEven, Float **gaugeOdd, int nbr_distance)
int fullLatticeIndex_5d(int i, int oddBit)
static void su3Mul(sFloat *res, gFloat *mat, sFloat *vec)
static void su3Tmul(sFloat *res, gFloat *mat, sFloat *vec)
void mat(void *out, void **link, void *in, int dagger_bit, int mu, QudaPrecision sPrecision, QudaPrecision gPrecision)
static void axpy(Float a, Float *x, Float *y, int len)
int comm_dim_partitioned(int dim)
static void negx(Float *x, int len)
static void su3Transpose(Float *res, Float *mat)
static void dot(sFloat *res, gFloat *a, sFloat *b)