5 #define SHORT_LENGTH 65536
6 #define SCALE_FLOAT ((SHORT_LENGTH-1) / 2.f)
7 #define SHIFT_FLOAT (-1.f / (SHORT_LENGTH-1))
9 template <
typename Float>
14 template <
typename Float>
26 template <
typename Float>
28 double2 *r = res + dir*4*
V;
29 r[0].x = atan2(g[1], g[0]);
30 r[0].y = atan2(g[13], g[12]);
31 for (
int j=1; j<4; j++) {
37 template <
typename Float>
39 float4 *r = res + dir*2*
V;
40 r[0].x = atan2(g[1], g[0]);
41 r[0].y = atan2(g[13], g[12]);
50 template <
typename Float>
52 float2 *r = res + dir*4*
V;
53 r[0].x = atan2(g[1], g[0]);
54 r[0].y = atan2(g[13], g[12]);
55 for (
int j=1; j<4; j++) {
61 template <
typename Float>
63 short4 *r = res + dir*2*
V;
74 template <
typename Float>
76 short2 *r = res + dir*4*
V;
79 for (
int j=1; j<4; j++) {
85 template <
typename Float>
87 double2 *r = res + dir*6*
V;
88 for (
int j=0; j<6; j++) {
94 template <
typename Float>
96 float4 *r = res + dir*3*
V;
97 for (
int j=0; j<3; j++) {
105 template <
typename Float>
107 float2 *r = res + dir*6*
V;
108 for (
int j=0; j<6; j++) {
114 template <
typename Float>
116 short4 *r = res + dir*3*
V;
117 for (
int j=0; j<3; j++) {
125 template <
typename Float>
127 short2 *r = res + dir*6*
V;
128 for (
int j=0; j<6; j++) {
134 template <
typename Float>
136 double2 *r = res + dir*9*
V;
137 for (
int j=0; j<9; j++) {
143 template <
typename Float>
145 float4 *r = res + dir*5*
V;
146 for (
int j=0; j<4; j++) {
158 template <
typename Float>
160 float2 *r = res + dir*9*
V;
161 for (
int j=0; j<9; j++) {
167 template <
typename Float>
169 short4 *r = res + dir*5*
V;
170 for (
int j=0; j<4; j++) {
182 template <
typename Float>
185 short2 *r = res + dir*9*
V;
186 for (
int j=0; j<9; j++) {
192 template<
typename Float>
195 short2 *dg = d_gauge + dir*9*
V;
196 for (
int j=0; j<9; j++) {
205 template <
typename Float>
207 a[0] += sign*(b[0]*c[0] - b[1]*c[1]);
208 a[1] += sign*(b[0]*c[1] + b[1]*c[0]);
212 template <
typename Float>
214 a[0] = b[0]*c[0] + b[1]*c[1];
215 a[1] = b[0]*c[1] - b[1]*c[0];
219 template <
typename Float>
221 a[0] += sign * (b[0]*c[0] - b[1]*c[1]);
222 a[1] -= sign * (b[0]*c[1] + b[1]*c[0]);
226 template <
typename Float>
228 a[0] = b[0]*c[0] - b[1]*c[1];
229 a[1] = -b[0]*c[1] - b[1]*c[0];
234 template <
typename Float>
238 row_sum += mat[2]*mat[2];
239 row_sum += mat[3]*mat[3];
240 row_sum += mat[4]*mat[4];
241 row_sum += mat[5]*mat[5];
242 Float u0 = (dir < 3 ? anisotropy_ : (idx >= (X_[3]-1)*X_[0]*X_[1]*X_[2]/2 ? t_boundary_ : 1));
243 Float diff = 1.f/(u0*u0) - row_sum;
244 Float U00_mag = sqrt(diff >= 0 ? diff : 0.0);
249 mat[0] = U00_mag * cos(mat[14]);
250 mat[1] = U00_mag * sin(mat[14]);
252 Float column_sum = 0.0;
253 for (
int i=0; i<2; i++) column_sum += mat[i]*mat[i];
254 for (
int i=6; i<8; i++) column_sum += mat[i]*mat[i];
255 diff = 1.f/(u0*u0) - column_sum;
256 Float U20_mag = sqrt(diff >= 0 ? diff : 0.0);
258 mat[12] = U20_mag * cos(mat[15]);
259 mat[13] = U20_mag * sin(mat[15]);
264 Float r_inv2 = 1.0/(u0*row_sum);
294 template <
typename Float>
296 double2 *dg = d_gauge + dir*4*
V;
297 for (
int j=0; j<4; j++) {
298 h_gauge[2*j+0] = dg[j*
V].x;
299 h_gauge[2*j+1] = dg[j*
V].y;
304 template <
typename Float>
306 float4 *dg = d_gauge + dir*2*
V;
307 h_gauge[0] = dg[0].x;
308 h_gauge[1] = dg[0].y;
309 h_gauge[2] = dg[0].z;
310 h_gauge[3] = dg[0].w;
311 h_gauge[4] = dg[
V].x;
312 h_gauge[5] = dg[
V].y;
313 h_gauge[6] = dg[
V].z;
314 h_gauge[7] = dg[
V].w;
318 template <
typename Float>
320 float2 *dg = d_gauge + dir*4*
V;
321 for (
int j=0; j<4; j++) {
322 h_gauge[2*j+0] = dg[j*
V].x;
323 h_gauge[2*j+1] = dg[j*
V].y;
328 template <
typename Float>
330 short4 *dg = d_gauge + dir*2*
V;
344 template <
typename Float>
346 short2 *dg = d_gauge + dir*4*
V;
347 for (
int j=0; j<4; j++) {
357 template <
typename Float>
359 Float *u = &mat[0*(3*2)];
360 Float *v = &mat[1*(3*2)];
361 Float *w = &mat[2*(3*2)];
362 w[0] = 0.0; w[1] = 0.0; w[2] = 0.0; w[3] = 0.0; w[4] = 0.0; w[5] = 0.0;
369 Float u0 = (dir < 3 ? anisotropy_ :
370 (idx >= (X_[3]-1)*X_[0]*X_[1]*X_[2]/2 ? t_boundary_ : 1));
371 w[0]*=u0; w[1]*=u0; w[2]*=u0; w[3]*=u0; w[4]*=u0; w[5]*=u0;
374 template <
typename Float>
376 double2 *dg = d_gauge + dir*6*
V;
377 for (
int j=0; j<6; j++) {
378 h_gauge[j*2+0] = dg[j*
V].x;
379 h_gauge[j*2+1] = dg[j*
V].y;
384 template <
typename Float>
386 float4 *dg = d_gauge + dir*3*
V;
387 for (
int j=0; j<3; j++) {
388 h_gauge[j*4+0] = dg[j*
V].x;
389 h_gauge[j*4+1] = dg[j*
V].y;
390 h_gauge[j*4+2] = dg[j*
V].z;
391 h_gauge[j*4+3] = dg[j*
V].w;
396 template <
typename Float>
398 float2 *dg = d_gauge + dir*6*
V;
399 for (
int j=0; j<6; j++) {
400 h_gauge[j*2+0] = dg[j*
V].x;
401 h_gauge[j*2+1] = dg[j*
V].y;
406 template <
typename Float>
408 short4 *dg = d_gauge + dir*3*
V;
409 for (
int j=0; j<3; j++) {
418 template <
typename Float>
420 short2 *dg = d_gauge + dir*6*
V;
421 for (
int j=0; j<6; j++) {
428 template <
typename Float>
430 double2 *dg = d_gauge + dir*9*
V;
431 for (
int j=0; j<9; j++) {
432 h_gauge[j*2+0] = dg[j*
V].x;
433 h_gauge[j*2+1] = dg[j*
V].y;
437 template <
typename Float>
439 float4 *dg = d_gauge + dir*5*
V;
440 for (
int j=0; j<4; j++) {
441 h_gauge[j*4+0] = dg[j*
V].x;
442 h_gauge[j*4+1] = dg[j*
V].y;
443 h_gauge[j*4+2] = dg[j*
V].z;
444 h_gauge[j*4+3] = dg[j*
V].w;
446 h_gauge[16] = dg[4*
V].x;
447 h_gauge[17] = dg[4*
V].y;
450 template <
typename Float>
452 float2 *dg = d_gauge + dir*9*
V;
453 for (
int j=0; j<9; j++) {
454 h_gauge[j*2+0] = dg[j*
V].x;
455 h_gauge[j*2+1] = dg[j*
V].y;
459 template <
typename Float>
461 short4 *dg = d_gauge + dir*5*
V;
462 for (
int j=0; j<4; j++) {
473 template <
typename Float>
475 short2 *dg = d_gauge + dir*9*
V;
476 for (
int j=0; j<9; j++) {
490 template <
typename Float,
typename FloatN>
491 static void packQDPGaugeField(
FloatN *d_gauge,
Float **h_gauge,
int oddBit,
496 int nMat = nFace*voxels[
dir];
497 Float *g = h_gauge[
dir] + oddBit*nMat*18;
498 for (
int i = 0; i < nMat; i++)
pack12(d_gauge+offset+i, g+i*18,
dir, Vh+pad);
502 int nMat = nFace*voxels[
dir];
503 Float *g = h_gauge[
dir] + oddBit*nMat*18;
504 for (
int i = 0; i < nMat; i++)
pack8(d_gauge+offset+i, g+i*18,
dir, Vh+pad);
508 int nMat = nFace*voxels[
dir];
509 Float *g = h_gauge[
dir] + oddBit*nMat*18;
512 for (
int i = 0; i < nMat; i++)
515 for (
int i = 0; i < nMat; i++)
pack18(d_gauge+offset+i, g+i*18,
dir, Vh+pad);
523 template <
typename Float,
typename FloatN>
524 static void unpackQDPGaugeField(
Float **h_gauge,
FloatN *d_gauge,
int oddBit,
528 Float *g = h_gauge[
dir] + oddBit*V*18;
529 for (
int i = 0; i <
V; i++)
unpack12(g+i*18, d_gauge+i,
dir, V+pad, i);
533 Float *g = h_gauge[
dir] + oddBit*V*18;
534 for (
int i = 0; i <
V; i++)
unpack8(g+i*18, d_gauge+i,
dir, V+pad, i);
538 Float *g = h_gauge[
dir] + oddBit*V*18;
539 for (
int i = 0; i <
V; i++)
unpack18(g+i*18, d_gauge+i,
dir, V+pad);
545 template <
typename Float,
typename Float2>
546 static void transposeScale(
Float *gT,
Float *g,
const Float2 &a) {
547 for (
int ic=0; ic<3; ic++)
for (
int jc=0; jc<3; jc++)
for (
int r=0; r<2; r++)
548 gT[(ic*3+jc)*2+r] = a*g[(jc*3+ic)*2+r];
553 template <
typename Float,
typename FloatN>
554 static void packCPSGaugeField(
FloatN *d_gauge,
Float *h_gauge,
int oddBit,
559 Float *g = h_gauge + (oddBit*V*4+
dir)*18;
560 for (
int i = 0; i <
V; i++) {
561 transposeScale(gT, g+4*i*18, 1.0 / anisotropy_);
567 Float *g = h_gauge + (oddBit*V*4+
dir)*18;
568 for (
int i = 0; i <
V; i++) {
569 transposeScale(gT, g+4*i*18, 1.0 / anisotropy_);
575 Float *g = h_gauge + (oddBit*V*4+
dir)*18;
576 for (
int i = 0; i <
V; i++) {
577 transposeScale(gT, g+4*i*18, 1.0 / anisotropy_);
587 template <
typename Float,
typename FloatN>
588 static void unpackCPSGaugeField(
Float *h_gauge,
FloatN *d_gauge,
int oddBit,
593 Float *hg = h_gauge + (oddBit*V*4+
dir)*18;
594 for (
int i = 0; i <
V; i++) {
596 transposeScale(hg+4*i*18, gT, anisotropy_);
601 Float *hg = h_gauge + (oddBit*V*4+
dir)*18;
602 for (
int i = 0; i <
V; i++) {
604 transposeScale(hg+4*i*18, gT, anisotropy_);
609 Float *hg = h_gauge + (oddBit*V*4+
dir)*18;
610 for (
int i = 0; i <
V; i++) {
612 transposeScale(hg+4*i*18, gT, anisotropy_);
622 template <
typename Float,
typename FloatN>
628 for (dir = 0; dir < 4; dir++) {
630 for (i = 0; i <
Vh; i++) {
631 pack12(res+i, g+(4*i+dir)*gaugeSiteSize, dir, Vh+pad);
635 for (dir = 0; dir < 4; dir++) {
637 for (i = 0; i <
Vh; i++) {
638 pack8(res+i, g+(4*i+dir)*gaugeSiteSize, dir, Vh+pad);
642 for (dir = 0; dir < 4; dir++) {
644 for (i = 0; i <
Vh; i++) {
645 pack18(res+i, g+(4*i+dir)*gaugeSiteSize, dir, Vh+pad);
653 template <
typename Float,
typename FloatN>
654 static void unpackMILCGaugeField(
Float *h_gauge,
FloatN *d_gauge,
int oddBit,
658 Float *hg = h_gauge + (oddBit*V*4+
dir)*18;
659 for (
int i = 0; i <
V; i++) {
665 Float *hg = h_gauge + (oddBit*V*4+
dir)*18;
666 for (
int i = 0; i <
V; i++) {
672 Float *hg = h_gauge + (oddBit*V*4+
dir)*18;
673 for (
int i = 0; i <
V; i++) {
683 template <
typename Float,
typename FloatN>
689 int mu_offset = X_[0]/2 + 2;
690 for (
int i=1; i<4; i++) mu_offset *= (X_[i] + 2);
695 for (dir = 0; dir < 4; dir++) {
697 for (i = 0; i <
Vh; i++) {
698 transposeScale(gT, g+i*18, 1.0);
699 pack12(res+i, gT, dir, Vh+pad);
703 for (dir = 0; dir < 4; dir++) {
706 for (i = 0; i <
Vh; i++) {
707 transposeScale(gT, g+i*18, 1.0);
708 pack8(res+i, gT, dir, Vh+pad);
713 for (dir = 0; dir < 4; dir++) {
716 for (i = 0; i <
Vh; i++) {
717 transposeScale(gT, g+i*18, 1.0);
718 pack18(res+i, gT, dir, Vh+pad);
727 template <
typename Float,
typename FloatN>
728 static void unpackBQCDGaugeField(
Float *h_gauge,
FloatN *d_gauge,
int oddBit,
731 int mu_offset = X_[0]/2 + 2;
732 for (
int i=1; i<4; i++) mu_offset *= (X_[i] + 2);
738 for (
int i = 0; i <
V; i++) {
740 transposeScale(hg+i*18, gT, 1.0);
746 for (
int i = 0; i <
V; i++) {
748 transposeScale(hg+i*18, gT, 1.0);
754 for (
int i = 0; i <
V; i++) {
756 transposeScale(hg+i*18, gT, 1.0);
768 template <
typename Float,
typename Float2>
772 Float2 *r = res + dir*5*
stride;
773 for (
int j=0; j<5; j++) {
779 template <
typename Float,
typename Float2>
783 Float *g = mom + (oddBit*Vh*4 +
dir)*10;
784 for (
int i = 0; i <
Vh; i++) {
790 template <
typename Float,
typename Float2>
794 Float2 *r = res + dir*5*
stride;
795 for (
int j=0; j<5; j++) {
801 template <
typename Float,
typename Float2>
805 Float *m = mom + oddBit*Vh*10*4;
807 for (i = 0; i <
Vh; i++) {
808 for (dir = 0; dir < 4; dir++) {
810 unpack10(thismom, res+i, dir, Vh, pad);