13 #define RETURN_IF_ERR if(err) return;
25 static const int result = 1;
31 static const int result = -1;
35 template<
class T,
class U>
80 typedef std::complex<float>
Type;
86 typedef std::complex<float>
Type;
92 typedef std::complex<float>
Type;
98 typedef std::complex<double>
Type;
104 typedef std::complex<double>
Type;
110 typedef std::complex<double>
Type;
116 typedef std::complex<double>
Type;
122 typedef std::complex<double>
Type;
128 typedef std::complex<double>
Type;
131 template<
int N,
class T>
146 template<
int N,
class T>
149 for(
int i=0; i<N; ++i){
150 for(
int j=0; j<N; ++j){
151 data[i][j] =
static_cast<T
>(0);
156 template<
int N,
class T>
159 for(
int i=0; i<N; ++i){
160 for(
int j=0; j<N; ++j){
161 data[i][j] = mat.
data[i][j];
166 template<
int N,
class T>
172 template<
int N,
class T>
178 template<
int N,
class T>
181 for(
int i=0; i<N; ++i){
182 for(
int j=0; j<N; ++j){
183 data[i][j] += mat.
data[i][j];
189 template<
int N,
class T>
192 for(
int i=0; i<N; ++i){
193 for(
int j=0; j<N; ++j){
194 data[i][j] -= mat.
data[i][j];
200 template<
int N,
class T>
208 template<
int N,
class T>
216 template<
int N,
class T>
220 for(
int i=0; i<N; ++i){
221 for(
int j=0; j<N; ++j){
222 result(i,j) =
static_cast<T
>(0);
223 for(
int k=0; k<N; ++k){
224 result(i,j) += a(i,k)*b(k,j);
231 template<
int N,
class T>
235 for(
int i=0; i<N; ++i){
236 for(
int j=0; j<N; ++j){
243 template<
int N,
class T>
247 for(
int i=0; i<N; ++i){
248 for(
int j=0; j<N; ++j){
249 result(i,j) =
mat(j,i);
255 template<
int N,
class T,
class U>
261 for(
int i=0; i<N; ++i){
262 for(
int j=0; j<N; ++j){
263 result(i,j) = scalar*
mat(i,j);
269 template<
int N,
class T,
class U>
275 template<
int N,
class T>
280 for(
int i=0; i<N; ++i){
281 id(i,i) =
static_cast<T
>(1);
287 template<
int N,
class T>
297 template<
int N,
class T>
298 std::ostream & operator << (std::ostream & os, const Matrix<N,T> & m)
300 for(
int i=0; i<N; ++i){
301 for(
int j=0; j<N; ++j){
304 if(i<N-1) os << std::endl;
315 const int half_volume;
321 void loadMatrixFromField(
const Real*
const field,
int oddBit,
int dir,
int half_lattice_index,
Matrix<3, std::complex<Real> >*
const mat)
const;
330 Real getData(
const Real*
const field,
int idx,
int dir,
int oddBit,
int offset,
int hfv)
const;
331 void addData(Real*
const field,
int idx,
int dir,
int oddBit,
int offset, Real,
int hfv)
const;
332 int half_idx_conversion_ex2normal(
int half_lattice_index,
const int* dim,
int oddBit)
const ;
333 int half_idx_conversion_normal2ex(
int half_lattice_index,
const int* dim,
int oddBit)
const ;
351 int sid = half_lattice_index_ex;
359 int x1odd = (x2 + x3 + x4 + oddBit) & 1;
362 int idx = ((x4-2)*X3*X2*X1 + (x3-2)*X2*X1+(x2-2)*X1+(x1-2))/2;
381 int sid = half_lattice_index;
389 int x1odd = (x2 + x3 + x4 + oddBit) & 1;
392 int idx = ((x4+2)*E3*E2*E1 + (x3+2)*E2*E1+(x2+2)*E1+(x1+2))/2;
401 return field[(4*hfv*oddBit +4*idx +
dir)*18+offset];
403 return ((Real**)field)[dir][(hfv*oddBit+idx)*18 +offset];
410 field[(4*hfv*oddBit +4*idx +
dir)*18+offset] += v;
412 ((Real**)field)[
dir][(hfv*oddBit+
idx)*18 +offset] += v;
421 int half_lattice_index,
422 Matrix<3, std::complex<Real> >*
const mat
432 for(
int i=0; i<3; ++i){
433 for(
int j=0; j<3; ++j){
434 (*mat)(i,j) = (*(field + (oddBit*hfv + half_lattice_index)*18 + offset++));
435 (*mat)(i,j) += std::complex<Real>(0, *(field + (oddBit*hfv + half_lattice_index)*18 + offset++));
445 int half_lattice_index,
446 Matrix<3, std::complex<Real> >*
const mat
457 for(
int i=0; i<3; ++i){
458 for(
int j=0; j<3; ++j){
459 (*mat)(i,j) = (getData(field, half_lattice_index, dir, oddBit, offset++, hfv));
460 (*mat)(i,j) += std::complex<Real>(0, getData(field, half_lattice_index, dir, oddBit, offset++, hfv));
469 int half_lattice_index,
470 Real*
const field)
const
479 for(
int i=0; i<3; ++i){
480 for(
int j=0; j<3; ++j){
481 *(field + (oddBit*hfv + half_lattice_index)*18 + offset++) = (
mat)(i,j).real();
482 *(field + (oddBit*hfv + half_lattice_index)*18 + offset++) = (
mat)(i,j).imag();
491 int half_lattice_index,
493 Real*
const field)
const
500 Real*
const local_field = field + (oddBit*hfv + half_lattice_index)*18;
504 for(
int i=0; i<3; ++i){
505 for(
int j=0; j<3; ++j){
506 local_field[offset++] += coeff*
mat(i,j).real();
507 local_field[offset++] += coeff*
mat(i,j).imag();
518 int half_lattice_index,
520 Real*
const field)
const
531 for(
int i=0; i<3; ++i){
532 for(
int j=0; j<3; ++j){
534 addData(field, half_lattice_index, dir, oddBit, offset++, coeff*
mat(i,j).real(), hfv);
537 addData(field, half_lattice_index, dir, oddBit, offset++, coeff*
mat(i,j).imag(), hfv);
548 int half_lattice_index,
550 Real*
const field)
const
552 Real*
const mom_field = field + ((oddBit*half_volume + half_lattice_index)*4 + dir)*10;
553 mom_field[0] = (
mat(0,1).real() -
mat(1,0).real())*0.5*coeff;
554 mom_field[1] = (
mat(0,1).imag() +
mat(1,0).imag())*0.5*coeff;
556 mom_field[2] = (
mat(0,2).real() -
mat(2,0).real())*0.5*coeff;
557 mom_field[3] = (
mat(0,2).imag() +
mat(2,0).imag())*0.5*coeff;
559 mom_field[4] = (
mat(1,2).real() -
mat(2,1).real())*0.5*coeff;
560 mom_field[5] = (
mat(1,2).imag() +
mat(2,1).imag())*0.5*coeff;
562 const Real temp = (
mat(0,0).imag() +
mat(1,1).imag() +
mat(2,2).imag())*0.3333333333333333333;
563 mom_field[6] = (
mat(0,0).imag() - temp)*coeff;
564 mom_field[7] = (
mat(1,1).imag() - temp)*coeff;
565 mom_field[8] = (
mat(2,2).imag() - temp)*coeff;
581 void getCoordsFromHalfIndex(
int half_index,
int coord[4]);
582 void getCoordsFromFullIndex(
int full_index,
int coord[4]);
583 void cache(
int half_lattice_index);
589 int getFullFromHalfIndex(
int half_lattice_index);
590 int getNeighborFromFullIndex(
int full_lattice_index,
int dir,
int* err=NULL);
596 for(
int dir=0; dir<4; ++
dir){
597 local_dim[
dir] = dim[
dir];
598 volume *= local_dim[
dir];
608 int E1 = local_dim[0]+4;
609 int E2 = local_dim[1]+4;
610 int E3 = local_dim[2]+4;
614 int z1 = half_lattice_index/
E1h;
615 int x1h = half_lattice_index - z1*
E1h;
617 coord[1] = z1 - z2*
E2;
619 coord[2] = z2 - coord[3]*
E3;
620 int x1odd = (coord[1] + coord[2] + coord[3] + oddBit) & 1;
621 coord[0] = 2*x1h +
x1odd;
623 int half_dim_0 = local_dim[0]/2;
624 int z1 = half_lattice_index/half_dim_0;
625 int x1h = half_lattice_index - z1*half_dim_0;
626 int z2 = z1/local_dim[1];
627 coord[1] = z1 - z2*local_dim[1];
628 coord[3] = z2/local_dim[2];
629 coord[2] = z2 - coord[3]*local_dim[2];
630 int x1odd = (coord[1] + coord[2] + coord[3] + oddBit) & 1;
631 coord[0] = 2*x1h +
x1odd;
640 int D1=local_dim[0]+4;
641 int D2=local_dim[1]+4;
642 int D3=local_dim[2]+4;
653 int z1 = full_lattice_index/
D1;
654 coord[0] = full_lattice_index - z1*
D1;
656 coord[1] = z1 - z2*
D2;
658 coord[2] = z2 - coord[3]*
D3;
668 half_index = half_lattice_index;
669 getCoordsFromHalfIndex(half_lattice_index, full_coord);
670 int x1odd = (full_coord[1] + full_coord[2] + full_coord[3] + oddBit) & 1;
671 full_index = 2*half_lattice_index +
x1odd;
678 if(half_index != half_lattice_index) cache(half_lattice_index);
690 getCoordsFromFullIndex(full_lattice_index, coord);
692 int E1 = local_dim[0] + 4;
693 int E2 = local_dim[1] + 4;
694 int E3 = local_dim[2] + 4;
695 int E4 = local_dim[3] + 4;
698 neighbor_index = full_lattice_index + 1;
699 if(err && (coord[0] == E1-1) ) *err = 1;
702 neighbor_index = full_lattice_index +
E1;
703 if(err && (coord[1] == E2-1) ) *err = 1;
706 neighbor_index = full_lattice_index + E2*
E1;
707 if(err && (coord[2] == E3-1) ) *err = 1;
710 neighbor_index = full_lattice_index + E3*E2*
E1;
711 if(err && (coord[3] == E4-1) ) *err = 1;
714 neighbor_index = full_lattice_index - 1;
715 if(err && (coord[0] == 0) ) *err = 1;
718 neighbor_index = full_lattice_index -
E1;
719 if(err && (coord[1] == 0) ) *err = 1;
722 neighbor_index = full_lattice_index - E2*
E1;
723 if(err && (coord[2] == 0) ) *err = 1;
726 neighbor_index = full_lattice_index - E3*E2*
E1;
727 if(err && (coord[3] == 0) ) *err = 1;
730 errorQuda(
"Neighbor index could not be determined\n");
738 neighbor_index = (coord[0] == local_dim[0]-1) ? full_lattice_index + 1 - local_dim[0] : full_lattice_index + 1;
741 neighbor_index = (coord[1] == local_dim[1]-1) ? full_lattice_index + local_dim[0]*(1 - local_dim[1]) : full_lattice_index + local_dim[0];
744 neighbor_index = (coord[2] == local_dim[2]-1) ? full_lattice_index + local_dim[0]*local_dim[1]*(1 - local_dim[2]) : full_lattice_index + local_dim[0]*local_dim[1];
747 neighbor_index = (coord[3] == local_dim[3]-1) ? full_lattice_index + local_dim[0]*local_dim[1]*local_dim[2]*(1-local_dim[3]) : full_lattice_index + local_dim[0]*local_dim[1]*local_dim[2];
750 neighbor_index = (coord[0] == 0) ? full_lattice_index - 1 + local_dim[0] : full_lattice_index - 1;
753 neighbor_index = (coord[1] == 0) ? full_lattice_index - local_dim[0]*(1 - local_dim[1]) : full_lattice_index - local_dim[0];
756 neighbor_index = (coord[2] == 0) ? full_lattice_index - local_dim[0]*local_dim[1]*(1 - local_dim[2]) : full_lattice_index - local_dim[0]*local_dim[1];
759 neighbor_index = (coord[3] == 0) ? full_lattice_index - local_dim[0]*local_dim[1]*local_dim[2]*(1 - local_dim[3]) : full_lattice_index - local_dim[0]*local_dim[1]*local_dim[2];
762 errorQuda(
"Neighbor index could not be determined\n");
768 return neighbor_index;
778 template<
class Real,
int oddBit>
780 int half_lattice_index,
781 const Real*
const oprod,
791 int idx = half_lattice_index;
801 const Real*
const oprod,
806 for(
int dir=0; dir<4; ++
dir) volume *= dim[dir];
807 const int half_volume = volume/2;
809 for(
int site=0; site<half_volume; ++site){
810 computeOneLinkSite<Real,0>(dim, site,
817 for(
int site=0; site<half_volume; ++site){
818 computeOneLinkSite<Real,1>(dim, site,
835 template<
class Real,
int oddBit>
838 const Real*
const oprod,
839 const Real*
const Qprev,
840 const Real*
const link,
851 const bool sig_positive = (
GOES_FORWARDS(sig)) ?
true :
false;
861 point_d = new_mem_idx >> 1;
864 point_c = new_mem_idx >> 1;
867 point_b = new_mem_idx >> 1;
869 ad_link_nbr_idx = (mu_positive) ? point_d : half_lattice_index;
870 bc_link_nbr_idx = (mu_positive) ? point_c : point_b;
871 ab_link_nbr_idx = (sig_positive) ? half_lattice_index : point_b;
897 colorMatY =
conj(colorMatY);
903 colorMatW = (!mu_positive) ? bc_link*colorMatY :
conj(bc_link)*colorMatY;
906 colorMatY = (sig_positive) ? ab_link*colorMatW :
conj(ab_link)*colorMatW;
913 ad_link =
conj(ad_link);
918 if(sig_positive) colorMatY = colorMatW*
ad_link;
921 if(Qmu || sig_positive){
926 if(sig_positive) colorMatY = colorMatW*colorMatX;
929 if(sig_positive) ls.
addMatrixToField(colorMatY, oddBit, sig, half_lattice_index, coeff, newOprod);
937 const Real*
const oprod,
938 const Real*
const Qprev,
939 const Real*
const link,
950 for(
int dir=0; dir<4; ++
dir) volume *= dim[dir];
952 const int loop_count =
Vh_ex;
954 const int loop_count = volume/2;
960 for(
int site=0; site<loop_count; ++site){
961 computeMiddleLinkSite<Real, 0>(site, dim,
968 for(
int site=0; site<loop_count; ++site){
969 computeMiddleLinkSite<Real,1>(site, dim,
981 template<
class Real,
int oddBit>
984 const Real*
const P3,
985 const Real*
const Qprod,
986 const Real*
const link,
996 const bool sig_positive = (
GOES_FORWARDS(sig)) ?
true :
false;
1005 point_d = new_mem_idx >> 1;
1006 ad_link_nbr_idx = (mu_positive) ? point_d : half_lattice_index;
1019 ad_link_nbr_idx = half_lattice_index;
1022 colorMatW = (mu_positive) ? ad_link*colorMatY :
conj(ad_link)*colorMatY;
1027 Real
mycoeff = ( (sig_positive && oddBit) || (!sig_positive && !oddBit) ) ? coeff : -coeff;
1032 colorMatW = colorMatY*colorMatX;
1033 if(!oddBit){ mycoeff = -
mycoeff; }
1036 colorMatW =
conj(colorMatX)*
conj(colorMatY);
1037 if(oddBit){ mycoeff = -
mycoeff; }
1044 if(!oddBit){ mycoeff = -
mycoeff; }
1047 if(oddBit){ mycoeff = -
mycoeff; }
1048 colorMatW =
conj(colorMatY);
1058 template<
class Real>
1060 const Real*
const P3,
1061 const Real*
const Qprod,
1062 const Real*
const link,
1066 Real*
const newOprod
1071 for(
int dir=0; dir<4; ++
dir) volume *= dim[dir];
1073 const int loop_count =
Vh_ex;
1075 const int loop_count = volume/2;
1079 for(
int site=0; site<loop_count; ++site){
1080 computeSideLinkSite<Real,0>(site, dim,
1084 ls, shortP, newOprod);
1087 for(
int site=0; site<loop_count; ++site){
1088 computeSideLinkSite<Real,1>(site, dim,
1092 ls, shortP, newOprod);
1102 template<
class Real,
int oddBit>
1105 const Real*
const oprod,
1106 const Real*
const Qprev,
1107 const Real*
const link,
1112 Real*
const newOprod)
1115 const bool mu_positive = (
GOES_FORWARDS(mu)) ?
true :
false;
1116 const bool sig_positive = (
GOES_FORWARDS(sig)) ?
true :
false;
1129 point_d = new_mem_idx >> 1;
1132 point_c = new_mem_idx >> 1;
1135 point_b = new_mem_idx >> 1;
1136 ab_link_nbr_idx = (sig_positive) ? half_lattice_index : point_b;
1140 Real
mycoeff = ( (sig_positive && oddBit) || (!sig_positive && !oddBit) ) ? coeff : -coeff;
1148 colorMatZ =
conj(bc_link)*colorMatY;
1152 colorMatY = colorMatX*
ad_link;
1153 colorMatW = colorMatZ*colorMatY;
1162 colorMatY = (sig_positive) ? ab_link*colorMatZ :
conj(ab_link)*colorMatZ;
1163 colorMatW = colorMatY*colorMatX;
1165 colorMatW = ad_link*colorMatY;
1174 if(sig_positive) colorMatW = colorMatX*
conj(ad_link);
1175 colorMatZ = bc_link*colorMatY;
1177 colorMatY = colorMatZ*colorMatW;
1187 colorMatY = (sig_positive) ? ab_link*colorMatZ :
conj(ab_link)*colorMatZ;
1188 colorMatW =
conj(colorMatX)*
conj(colorMatY);
1191 colorMatW =
conj(ad_link)*colorMatY;
1199 template<
class Real>
1201 const Real*
const oprod,
1202 const Real*
const Qprev,
1203 const Real*
const link,
1207 Real*
const newOprod)
1210 for(
int dir=0; dir<4; ++
dir) volume *= dim[dir];
1212 const int loop_count =
Vh_ex;
1214 const int loop_count = volume/2;
1218 for(
int site=0; site<loop_count; ++site){
1220 computeAllLinkSite<Real,0>(site, dim,
1228 for(
int site=0; site<loop_count; ++site){
1229 computeAllLinkSite<Real, 1>(site, dim,
1240 #define Pmu tempmat[0]
1241 #define P3 tempmat[1]
1242 #define P5 tempmat[2]
1243 #define Pnumu tempmat[3]
1244 #define Qmu tempmat[4]
1245 #define Qnumu tempmat[5]
1247 template<
class Real>
1259 template<
class Real>
1267 Real OneLink, ThreeSt, FiveSt, SevenSt, Lepage,
coeff;
1269 OneLink = staple_coeff.
one;
1270 ThreeSt = staple_coeff.
three;
1271 FiveSt = staple_coeff.
five;
1272 SevenSt = staple_coeff.
seven;
1273 Lepage = staple_coeff.
lepage;
1285 for(
int mu=0;
mu<8; ++
mu){
1288 computeMiddleLinkField<Real>(dim,
1294 for(
int nu=0; nu<8; ++nu){
1296 || nu==sig || nu==
OPP_DIR(sig) )
continue;
1298 computeMiddleLinkField<Real>(dim,
1305 for(
int rho=0; rho<8; ++rho){
1306 if( rho == sig || rho ==
OPP_DIR(sig)
1307 || rho == mu || rho ==
OPP_DIR(mu)
1308 || rho == nu || rho ==
OPP_DIR(nu) )
1313 if(FiveSt != 0)coeff = SevenSt/FiveSt;
else coeff = 0;
1314 computeAllLinkField<Real>(dim,
1322 if(ThreeSt != 0)coeff = FiveSt/ThreeSt;
else coeff = 0;
1323 computeSideLinkField<Real>(dim,
1333 if(staple_coeff.
lepage != 0.){
1334 computeMiddleLinkField<Real>(dim,
1340 if(ThreeSt != 0)coeff = Lepage/ThreeSt;
else coeff = 0;
1341 computeSideLinkField<Real>(dim,
1348 computeSideLinkField<Real>(dim,
1350 sig,
mu, ThreeSt, 0.,
1374 for(
int dir=0; dir<4; ++
dir) volume *= param.
X[dir];
1384 for(
int i=0; i<6; ++i) tempmat[i] = malloc(len*18*
sizeof(
double));
1386 for(
int i=0; i<6; ++i) tempmat[i] = malloc(len*18*
sizeof(
float));
1390 act_path_coeff.
one = path_coeff[0];
1391 act_path_coeff.
naik = path_coeff[1];
1392 act_path_coeff.
three = path_coeff[2];
1393 act_path_coeff.
five = path_coeff[3];
1394 act_path_coeff.
seven = path_coeff[4];
1395 act_path_coeff.
lepage = path_coeff[5];
1398 doHisqStaplesForceCPU<double>(param.
X,
1407 doHisqStaplesForceCPU<float>(param.
X,
1418 for(
int i=0; i<6; ++i){
1426 template<
class Real,
int oddBit>
1429 const Real*
const oprod,
1430 const Real*
const link,
1446 int idx = half_lattice_index;
1453 point_d = new_mem_idx >> 1;
1456 point_e = new_mem_idx >> 1;
1459 point_b = new_mem_idx >> 1;
1462 point_a = new_mem_idx >> 1;
1473 colorMatV = de_link*ef_link*colorMatZ
1474 - de_link*colorMatY*bc_link
1482 template<
class Real>
1484 const Real*
const oprod,
1485 const Real*
const link,
1490 for(
int dir=0; dir<4; ++
dir) volume *= dim[dir];
1491 const int half_volume = volume/2;
1494 for(
int site=0; site<half_volume; ++site){
1495 computeLongLinkSite<Real,0>(site,
1504 for(
int site=0; site<half_volume; ++site){
1505 computeLongLinkSite<Real,1>(site,
1524 computeLongLinkField<float>(param.
X,
1530 computeLongLinkField<double>(param.
X,
1534 (
double*)newOprod->
Gauge_p());
1543 template<
class Real,
int oddBit>
1546 const Real*
const oprod,
1547 const Real*
const link,
1557 int idx = half_lattice_index_ex;
1559 int idx = half_lattice_index;
1564 const Real
coeff = (oddBit) ? -1 : 1;
1565 colorMatY = linkW*colorMatX;
1571 template <
class Real>
1573 const Real*
const oprod,
1574 const Real*
const link,
1578 int volume = dim[0]*dim[1]*dim[2]*dim[3];
1579 const int half_volume = volume/2;
1583 for(
int site=0; site<half_volume; ++site){
1584 completeForceSite<Real,0>(site,
1592 for(
int site=0; site<half_volume; ++site){
1593 completeForceSite<Real,1>(site,
1611 completeForceField<float>(param.
X,
1617 completeForceField<double>(param.
X,