13 #define RETURN_IF_ERR if(err) return;
19 static int OPP_DIR(
int dir){
return 7-dir; }
24 static const int result = 1;
30 static const int result = -1;
34 template<
class T,
class U>
79 typedef std::complex<float>
Type;
85 typedef std::complex<float>
Type;
91 typedef std::complex<float>
Type;
97 typedef std::complex<double>
Type;
103 typedef std::complex<double>
Type;
109 typedef std::complex<double>
Type;
115 typedef std::complex<double>
Type;
121 typedef std::complex<double>
Type;
127 typedef std::complex<double>
Type;
130 template<
int N,
class T>
145 template<
int N,
class T>
148 for(
int i=0; i<N; ++i){
149 for(
int j=0; j<N; ++j){
150 data[i][j] =
static_cast<T
>(0);
155 template<
int N,
class T>
158 for(
int i=0; i<N; ++i){
159 for(
int j=0; j<N; ++j){
160 data[i][j] = mat.
data[i][j];
165 template<
int N,
class T>
171 template<
int N,
class T>
177 template<
int N,
class T>
180 for(
int i=0; i<N; ++i){
181 for(
int j=0; j<N; ++j){
182 data[i][j] += mat.
data[i][j];
188 template<
int N,
class T>
191 for(
int i=0; i<N; ++i){
192 for(
int j=0; j<N; ++j){
193 data[i][j] -= mat.
data[i][j];
199 template<
int N,
class T>
207 template<
int N,
class T>
215 template<
int N,
class T>
219 for(
int i=0; i<N; ++i){
220 for(
int j=0; j<N; ++j){
221 result(i,j) =
static_cast<T
>(0);
222 for(
int k=0; k<N; ++k){
223 result(i,j) += a(i,k)*b(k,j);
230 template<
int N,
class T>
234 for(
int i=0; i<N; ++i){
235 for(
int j=0; j<N; ++j){
242 template<
int N,
class T>
246 for(
int i=0; i<N; ++i){
247 for(
int j=0; j<N; ++j){
248 result(i,j) =
mat(j,i);
254 template<
int N,
class T,
class U>
260 for(
int i=0; i<N; ++i){
261 for(
int j=0; j<N; ++j){
262 result(i,j) = scalar*
mat(i,j);
268 template<
int N,
class T,
class U>
274 template<
int N,
class T>
279 for(
int i=0; i<N; ++i){
280 id(i,i) =
static_cast<T
>(1);
286 template<
int N,
class T>
296 template<
int N,
class T>
297 std::ostream & operator << (std::ostream & os, const Matrix<N,T> & m)
299 for(
int i=0; i<N; ++i){
300 for(
int j=0; j<N; ++j){
303 if(i<N-1) os << std::endl;
314 const int half_volume;
328 void storeMatrixToMomentumField(
const Matrix<3, std::complex<Real> >&
mat,
int oddBit,
int dir,
int half_lattice_index, Real
coeff, Real*
const)
const;
329 Real getData(
const Real*
const field,
int idx,
int dir,
int oddBit,
int offset,
int hfv)
const;
330 void addData(Real*
const field,
int idx,
int dir,
int oddBit,
int offset, Real,
int hfv)
const;
331 int half_idx_conversion_ex2normal(
int half_lattice_index,
const int*
dim,
int oddBit)
const ;
332 int half_idx_conversion_normal2ex(
int half_lattice_index,
const int*
dim,
int oddBit)
const ;
350 int sid = half_lattice_index_ex;
361 int idx = ((x4-2)*X3*X2*X1 + (x3-2)*X2*X1+(x2-2)*X1+(x1-2))/2;
380 int sid = half_lattice_index;
391 int idx = ((x4+2)*E3*E2*E1 + (x3+2)*E2*E1+(x2+2)*E1+(x1+2))/2;
400 return field[(4*hfv*oddBit +4*idx + dir)*18+offset];
402 return ((Real**)field)[dir][(hfv*oddBit+idx)*18 +offset];
409 field[(4*hfv*oddBit +4*idx + dir)*18+offset] += v;
411 ((Real**)field)[dir][(hfv*oddBit+
idx)*18 +offset] += v;
420 int half_lattice_index,
421 Matrix<3, std::complex<Real> >*
const mat
431 for(
int i=0; i<3; ++i){
432 for(
int j=0; j<3; ++j){
433 (*mat)(i,j) = (*(field + (oddBit*hfv + half_lattice_index)*18 + offset++));
434 (*mat)(i,j) += std::complex<Real>(0, *(field + (oddBit*hfv + half_lattice_index)*18 + offset++));
444 int half_lattice_index,
445 Matrix<3, std::complex<Real> >*
const mat
456 for(
int i=0; i<3; ++i){
457 for(
int j=0; j<3; ++j){
458 (*mat)(i,j) = (getData(field, half_lattice_index, dir, oddBit, offset++, hfv));
459 (*mat)(i,j) += std::complex<Real>(0, getData(field, half_lattice_index, dir, oddBit, offset++, hfv));
468 int half_lattice_index,
469 Real*
const field)
const
478 for(
int i=0; i<3; ++i){
479 for(
int j=0; j<3; ++j){
480 *(field + (oddBit*hfv + half_lattice_index)*18 + offset++) = (
mat)(i,j).real();
481 *(field + (oddBit*hfv + half_lattice_index)*18 + offset++) = (
mat)(i,j).imag();
490 int half_lattice_index,
492 Real*
const field)
const
499 Real*
const local_field = field + (oddBit*hfv + half_lattice_index)*18;
503 for(
int i=0; i<3; ++i){
504 for(
int j=0; j<3; ++j){
505 local_field[offset++] += coeff*
mat(i,j).real();
506 local_field[offset++] += coeff*
mat(i,j).imag();
517 int half_lattice_index,
519 Real*
const field)
const
530 for(
int i=0; i<3; ++i){
531 for(
int j=0; j<3; ++j){
533 addData(field, half_lattice_index, dir, oddBit, offset++, coeff*
mat(i,j).real(), hfv);
536 addData(field, half_lattice_index, dir, oddBit, offset++, coeff*
mat(i,j).imag(), hfv);
547 int half_lattice_index,
549 Real*
const field)
const
551 Real*
const mom_field = field + ((oddBit*half_volume + half_lattice_index)*4 + dir)*10;
552 mom_field[0] = (
mat(0,1).real() -
mat(1,0).real())*0.5*coeff;
553 mom_field[1] = (
mat(0,1).imag() +
mat(1,0).imag())*0.5*coeff;
555 mom_field[2] = (
mat(0,2).real() -
mat(2,0).real())*0.5*coeff;
556 mom_field[3] = (
mat(0,2).imag() +
mat(2,0).imag())*0.5*coeff;
558 mom_field[4] = (
mat(1,2).real() -
mat(2,1).real())*0.5*coeff;
559 mom_field[5] = (
mat(1,2).imag() +
mat(2,1).imag())*0.5*coeff;
561 const Real temp = (
mat(0,0).imag() +
mat(1,1).imag() +
mat(2,2).imag())*0.3333333333333333333;
562 mom_field[6] = (
mat(0,0).imag() - temp)*coeff;
563 mom_field[7] = (
mat(1,1).imag() - temp)*coeff;
564 mom_field[8] = (
mat(2,2).imag() - temp)*coeff;
580 void getCoordsFromHalfIndex(
int half_index,
int coord[4]);
581 void getCoordsFromFullIndex(
int full_index,
int coord[4]);
582 void cache(
int half_lattice_index);
588 int getFullFromHalfIndex(
int half_lattice_index);
589 int getNeighborFromFullIndex(
int full_lattice_index,
int dir,
int* err=NULL);
595 for(
int dir=0; dir<4; ++dir){
596 local_dim[dir] = dim[dir];
597 volume *= local_dim[dir];
607 int E1 = local_dim[0]+4;
608 int E2 = local_dim[1]+4;
609 int E3 = local_dim[2]+4;
613 int z1 = half_lattice_index/
E1h;
614 int x1h = half_lattice_index - z1*
E1h;
616 coord[1] = z1 - z2*
E2;
618 coord[2] = z2 - coord[3]*
E3;
619 int x1odd = (coord[1] + coord[2] + coord[3] +
oddBit) & 1;
620 coord[0] = 2*x1h +
x1odd;
622 int half_dim_0 = local_dim[0]/2;
623 int z1 = half_lattice_index/half_dim_0;
624 int x1h = half_lattice_index - z1*half_dim_0;
625 int z2 = z1/local_dim[1];
626 coord[1] = z1 - z2*local_dim[1];
627 coord[3] = z2/local_dim[2];
628 coord[2] = z2 - coord[3]*local_dim[2];
629 int x1odd = (coord[1] + coord[2] + coord[3] +
oddBit) & 1;
630 coord[0] = 2*x1h +
x1odd;
639 int D1=local_dim[0]+4;
640 int D2=local_dim[1]+4;
641 int D3=local_dim[2]+4;
652 int z1 = full_lattice_index/
D1;
653 coord[0] = full_lattice_index - z1*
D1;
655 coord[1] = z1 - z2*
D2;
657 coord[2] = z2 - coord[3]*
D3;
667 half_index = half_lattice_index;
668 getCoordsFromHalfIndex(half_lattice_index, full_coord);
669 int x1odd = (full_coord[1] + full_coord[2] + full_coord[3] +
oddBit) & 1;
670 full_index = 2*half_lattice_index +
x1odd;
677 if(half_index != half_lattice_index) cache(half_lattice_index);
689 getCoordsFromFullIndex(full_lattice_index, coord);
691 int E1 = local_dim[0] + 4;
692 int E2 = local_dim[1] + 4;
693 int E3 = local_dim[2] + 4;
694 int E4 = local_dim[3] + 4;
697 neighbor_index = full_lattice_index + 1;
698 if(err && (coord[0] == E1-1) ) *err = 1;
701 neighbor_index = full_lattice_index +
E1;
702 if(err && (coord[1] == E2-1) ) *err = 1;
705 neighbor_index = full_lattice_index + E2*
E1;
706 if(err && (coord[2] == E3-1) ) *err = 1;
709 neighbor_index = full_lattice_index + E3*E2*
E1;
710 if(err && (coord[3] == E4-1) ) *err = 1;
713 neighbor_index = full_lattice_index - 1;
714 if(err && (coord[0] == 0) ) *err = 1;
717 neighbor_index = full_lattice_index -
E1;
718 if(err && (coord[1] == 0) ) *err = 1;
721 neighbor_index = full_lattice_index - E2*
E1;
722 if(err && (coord[2] == 0) ) *err = 1;
725 neighbor_index = full_lattice_index - E3*E2*
E1;
726 if(err && (coord[3] == 0) ) *err = 1;
729 errorQuda(
"Neighbor index could not be determined\n");
737 neighbor_index = (coord[0] == local_dim[0]-1) ? full_lattice_index + 1 - local_dim[0] : full_lattice_index + 1;
740 neighbor_index = (coord[1] == local_dim[1]-1) ? full_lattice_index + local_dim[0]*(1 - local_dim[1]) : full_lattice_index + local_dim[0];
743 neighbor_index = (coord[2] == local_dim[2]-1) ? full_lattice_index + local_dim[0]*local_dim[1]*(1 - local_dim[2]) : full_lattice_index + local_dim[0]*local_dim[1];
746 neighbor_index = (coord[3] == local_dim[3]-1) ? full_lattice_index + local_dim[0]*local_dim[1]*local_dim[2]*(1-local_dim[3]) : full_lattice_index + local_dim[0]*local_dim[1]*local_dim[2];
749 neighbor_index = (coord[0] == 0) ? full_lattice_index - 1 + local_dim[0] : full_lattice_index - 1;
752 neighbor_index = (coord[1] == 0) ? full_lattice_index - local_dim[0]*(1 - local_dim[1]) : full_lattice_index - local_dim[0];
755 neighbor_index = (coord[2] == 0) ? full_lattice_index - local_dim[0]*local_dim[1]*(1 - local_dim[2]) : full_lattice_index - local_dim[0]*local_dim[1];
758 neighbor_index = (coord[3] == 0) ? full_lattice_index - local_dim[0]*local_dim[1]*local_dim[2]*(1 - local_dim[3]) : full_lattice_index - local_dim[0]*local_dim[1]*local_dim[2];
761 errorQuda(
"Neighbor index could not be determined\n");
767 return neighbor_index;
777 template<
class Real,
int oddBit>
779 int half_lattice_index,
780 const Real*
const oprod,
790 int idx = half_lattice_index;
800 const Real*
const oprod,
805 for(
int dir=0; dir<4; ++dir) volume *= dim[dir];
806 const int half_volume = volume/2;
808 for(
int site=0; site<half_volume; ++site){
809 computeOneLinkSite<Real,0>(
dim, site,
816 for(
int site=0; site<half_volume; ++site){
817 computeOneLinkSite<Real,1>(
dim, site,
834 template<
class Real,
int oddBit>
837 const Real*
const oprod,
838 const Real*
const Qprev,
839 const Real*
const link,
850 const bool sig_positive = (
GOES_FORWARDS(sig)) ?
true :
false;
860 point_d = new_mem_idx >> 1;
863 point_c = new_mem_idx >> 1;
866 point_b = new_mem_idx >> 1;
868 ad_link_nbr_idx = (mu_positive) ? point_d : half_lattice_index;
869 bc_link_nbr_idx = (mu_positive) ? point_c : point_b;
870 ab_link_nbr_idx = (sig_positive) ? half_lattice_index : point_b;
896 colorMatY =
conj(colorMatY);
902 colorMatW = (!mu_positive) ? bc_link*colorMatY :
conj(bc_link)*colorMatY;
905 colorMatY = (sig_positive) ? ab_link*colorMatW :
conj(ab_link)*colorMatW;
912 ad_link =
conj(ad_link);
917 if(sig_positive) colorMatY = colorMatW*ad_link;
920 if(Qmu || sig_positive){
922 colorMatX= colorMatY*ad_link;
925 if(sig_positive) colorMatY = colorMatW*colorMatX;
936 const Real*
const oprod,
937 const Real*
const Qprev,
938 const Real*
const link,
949 for(
int dir=0; dir<4; ++dir) volume *= dim[dir];
951 const int loop_count =
Vh_ex;
953 const int loop_count = volume/2;
959 for(
int site=0; site<loop_count; ++site){
960 computeMiddleLinkSite<Real, 0>(site,
dim,
967 for(
int site=0; site<loop_count; ++site){
968 computeMiddleLinkSite<Real,1>(site,
dim,
980 template<
class Real,
int oddBit>
983 const Real*
const P3,
984 const Real*
const Qprod,
985 const Real*
const link,
995 const bool sig_positive = (
GOES_FORWARDS(sig)) ?
true :
false;
1004 point_d = new_mem_idx >> 1;
1005 ad_link_nbr_idx = (mu_positive) ? point_d : half_lattice_index;
1018 ad_link_nbr_idx = half_lattice_index;
1021 colorMatW = (mu_positive) ? ad_link*colorMatY :
conj(ad_link)*colorMatY;
1026 Real
mycoeff = ( (sig_positive &&
oddBit) || (!sig_positive && !
oddBit) ) ? coeff : -coeff;
1031 colorMatW = colorMatY*colorMatX;
1035 colorMatW =
conj(colorMatX)*
conj(colorMatY);
1047 colorMatW =
conj(colorMatY);
1057 template<
class Real>
1059 const Real*
const P3,
1060 const Real*
const Qprod,
1061 const Real*
const link,
1065 Real*
const newOprod
1070 for(
int dir=0; dir<4; ++dir) volume *= dim[dir];
1072 const int loop_count =
Vh_ex;
1074 const int loop_count = volume/2;
1078 for(
int site=0; site<loop_count; ++site){
1079 computeSideLinkSite<Real,0>(site,
dim,
1083 ls, shortP, newOprod);
1086 for(
int site=0; site<loop_count; ++site){
1087 computeSideLinkSite<Real,1>(site,
dim,
1091 ls, shortP, newOprod);
1101 template<
class Real,
int oddBit>
1104 const Real*
const oprod,
1105 const Real*
const Qprev,
1106 const Real*
const link,
1111 Real*
const newOprod)
1114 const bool mu_positive = (
GOES_FORWARDS(mu)) ?
true :
false;
1115 const bool sig_positive = (
GOES_FORWARDS(sig)) ?
true :
false;
1128 point_d = new_mem_idx >> 1;
1131 point_c = new_mem_idx >> 1;
1134 point_b = new_mem_idx >> 1;
1135 ab_link_nbr_idx = (sig_positive) ? half_lattice_index : point_b;
1139 Real
mycoeff = ( (sig_positive &&
oddBit) || (!sig_positive && !
oddBit) ) ? coeff : -coeff;
1147 colorMatZ =
conj(bc_link)*colorMatY;
1151 colorMatY = colorMatX*ad_link;
1152 colorMatW = colorMatZ*colorMatY;
1161 colorMatY = (sig_positive) ? ab_link*colorMatZ :
conj(ab_link)*colorMatZ;
1162 colorMatW = colorMatY*colorMatX;
1164 colorMatW = ad_link*colorMatY;
1173 if(sig_positive) colorMatW = colorMatX*
conj(ad_link);
1174 colorMatZ = bc_link*colorMatY;
1176 colorMatY = colorMatZ*colorMatW;
1186 colorMatY = (sig_positive) ? ab_link*colorMatZ :
conj(ab_link)*colorMatZ;
1187 colorMatW =
conj(colorMatX)*
conj(colorMatY);
1190 colorMatW =
conj(ad_link)*colorMatY;
1198 template<
class Real>
1200 const Real*
const oprod,
1201 const Real*
const Qprev,
1202 const Real*
const link,
1206 Real*
const newOprod)
1209 for(
int dir=0; dir<4; ++dir) volume *= dim[dir];
1211 const int loop_count =
Vh_ex;
1213 const int loop_count = volume/2;
1217 for(
int site=0; site<loop_count; ++site){
1219 computeAllLinkSite<Real,0>(site,
dim,
1227 for(
int site=0; site<loop_count; ++site){
1228 computeAllLinkSite<Real, 1>(site,
dim,
1239 #define Pmu tempmat[0]
1240 #define P3 tempmat[1]
1241 #define P5 tempmat[2]
1242 #define Pnumu tempmat[3]
1243 #define Qmu tempmat[4]
1244 #define Qnumu tempmat[5]
1246 template<
class Real>
1258 template<
class Real>
1266 Real OneLink, ThreeSt, FiveSt, SevenSt, Lepage,
coeff;
1268 OneLink = staple_coeff.
one;
1269 ThreeSt = staple_coeff.
three;
1270 FiveSt = staple_coeff.
five;
1271 SevenSt = staple_coeff.
seven;
1272 Lepage = staple_coeff.
lepage;
1284 for(
int mu=0;
mu<8; ++
mu){
1287 computeMiddleLinkField<Real>(
dim,
1293 for(
int nu=0; nu<8; ++nu){
1295 || nu==sig || nu==
OPP_DIR(sig) )
continue;
1297 computeMiddleLinkField<Real>(
dim,
1304 for(
int rho=0; rho<8; ++rho){
1305 if( rho == sig || rho ==
OPP_DIR(sig)
1306 || rho == mu || rho ==
OPP_DIR(mu)
1307 || rho == nu || rho ==
OPP_DIR(nu) )
1312 if(FiveSt != 0)coeff = SevenSt/FiveSt;
else coeff = 0;
1313 computeAllLinkField<Real>(
dim,
1321 if(ThreeSt != 0)coeff = FiveSt/ThreeSt;
else coeff = 0;
1322 computeSideLinkField<Real>(
dim,
1332 if(staple_coeff.
lepage != 0.){
1333 computeMiddleLinkField<Real>(
dim,
1339 if(ThreeSt != 0)coeff = Lepage/ThreeSt;
else coeff = 0;
1340 computeSideLinkField<Real>(
dim,
1347 computeSideLinkField<Real>(
dim,
1349 sig,
mu, ThreeSt, 0.,
1373 for(
int dir=0; dir<4; ++dir) volume *= param.
X[dir];
1383 for(
int i=0; i<6; ++i) tempmat[i] = malloc(len*18*
sizeof(
double));
1385 for(
int i=0; i<6; ++i) tempmat[i] = malloc(len*18*
sizeof(
float));
1389 act_path_coeff.
one = path_coeff[0];
1390 act_path_coeff.
naik = path_coeff[1];
1391 act_path_coeff.
three = path_coeff[2];
1392 act_path_coeff.
five = path_coeff[3];
1393 act_path_coeff.
seven = path_coeff[4];
1394 act_path_coeff.
lepage = path_coeff[5];
1397 doHisqStaplesForceCPU<double>(param.
X,
1406 doHisqStaplesForceCPU<float>(param.
X,
1417 for(
int i=0; i<6; ++i){
1425 template<
class Real,
int oddBit>
1428 const Real*
const oprod,
1429 const Real*
const link,
1445 int idx = half_lattice_index;
1452 point_d = new_mem_idx >> 1;
1455 point_e = new_mem_idx >> 1;
1458 point_b = new_mem_idx >> 1;
1461 point_a = new_mem_idx >> 1;
1472 colorMatV = de_link*ef_link*colorMatZ
1473 - de_link*colorMatY*bc_link
1474 + colorMatX*ab_link*bc_link;
1481 template<
class Real>
1483 const Real*
const oprod,
1484 const Real*
const link,
1489 for(
int dir=0; dir<4; ++dir) volume *= dim[dir];
1490 const int half_volume = volume/2;
1493 for(
int site=0; site<half_volume; ++site){
1494 computeLongLinkSite<Real,0>(site,
1503 for(
int site=0; site<half_volume; ++site){
1504 computeLongLinkSite<Real,1>(site,
1523 computeLongLinkField<float>(param.
X,
1529 computeLongLinkField<double>(param.
X,
1533 (
double*)newOprod->
Gauge_p());
1542 template<
class Real,
int oddBit>
1545 const Real*
const oprod,
1546 const Real*
const link,
1556 int idx = half_lattice_index_ex;
1558 int idx = half_lattice_index;
1564 colorMatY = linkW*colorMatX;
1570 template <
class Real>
1572 const Real*
const oprod,
1573 const Real*
const link,
1577 int volume = dim[0]*dim[1]*dim[2]*dim[3];
1578 const int half_volume = volume/2;
1582 for(
int site=0; site<half_volume; ++site){
1583 completeForceSite<Real,0>(site,
1591 for(
int site=0; site<half_volume; ++site){
1592 completeForceSite<Real,1>(site,
1610 completeForceField<float>(param.
X,
1616 completeForceField<double>(param.
X,
__host__ __device__ float4 operator-=(float4 &x, const float4 y)
void completeForceSite(int half_lattice_index, const int dim[4], const Real *const oprod, const Real *const link, int sig, const LoadStore< Real > &ls, Real *const mom)
__device__ __host__ T const & operator()(int i, int j) const
void addMatrixToField(const Matrix< 3, std::complex< Real > > &mat, int oddBit, int half_lattice_index, Real coeff, Real *const) const
Matrix< N, std::complex< T > > conj(const Matrix< N, std::complex< T > > &mat)
Matrix< N, T > operator()() const
Matrix & operator-=(const Matrix< N, T > &mat)
__global__ void const RealA *const const RealA *const const RealA *const const RealB *const const RealB *const int int mu
int getFullFromHalfIndex(int half_lattice_index)
addMatrixToField(Ow.data, point_d, accumu_coeff, shortPEven, shortPOdd, 1-oddBit, kparam.color_matrix_stride)
int getNeighborFromFullIndex(int full_lattice_index, int dir, int *err=NULL)
void mat(void *out, void **fatlink, void **longlink, void *in, double kappa, int dagger_bit, QudaPrecision sPrecision, QudaPrecision gPrecision)
void addData(Real *const field, int idx, int dir, int oddBit, int offset, Real, int hfv) const
void completeForceField(const int dim[4], const Real *const oprod, const Real *const link, int sig, Real *const mom)
void computeAllLinkField(const int dim[4], const Real *const oprod, const Real *const Qprev, const Real *const link, int sig, int mu, Real coeff, Real accumu_coeff, Real *const shortP, Real *const newOprod)
void computeOneLinkField(const int dim[4], const Real *const oprod, int sig, Real coeff, Real *const output)
void computeSideLinkSite(int half_lattice_index, const int dim[4], const Real *const P3, const Real *const Qprod, const Real *const link, int sig, int mu, Real coeff, Real accumu_coeff, const LoadStore< Real > &ls, Real *const shortP, Real *const newOprod)
void hisqStaplesForceCPU(const double *path_coeff, const QudaGaugeParam ¶m, cpuGaugeField &oprod, cpuGaugeField &link, cpuGaugeField *newOprod)
__host__ __device__ complex< ValueType > operator-(const complex< ValueType > &lhs, const complex< ValueType > &rhs)
void computeMiddleLinkSite(int half_lattice_index, const int dim[4], const Real *const oprod, const Real *const Qprev, const Real *const link, int sig, int mu, Real coeff, const LoadStore< Real > &ls, Real *const Pmu, Real *const P3, Real *const Qmu, Real *const newOprod)
void storeMatrixToMomentumField(const Matrix< 3, std::complex< Real > > &mat, int oddBit, int dir, int half_lattice_index, Real coeff, Real *const) const
__constant__ double coeff
void computeOneLinkSite(const int dim[4], int half_lattice_index, const Real *const oprod, int sig, Real coeff, const LoadStore< Real > &ls, Real *const output)
__global__ void const RealA *const const RealA *const const RealA *const const RealB *const const RealB *const int int RealTypeId< RealA >::Type RealTypeId< RealA >::Type accumu_coeff
void hisqLongLinkForceCPU(double coeff, const QudaGaugeParam ¶m, cpuGaugeField &oprod, cpuGaugeField &link, cpuGaugeField *newOprod)
int half_idx_conversion_ex2normal(int half_lattice_index, const int *dim, int oddBit) const
Matrix< N, T > operator()() const
storeMatrixToField(Oy.data, new_sid, P3Even, P3Odd, oddBit, kparam.color_matrix_stride)
__host__ __device__ complex< ValueType > operator*(const complex< ValueType > &lhs, const complex< ValueType > &rhs)
Matrix< 3, std::complex< Real > > Type
Main header file for the QUDA library.
Matrix & operator+=(const Matrix< N, T > &mat)
void computeSideLinkField(const int dim[4], const Real *const P3, const Real *const Qprod, const Real *const link, int sig, int mu, Real coeff, Real accumu_coeff, Real *const shortP, Real *const newOprod)
void computeMiddleLinkField(const int dim[4], const Real *const oprod, const Real *const Qprev, const Real *const link, int sig, int mu, Real coeff, Real *const Pmu, Real *const P3, Real *const Qmu, Real *const newOprod)
void loadMatrixFromField(const Real *const field, int oddBit, int half_lattice_index, Matrix< 3, std::complex< Real > > *const mat) const
Real getData(const Real *const field, int idx, int dir, int oddBit, int offset, int hfv) const
void computeLongLinkSite(int half_lattice_index, const int dim[4], const Real *const oprod, const Real *const link, int sig, Real coeff, const LoadStore< Real > &ls, Real *const output)
Matrix< N, T > transpose(const Matrix< N, std::complex< T > > &mat)
__host__ __device__ float4 operator+=(float4 &x, const float4 y)
#define GOES_FORWARDS(dir)
__global__ void const RealA *const const RealA *const const RealA *const const RealB *const const RealB *const int sig
void computeLongLinkField(const int dim[4], const Real *const oprod, const Real *const link, int sig, Real coeff, Real *const output)
__host__ __device__ ValueType conj(ValueType x)
RealTypeId< RealA >::Type mycoeff
__host__ __device__ complex< ValueType > operator+(const complex< ValueType > &lhs, const complex< ValueType > &rhs)
void computeAllLinkSite(int half_lattice_index, const int dim[4], const Real *const oprod, const Real *const Qprev, const Real *const link, int sig, int mu, Real coeff, Real accumu_coeff, const LoadStore< Real > &ls, Real *const shortP, Real *const newOprod)
loadMatrixFromField(oprodEven, oprodOdd, point_c, Oy.data, oddBit, kparam.color_matrix_stride)
Locator(const int dim[4])
void hisqCompleteForceCPU(const QudaGaugeParam ¶m, cpuGaugeField &oprod, cpuGaugeField &link, cpuGaugeField *mom)
void storeMatrixToField(const Matrix< 3, std::complex< Real > > &mat, int oddBit, int half_lattice_index, Real *const field) const
void doHisqStaplesForceCPU(const int dim[4], PathCoefficients< double > staple_coeff, Real *oprod, Real *link, Real **tempmat, Real *newOprod)
int half_idx_conversion_normal2ex(int half_lattice_index, const int *dim, int oddBit) const