QUDA: quda/tests/hisq_force_reference.cpp Source File

QUDA v0.4.0
A library for QCD on GPUs
00001 #include <stdio.h>
00002 #include <stdlib.h>
00003 #include <math.h>
00004 #include <string.h>
00005 
00006 #include "quda.h"
00007 #include "test_util.h"
00008 #include "misc.h"
00009 #include "hisq_force_reference.h"
00010 
00011 extern int Z[4];
00012 extern int V;
00013 extern int Vh;
00014 
00015 
00016 #define CADD(a,b,c) { (c).real = (a).real + (b).real;   \
00017         (c).imag = (a).imag + (b).imag; }
00018 #define CMUL(a,b,c) { (c).real = (a).real*(b).real - (a).imag*(b).imag; \
00019                       (c).imag = (a).real*(b).imag + (a).imag*(b).real; }
00020 #define CSUM(a,b) { (a).real += (b).real; (a).imag += (b).imag; }
00021 
00022 /* c = a* * b */
00023 #define CMULJ_(a,b,c) { (c).real = (a).real*(b).real + (a).imag*(b).imag; \
00024                         (c).imag = (a).real*(b).imag - (a).imag*(b).real; }
00025 
00026 /* c = a * b* */
00027 #define CMUL_J(a,b,c) { (c).real = (a).real*(b).real + (a).imag*(b).imag; \
00028                         (c).imag = (a).imag*(b).real - (a).real*(b).imag; }
00029 
00030 #define CONJG(a,b) { (b).real = (a).real; (b).imag = -(a).imag; }
00031 
00032 typedef struct {   
00033     float real;    
00034     float imag; 
00035 } fcomplex;  
00036 
00037 /* specific for double complex */
00038 typedef struct {
00039     double real;
00040     double imag;
00041 } dcomplex;
00042 
00043 typedef struct { fcomplex e[3][3]; } fsu3_matrix;
00044 typedef struct { fcomplex c[3]; } fsu3_vector;
00045 typedef struct { dcomplex e[3][3]; } dsu3_matrix;
00046 typedef struct { dcomplex c[3]; } dsu3_vector;
00047 
00048 typedef struct { 
00049     fcomplex m01,m02,m12; 
00050     float m00im,m11im,m22im; 
00051     float space; 
00052 } fanti_hermitmat;
00053 
00054 typedef struct { 
00055     dcomplex m01,m02,m12; 
00056     double m00im,m11im,m22im; 
00057     double space; 
00058 } danti_hermitmat;
00059 
00060 typedef struct { fsu3_vector h[2]; } fhalf_wilson_vector;
00061 typedef struct { dsu3_vector h[2]; } dhalf_wilson_vector;
00062 
00063 
00064 template<typename su3_matrix>
00065 static void  
00066 su3_adjoint( su3_matrix *a, su3_matrix *b )
00067 {
00068     int i,j;
00069     for(i=0;i<3;i++)for(j=0;j<3;j++){
00070             CONJG( a->e[j][i], b->e[i][j] );
00071         }
00072 }
00073 
00074 
00075 template<typename su3_matrix, typename anti_hermitmat>
00076 static void
00077 make_anti_hermitian( su3_matrix *m3, anti_hermitmat *ah3 ) 
00078 {
00079     
00080     typeof(ah3->m00im) temp =
00081         (m3->e[0][0].imag + m3->e[1][1].imag + m3->e[2][2].imag)*0.33333333333333333;
00082     ah3->m00im = m3->e[0][0].imag - temp;
00083     ah3->m11im = m3->e[1][1].imag - temp;
00084     ah3->m22im = m3->e[2][2].imag - temp;
00085     ah3->m01.real = (m3->e[0][1].real - m3->e[1][0].real)*0.5;
00086     ah3->m02.real = (m3->e[0][2].real - m3->e[2][0].real)*0.5;
00087     ah3->m12.real = (m3->e[1][2].real - m3->e[2][1].real)*0.5;
00088     ah3->m01.imag = (m3->e[0][1].imag + m3->e[1][0].imag)*0.5;
00089     ah3->m02.imag = (m3->e[0][2].imag + m3->e[2][0].imag)*0.5;
00090     ah3->m12.imag = (m3->e[1][2].imag + m3->e[2][1].imag)*0.5;
00091     
00092 }
00093 
00094 template <typename anti_hermitmat, typename su3_matrix>
00095 static void
00096 uncompress_anti_hermitian(anti_hermitmat *mat_antihermit,
00097                           su3_matrix *mat_su3 )
00098 {
00099     typeof(mat_antihermit->m00im) temp1;
00100     mat_su3->e[0][0].imag=mat_antihermit->m00im;
00101     mat_su3->e[0][0].real=0.;
00102     mat_su3->e[1][1].imag=mat_antihermit->m11im;
00103     mat_su3->e[1][1].real=0.;
00104     mat_su3->e[2][2].imag=mat_antihermit->m22im;
00105     mat_su3->e[2][2].real=0.;
00106     mat_su3->e[0][1].imag=mat_antihermit->m01.imag;
00107     temp1=mat_antihermit->m01.real;
00108     mat_su3->e[0][1].real=temp1;
00109     mat_su3->e[1][0].real= -temp1;
00110     mat_su3->e[1][0].imag=mat_antihermit->m01.imag;
00111     mat_su3->e[0][2].imag=mat_antihermit->m02.imag;
00112     temp1=mat_antihermit->m02.real;
00113     mat_su3->e[0][2].real=temp1;
00114     mat_su3->e[2][0].real= -temp1;
00115     mat_su3->e[2][0].imag=mat_antihermit->m02.imag;
00116     mat_su3->e[1][2].imag=mat_antihermit->m12.imag;
00117     temp1=mat_antihermit->m12.real;
00118     mat_su3->e[1][2].real=temp1;
00119     mat_su3->e[2][1].real= -temp1;
00120     mat_su3->e[2][1].imag=mat_antihermit->m12.imag;    
00121 }
00122 
00123 template <typename su3_matrix, typename Float>
00124 static void
00125 scalar_mult_sub_su3_matrix(su3_matrix *a,su3_matrix *b, Float s, su3_matrix *c)
00126 {    
00127     int i,j;
00128     for(i=0;i<3;i++){
00129         for(j=0;j<3;j++){           
00130             c->e[i][j].real = a->e[i][j].real - s*b->e[i][j].real;
00131             c->e[i][j].imag = a->e[i][j].imag - s*b->e[i][j].imag;
00132         }
00133     }
00134 }
00135 
00136 template <typename su3_matrix, typename Float>
00137 static void
00138 scalar_mult_add_su3_matrix(su3_matrix *a,su3_matrix *b, Float s, su3_matrix *c)
00139 {
00140     int i,j;
00141     for(i=0;i<3;i++){
00142         for(j=0;j<3;j++){           
00143             c->e[i][j].real = a->e[i][j].real + s*b->e[i][j].real;
00144             c->e[i][j].imag = a->e[i][j].imag + s*b->e[i][j].imag;
00145         }
00146     }    
00147 }
00148 
00149 
00150 template<typename su3_matrix, typename su3_vector>
00151 static void
00152 mult_su3_mat_vec( su3_matrix *a, su3_vector *b, su3_vector *c  )
00153 {
00154     int i,j;
00155     typeof(a->e[0][0]) x,y;
00156     for(i=0;i<3;i++){
00157         x.real=x.imag=0.0;
00158         for(j=0;j<3;j++){
00159             CMUL( a->e[i][j] , b->c[j] , y );
00160             CSUM( x , y );
00161         }
00162         c->c[i]=x;
00163     }
00164 }
00165 template<typename su3_matrix, typename su3_vector>
00166 static void
00167 mult_adj_su3_mat_vec( su3_matrix *a, su3_vector *b, su3_vector *c )
00168 {
00169     int i,j;
00170     typeof(a->e[0][0]) x,y,z;
00171     for(i=0;i<3;i++){
00172         x.real=x.imag=0.0;
00173         for(j=0;j<3;j++){
00174             CONJG( a->e[j][i], z );
00175             CMUL( z , b->c[j], y );
00176             CSUM( x , y );
00177         }
00178         c->c[i] = x;
00179     }
00180 }
00181 
00182 template<typename su3_vector, typename su3_matrix>
00183 static void
00184 su3_projector( su3_vector *a, su3_vector *b, su3_matrix *c )
00185 {
00186     int i,j;
00187     for(i=0;i<3;i++)for(j=0;j<3;j++){
00188             CMUL_J( a->c[i], b->c[j], c->e[i][j] );
00189         }
00190 }
00191 
00192 template<typename su3_vector, typename Real>
00193 static void 
00194 scalar_mult_add_su3_vector(su3_vector *a, su3_vector *b, Real s,
00195                            su3_vector *c)
00196 {    
00197     int i;
00198     for(i=0;i<3;i++){
00199         c->c[i].real = a->c[i].real + s*b->c[i].real;
00200         c->c[i].imag = a->c[i].imag + s*b->c[i].imag;
00201     }    
00202 }
00203 
00204 
00205 
00206 template < typename su3_matrix>
00207 static void
00208 print_su3_matrix(su3_matrix *a)
00209 {
00210     int i, j;
00211     for(i=0;i < 3; i++){
00212         for(j=0;j < 3;j++){
00213             printf("(%f %f)\t", a->e[i][j].real, a->e[i][j].imag);
00214         }
00215         printf("\n");
00216     }
00217     
00218 }
00219 
00220 
00221 
00222 // Add a matrix multiplication function
00223 template<typename su3_matrix>
00224 static void
00225 matrix_mult_nn(su3_matrix* a, su3_matrix* b, su3_matrix* c){
00226   // c = a*b
00227   typeof(c->e[0][0]) x;
00228   for(int i=0; i<3; i++){
00229     for(int j=0; j<3; j++){     
00230       c->e[i][j].real = 0.;
00231       c->e[i][j].imag = 0.;
00232       for(int k=0; k<3; k++){   
00233         CMUL(a->e[i][k],b->e[k][j],x);
00234         c->e[i][j].real += x.real;
00235         c->e[i][j].imag += x.imag;
00236       }         
00237     }   
00238   }
00239   return;
00240 }
00241 
00242 
00243 template<typename su3_matrix>
00244 static void
00245 matrix_mult_an(su3_matrix* a, su3_matrix* b, su3_matrix* c){
00246   // c = (a^{\dagger})*b
00247   typeof(c->e[0][0]) x;
00248   for(int i=0; i<3; i++){
00249     for(int j=0; j<3; j++){     
00250       c->e[i][j].real = 0.;
00251       c->e[i][j].imag = 0.;
00252       for(int k=0; k<3; k++){   
00253         CMULJ_(a->e[k][i],b->e[k][j],x);
00254         c->e[i][j].real += x.real;
00255         c->e[i][j].imag += x.imag;
00256       }         
00257     }   
00258   }
00259   return;
00260 }
00261 
00262 
00263 
00264 template<typename su3_matrix>
00265 static void
00266 matrix_mult_na(su3_matrix* a, su3_matrix* b, su3_matrix* c){
00267   // c = a*b^{\dagger}
00268   typeof(c->e[0][0]) x;
00269   for(int i=0; i<3; i++){
00270     for(int j=0; j<3; j++){
00271       c->e[i][j].real = 0.; c->e[i][j].imag = 0.;
00272       for(int k=0; k<3; k++){
00273         CMUL_J(a->e[i][k],b->e[j][k],x);
00274         c->e[i][j].real += x.real;
00275         c->e[i][j].imag += x.imag;
00276       }
00277     }
00278   }
00279   return;
00280 }
00281 
00282 
00283 
00284 
00285 template <typename su3_matrix, typename anti_hermitmat, typename Float>
00286 static void
00287 update_mom(anti_hermitmat* momentum, int dir, su3_matrix* sitelink,
00288            su3_matrix* staple, Float eb3)
00289 {
00290     int i;
00291     for(i=0;i <V; i++){
00292         su3_matrix tmat1;
00293         su3_matrix tmat2;
00294         su3_matrix tmat3;
00295 
00296         su3_matrix* lnk = sitelink + 4*i+dir;
00297         su3_matrix* stp = staple + i;
00298         anti_hermitmat* mom = momentum + 4*i+dir;
00299         
00300         mult_su3_na(lnk, stp, &tmat1);
00301         uncompress_anti_hermitian(mom, &tmat2);
00302         
00303         scalar_mult_sub_su3_matrix(&tmat2, &tmat1, eb3, &tmat3);
00304         make_anti_hermitian(&tmat3, mom);
00305         
00306     }
00307     
00308 }
00309 
00310 
00311 template<typename half_wilson_vector, typename su3_matrix>
00312 static void 
00313 u_shift_hw(half_wilson_vector *src, half_wilson_vector *dest, int dir, su3_matrix* sitelink ) 
00314 {
00315     int i ;
00316     int dx[4];
00317     
00318     dx[3]=dx[2]=dx[1]=dx[0]=0;
00319     
00320     if(GOES_FORWARDS(dir)){     
00321         dx[dir]=1;      
00322         for(i=0;i < V; i++){
00323             int nbr_idx = neighborIndexFullLattice(i, dx[3], dx[2], dx[1], dx[0]);
00324             half_wilson_vector* hw = src + nbr_idx;
00325             su3_matrix* link = sitelink + i*4 + dir;
00326             mult_su3_mat_vec(link, &hw->h[0], &dest[i].h[0]);
00327             mult_su3_mat_vec(link, &hw->h[1], &dest[i].h[1]);       
00328         }       
00329     }else{
00330         dx[OPP_DIR(dir)]=-1;
00331         for(i=0;i < V; i++){
00332             int nbr_idx = neighborIndexFullLattice(i, dx[3], dx[2], dx[1], dx[0]);
00333             half_wilson_vector* hw = src + nbr_idx;
00334             su3_matrix* link = sitelink + nbr_idx*4 + OPP_DIR(dir);
00335             mult_adj_su3_mat_vec(link, &hw->h[0], &dest[i].h[0]);
00336             mult_adj_su3_mat_vec(link, &hw->h[1], &dest[i].h[1]);          
00337         }       
00338     }
00339 }
00340 
00341 
00342 
00343 template<typename half_wilson_vector, typename su3_matrix>
00344 static void 
00345 shifted_outer_prod(half_wilson_vector *src, su3_matrix* dest, int dir)
00346 {
00347     
00348     int i;
00349     int dx[4];
00350     
00351     dx[3]=dx[2]=dx[1]=dx[0]=0;
00352     
00353     if(GOES_FORWARDS(dir)){     
00354         dx[dir]=1;      
00355     }else{ dx[OPP_DIR(dir)]=-1; }
00356 
00357     for(i=0;i < V; i++){
00358       int nbr_idx = neighborIndexFullLattice(i, dx[3], dx[2], dx[1], dx[0]);
00359       half_wilson_vector* hw = src + nbr_idx;
00360       su3_projector( &src[i].h[0], &(hw->h[0]), &dest[i]);
00361     }   
00362 }
00363 
00364 
00365 template<typename half_wilson_vector, typename su3_matrix>
00366 static void 
00367 forward_shifted_outer_prod(half_wilson_vector *src, su3_matrix* dest, int dir)
00368 {
00369 
00370   int i;
00371   int dx[4];
00372     
00373   dx[3]=dx[2]=dx[1]=dx[0]=0;
00374     
00375   if(GOES_FORWARDS(dir)){       
00376     dx[dir]=1;  
00377   }else{ dx[OPP_DIR(dir)]=-1; }
00378 
00379   for(i=0;i < V; i++){
00380     int nbr_idx = neighborIndexFullLattice(i, dx[3], dx[2], dx[1], dx[0]);
00381     half_wilson_vector* hw = src + nbr_idx;
00382     //su3_projector( &src[i].h[0], &(hw->h[0]), &dest[i]);
00383     su3_projector( &(hw->h[0]), &src[i].h[0], &dest[i]);
00384   }     
00385 
00386   return;
00387 }
00388 
00389 
00390 
00391 template <typename half_wilson_vector, typename su3_matrix>
00392 static void
00393 computeLinkOrderedOuterProduct(half_wilson_vector *src, su3_matrix* dest)
00394 {
00395   int dx[4];
00396   for(int i=0; i<V; ++i){
00397     for(int dir=0; dir<4; ++dir){
00398       dx[3]=dx[2]=dx[1]=dx[0]=0;
00399       dx[dir] = 1;
00400       int nbr_idx = neighborIndexFullLattice(i, dx[3], dx[2], dx[1], dx[0]);
00401       half_wilson_vector* hw = src + nbr_idx;
00402       su3_projector( &(hw->h[0]), &src[i].h[0], &dest[i*4 + dir]);
00403     } // dir
00404   } // i
00405   return;
00406 }
00407 
00408 
00409 template <typename half_wilson_vector, typename su3_matrix>
00410 static void
00411 computeLinkOrderedOuterProduct(half_wilson_vector *src, su3_matrix* dest, size_t nhops)
00412 {
00413   int dx[4];
00414   for(int i=0; i<V; ++i){
00415     for(int dir=0; dir<4; ++dir){
00416       dx[3]=dx[2]=dx[1]=dx[0]=0;
00417       dx[dir] = nhops;
00418       int nbr_idx = neighborIndexFullLattice(i, dx[3], dx[2], dx[1], dx[0]);
00419       half_wilson_vector* hw = src + nbr_idx;
00420       su3_projector( &(hw->h[0]), &src[i].h[0], &dest[i*4 + dir]);
00421     } // dir
00422   } // i
00423   return;
00424 }
00425 
00426 
00427 
00428 
00429 void computeLinkOrderedOuterProduct(void *src, void *dst, QudaPrecision precision)
00430 {
00431   if(precision == QUDA_SINGLE_PRECISION){
00432     computeLinkOrderedOuterProduct((fhalf_wilson_vector*)src,(fsu3_matrix*)dst);
00433   }else{
00434     computeLinkOrderedOuterProduct((dhalf_wilson_vector*)src,(dsu3_matrix*)dst);
00435   }
00436   return;
00437 }
00438 
00439 void computeLinkOrderedOuterProduct(void *src, void *dst, QudaPrecision precision, size_t nhops)
00440 {
00441   if(precision == QUDA_SINGLE_PRECISION){
00442     computeLinkOrderedOuterProduct((fhalf_wilson_vector*)src,(fsu3_matrix*)dst, nhops);
00443   }else{
00444     computeLinkOrderedOuterProduct((dhalf_wilson_vector*)src,(dsu3_matrix*)dst, nhops);
00445   }
00446   return;
00447 }
00448 
00449 
00450 
00451 template<typename half_wilson_vector, typename su3_matrix> 
00452 static void shiftedOuterProduct(half_wilson_vector *src, su3_matrix* dest){
00453   for(int dir=0; dir<4; dir++){
00454     shifted_outer_prod(src, &dest[dir*V], OPP_DIR(dir));
00455   }
00456 }
00457 
00458 template<typename half_wilson_vector, typename su3_matrix> 
00459 static void forwardShiftedOuterProduct(half_wilson_vector *src, su3_matrix* dest){
00460   for(int dir=0; dir<4; dir++){
00461     forward_shifted_outer_prod(src, &dest[dir*V], dir);
00462   }
00463 }
00464 
00465 
00466 void computeHisqOuterProduct(void* src, void* dest, QudaPrecision precision){
00467   if(precision == QUDA_SINGLE_PRECISION){
00468     forwardShiftedOuterProduct((fhalf_wilson_vector*)src,(fsu3_matrix*)dest);   
00469   }else{
00470     forwardShiftedOuterProduct((dhalf_wilson_vector*)src,(dsu3_matrix*)dest);   
00471   }
00472 }
00473 
00474 
00475 // Note that the hisq routines do not involve half-wilson vectors, 
00476 // but instead require colour matrix shifts
00477 
00478 
00479 template<typename su3_matrix> 
00480 static void 
00481 u_shift_mat(su3_matrix *src, su3_matrix *dest, int dir, su3_matrix* sitelink)
00482 {
00483   int i;
00484   int dx[4];
00485   dx[3]=dx[2]=dx[1]=dx[0]=0;
00486 
00487   if(GOES_FORWARDS(dir)){
00488     dx[dir]=1;
00489     for(i=0; i<V; i++){
00490       int nbr_idx = neighborIndexFullLattice(i, dx[3], dx[2], dx[1], dx[0]);
00491       su3_matrix* mat = src+nbr_idx; // No need for a factor of 4 here, the colour matrices do not have a Lorentz index
00492       su3_matrix* link = sitelink + i*4 + dir;
00493       matrix_mult_nn(link, mat, &dest[i]);      
00494     }   
00495   }else{
00496     dx[OPP_DIR(dir)]=-1;
00497     for(i=0; i<V; i++){
00498       int nbr_idx = neighborIndexFullLattice(i, dx[3], dx[2], dx[1], dx[0]);
00499       su3_matrix* mat = src+nbr_idx; // No need for a factor of 4 here, the colour matrices do not have a Lorentz index
00500       su3_matrix* link = sitelink + nbr_idx*4 + OPP_DIR(dir);
00501       matrix_mult_an(link, mat, &dest[i]);
00502     }
00503   }
00504   return;
00505 }
00506 
00507 
00508 
00509 
00510 
00511 
00512 
00513 template <typename half_wilson_vector,
00514           typename anti_hermitmat, typename Real>
00515 static void 
00516 add_3f_force_to_mom(half_wilson_vector *back, half_wilson_vector *forw, 
00517                     int dir, Real coeff[2], anti_hermitmat* momentum) 
00518 {
00519     Real my_coeff[2] ;
00520     Real tmp_coeff[2] ;
00521     int mydir;
00522     int i;
00523     
00524     if(GOES_BACKWARDS(dir)){
00525         mydir = OPP_DIR(dir);
00526         my_coeff[0] = -coeff[0]; 
00527         my_coeff[1] = -coeff[1];
00528     }else{ 
00529         mydir = dir; 
00530         my_coeff[0] = coeff[0]; 
00531         my_coeff[1] = coeff[1]; 
00532     }
00533     
00534     for(i=0;i < V;i++){
00535         if (i < Vh){
00536             tmp_coeff[0] = my_coeff[0];
00537             tmp_coeff[1] = my_coeff[1];
00538         }else{
00539             tmp_coeff[0] = -my_coeff[0];
00540             tmp_coeff[1] = -my_coeff[1] ;       
00541         }
00542         
00543         if (sizeof(Real) == sizeof(float)){
00544             fsu3_matrix tmat;
00545             fsu3_matrix mom_matrix;
00546             anti_hermitmat* mom = momentum+ 4* i + mydir;
00547             uncompress_anti_hermitian(mom, &mom_matrix);
00548             su3_projector( &back[i].h[0], &forw[i].h[0], &tmat);
00549             scalar_mult_add_su3_matrix(&mom_matrix, &tmat,  tmp_coeff[0], &mom_matrix );
00550             make_anti_hermitian(&mom_matrix, mom);
00551         }else{
00552             dsu3_matrix tmat;
00553             dsu3_matrix mom_matrix;
00554             anti_hermitmat* mom = momentum+ 4* i + mydir;
00555             uncompress_anti_hermitian(mom, &mom_matrix);
00556             su3_projector( &back[i].h[0], &forw[i].h[0], &tmat);
00557             scalar_mult_add_su3_matrix(&mom_matrix, &tmat,  tmp_coeff[0], &mom_matrix );
00558             make_anti_hermitian(&mom_matrix, mom);
00559         }
00560     }    
00561 }
00562 
00563 
00564 template<typename Real, typename half_wilson_vector, typename anti_hermitmat>
00565   static void 
00566 side_link_3f_force(int mu, int nu, Real coeff[2], half_wilson_vector *Path   , 
00567     half_wilson_vector *Path_nu, half_wilson_vector *Path_mu, 
00568     half_wilson_vector *Path_numu, anti_hermitmat* mom) 
00569 {
00570   Real m_coeff[2] ;
00571 
00572   m_coeff[0] = -coeff[0] ;
00573   m_coeff[1] = -coeff[1] ;
00574 
00575   if(GOES_FORWARDS(mu)){
00576     if(GOES_FORWARDS(nu)){
00577       add_3f_force_to_mom(Path_numu, Path, mu, coeff, mom) ;
00578     }else{
00579       add_3f_force_to_mom(Path,Path_numu,OPP_DIR(mu),m_coeff, mom);
00580     }
00581   }
00582   else{
00583     if(GOES_FORWARDS(nu))
00584       add_3f_force_to_mom(Path_nu, Path_mu, mu, m_coeff, mom);
00585     else
00586       add_3f_force_to_mom(Path_mu, Path_nu, OPP_DIR(mu), coeff, mom) ;
00587   }
00588 }
00589 
00590 
00591 template<typename Real, typename su3_matrix, typename anti_hermitmat>
00592   static void
00593 add_force_to_momentum(su3_matrix *back, su3_matrix *forw,
00594     int dir, Real coeff, anti_hermitmat* momentum)
00595 {
00596   Real my_coeff;
00597   Real tmp_coeff;
00598   int mydir;
00599   int i;
00600 
00601   if(GOES_BACKWARDS(dir)){
00602     mydir = OPP_DIR(dir);
00603     my_coeff = -coeff;
00604   }else{
00605     mydir = dir;
00606     my_coeff = coeff;   
00607   }
00608 
00609 
00610   for(i=0; i<V; i++){
00611     if(i<Vh){ tmp_coeff = my_coeff; }
00612     else{ tmp_coeff = -my_coeff; }
00613 
00614 
00615     su3_matrix tmat;
00616     su3_matrix mom_matrix;
00617     anti_hermitmat* mom = momentum + 4*i + mydir;
00618     uncompress_anti_hermitian(mom, &mom_matrix);
00619     matrix_mult_na(&back[i], &forw[i], &tmat);
00620     scalar_mult_add_su3_matrix(&mom_matrix, &tmat, tmp_coeff, &mom_matrix);  
00621 
00622     make_anti_hermitian(&mom_matrix, mom);      
00623   }
00624   return;
00625 }
00626 
00627 
00628 template<typename Real, typename su3_matrix, typename anti_hermitmat>
00629 static void 
00630 side_link_force(int mu, int nu, Real coeff, su3_matrix *Path,
00631                 su3_matrix *Path_nu, su3_matrix *Path_mu,
00632                 su3_matrix *Path_numu, anti_hermitmat* mom)
00633 {
00634   Real m_coeff;
00635   m_coeff = -coeff;
00636 
00637   if(GOES_FORWARDS(mu)){
00638     if(GOES_FORWARDS(nu)){
00639       add_force_to_momentum(Path_numu, Path, mu, coeff, mom); 
00640       // In the example below:
00641       // add_force_to_momentum(P7rho, Qnumu, rho, coeff, mom)
00642     }else{
00643       add_force_to_momentum(Path, Path_numu, OPP_DIR(mu), m_coeff, mom);        
00644       // add_force_to_momentum(Qnumu, P7rho, -Qnumu, rho, m_coeff, mom)
00645     }   
00646   }
00647   else { // if (GOES_BACKWARDS(mu))
00648     if(GOES_FORWARDS(nu)){
00649       add_force_to_momentum(Path_nu, Path_mu, mu, m_coeff, mom);
00650       // add_force_to_momentum(P7, Qrhonumu, rho, m_coeff, mom) 
00651     }else{
00652       add_force_to_momentum(Path_mu, Path_nu, OPP_DIR(mu), coeff, mom); 
00653       // add_force_to_momentum(Qrhonumu, P7, rho, coeff, mom)
00654     }
00655   }
00656   return; 
00657 }
00658 
00659 
00660 
00661 
00662 #define Pmu          tempmat[0]
00663 #define Pnumu        tempmat[1]
00664 #define Prhonumu     tempmat[2]
00665 #define P7           tempmat[3]
00666 #define P7rho        tempmat[4]
00667 #define P5           tempmat[5]
00668 #define P3           tempmat[6]
00669 #define P5nu         tempmat[3]
00670 #define P3mu         tempmat[3]
00671 #define Popmu        tempmat[4]
00672 #define Pmumumu      tempmat[4]
00673 
00674 #define Qmu          tempmat[7]
00675 #define Qnumu        tempmat[8]
00676 #define Qrhonumu     tempmat[2] // same as Prhonumu
00677 
00678 
00679 
00680 
00681 
00682 template<typename su3_matrix>
00683 static void set_identity_matrix(su3_matrix* mat)
00684 {
00685   for(int i=0; i<3; i++){
00686     for(int j=0; j<3; j++){
00687       mat->e[i][j].real=0;
00688       mat->e[i][j].imag=0;              
00689     }
00690     mat->e[i][i].real=1;                
00691   }
00692 } 
00693 
00694 
00695 template<typename su3_matrix> 
00696 static void set_identity(su3_matrix* matrices, int num_dirs){
00697   for(int i=0; i<V*num_dirs; i++){
00698     set_identity_matrix(&matrices[i]);  
00699   }
00700 }
00701 
00702 
00703 template <typename Real, typename su3_matrix, typename anti_hermitmat>
00704 void do_color_matrix_hisq_force_reference(Real eps, Real weight, 
00705                            su3_matrix* temp_xx, Real* act_path_coeff,
00706                            su3_matrix* sitelink, anti_hermitmat* mom)
00707 {
00708   int i;
00709   int mu, nu, rho, sig;
00710   Real coeff;
00711   Real OneLink, Lepage, FiveSt, ThreeSt, SevenSt;
00712 //  Real Naik;
00713   Real mLepage, mFiveSt, mThreeSt, mSevenSt;    
00714 
00715   su3_matrix* tempmat[9];
00716   int sites_on_node = V;
00717   //int DirectLinks[8] ;
00718  
00719   Real ferm_epsilon;
00720   ferm_epsilon = 2.0*weight*eps;
00721   OneLink = act_path_coeff[0]*ferm_epsilon ; 
00722 //  Naik    = act_path_coeff[1]*ferm_epsilon ; mNaik    = -Naik;
00723   ThreeSt = act_path_coeff[2]*ferm_epsilon ; mThreeSt = -ThreeSt;
00724   FiveSt  = act_path_coeff[3]*ferm_epsilon ; mFiveSt  = -FiveSt;
00725   SevenSt = act_path_coeff[4]*ferm_epsilon ; mSevenSt = -SevenSt;
00726   Lepage  = act_path_coeff[5]*ferm_epsilon ; mLepage  = -Lepage;
00727 
00728 //  for(mu=0;mu<8;mu++){
00729 //    DirectLinks[mu] = 0 ;
00730 //  }
00731 
00732   for(mu=0; mu<9; mu++){        
00733     tempmat[mu] = (su3_matrix *)malloc( sites_on_node*sizeof(su3_matrix) );  
00734   }
00735 
00736 
00737   su3_matrix* id;
00738   id = (su3_matrix *)malloc(sites_on_node*sizeof(su3_matrix) ); 
00739   //su3_matrix* id4;
00740   //id4 = (su3_matrix *)malloc(sites_on_node*4*sizeof(su3_matrix) ); 
00741 
00742  // su3_matrix* temp_mat;
00743 //  temp_mat = (su3_matrix *)malloc(sites_on_node*sizeof(su3_matrix) ); 
00744 
00745 
00746   //  su3_matrix* temp_xx;
00747   //  temp_xx = (su3_matrix *)malloc(sites_on_node*8*sizeof(su3_matrix) );
00748   //  shiftedOuterProduct(temp_x, temp_xx);
00749 
00750   // initialise id so that it is the identity matrix on each lattice site
00751   set_identity(id,1);
00752 
00753   printf("Calling modified hisq\n");
00754   for(sig=0; sig < 8; sig++){
00755     // One-link term - don't have the savings here that we get when working with the 
00756     // half-wilson vectors
00757     if(GOES_FORWARDS(sig)){
00758       u_shift_mat(&temp_xx[OPP_DIR(sig)*V], Pmu, sig, sitelink);
00759       add_force_to_momentum(Pmu, id, sig, OneLink, mom); // I could optimise functions which 
00760       // involve id
00761     }
00762 
00763     for(mu = 0; mu < 8; mu++){
00764       if ( (mu == sig) || (mu == OPP_DIR(sig))){
00765         continue;
00766       }
00767       //     3 link path 
00768       //         sig
00769       //    A  _______
00770       //      |       |
00771       //  mu /|\     \|/
00772       //      |       |
00773       //
00774       //
00775       u_shift_mat(&temp_xx[OPP_DIR(sig)*V], Pmu, OPP_DIR(mu), sitelink); // temp_xx[sig] stores |X(x)><X(x-sig)|
00776       u_shift_mat(id, Qmu, OPP_DIR(mu), sitelink); // This returns the path less the outer-product of quark fields at the end 
00777                                                    // Qmu = U[mu]       
00778 
00779 
00780       u_shift_mat(Pmu, P3, sig, sitelink); // P3 is U[sig](X)U[-mu](X+sig) temp_xx
00781       if (GOES_FORWARDS(sig)){
00782         // add contribution from middle link
00783         add_force_to_momentum(P3, Qmu, sig, mThreeSt, mom); 
00784         // matrix_mult_na(P3[x],Qmu[x],tmp);
00785         // mom[sig][x] += mThreeSt*tmp; 
00786       }
00787       for(nu=0; nu < 8; nu++){
00788         if (nu == sig || nu == OPP_DIR(sig)
00789             || nu == mu || nu == OPP_DIR(mu)){
00790           continue;
00791         }
00792 
00793         // 5 link path
00794         //
00795         //        sig
00796         //    A ________
00797         //     |        |
00798         //    /|\      \|/
00799         //     |        |
00800         //      \        \
00801         //       \        \
00802 
00803         u_shift_mat(Pmu, Pnumu, OPP_DIR(nu), sitelink);
00804         u_shift_mat(Qmu, Qnumu, OPP_DIR(nu), sitelink);
00805 
00806         u_shift_mat(Pnumu, P5, sig, sitelink);
00807         if (GOES_FORWARDS(sig)){
00808           add_force_to_momentum(P5, Qnumu, sig, FiveSt, mom);
00809         } // seems to do what I think it should
00810         for(rho =0; rho < 8; rho++){
00811           if (rho == sig || rho == OPP_DIR(sig)
00812               || rho == mu || rho == OPP_DIR(mu)
00813               || rho == nu || rho == OPP_DIR(nu)){
00814             continue;
00815           }
00816 
00817           //7 link path
00818           u_shift_mat(Pnumu, Prhonumu, OPP_DIR(rho), sitelink);
00819           u_shift_mat(Prhonumu, P7, sig, sitelink);
00820 
00821           // Prhonumu = tempmat[2] is not needed again
00822           // => store Qrhonumu in the same memory
00823           u_shift_mat(Qnumu, Qrhonumu, OPP_DIR(rho), sitelink);
00824 
00825 
00826           if(GOES_FORWARDS(sig)){
00827             add_force_to_momentum(P7, Qrhonumu, sig, mSevenSt, mom) ;   
00828           }
00829              u_shift_mat(P7, P7rho, rho, sitelink);
00830              side_link_force(rho, sig, SevenSt, Qnumu, P7, Qrhonumu, P7rho, mom);                   
00831              if(FiveSt != 0)coeff = SevenSt/FiveSt ; else coeff = 0;
00832              for(i=0; i<V; i++){
00833              scalar_mult_add_su3_matrix(&P5[i], &P7rho[i], coeff, &P5[i]);
00834              } // end loop over volume
00835         } // end loop over rho  
00836 
00837 
00838            u_shift_mat(P5, P5nu, nu, sitelink); 
00839            side_link_force(nu,sig,mFiveSt,Qmu,P5,Qnumu,P5nu,mom); // I believe this should do what I want it to
00840         // check this!
00841         if(ThreeSt != 0)coeff   = FiveSt/ThreeSt; else coeff = 0;
00842 
00843         for(i=0; i<V; i++){
00844         scalar_mult_add_su3_matrix(&P3[i], &P5nu[i], coeff, &P3[i]);
00845         } // end loop over volume
00846       } // end loop over nu
00847 
00848       // Lepage term 
00849       u_shift_mat(Pmu, Pnumu, OPP_DIR(mu), sitelink);
00850       u_shift_mat(Qmu, Qnumu, OPP_DIR(mu), sitelink);
00851 
00852       u_shift_mat(Pnumu, P5, sig, sitelink);
00853       if(GOES_FORWARDS(sig)){
00854       add_force_to_momentum(P5, Qnumu, sig, Lepage, mom); 
00855       }
00856 
00857       u_shift_mat(P5, P5nu, mu, sitelink);
00858       side_link_force(mu, sig, mLepage, Qmu, P5, Qnumu, P5nu, mom);
00859 
00860 
00861       if(ThreeSt != 0)coeff = Lepage/ThreeSt; else coeff = 0;
00862 
00863       for(i=0; i<V; i++){
00864       scalar_mult_add_su3_matrix(&P3[i], &P5nu[i], coeff, &P3[i]);
00865       }
00866 
00867       if(GOES_FORWARDS(mu)){
00868       u_shift_mat(P3, P3mu, mu, sitelink);
00869       }
00870       side_link_force(mu, sig, ThreeSt, id, P3, Qmu, P3mu, mom);
00871     } // end loop over mu
00872   } // end loop over sig
00873 } // modified hisq_force_reference
00874 
00875 
00876 
00877 
00878 
00879 
00880 // This version of the test routine uses 
00881 // half-wilson vectors instead of color matrices.
00882 template <typename Real, typename su3_matrix, typename anti_hermitmat, typename half_wilson_vector>
00883 void do_halfwilson_hisq_force_reference(Real eps, Real weight, 
00884                            half_wilson_vector* temp_x, Real* act_path_coeff,
00885                            su3_matrix* sitelink, anti_hermitmat* mom)
00886 {
00887   int i;
00888   int mu, nu, rho, sig;
00889   Real coeff;
00890   Real OneLink, Lepage, FiveSt, ThreeSt, SevenSt;
00891   //Real Naik;
00892   Real mLepage, mFiveSt, mThreeSt, mSevenSt;    
00893 
00894   su3_matrix* tempmat[9];
00895   int sites_on_node = V;
00896 //  int DirectLinks[8] ;
00897  
00898   Real ferm_epsilon;
00899   ferm_epsilon = 2.0*weight*eps;
00900   OneLink = act_path_coeff[0]*ferm_epsilon ; 
00901 //  Naik    = act_path_coeff[1]*ferm_epsilon ; 
00902   ThreeSt = act_path_coeff[2]*ferm_epsilon ; mThreeSt = -ThreeSt;
00903   FiveSt  = act_path_coeff[3]*ferm_epsilon ; mFiveSt  = -FiveSt;
00904   SevenSt = act_path_coeff[4]*ferm_epsilon ; mSevenSt = -SevenSt;
00905   Lepage  = act_path_coeff[5]*ferm_epsilon ; mLepage  = -Lepage;
00906    
00907  
00908 //  for(mu=0;mu<8;mu++){
00909 //    DirectLinks[mu] = 0 ;
00910 //  }
00911 
00912   for(mu=0; mu<9; mu++){        
00913     tempmat[mu] = (su3_matrix *)malloc( sites_on_node*sizeof(su3_matrix) );  
00914   }
00915 
00916 
00917   su3_matrix* id;
00918   id = (su3_matrix *)malloc(sites_on_node*sizeof(su3_matrix) ); 
00919   su3_matrix* id4;
00920   id4 = (su3_matrix *)malloc(sites_on_node*4*sizeof(su3_matrix) ); 
00921 
00922   su3_matrix* temp_mat;
00923   temp_mat = (su3_matrix *)malloc(sites_on_node*sizeof(su3_matrix) ); 
00924 
00925   // initialise id so that it is the identity matrix on each lattice site
00926   set_identity(id,1);
00927 
00928   printf("Calling hisq reference routine\n");
00929   for(sig=0; sig < 8; sig++){
00930     shifted_outer_prod(temp_x, temp_mat, OPP_DIR(sig));
00931 
00932     // One-link term - don't have the savings here that we get when working with the 
00933     // half-wilson vectors
00934     if(GOES_FORWARDS(sig)){
00935       u_shift_mat(temp_mat, Pmu, sig, sitelink);
00936        add_force_to_momentum(Pmu, id, sig, OneLink, mom); 
00937     }
00938 
00939     for(mu = 0; mu < 8; mu++){
00940       if ( (mu == sig) || (mu == OPP_DIR(sig))){
00941                 continue;
00942       }
00943       // 3 link path 
00944       //         sig
00945       //    A  _______
00946       //      |       |
00947       //  mu /|\     \|/
00948       //      |       |
00949       //
00950       //
00951       u_shift_mat(temp_mat, Pmu, OPP_DIR(mu), sitelink); // temp_xx[sig] stores |X(x)><X(x-sig)|
00952       u_shift_mat(id, Qmu, OPP_DIR(mu), sitelink);       // This returns the path less the outer-product of quark fields at the end 
00953                                                          // Qmu = U[mu] 
00954         
00955 
00956       u_shift_mat(Pmu, P3, sig, sitelink); // P3 is U[sig](X)U[-mu](X+sig) temp_xx
00957       if (GOES_FORWARDS(sig)){
00958           // add contribution from middle link
00959         add_force_to_momentum(P3, Qmu, sig, mThreeSt, mom); // matrix_mult_na(P3[x],Qmu[x],tmp);
00960                                                             // mom[sig][x] += mThreeSt*tmp; 
00961       }
00962       for(nu=0; nu < 8; nu++){
00963         if (nu == sig || nu == OPP_DIR(sig)
00964          || nu == mu || nu == OPP_DIR(mu)){
00965          continue;
00966         }
00967         
00968         // 5 link path
00969         //
00970         //        sig
00971         //    A ________
00972         //     |        |
00973         //    /|\      \|/
00974         //     |        |
00975         //      \        \
00976         //       \        \
00977 
00978         u_shift_mat(Pmu, Pnumu, OPP_DIR(nu), sitelink);
00979         u_shift_mat(Qmu, Qnumu, OPP_DIR(nu), sitelink);
00980 
00981         u_shift_mat(Pnumu, P5, sig, sitelink);
00982         if (GOES_FORWARDS(sig)){
00983           add_force_to_momentum(P5, Qnumu, sig, FiveSt, mom);
00984         } // seems to do what I think it should
00985 
00986         for(rho =0; rho < 8; rho++){
00987           if (rho == sig || rho == OPP_DIR(sig)
00988             || rho == mu || rho == OPP_DIR(mu)
00989             || rho == nu || rho == OPP_DIR(nu)){
00990                continue;
00991            }
00992                     
00993            //7 link path
00994           u_shift_mat(Pnumu, Prhonumu, OPP_DIR(rho), sitelink);
00995           u_shift_mat(Prhonumu, P7, sig, sitelink);
00996           
00997           // Prhonumu = tempmat[2] is not needed again
00998           // => store Qrhonumu in the same memory
00999           u_shift_mat(Qnumu, Qrhonumu, OPP_DIR(rho), sitelink);
01000         
01001 
01002           if(GOES_FORWARDS(sig)){
01003             add_force_to_momentum(P7, Qrhonumu, sig, mSevenSt, mom) ;   
01004           }
01005 
01006           u_shift_mat(P7, P7rho, rho, sitelink);
01007           side_link_force(rho, sig, SevenSt, Qnumu, P7, Qrhonumu, P7rho, mom);              
01008           if(FiveSt != 0)coeff = SevenSt/FiveSt ; else coeff = 0;
01009           for(i=0; i<V; i++){
01010             scalar_mult_add_su3_matrix(&P5[i], &P7rho[i], coeff, &P5[i]);
01011           } // end loop over volume
01012         } // end loop over rho  
01013 
01014 
01015         u_shift_mat(P5, P5nu, nu, sitelink);    
01016         side_link_force(nu,sig,mFiveSt,Qmu,P5,Qnumu,P5nu,mom); // I believe this should do what I want it to
01017                                                                // check this!
01018         if(ThreeSt != 0)coeff   = FiveSt/ThreeSt; else coeff = 0;
01019         
01020         for(i=0; i<V; i++){
01021           scalar_mult_add_su3_matrix(&P3[i], &P5nu[i], coeff, &P3[i]);
01022         } // end loop over volume
01023       } // end loop over nu
01024 
01025       // Lepage term 
01026       u_shift_mat(Pmu, Pnumu, OPP_DIR(mu), sitelink);
01027       u_shift_mat(Qmu, Qnumu, OPP_DIR(mu), sitelink);
01028         
01029       u_shift_mat(Pnumu, P5, sig, sitelink);
01030       if(GOES_FORWARDS(sig)){
01031         add_force_to_momentum(P5, Qnumu, sig, Lepage, mom); 
01032       }
01033 
01034       u_shift_mat(P5, P5nu, mu, sitelink);
01035       side_link_force(mu, sig, mLepage, Qmu, P5, Qnumu, P5nu, mom);
01036 
01037 
01038       if(ThreeSt != 0)coeff = Lepage/ThreeSt; else coeff = 0;
01039 
01040       for(i=0; i<V; i++){
01041         scalar_mult_add_su3_matrix(&P3[i], &P5nu[i], coeff, &P3[i]);
01042       }
01043    
01044       if(GOES_FORWARDS(mu)){
01045         u_shift_mat(P3, P3mu, mu, sitelink);
01046       }
01047       side_link_force(mu, sig, ThreeSt, id, P3, Qmu, P3mu, mom);
01048     } // end loop over mu
01049   } // end loop over sig
01050 
01051   for(mu=0; mu<9; mu++){        
01052     free(tempmat[mu]);
01053   }
01054   
01055   free(id);
01056   free(id4);
01057   free(temp_mat);
01058 }
01059 
01060 #undef Pmu
01061 #undef Pnumu
01062 #undef Prhonumu
01063 #undef P3
01064 #undef P3mu
01065 #undef P5
01066 #undef P5nu
01067 #undef P7
01068 #undef P7rho
01069 #undef P7rhonu
01070 
01071 #undef Popmu
01072 #undef Pmumumu
01073 
01074 #undef Qmu
01075 #undef Qnumu
01076 #undef Qrhonumu
01077 
01078 
01079 
01080 static
01081 void set_identity(fsu3_matrix *sitelink){
01082   int tot = V*4;
01083   for(int i=0; i<tot; i++){ // loop over sites and directions
01084    for(int a=0; a<3; a++){
01085      for(int b=0; b<3; b++){
01086         sitelink[i].e[a][b].real = sitelink[i].e[a][b].imag = 0.;
01087      }
01088    }
01089    for(int a=0; a<3; a++){ // set the diagonal elements to unity
01090      sitelink[i].e[a][a].real = 1.;
01091    }
01092   }
01093   return;
01094 }
01095 
01096 
01097 
01098 
01099 void halfwilson_hisq_force_reference(float eps, float weight, 
01100                           void* act_path_coeff, void* temp_x,
01101                           void* sitelink, void* mom)
01102 {
01103  do_halfwilson_hisq_force_reference((float)eps, (float)weight,
01104                           (fhalf_wilson_vector*) temp_x, (float*)act_path_coeff,
01105                           (fsu3_matrix*)sitelink, (fanti_hermitmat*)mom);
01106  return;
01107 }
01108 
01109 
01110 
01111 void halfwilson_hisq_force_reference(double eps, double weight, 
01112                           void* act_path_coeff, void* temp_x,
01113                           void* sitelink, void* mom)
01114 {
01115  do_halfwilson_hisq_force_reference((double)eps, (double)weight,
01116                           (dhalf_wilson_vector*) temp_x, (double*)act_path_coeff,
01117                           (dsu3_matrix*)sitelink, (danti_hermitmat*)mom);
01118  return;
01119 }
01120 
01121 
01122 
01123 void color_matrix_hisq_force_reference(float eps, float weight, 
01124                           void* act_path_coeff, void* temp_xx,
01125                           void* sitelink, void* mom)
01126 {
01127  do_color_matrix_hisq_force_reference((float)eps, (float)weight,
01128                           (fsu3_matrix*) temp_xx, (float*)act_path_coeff,
01129                           (fsu3_matrix*)sitelink, (fanti_hermitmat*)mom);
01130   return;
01131 }