|
QUDA v0.3.2
A library for QCD on GPUs
|
00001 #include <stdio.h> 00002 #include <stdlib.h> 00003 #include <math.h> 00004 00005 #include <quda.h> 00006 #include <test_util.h> 00007 #include "llfat_reference.h" 00008 #include "misc.h" 00009 #include <string.h> 00010 00011 #define XUP 0 00012 #define YUP 1 00013 #define ZUP 2 00014 #define TUP 3 00015 00016 typedef struct { 00017 float real; 00018 float imag; 00019 } fcomplex; 00020 00021 /* specific for double complex */ 00022 typedef struct { 00023 double real; 00024 double imag; 00025 } dcomplex; 00026 00027 typedef struct { fcomplex e[3][3]; } fsu3_matrix; 00028 typedef struct { fcomplex c[3]; } fsu3_vector; 00029 typedef struct { dcomplex e[3][3]; } dsu3_matrix; 00030 typedef struct { dcomplex c[3]; } dsu3_vector; 00031 00032 00033 #define CADD(a,b,c) { (c).real = (a).real + (b).real; \ 00034 (c).imag = (a).imag + (b).imag; } 00035 #define CMUL(a,b,c) { (c).real = (a).real*(b).real - (a).imag*(b).imag; \ 00036 (c).imag = (a).real*(b).imag + (a).imag*(b).real; } 00037 #define CSUM(a,b) { (a).real += (b).real; (a).imag += (b).imag; } 00038 00039 /* c = a* * b */ 00040 #define CMULJ_(a,b,c) { (c).real = (a).real*(b).real + (a).imag*(b).imag; \ 00041 (c).imag = (a).real*(b).imag - (a).imag*(b).real; } 00042 00043 /* c = a * b* */ 00044 #define CMUL_J(a,b,c) { (c).real = (a).real*(b).real + (a).imag*(b).imag; \ 00045 (c).imag = (a).imag*(b).real - (a).real*(b).imag; } 00046 00047 extern int Z[4]; 00048 extern int V; 00049 extern int Vh; 00050 00051 template<typename su3_matrix, typename Real> 00052 void 00053 llfat_scalar_mult_su3_matrix( su3_matrix *a, Real s, su3_matrix *b ) 00054 { 00055 00056 int i,j; 00057 for(i=0;i<3;i++)for(j=0;j<3;j++){ 00058 b->e[i][j].real = s*a->e[i][j].real; 00059 b->e[i][j].imag = s*a->e[i][j].imag; 00060 } 00061 00062 return; 00063 } 00064 00065 template<typename su3_matrix, typename Real> 00066 void 00067 llfat_scalar_mult_add_su3_matrix(su3_matrix *a,su3_matrix *b, Real s, su3_matrix *c) 00068 { 00069 int i,j; 00070 for(i=0;i<3;i++)for(j=0;j<3;j++){ 00071 c->e[i][j].real = a->e[i][j].real + s*b->e[i][j].real; 00072 c->e[i][j].imag = a->e[i][j].imag + s*b->e[i][j].imag; 00073 } 00074 00075 } 00076 00077 template <typename su3_matrix> 00078 void 00079 llfat_mult_su3_na( su3_matrix *a, su3_matrix *b, su3_matrix *c ) 00080 { 00081 int i,j,k; 00082 typeof(a->e[0][0]) x,y; 00083 for(i=0;i<3;i++)for(j=0;j<3;j++){ 00084 x.real=x.imag=0.0; 00085 for(k=0;k<3;k++){ 00086 CMUL_J( a->e[i][k] , b->e[j][k] , y ); 00087 CSUM( x , y ); 00088 } 00089 c->e[i][j] = x; 00090 } 00091 } 00092 00093 template <typename su3_matrix> 00094 void 00095 llfat_mult_su3_nn( su3_matrix *a, su3_matrix *b, su3_matrix *c ) 00096 { 00097 int i,j,k; 00098 typeof(a->e[0][0]) x,y; 00099 for(i=0;i<3;i++)for(j=0;j<3;j++){ 00100 x.real=x.imag=0.0; 00101 for(k=0;k<3;k++){ 00102 CMUL( a->e[i][k] , b->e[k][j] , y ); 00103 CSUM( x , y ); 00104 } 00105 c->e[i][j] = x; 00106 } 00107 } 00108 00109 template<typename su3_matrix> 00110 void 00111 llfat_mult_su3_an( su3_matrix *a, su3_matrix *b, su3_matrix *c ) 00112 { 00113 int i,j,k; 00114 typeof(a->e[0][0]) x,y; 00115 for(i=0;i<3;i++)for(j=0;j<3;j++){ 00116 x.real=x.imag=0.0; 00117 for(k=0;k<3;k++){ 00118 CMULJ_( a->e[k][i] , b->e[k][j], y ); 00119 CSUM( x , y ); 00120 } 00121 c->e[i][j] = x; 00122 } 00123 } 00124 00125 00126 00127 00128 00129 template<typename su3_matrix> 00130 void 00131 llfat_add_su3_matrix( su3_matrix *a, su3_matrix *b, su3_matrix *c ) 00132 { 00133 int i,j; 00134 for(i=0;i<3;i++)for(j=0;j<3;j++){ 00135 CADD( a->e[i][j], b->e[i][j], c->e[i][j] ); 00136 } 00137 } 00138 00139 00140 00141 template<typename su3_matrix, typename Real> 00142 void 00143 llfat_compute_gen_staple_field(su3_matrix *staple, int mu, int nu, 00144 su3_matrix* mulink, su3_matrix* sitelink, su3_matrix* fatlink, Real coef, 00145 int use_staple) 00146 { 00147 su3_matrix tmat1,tmat2; 00148 int i ; 00149 su3_matrix *fat1; 00150 00151 /* Upper staple */ 00152 /* Computes the staple : 00153 * mu (B) 00154 * +-------+ 00155 * nu | | 00156 * (A) | |(C) 00157 * X X 00158 * 00159 * Where the mu link can be any su3_matrix. The result is saved in staple. 00160 * if staple==NULL then the result is not saved. 00161 * It also adds the computed staple to the fatlink[mu] with weight coef. 00162 */ 00163 00164 int dx[4]; 00165 00166 /* upper staple */ 00167 00168 for(i=0;i < V;i++){ 00169 00170 fat1 = fatlink + 4*i + mu; 00171 su3_matrix* A = sitelink + 4*i + nu; 00172 00173 memset(dx, 0, sizeof(dx)); 00174 dx[nu] =1; 00175 int nbr_idx = neighborIndexFullLattice(i, dx[3], dx[2], dx[1], dx[0]); 00176 su3_matrix* B; 00177 if (use_staple){ 00178 B = mulink + nbr_idx; 00179 }else{ 00180 B = mulink + 4*nbr_idx + mu; 00181 } 00182 00183 memset(dx, 0, sizeof(dx)); 00184 dx[mu] =1; 00185 nbr_idx = neighborIndexFullLattice(i, dx[3], dx[2],dx[1],dx[0]); 00186 su3_matrix* C = sitelink + 4*nbr_idx + nu; 00187 00188 llfat_mult_su3_nn( A, B,&tmat1); 00189 00190 if(staple!=NULL){/* Save the staple */ 00191 llfat_mult_su3_na( &tmat1, C, &staple[i]); 00192 } else{ /* No need to save the staple. Add it to the fatlinks */ 00193 llfat_mult_su3_na( &tmat1, C, &tmat2); 00194 llfat_scalar_mult_add_su3_matrix(fat1, &tmat2, coef, fat1); 00195 } 00196 } 00197 /***************lower staple**************** 00198 * 00199 * X X 00200 * nu | | 00201 * (A) | |(C) 00202 * +-------+ 00203 * mu (B) 00204 * 00205 *********************************************/ 00206 00207 for(i=0;i < V;i++){ 00208 00209 fat1 = fatlink + 4*i + mu; 00210 memset(dx, 0, sizeof(dx)); 00211 dx[nu] = -1; 00212 int nbr_idx = neighborIndexFullLattice(i, dx[3], dx[2], dx[1], dx[0]); 00213 if (nbr_idx >= V || nbr_idx <0){ 00214 fprintf(stderr, "ERROR: invliad nbr_idx(%d), line=%d\n", nbr_idx, __LINE__); 00215 exit(1); 00216 } 00217 su3_matrix* A = sitelink + 4*nbr_idx + nu; 00218 00219 su3_matrix* B; 00220 if (use_staple){ 00221 B = mulink + nbr_idx; 00222 }else{ 00223 B = mulink + 4*nbr_idx + mu; 00224 } 00225 00226 memset(dx, 0, sizeof(dx)); 00227 dx[mu] = 1; 00228 nbr_idx = neighborIndexFullLattice(nbr_idx, dx[3], dx[2],dx[1],dx[0]); 00229 su3_matrix* C = sitelink + 4*nbr_idx + nu; 00230 00231 llfat_mult_su3_an( A, B,&tmat1); 00232 llfat_mult_su3_nn( &tmat1, C,&tmat2); 00233 00234 if(staple!=NULL){/* Save the staple */ 00235 llfat_add_su3_matrix(&staple[i], &tmat2, &staple[i]); 00236 llfat_scalar_mult_add_su3_matrix(fat1, &staple[i], coef, fat1); 00237 00238 } else{ /* No need to save the staple. Add it to the fatlinks */ 00239 llfat_scalar_mult_add_su3_matrix(fat1, &tmat2, coef, fat1); 00240 } 00241 } 00242 00243 } /* compute_gen_staple_site */ 00244 00245 00246 00247 /* Optimized fattening code for the Asq and Asqtad actions. 00248 * I assume that: 00249 * path 0 is the one link 00250 * path 2 the 3-staple 00251 * path 3 the 5-staple 00252 * path 4 the 7-staple 00253 * path 5 the Lapage term. 00254 * Path 1 is the Naik term 00255 * 00256 */ 00257 template <typename su3_matrix, typename Float> 00258 void llfat_cpu(su3_matrix* fatlink, su3_matrix* sitelink, Float* act_path_coeff) 00259 { 00260 00261 su3_matrix* staple = (su3_matrix *)malloc(V*sizeof(su3_matrix)); 00262 if(staple == NULL){ 00263 fprintf(stderr, "Error: malloc failed for staple in function %s\n", __FUNCTION__); 00264 exit(1); 00265 } 00266 00267 su3_matrix* tempmat1 = (su3_matrix *)malloc(V*sizeof(su3_matrix)); 00268 if(tempmat1 == NULL){ 00269 fprintf(stderr, "ERROR: malloc failed for tempmat1 in function %s\n", __FUNCTION__); 00270 exit(1); 00271 } 00272 00273 /* to fix up the Lepage term, included by a trick below */ 00274 Float one_link = (act_path_coeff[0] - 6.0*act_path_coeff[5]); 00275 00276 00277 for (int dir=XUP; dir<=TUP; dir++){ 00278 00279 /* Intialize fat links with c_1*U_\mu(x) */ 00280 for(int i=0;i < V;i ++){ 00281 su3_matrix* fat1 = fatlink + 4*i + dir; 00282 llfat_scalar_mult_su3_matrix(sitelink+ 4*i + dir, one_link, fat1 ); 00283 } 00284 } 00285 00286 for (int dir=XUP; dir<=TUP; dir++){ 00287 for(int nu=XUP; nu<=TUP; nu++){ 00288 if(nu!=dir){ 00289 llfat_compute_gen_staple_field(staple,dir,nu,sitelink, sitelink,fatlink, act_path_coeff[2], 0); 00290 00291 /* The Lepage term */ 00292 /* Note this also involves modifying c_1 (above) */ 00293 00294 00295 llfat_compute_gen_staple_field((su3_matrix*)NULL,dir,nu,staple,sitelink, fatlink, act_path_coeff[5],1); 00296 00297 for(int rho=XUP; rho<=TUP; rho++) { 00298 if((rho!=dir)&&(rho!=nu)){ 00299 llfat_compute_gen_staple_field( tempmat1, dir, rho, staple,sitelink,fatlink, act_path_coeff[3], 1); 00300 00301 for(int sig=XUP; sig<=TUP; sig++){ 00302 if((sig!=dir)&&(sig!=nu)&&(sig!=rho)){ 00303 llfat_compute_gen_staple_field((su3_matrix*)NULL,dir,sig,tempmat1,sitelink,fatlink, act_path_coeff[4], 1); 00304 } 00305 }/* sig */ 00306 00307 } 00308 00309 }/* rho */ 00310 00311 00312 } 00313 00314 }/* nu */ 00315 00316 }/* dir */ 00317 00318 free(staple); 00319 free(tempmat1); 00320 00321 } 00322 00323 00324 void 00325 llfat_reference(void* fatlink, void* sitelink, QudaPrecision prec, void* act_path_coeff) 00326 { 00327 switch(prec){ 00328 case QUDA_DOUBLE_PRECISION:{ 00329 llfat_cpu((dsu3_matrix*)fatlink, (dsu3_matrix*)sitelink, (double*) act_path_coeff); 00330 break; 00331 } 00332 case QUDA_SINGLE_PRECISION:{ 00333 llfat_cpu((fsu3_matrix*)fatlink, (fsu3_matrix*)sitelink, (float*) act_path_coeff); 00334 break; 00335 } 00336 default: 00337 fprintf(stderr, "ERROR: unsupported precision\n"); 00338 exit(1); 00339 break; 00340 00341 } 00342 00343 return; 00344 00345 }
1.7.3