|
QUDA v0.3.2
A library for QCD on GPUs
|
00001 #include <stdio.h> 00002 #include <stdlib.h> 00003 #include <math.h> 00004 #include <string.h> 00005 00006 #include "quda.h" 00007 #include "test_util.h" 00008 #include "misc.h" 00009 #include "gauge_force_reference.h" 00010 00011 extern int Z[4]; 00012 extern int V; 00013 extern int Vh; 00014 00015 00016 00017 #define CADD(a,b,c) { (c).real = (a).real + (b).real; \ 00018 (c).imag = (a).imag + (b).imag; } 00019 #define CMUL(a,b,c) { (c).real = (a).real*(b).real - (a).imag*(b).imag; \ 00020 (c).imag = (a).real*(b).imag + (a).imag*(b).real; } 00021 #define CSUM(a,b) { (a).real += (b).real; (a).imag += (b).imag; } 00022 00023 /* c = a* * b */ 00024 #define CMULJ_(a,b,c) { (c).real = (a).real*(b).real + (a).imag*(b).imag; \ 00025 (c).imag = (a).real*(b).imag - (a).imag*(b).real; } 00026 00027 /* c = a * b* */ 00028 #define CMUL_J(a,b,c) { (c).real = (a).real*(b).real + (a).imag*(b).imag; \ 00029 (c).imag = (a).imag*(b).real - (a).real*(b).imag; } 00030 00031 #define CONJG(a,b) { (b).real = (a).real; (b).imag = -(a).imag; } 00032 00033 typedef struct { 00034 float real; 00035 float imag; 00036 } fcomplex; 00037 00038 /* specific for double complex */ 00039 typedef struct { 00040 double real; 00041 double imag; 00042 } dcomplex; 00043 00044 typedef struct { fcomplex e[3][3]; } fsu3_matrix; 00045 typedef struct { fcomplex c[3]; } fsu3_vector; 00046 typedef struct { dcomplex e[3][3]; } dsu3_matrix; 00047 typedef struct { dcomplex c[3]; } dsu3_vector; 00048 00049 typedef struct { 00050 fcomplex m01,m02,m12; 00051 float m00im,m11im,m22im; 00052 float space; 00053 } fanti_hermitmat; 00054 00055 typedef struct { 00056 dcomplex m01,m02,m12; 00057 double m00im,m11im,m22im; 00058 double space; 00059 } danti_hermitmat; 00060 00061 extern int neighborIndexFullLattice(int i, int dx4, int dx3, int dx2, int dx1); 00062 00063 template<typename su3_matrix> 00064 void su3_adjoint( su3_matrix *a, su3_matrix *b ) 00065 { 00066 int i,j; 00067 for(i=0;i<3;i++)for(j=0;j<3;j++){ 00068 CONJG( a->e[j][i], b->e[i][j] ); 00069 } 00070 } 00071 00072 00073 template<typename su3_matrix, typename anti_hermitmat> 00074 void 00075 make_anti_hermitian( su3_matrix *m3, anti_hermitmat *ah3 ) 00076 { 00077 00078 typeof(ah3->m00im) temp = 00079 (m3->e[0][0].imag + m3->e[1][1].imag + m3->e[2][2].imag)*0.33333333333333333; 00080 ah3->m00im = m3->e[0][0].imag - temp; 00081 ah3->m11im = m3->e[1][1].imag - temp; 00082 ah3->m22im = m3->e[2][2].imag - temp; 00083 ah3->m01.real = (m3->e[0][1].real - m3->e[1][0].real)*0.5; 00084 ah3->m02.real = (m3->e[0][2].real - m3->e[2][0].real)*0.5; 00085 ah3->m12.real = (m3->e[1][2].real - m3->e[2][1].real)*0.5; 00086 ah3->m01.imag = (m3->e[0][1].imag + m3->e[1][0].imag)*0.5; 00087 ah3->m02.imag = (m3->e[0][2].imag + m3->e[2][0].imag)*0.5; 00088 ah3->m12.imag = (m3->e[1][2].imag + m3->e[2][1].imag)*0.5; 00089 00090 } 00091 00092 template <typename anti_hermitmat, typename su3_matrix> 00093 static void 00094 uncompress_anti_hermitian(anti_hermitmat *mat_antihermit, 00095 su3_matrix *mat_su3 ) 00096 { 00097 typeof(mat_antihermit->m00im) temp1; 00098 mat_su3->e[0][0].imag=mat_antihermit->m00im; 00099 mat_su3->e[0][0].real=0.; 00100 mat_su3->e[1][1].imag=mat_antihermit->m11im; 00101 mat_su3->e[1][1].real=0.; 00102 mat_su3->e[2][2].imag=mat_antihermit->m22im; 00103 mat_su3->e[2][2].real=0.; 00104 mat_su3->e[0][1].imag=mat_antihermit->m01.imag; 00105 temp1=mat_antihermit->m01.real; 00106 mat_su3->e[0][1].real=temp1; 00107 mat_su3->e[1][0].real= -temp1; 00108 mat_su3->e[1][0].imag=mat_antihermit->m01.imag; 00109 mat_su3->e[0][2].imag=mat_antihermit->m02.imag; 00110 temp1=mat_antihermit->m02.real; 00111 mat_su3->e[0][2].real=temp1; 00112 mat_su3->e[2][0].real= -temp1; 00113 mat_su3->e[2][0].imag=mat_antihermit->m02.imag; 00114 mat_su3->e[1][2].imag=mat_antihermit->m12.imag; 00115 temp1=mat_antihermit->m12.real; 00116 mat_su3->e[1][2].real=temp1; 00117 mat_su3->e[2][1].real= -temp1; 00118 mat_su3->e[2][1].imag=mat_antihermit->m12.imag; 00119 } 00120 00121 template <typename su3_matrix, typename Float> 00122 static void 00123 scalar_mult_sub_su3_matrix(su3_matrix *a,su3_matrix *b, Float s, su3_matrix *c) 00124 { 00125 int i,j; 00126 for(i=0;i<3;i++){ 00127 for(j=0;j<3;j++){ 00128 c->e[i][j].real = a->e[i][j].real - s*b->e[i][j].real; 00129 c->e[i][j].imag = a->e[i][j].imag - s*b->e[i][j].imag; 00130 } 00131 } 00132 } 00133 00134 template <typename su3_matrix, typename Float> 00135 static void 00136 scalar_mult_add_su3_matrix(su3_matrix *a,su3_matrix *b, Float s, su3_matrix *c) 00137 { 00138 int i,j; 00139 for(i=0;i<3;i++){ 00140 for(j=0;j<3;j++){ 00141 c->e[i][j].real = a->e[i][j].real + s*b->e[i][j].real; 00142 c->e[i][j].imag = a->e[i][j].imag + s*b->e[i][j].imag; 00143 } 00144 } 00145 } 00146 00147 template <typename su3_matrix> 00148 static void 00149 mult_su3_nn(su3_matrix* a, su3_matrix* b, su3_matrix* c) 00150 { 00151 int i,j,k; 00152 typeof(a->e[0][0]) x,y; 00153 for(i=0;i<3;i++){ 00154 for(j=0;j<3;j++){ 00155 x.real=x.imag=0.0; 00156 for(k=0;k<3;k++){ 00157 CMUL( a->e[i][k] , b->e[k][j] , y ); 00158 CSUM( x , y ); 00159 } 00160 c->e[i][j] = x; 00161 } 00162 } 00163 } 00164 template<typename su3_matrix> 00165 static void 00166 mult_su3_an( su3_matrix *a, su3_matrix *b, su3_matrix *c ) 00167 { 00168 int i,j,k; 00169 typeof(a->e[0][0]) x,y; 00170 for(i=0;i<3;i++){ 00171 for(j=0;j<3;j++){ 00172 x.real=x.imag=0.0; 00173 for(k=0;k<3;k++){ 00174 CMULJ_( a->e[k][i] , b->e[k][j], y ); 00175 CSUM( x , y ); 00176 } 00177 c->e[i][j] = x; 00178 } 00179 } 00180 } 00181 00182 template<typename su3_matrix> 00183 static void 00184 mult_su3_na( su3_matrix *a, su3_matrix *b, su3_matrix *c ) 00185 { 00186 int i,j,k; 00187 typeof(a->e[0][0]) x,y; 00188 for(i=0;i<3;i++){ 00189 for(j=0;j<3;j++){ 00190 x.real=x.imag=0.0; 00191 for(k=0;k<3;k++){ 00192 CMUL_J( a->e[i][k] , b->e[j][k] , y ); 00193 CSUM( x , y ); 00194 } 00195 c->e[i][j] = x; 00196 } 00197 } 00198 } 00199 00200 template < typename su3_matrix> 00201 void 00202 print_su3_matrix(su3_matrix *a) 00203 { 00204 int i, j; 00205 for(i=0;i < 3; i++){ 00206 for(j=0;j < 3;j++){ 00207 printf("(%f %f)\t", a->e[i][j].real, a->e[i][j].imag); 00208 } 00209 printf("\n"); 00210 } 00211 00212 } 00213 00214 00215 //this functon compute one path for all lattice sites 00216 template<typename su3_matrix, typename Float> 00217 static void 00218 compute_path_product(su3_matrix* staple, su3_matrix* sitelink, int* path, int len, Float loop_coeff, int dir) 00219 { 00220 int i, j; 00221 00222 su3_matrix prev_matrix, curr_matrix, tmat; 00223 int dx[4]; 00224 00225 for(i=0;i<V;i++){ 00226 memset(dx,0, sizeof(dx)); 00227 memset(&curr_matrix, 0, sizeof(curr_matrix)); 00228 00229 curr_matrix.e[0][0].real = 1.0; 00230 curr_matrix.e[1][1].real = 1.0; 00231 curr_matrix.e[2][2].real = 1.0; 00232 00233 dx[dir] =1; 00234 for(j=0; j < len;j++){ 00235 int lnkdir; 00236 00237 prev_matrix = curr_matrix; 00238 if (GOES_FORWARDS(path[j])){ 00239 //dx[path[j]] +=1; 00240 lnkdir = path[j]; 00241 }else{ 00242 dx[OPP_DIR(path[j])] -=1; 00243 lnkdir=OPP_DIR(path[j]); 00244 } 00245 00246 int nbr_idx = neighborIndexFullLattice(i, dx[3], dx[2], dx[1], dx[0]); 00247 00248 su3_matrix* lnk = sitelink + nbr_idx*4 + lnkdir; 00249 if (GOES_FORWARDS(path[j])){ 00250 mult_su3_nn(&prev_matrix, lnk, &curr_matrix); 00251 }else{ 00252 mult_su3_na(&prev_matrix, lnk, &curr_matrix); 00253 } 00254 00255 if (GOES_FORWARDS(path[j])){ 00256 dx[path[j]] +=1; 00257 }else{ 00258 //dx[OPP_DIR(path[j])] -=1; 00259 } 00260 00261 00262 }//j 00263 00264 su3_adjoint(&curr_matrix, &tmat ); 00265 00266 scalar_mult_add_su3_matrix(staple + i , &tmat, loop_coeff, staple+i); 00267 00268 }//i 00269 00270 return; 00271 } 00272 00273 00274 template <typename su3_matrix, typename anti_hermitmat, typename Float> 00275 static void 00276 update_mom(anti_hermitmat* momentum, int dir, su3_matrix* sitelink, 00277 su3_matrix* staple, Float eb3) 00278 { 00279 int i; 00280 for(i=0;i <V; i++){ 00281 su3_matrix tmat1; 00282 su3_matrix tmat2; 00283 su3_matrix tmat3; 00284 00285 su3_matrix* lnk = sitelink + 4*i+dir; 00286 su3_matrix* stp = staple + i; 00287 anti_hermitmat* mom = momentum + 4*i+dir; 00288 00289 mult_su3_na(lnk, stp, &tmat1); 00290 uncompress_anti_hermitian(mom, &tmat2); 00291 00292 scalar_mult_sub_su3_matrix(&tmat2, &tmat1, eb3, &tmat3); 00293 make_anti_hermitian(&tmat3, mom); 00294 00295 } 00296 00297 } 00298 00299 00300 00301 /* This function only computes one direction @dir 00302 * 00303 */ 00304 00305 void 00306 gauge_force_reference(void* refMom, int dir, double eb3, void* sitelink, QudaPrecision prec, 00307 int **path_dir, int* length, void* loop_coeff, int num_paths) 00308 { 00309 int i; 00310 00311 void* staple; 00312 int gSize; 00313 if (prec == QUDA_DOUBLE_PRECISION){ 00314 gSize= sizeof(double); 00315 }else{ 00316 gSize= sizeof(float); 00317 } 00318 00319 staple = malloc(V* gaugeSiteSize* gSize); 00320 if (staple == NULL){ 00321 fprintf(stderr, "ERROR: malloc failed for staple in functon %s\n", __FUNCTION__); 00322 exit(1); 00323 } 00324 00325 memset(staple, 0, V*gaugeSiteSize* gSize); 00326 00327 for(i=0;i < num_paths; i++){ 00328 if (prec == QUDA_DOUBLE_PRECISION){ 00329 double* my_loop_coeff = (double*)loop_coeff; 00330 compute_path_product((dsu3_matrix*)staple, (dsu3_matrix*)sitelink, path_dir[i], length[i], my_loop_coeff[i], dir); 00331 }else{ 00332 float* my_loop_coeff = (float*)loop_coeff; 00333 compute_path_product((fsu3_matrix*)staple, (fsu3_matrix*)sitelink, path_dir[i], length[i], my_loop_coeff[i], dir); 00334 } 00335 } 00336 00337 if (prec == QUDA_DOUBLE_PRECISION){ 00338 update_mom((danti_hermitmat*) refMom, dir, (dsu3_matrix*)sitelink, (dsu3_matrix*)staple, (double)eb3); 00339 }else{ 00340 update_mom((fanti_hermitmat*)refMom, dir, (fsu3_matrix*)sitelink, (fsu3_matrix*)staple, (float)eb3); 00341 } 00342 00343 } 00344 00345
1.7.3