QUDA v0.4.0
A library for QCD on GPUs
|
00001 #include <stdio.h> 00002 #include <stdlib.h> 00003 #include <math.h> 00004 #include <string.h> 00005 00006 #include "quda.h" 00007 #include "test_util.h" 00008 #include "misc.h" 00009 #include "gauge_force_reference.h" 00010 00011 extern int Z[4]; 00012 extern int V; 00013 extern int Vh; 00014 extern int Vh_ex; 00015 extern int E[4]; 00016 00017 00018 #define CADD(a,b,c) { (c).real = (a).real + (b).real; \ 00019 (c).imag = (a).imag + (b).imag; } 00020 #define CMUL(a,b,c) { (c).real = (a).real*(b).real - (a).imag*(b).imag; \ 00021 (c).imag = (a).real*(b).imag + (a).imag*(b).real; } 00022 #define CSUM(a,b) { (a).real += (b).real; (a).imag += (b).imag; } 00023 00024 /* c = a* * b */ 00025 #define CMULJ_(a,b,c) { (c).real = (a).real*(b).real + (a).imag*(b).imag; \ 00026 (c).imag = (a).real*(b).imag - (a).imag*(b).real; } 00027 00028 /* c = a * b* */ 00029 #define CMUL_J(a,b,c) { (c).real = (a).real*(b).real + (a).imag*(b).imag; \ 00030 (c).imag = (a).imag*(b).real - (a).real*(b).imag; } 00031 00032 #define CONJG(a,b) { (b).real = (a).real; (b).imag = -(a).imag; } 00033 00034 typedef struct { 00035 float real; 00036 float imag; 00037 } fcomplex; 00038 00039 /* specific for double complex */ 00040 typedef struct { 00041 double real; 00042 double imag; 00043 } dcomplex; 00044 00045 typedef struct { fcomplex e[3][3]; } fsu3_matrix; 00046 typedef struct { fcomplex c[3]; } fsu3_vector; 00047 typedef struct { dcomplex e[3][3]; } dsu3_matrix; 00048 typedef struct { dcomplex c[3]; } dsu3_vector; 00049 00050 typedef struct { 00051 fcomplex m01,m02,m12; 00052 float m00im,m11im,m22im; 00053 float space; 00054 } fanti_hermitmat; 00055 00056 typedef struct { 00057 dcomplex m01,m02,m12; 00058 double m00im,m11im,m22im; 00059 double space; 00060 } danti_hermitmat; 00061 00062 extern int neighborIndexFullLattice(int i, int dx4, int dx3, int dx2, int dx1); 00063 00064 template<typename su3_matrix> 00065 void su3_adjoint( su3_matrix *a, su3_matrix *b ) 00066 { 00067 int i,j; 00068 for(i=0;i<3;i++)for(j=0;j<3;j++){ 00069 CONJG( a->e[j][i], b->e[i][j] ); 00070 } 00071 } 00072 00073 00074 template<typename su3_matrix, typename anti_hermitmat> 00075 void 00076 make_anti_hermitian( su3_matrix *m3, anti_hermitmat *ah3 ) 00077 { 00078 00079 typeof(ah3->m00im) temp = 00080 (m3->e[0][0].imag + m3->e[1][1].imag + m3->e[2][2].imag)*0.33333333333333333; 00081 ah3->m00im = m3->e[0][0].imag - temp; 00082 ah3->m11im = m3->e[1][1].imag - temp; 00083 ah3->m22im = m3->e[2][2].imag - temp; 00084 ah3->m01.real = (m3->e[0][1].real - m3->e[1][0].real)*0.5; 00085 ah3->m02.real = (m3->e[0][2].real - m3->e[2][0].real)*0.5; 00086 ah3->m12.real = (m3->e[1][2].real - m3->e[2][1].real)*0.5; 00087 ah3->m01.imag = (m3->e[0][1].imag + m3->e[1][0].imag)*0.5; 00088 ah3->m02.imag = (m3->e[0][2].imag + m3->e[2][0].imag)*0.5; 00089 ah3->m12.imag = (m3->e[1][2].imag + m3->e[2][1].imag)*0.5; 00090 00091 } 00092 00093 template <typename anti_hermitmat, typename su3_matrix> 00094 static void 00095 uncompress_anti_hermitian(anti_hermitmat *mat_antihermit, 00096 su3_matrix *mat_su3 ) 00097 { 00098 typeof(mat_antihermit->m00im) temp1; 00099 mat_su3->e[0][0].imag=mat_antihermit->m00im; 00100 mat_su3->e[0][0].real=0.; 00101 mat_su3->e[1][1].imag=mat_antihermit->m11im; 00102 mat_su3->e[1][1].real=0.; 00103 mat_su3->e[2][2].imag=mat_antihermit->m22im; 00104 mat_su3->e[2][2].real=0.; 00105 mat_su3->e[0][1].imag=mat_antihermit->m01.imag; 00106 temp1=mat_antihermit->m01.real; 00107 mat_su3->e[0][1].real=temp1; 00108 mat_su3->e[1][0].real= -temp1; 00109 mat_su3->e[1][0].imag=mat_antihermit->m01.imag; 00110 mat_su3->e[0][2].imag=mat_antihermit->m02.imag; 00111 temp1=mat_antihermit->m02.real; 00112 mat_su3->e[0][2].real=temp1; 00113 mat_su3->e[2][0].real= -temp1; 00114 mat_su3->e[2][0].imag=mat_antihermit->m02.imag; 00115 mat_su3->e[1][2].imag=mat_antihermit->m12.imag; 00116 temp1=mat_antihermit->m12.real; 00117 mat_su3->e[1][2].real=temp1; 00118 mat_su3->e[2][1].real= -temp1; 00119 mat_su3->e[2][1].imag=mat_antihermit->m12.imag; 00120 } 00121 00122 template <typename su3_matrix, typename Float> 00123 static void 00124 scalar_mult_sub_su3_matrix(su3_matrix *a,su3_matrix *b, Float s, su3_matrix *c) 00125 { 00126 int i,j; 00127 for(i=0;i<3;i++){ 00128 for(j=0;j<3;j++){ 00129 c->e[i][j].real = a->e[i][j].real - s*b->e[i][j].real; 00130 c->e[i][j].imag = a->e[i][j].imag - s*b->e[i][j].imag; 00131 } 00132 } 00133 } 00134 00135 template <typename su3_matrix, typename Float> 00136 static void 00137 scalar_mult_add_su3_matrix(su3_matrix *a,su3_matrix *b, Float s, su3_matrix *c) 00138 { 00139 int i,j; 00140 for(i=0;i<3;i++){ 00141 for(j=0;j<3;j++){ 00142 c->e[i][j].real = a->e[i][j].real + s*b->e[i][j].real; 00143 c->e[i][j].imag = a->e[i][j].imag + s*b->e[i][j].imag; 00144 } 00145 } 00146 } 00147 00148 template <typename su3_matrix> 00149 static void 00150 mult_su3_nn(su3_matrix* a, su3_matrix* b, su3_matrix* c) 00151 { 00152 int i,j,k; 00153 typeof(a->e[0][0]) x,y; 00154 for(i=0;i<3;i++){ 00155 for(j=0;j<3;j++){ 00156 x.real=x.imag=0.0; 00157 for(k=0;k<3;k++){ 00158 CMUL( a->e[i][k] , b->e[k][j] , y ); 00159 CSUM( x , y ); 00160 } 00161 c->e[i][j] = x; 00162 } 00163 } 00164 } 00165 template<typename su3_matrix> 00166 static void 00167 mult_su3_an( su3_matrix *a, su3_matrix *b, su3_matrix *c ) 00168 { 00169 int i,j,k; 00170 typeof(a->e[0][0]) x,y; 00171 for(i=0;i<3;i++){ 00172 for(j=0;j<3;j++){ 00173 x.real=x.imag=0.0; 00174 for(k=0;k<3;k++){ 00175 CMULJ_( a->e[k][i] , b->e[k][j], y ); 00176 CSUM( x , y ); 00177 } 00178 c->e[i][j] = x; 00179 } 00180 } 00181 } 00182 00183 template<typename su3_matrix> 00184 static void 00185 mult_su3_na( su3_matrix *a, su3_matrix *b, su3_matrix *c ) 00186 { 00187 int i,j,k; 00188 typeof(a->e[0][0]) x,y; 00189 for(i=0;i<3;i++){ 00190 for(j=0;j<3;j++){ 00191 x.real=x.imag=0.0; 00192 for(k=0;k<3;k++){ 00193 CMUL_J( a->e[i][k] , b->e[j][k] , y ); 00194 CSUM( x , y ); 00195 } 00196 c->e[i][j] = x; 00197 } 00198 } 00199 } 00200 00201 template < typename su3_matrix> 00202 void 00203 print_su3_matrix(su3_matrix *a) 00204 { 00205 int i, j; 00206 for(i=0;i < 3; i++){ 00207 for(j=0;j < 3;j++){ 00208 printf("(%f %f)\t", a->e[i][j].real, a->e[i][j].imag); 00209 } 00210 printf("\n"); 00211 } 00212 00213 } 00214 00215 00216 int 00217 gf_neighborIndexFullLattice(int i, int dx4, int dx3, int dx2, int dx1) 00218 { 00219 int oddBit = 0; 00220 int half_idx = i; 00221 if (i >= Vh){ 00222 oddBit =1; 00223 half_idx = i - Vh; 00224 } 00225 int X1 = Z[0]; 00226 int X2 = Z[1]; 00227 int X3 = Z[2]; 00228 //int X4 = Z[3]; 00229 int X1h =X1/2; 00230 00231 int za = half_idx/X1h; 00232 int x1h = half_idx - za*X1h; 00233 int zb = za/X2; 00234 int x2 = za - zb*X2; 00235 int x4 = zb/X3; 00236 int x3 = zb - x4*X3; 00237 int x1odd = (x2 + x3 + x4 + oddBit) & 1; 00238 int x1 = 2*x1h + x1odd; 00239 00240 #ifdef MULTI_GPU 00241 x4 = x4+dx4; 00242 x3 = x3+dx3; 00243 x2 = x2+dx2; 00244 x1 = x1+dx1; 00245 00246 int nbr_half_idx = ( (x4+2)*(E[2]*E[1]*E[0]) + (x3+2)*(E[1]*E[0]) + (x2+2)*(E[0]) + (x1+2)) / 2; 00247 #else 00248 x4 = (x4+dx4+Z[3]) % Z[3]; 00249 x3 = (x3+dx3+Z[2]) % Z[2]; 00250 x2 = (x2+dx2+Z[1]) % Z[1]; 00251 x1 = (x1+dx1+Z[0]) % Z[0]; 00252 00253 int nbr_half_idx = (x4*(Z[2]*Z[1]*Z[0]) + x3*(Z[1]*Z[0]) + x2*(Z[0]) + x1) / 2; 00254 #endif 00255 00256 int oddBitChanged = (dx4+dx3+dx2+dx1)%2; 00257 if (oddBitChanged){ 00258 oddBit = 1 - oddBit; 00259 } 00260 int ret = nbr_half_idx; 00261 if (oddBit){ 00262 #ifdef MULTI_GPU 00263 ret = Vh_ex + nbr_half_idx; 00264 #else 00265 ret = Vh + nbr_half_idx; 00266 #endif 00267 } 00268 00269 return ret; 00270 } 00271 00272 00273 00274 //this functon compute one path for all lattice sites 00275 template<typename su3_matrix, typename Float> 00276 static void 00277 compute_path_product(su3_matrix* staple, su3_matrix** sitelink, su3_matrix** sitelink_ex_2d, 00278 int* path, int len, Float loop_coeff, int dir) 00279 { 00280 int i, j; 00281 00282 su3_matrix prev_matrix, curr_matrix, tmat; 00283 int dx[4]; 00284 00285 for(i=0;i<V;i++){ 00286 memset(dx,0, sizeof(dx)); 00287 memset(&curr_matrix, 0, sizeof(curr_matrix)); 00288 00289 curr_matrix.e[0][0].real = 1.0; 00290 curr_matrix.e[1][1].real = 1.0; 00291 curr_matrix.e[2][2].real = 1.0; 00292 00293 dx[dir] =1; 00294 for(j=0; j < len;j++){ 00295 int lnkdir; 00296 00297 prev_matrix = curr_matrix; 00298 if (GOES_FORWARDS(path[j])){ 00299 //dx[path[j]] +=1; 00300 lnkdir = path[j]; 00301 }else{ 00302 dx[OPP_DIR(path[j])] -=1; 00303 lnkdir=OPP_DIR(path[j]); 00304 } 00305 00306 int nbr_idx = gf_neighborIndexFullLattice(i, dx[3], dx[2], dx[1], dx[0]); 00307 #ifdef MULTI_GPU 00308 su3_matrix* lnk = sitelink_ex_2d[lnkdir] + nbr_idx; 00309 #else 00310 su3_matrix* lnk = sitelink[lnkdir]+ nbr_idx; 00311 #endif 00312 if (GOES_FORWARDS(path[j])){ 00313 mult_su3_nn(&prev_matrix, lnk, &curr_matrix); 00314 }else{ 00315 mult_su3_na(&prev_matrix, lnk, &curr_matrix); 00316 } 00317 00318 if (GOES_FORWARDS(path[j])){ 00319 dx[path[j]] +=1; 00320 }else{ 00321 //we already substract one in the code above 00322 } 00323 00324 00325 }//j 00326 00327 su3_adjoint(&curr_matrix, &tmat ); 00328 00329 scalar_mult_add_su3_matrix(staple + i , &tmat, loop_coeff, staple+i); 00330 00331 }//i 00332 00333 return; 00334 } 00335 00336 00337 template <typename su3_matrix, typename anti_hermitmat, typename Float> 00338 static void 00339 update_mom(anti_hermitmat* momentum, int dir, su3_matrix** sitelink, 00340 su3_matrix* staple, Float eb3) 00341 { 00342 int i; 00343 for(i=0;i <V; i++){ 00344 su3_matrix tmat1; 00345 su3_matrix tmat2; 00346 su3_matrix tmat3; 00347 00348 su3_matrix* lnk = sitelink[dir] + i; 00349 su3_matrix* stp = staple + i; 00350 anti_hermitmat* mom = momentum + 4*i+dir; 00351 00352 mult_su3_na(lnk, stp, &tmat1); 00353 uncompress_anti_hermitian(mom, &tmat2); 00354 00355 scalar_mult_sub_su3_matrix(&tmat2, &tmat1, eb3, &tmat3); 00356 make_anti_hermitian(&tmat3, mom); 00357 00358 } 00359 00360 } 00361 00362 00363 00364 /* This function only computes one direction @dir 00365 * 00366 */ 00367 00368 void 00369 gauge_force_reference_dir(void* refMom, int dir, double eb3, void** sitelink, void** sitelink_ex_2d, QudaPrecision prec, 00370 int **path_dir, int* length, void* loop_coeff, int num_paths) 00371 { 00372 int i; 00373 00374 void* staple; 00375 int gSize = prec; 00376 00377 staple = malloc(V* gaugeSiteSize* gSize); 00378 if (staple == NULL){ 00379 fprintf(stderr, "ERROR: malloc failed for staple in functon %s\n", __FUNCTION__); 00380 exit(1); 00381 } 00382 00383 memset(staple, 0, V*gaugeSiteSize* gSize); 00384 00385 for(i=0;i < num_paths; i++){ 00386 if (prec == QUDA_DOUBLE_PRECISION){ 00387 double* my_loop_coeff = (double*)loop_coeff; 00388 compute_path_product((dsu3_matrix*)staple, (dsu3_matrix**)sitelink, (dsu3_matrix**)sitelink_ex_2d, 00389 path_dir[i], length[i], my_loop_coeff[i], dir); 00390 }else{ 00391 float* my_loop_coeff = (float*)loop_coeff; 00392 compute_path_product((fsu3_matrix*)staple, (fsu3_matrix**)sitelink, (fsu3_matrix**)sitelink_ex_2d, 00393 path_dir[i], length[i], my_loop_coeff[i], dir); 00394 } 00395 } 00396 00397 if (prec == QUDA_DOUBLE_PRECISION){ 00398 update_mom((danti_hermitmat*) refMom, dir, (dsu3_matrix**)sitelink, (dsu3_matrix*)staple, (double)eb3); 00399 }else{ 00400 update_mom((fanti_hermitmat*)refMom, dir, (fsu3_matrix**)sitelink, (fsu3_matrix*)staple, (float)eb3); 00401 } 00402 00403 free(staple); 00404 } 00405 00406 00407 void 00408 gauge_force_reference(void* refMom, double eb3, void** sitelink, void** sitelink_ex_2d, QudaPrecision prec, 00409 int ***path_dir, int* length, void* loop_coeff, int num_paths) 00410 { 00411 for(int dir =0; dir < 4; dir++){ 00412 gauge_force_reference_dir(refMom, dir, eb3, sitelink, sitelink_ex_2d, prec, path_dir[dir], 00413 length, loop_coeff, num_paths); 00414 00415 } 00416 00417 }