QUDA v0.4.0
A library for QCD on GPUs
|
00001 #include <stdio.h> 00002 #include <stdlib.h> 00003 #include <math.h> 00004 00005 #include <quda.h> 00006 #include <test_util.h> 00007 #include "llfat_reference.h" 00008 #include "misc.h" 00009 #include <string.h> 00010 00011 #include <quda_internal.h> 00012 #include "face_quda.h" 00013 00014 #define XUP 0 00015 #define YUP 1 00016 #define ZUP 2 00017 #define TUP 3 00018 00019 typedef struct { 00020 float real; 00021 float imag; 00022 } fcomplex; 00023 00024 /* specific for double complex */ 00025 typedef struct { 00026 double real; 00027 double imag; 00028 } dcomplex; 00029 00030 typedef struct { fcomplex e[3][3]; } fsu3_matrix; 00031 typedef struct { fcomplex c[3]; } fsu3_vector; 00032 typedef struct { dcomplex e[3][3]; } dsu3_matrix; 00033 typedef struct { dcomplex c[3]; } dsu3_vector; 00034 00035 00036 #define CADD(a,b,c) { (c).real = (a).real + (b).real; \ 00037 (c).imag = (a).imag + (b).imag; } 00038 #define CMUL(a,b,c) { (c).real = (a).real*(b).real - (a).imag*(b).imag; \ 00039 (c).imag = (a).real*(b).imag + (a).imag*(b).real; } 00040 #define CSUM(a,b) { (a).real += (b).real; (a).imag += (b).imag; } 00041 00042 /* c = a* * b */ 00043 #define CMULJ_(a,b,c) { (c).real = (a).real*(b).real + (a).imag*(b).imag; \ 00044 (c).imag = (a).real*(b).imag - (a).imag*(b).real; } 00045 00046 /* c = a * b* */ 00047 #define CMUL_J(a,b,c) { (c).real = (a).real*(b).real + (a).imag*(b).imag; \ 00048 (c).imag = (a).imag*(b).real - (a).real*(b).imag; } 00049 00050 extern int Z[4]; 00051 extern int V; 00052 extern int Vh; 00053 extern int Vs[]; 00054 extern int Vsh[]; 00055 extern int Vs_x, Vs_y, Vs_z, Vs_t; 00056 extern int Vsh_x, Vsh_y, Vsh_z, Vsh_t; 00057 00058 00059 template<typename su3_matrix, typename Real> 00060 void 00061 llfat_scalar_mult_su3_matrix( su3_matrix *a, Real s, su3_matrix *b ) 00062 { 00063 00064 int i,j; 00065 for(i=0;i<3;i++)for(j=0;j<3;j++){ 00066 b->e[i][j].real = s*a->e[i][j].real; 00067 b->e[i][j].imag = s*a->e[i][j].imag; 00068 } 00069 00070 return; 00071 } 00072 00073 template<typename su3_matrix, typename Real> 00074 void 00075 llfat_scalar_mult_add_su3_matrix(su3_matrix *a,su3_matrix *b, Real s, su3_matrix *c) 00076 { 00077 int i,j; 00078 for(i=0;i<3;i++)for(j=0;j<3;j++){ 00079 c->e[i][j].real = a->e[i][j].real + s*b->e[i][j].real; 00080 c->e[i][j].imag = a->e[i][j].imag + s*b->e[i][j].imag; 00081 } 00082 00083 } 00084 00085 template <typename su3_matrix> 00086 void 00087 llfat_mult_su3_na( su3_matrix *a, su3_matrix *b, su3_matrix *c ) 00088 { 00089 int i,j,k; 00090 typeof(a->e[0][0]) x,y; 00091 for(i=0;i<3;i++)for(j=0;j<3;j++){ 00092 x.real=x.imag=0.0; 00093 for(k=0;k<3;k++){ 00094 CMUL_J( a->e[i][k] , b->e[j][k] , y ); 00095 CSUM( x , y ); 00096 } 00097 c->e[i][j] = x; 00098 } 00099 } 00100 00101 template <typename su3_matrix> 00102 void 00103 llfat_mult_su3_nn( su3_matrix *a, su3_matrix *b, su3_matrix *c ) 00104 { 00105 int i,j,k; 00106 typeof(a->e[0][0]) x,y; 00107 for(i=0;i<3;i++)for(j=0;j<3;j++){ 00108 x.real=x.imag=0.0; 00109 for(k=0;k<3;k++){ 00110 CMUL( a->e[i][k] , b->e[k][j] , y ); 00111 CSUM( x , y ); 00112 } 00113 c->e[i][j] = x; 00114 } 00115 } 00116 00117 template<typename su3_matrix> 00118 void 00119 llfat_mult_su3_an( su3_matrix *a, su3_matrix *b, su3_matrix *c ) 00120 { 00121 int i,j,k; 00122 typeof(a->e[0][0]) x,y; 00123 for(i=0;i<3;i++)for(j=0;j<3;j++){ 00124 x.real=x.imag=0.0; 00125 for(k=0;k<3;k++){ 00126 CMULJ_( a->e[k][i] , b->e[k][j], y ); 00127 CSUM( x , y ); 00128 } 00129 c->e[i][j] = x; 00130 } 00131 } 00132 00133 00134 00135 00136 00137 template<typename su3_matrix> 00138 void 00139 llfat_add_su3_matrix( su3_matrix *a, su3_matrix *b, su3_matrix *c ) 00140 { 00141 int i,j; 00142 for(i=0;i<3;i++)for(j=0;j<3;j++){ 00143 CADD( a->e[i][j], b->e[i][j], c->e[i][j] ); 00144 } 00145 } 00146 00147 00148 00149 template<typename su3_matrix, typename Real> 00150 void 00151 llfat_compute_gen_staple_field(su3_matrix *staple, int mu, int nu, 00152 su3_matrix* mulink, su3_matrix** sitelink, void** fatlink, Real coef, 00153 int use_staple) 00154 { 00155 su3_matrix tmat1,tmat2; 00156 int i ; 00157 su3_matrix *fat1; 00158 00159 /* Upper staple */ 00160 /* Computes the staple : 00161 * mu (B) 00162 * +-------+ 00163 * nu | | 00164 * (A) | |(C) 00165 * X X 00166 * 00167 * Where the mu link can be any su3_matrix. The result is saved in staple. 00168 * if staple==NULL then the result is not saved. 00169 * It also adds the computed staple to the fatlink[mu] with weight coef. 00170 */ 00171 00172 int dx[4]; 00173 00174 /* upper staple */ 00175 00176 for(i=0;i < V;i++){ 00177 00178 fat1 = ((su3_matrix*)fatlink[mu]) + i; 00179 su3_matrix* A = sitelink[nu] + i; 00180 00181 memset(dx, 0, sizeof(dx)); 00182 dx[nu] =1; 00183 int nbr_idx = neighborIndexFullLattice(i, dx[3], dx[2], dx[1], dx[0]); 00184 su3_matrix* B; 00185 if (use_staple){ 00186 B = mulink + nbr_idx; 00187 }else{ 00188 B = mulink + nbr_idx; 00189 } 00190 00191 memset(dx, 0, sizeof(dx)); 00192 dx[mu] =1; 00193 nbr_idx = neighborIndexFullLattice(i, dx[3], dx[2],dx[1],dx[0]); 00194 su3_matrix* C = sitelink[nu] + nbr_idx; 00195 00196 llfat_mult_su3_nn( A, B,&tmat1); 00197 00198 if(staple!=NULL){/* Save the staple */ 00199 llfat_mult_su3_na( &tmat1, C, &staple[i]); 00200 } else{ /* No need to save the staple. Add it to the fatlinks */ 00201 llfat_mult_su3_na( &tmat1, C, &tmat2); 00202 llfat_scalar_mult_add_su3_matrix(fat1, &tmat2, coef, fat1); 00203 } 00204 } 00205 /***************lower staple**************** 00206 * 00207 * X X 00208 * nu | | 00209 * (A) | |(C) 00210 * +-------+ 00211 * mu (B) 00212 * 00213 *********************************************/ 00214 00215 for(i=0;i < V;i++){ 00216 00217 fat1 = ((su3_matrix*)fatlink[mu]) + i; 00218 memset(dx, 0, sizeof(dx)); 00219 dx[nu] = -1; 00220 int nbr_idx = neighborIndexFullLattice(i, dx[3], dx[2], dx[1], dx[0]); 00221 if (nbr_idx >= V || nbr_idx <0){ 00222 fprintf(stderr, "ERROR: invliad nbr_idx(%d), line=%d\n", nbr_idx, __LINE__); 00223 exit(1); 00224 } 00225 su3_matrix* A = sitelink[nu] + nbr_idx; 00226 00227 su3_matrix* B; 00228 if (use_staple){ 00229 B = mulink + nbr_idx; 00230 }else{ 00231 B = mulink + nbr_idx; 00232 } 00233 00234 memset(dx, 0, sizeof(dx)); 00235 dx[mu] = 1; 00236 nbr_idx = neighborIndexFullLattice(nbr_idx, dx[3], dx[2],dx[1],dx[0]); 00237 su3_matrix* C = sitelink[nu] + nbr_idx; 00238 00239 llfat_mult_su3_an( A, B,&tmat1); 00240 llfat_mult_su3_nn( &tmat1, C,&tmat2); 00241 00242 if(staple!=NULL){/* Save the staple */ 00243 llfat_add_su3_matrix(&staple[i], &tmat2, &staple[i]); 00244 llfat_scalar_mult_add_su3_matrix(fat1, &staple[i], coef, fat1); 00245 00246 } else{ /* No need to save the staple. Add it to the fatlinks */ 00247 llfat_scalar_mult_add_su3_matrix(fat1, &tmat2, coef, fat1); 00248 } 00249 } 00250 00251 } /* compute_gen_staple_site */ 00252 00253 00254 00255 /* Optimized fattening code for the Asq and Asqtad actions. 00256 * I assume that: 00257 * path 0 is the one link 00258 * path 2 the 3-staple 00259 * path 3 the 5-staple 00260 * path 4 the 7-staple 00261 * path 5 the Lapage term. 00262 * Path 1 is the Naik term 00263 * 00264 */ 00265 template <typename su3_matrix, typename Float> 00266 void llfat_cpu(void** fatlink, su3_matrix** sitelink, Float* act_path_coeff) 00267 { 00268 00269 su3_matrix* staple = (su3_matrix *)malloc(V*sizeof(su3_matrix)); 00270 if(staple == NULL){ 00271 fprintf(stderr, "Error: malloc failed for staple in function %s\n", __FUNCTION__); 00272 exit(1); 00273 } 00274 00275 su3_matrix* tempmat1 = (su3_matrix *)malloc(V*sizeof(su3_matrix)); 00276 if(tempmat1 == NULL){ 00277 fprintf(stderr, "ERROR: malloc failed for tempmat1 in function %s\n", __FUNCTION__); 00278 exit(1); 00279 } 00280 00281 /* to fix up the Lepage term, included by a trick below */ 00282 Float one_link = (act_path_coeff[0] - 6.0*act_path_coeff[5]); 00283 00284 00285 for (int dir=XUP; dir<=TUP; dir++){ 00286 00287 /* Intialize fat links with c_1*U_\mu(x) */ 00288 for(int i=0;i < V;i ++){ 00289 su3_matrix* fat1 = ((su3_matrix*)fatlink[dir]) + i; 00290 llfat_scalar_mult_su3_matrix(sitelink[dir] + i, one_link, fat1 ); 00291 } 00292 } 00293 00294 00295 00296 00297 for (int dir=XUP; dir<=TUP; dir++){ 00298 for(int nu=XUP; nu<=TUP; nu++){ 00299 if(nu!=dir){ 00300 llfat_compute_gen_staple_field(staple,dir,nu,sitelink[dir], sitelink,fatlink, act_path_coeff[2], 0); 00301 00302 /* The Lepage term */ 00303 /* Note this also involves modifying c_1 (above) */ 00304 00305 llfat_compute_gen_staple_field((su3_matrix*)NULL,dir,nu,staple,sitelink, fatlink, act_path_coeff[5],1); 00306 00307 for(int rho=XUP; rho<=TUP; rho++) { 00308 if((rho!=dir)&&(rho!=nu)){ 00309 llfat_compute_gen_staple_field( tempmat1, dir, rho, staple,sitelink,fatlink, act_path_coeff[3], 1); 00310 00311 for(int sig=XUP; sig<=TUP; sig++){ 00312 if((sig!=dir)&&(sig!=nu)&&(sig!=rho)){ 00313 llfat_compute_gen_staple_field((su3_matrix*)NULL,dir,sig,tempmat1,sitelink,fatlink, act_path_coeff[4], 1); 00314 } 00315 }/* sig */ 00316 00317 } 00318 00319 }/* rho */ 00320 } 00321 00322 }/* nu */ 00323 00324 }/* dir */ 00325 00326 00327 free(staple); 00328 free(tempmat1); 00329 00330 } 00331 00332 00333 00334 void 00335 llfat_reference(void** fatlink, void** sitelink, QudaPrecision prec, void* act_path_coeff) 00336 { 00337 switch(prec){ 00338 case QUDA_DOUBLE_PRECISION:{ 00339 llfat_cpu((void**)fatlink, (dsu3_matrix**)sitelink, (double*) act_path_coeff); 00340 break; 00341 } 00342 case QUDA_SINGLE_PRECISION:{ 00343 llfat_cpu((void**)fatlink, (fsu3_matrix**)sitelink, (float*) act_path_coeff); 00344 break; 00345 } 00346 default: 00347 fprintf(stderr, "ERROR: unsupported precision(%d)\n", prec); 00348 exit(1); 00349 break; 00350 00351 } 00352 00353 return; 00354 00355 } 00356 00357 #ifdef MULTI_GPU 00358 00359 template<typename su3_matrix, typename Real> 00360 void 00361 llfat_compute_gen_staple_field_mg(su3_matrix *staple, int mu, int nu, 00362 su3_matrix* mulink, su3_matrix** ghost_mulink, 00363 su3_matrix** sitelink, su3_matrix** ghost_sitelink, su3_matrix** ghost_sitelink_diag, 00364 void** fatlink, Real coef, 00365 int use_staple) 00366 { 00367 su3_matrix tmat1,tmat2; 00368 int i ; 00369 su3_matrix *fat1; 00370 00371 00372 int X1 = Z[0]; 00373 int X2 = Z[1]; 00374 int X3 = Z[2]; 00375 int X4 = Z[3]; 00376 int X1h =X1/2; 00377 00378 int X2X1 = X1*X2; 00379 int X3X2 = X3*X2; 00380 int X3X1 = X3*X1; 00381 00382 /* Upper staple */ 00383 /* Computes the staple : 00384 * mu (B) 00385 * +-------+ 00386 * nu | | 00387 * (A) | |(C) 00388 * X X 00389 * 00390 * Where the mu link can be any su3_matrix. The result is saved in staple. 00391 * if staple==NULL then the result is not saved. 00392 * It also adds the computed staple to the fatlink[mu] with weight coef. 00393 */ 00394 00395 int dx[4]; 00396 00397 /* upper staple */ 00398 00399 for(i=0;i < V;i++){ 00400 00401 int half_index = i; 00402 int oddBit =0; 00403 if (i >= Vh){ 00404 oddBit = 1; 00405 half_index = i -Vh; 00406 } 00407 //int x4 = x4_from_full_index(i); 00408 00409 00410 00411 int sid =half_index; 00412 int za = sid/X1h; 00413 int x1h = sid - za*X1h; 00414 int zb = za/X2; 00415 int x2 = za - zb*X2; 00416 int x4 = zb/X3; 00417 int x3 = zb - x4*X3; 00418 int x1odd = (x2 + x3 + x4 + oddBit) & 1; 00419 int x1 = 2*x1h + x1odd; 00420 int x[4] = {x1,x2,x3,x4}; 00421 int space_con[4]={ 00422 (x4*X3X2+x3*X2+x2)/2, 00423 (x4*X3X1+x3*X1+x1)/2, 00424 (x4*X2X1+x2*X1+x1)/2, 00425 (x3*X2X1+x2*X1+x1)/2 00426 }; 00427 00428 fat1 = ((su3_matrix*)fatlink[mu]) + i; 00429 su3_matrix* A = sitelink[nu] + i; 00430 00431 memset(dx, 0, sizeof(dx)); 00432 dx[nu] =1; 00433 int nbr_idx; 00434 00435 su3_matrix* B; 00436 if (use_staple){ 00437 if (x[nu] + dx[nu] >= Z[nu]){ 00438 B = ghost_mulink[nu] + Vs[nu] + (1-oddBit)*Vsh[nu] + space_con[nu]; 00439 }else{ 00440 nbr_idx = neighborIndexFullLattice_mg(i, dx[3], dx[2], dx[1], dx[0]); 00441 B = mulink + nbr_idx; 00442 } 00443 }else{ 00444 if(x[nu]+dx[nu] >= Z[nu]){ //out of boundary, use ghost data 00445 B = ghost_sitelink[nu] + 4*Vs[nu] + mu*Vs[nu] + (1-oddBit)*Vsh[nu] + space_con[nu]; 00446 }else{ 00447 nbr_idx = neighborIndexFullLattice_mg(i, dx[3], dx[2], dx[1], dx[0]); 00448 B = sitelink[mu] + nbr_idx; 00449 } 00450 } 00451 00452 00453 //we could be in the ghost link area if mu is T and we are at high T boundary 00454 su3_matrix* C; 00455 memset(dx, 0, sizeof(dx)); 00456 dx[mu] =1; 00457 if(x[mu] + dx[mu] >= Z[mu]){ //out of boundary, use ghost data 00458 C = ghost_sitelink[mu] + 4*Vs[mu] + nu*Vs[mu] + (1-oddBit)*Vsh[mu] + space_con[mu]; 00459 }else{ 00460 nbr_idx = neighborIndexFullLattice_mg(i, dx[3], dx[2],dx[1],dx[0]); 00461 C = sitelink[nu] + nbr_idx; 00462 } 00463 00464 llfat_mult_su3_nn( A, B,&tmat1); 00465 00466 if(staple!=NULL){/* Save the staple */ 00467 llfat_mult_su3_na( &tmat1, C, &staple[i]); 00468 } else{ /* No need to save the staple. Add it to the fatlinks */ 00469 llfat_mult_su3_na( &tmat1, C, &tmat2); 00470 llfat_scalar_mult_add_su3_matrix(fat1, &tmat2, coef, fat1); 00471 } 00472 } 00473 /***************lower staple**************** 00474 * 00475 * X X 00476 * nu | | 00477 * (A) | |(C) 00478 * +-------+ 00479 * mu (B) 00480 * 00481 *********************************************/ 00482 00483 for(i=0;i < V;i++){ 00484 00485 int half_index = i; 00486 int oddBit =0; 00487 if (i >= Vh){ 00488 oddBit = 1; 00489 half_index = i -Vh; 00490 } 00491 00492 int sid =half_index; 00493 int za = sid/X1h; 00494 int x1h = sid - za*X1h; 00495 int zb = za/X2; 00496 int x2 = za - zb*X2; 00497 int x4 = zb/X3; 00498 int x3 = zb - x4*X3; 00499 int x1odd = (x2 + x3 + x4 + oddBit) & 1; 00500 int x1 = 2*x1h + x1odd; 00501 int x[4] = {x1,x2,x3,x4}; 00502 int space_con[4]={ 00503 (x4*X3X2+x3*X2+x2)/2, 00504 (x4*X3X1+x3*X1+x1)/2, 00505 (x4*X2X1+x2*X1+x1)/2, 00506 (x3*X2X1+x2*X1+x1)/2 00507 }; 00508 00509 //int x4 = x4_from_full_index(i); 00510 00511 fat1 = ((su3_matrix*)fatlink[mu]) + i; 00512 00513 //we could be in the ghost link area if nu is T and we are at low T boundary 00514 su3_matrix* A; 00515 memset(dx, 0, sizeof(dx)); 00516 dx[nu] = -1; 00517 00518 int nbr_idx; 00519 if(x[nu] + dx[nu] < 0){ //out of boundary, use ghost data 00520 A = ghost_sitelink[nu] + nu*Vs[nu] + (1-oddBit)*Vsh[nu] + space_con[nu]; 00521 }else{ 00522 nbr_idx = neighborIndexFullLattice_mg(i, dx[3], dx[2], dx[1], dx[0]); 00523 A = sitelink[nu] + nbr_idx; 00524 } 00525 00526 su3_matrix* B; 00527 if (use_staple){ 00528 nbr_idx = neighborIndexFullLattice_mg(i, dx[3], dx[2], dx[1], dx[0]); 00529 if (x[nu] + dx[nu] < 0){ 00530 B = ghost_mulink[nu] + (1-oddBit)*Vsh[nu] + space_con[nu]; 00531 }else{ 00532 B = mulink + nbr_idx; 00533 } 00534 }else{ 00535 if(x[nu] + dx[nu] < 0){ //out of boundary, use ghost data 00536 B = ghost_sitelink[nu] + mu*Vs[nu] + (1-oddBit)*Vsh[nu] + space_con[nu]; 00537 }else{ 00538 nbr_idx = neighborIndexFullLattice_mg(i, dx[3], dx[2], dx[1], dx[0]); 00539 B = sitelink[mu] + nbr_idx; 00540 } 00541 } 00542 00543 //we could be in the ghost link area if nu is T and we are at low T boundary 00544 // or mu is T and we are on high T boundary 00545 su3_matrix* C; 00546 memset(dx, 0, sizeof(dx)); 00547 dx[nu] = -1; 00548 dx[mu] = 1; 00549 nbr_idx = neighborIndexFullLattice_mg(i, dx[3], dx[2],dx[1],dx[0]); 00550 00551 //space con must be recomputed because we have coodinates change in 2 directions 00552 int new_x1, new_x2, new_x3, new_x4; 00553 new_x1 = (x[0] + dx[0] + Z[0])%Z[0]; 00554 new_x2 = (x[1] + dx[1] + Z[1])%Z[1]; 00555 new_x3 = (x[2] + dx[2] + Z[2])%Z[2]; 00556 new_x4 = (x[3] + dx[3] + Z[3])%Z[3]; 00557 int new_x[4] = {new_x1, new_x2, new_x3, new_x4}; 00558 space_con[0] = (new_x4*X3X2 + new_x3*X2 + new_x2)/2; 00559 space_con[1] = (new_x4*X3X1 + new_x3*X1 + new_x1)/2; 00560 space_con[2] = (new_x4*X2X1 + new_x2*X1 + new_x1)/2; 00561 space_con[3] = (new_x3*X2X1 + new_x2*X1 + new_x1)/2; 00562 00563 if( (x[nu] + dx[nu]) < 0 && (x[mu] + dx[mu] >= Z[mu])){ 00564 //find the other 2 directions, dir1, dir2 00565 //with dir2 the slowest changing direction 00566 int dir1, dir2; //other two dimensions 00567 for(dir1=0; dir1 < 4; dir1 ++){ 00568 if(dir1 != nu && dir1 != mu){ 00569 break; 00570 } 00571 } 00572 for(dir2=0; dir2 < 4; dir2 ++){ 00573 if(dir2 != nu && dir2 != mu && dir2 != dir1){ 00574 break; 00575 } 00576 } 00577 C = ghost_sitelink_diag[nu*4+mu] + oddBit*Z[dir1]*Z[dir2]/2 + (new_x[dir2]*Z[dir1]+new_x[dir1])/2; 00578 }else if (x[nu] + dx[nu] < 0){ 00579 C = ghost_sitelink[nu] + nu*Vs[nu] + oddBit*Vsh[nu]+ space_con[nu]; 00580 }else if (x[mu] + dx[mu] >= Z[mu]){ 00581 C = ghost_sitelink[mu] + 4*Vs[mu] + nu*Vs[mu] + oddBit*Vsh[mu]+space_con[mu]; 00582 }else{ 00583 C = sitelink[nu] + nbr_idx; 00584 } 00585 llfat_mult_su3_an( A, B,&tmat1); 00586 llfat_mult_su3_nn( &tmat1, C,&tmat2); 00587 00588 if(staple!=NULL){/* Save the staple */ 00589 llfat_add_su3_matrix(&staple[i], &tmat2, &staple[i]); 00590 llfat_scalar_mult_add_su3_matrix(fat1, &staple[i], coef, fat1); 00591 00592 } else{ /* No need to save the staple. Add it to the fatlinks */ 00593 llfat_scalar_mult_add_su3_matrix(fat1, &tmat2, coef, fat1); 00594 } 00595 } 00596 00597 } /* compute_gen_staple_site */ 00598 00599 00600 template <typename su3_matrix, typename Float> 00601 void llfat_cpu_mg(void** fatlink, su3_matrix** sitelink, su3_matrix** ghost_sitelink, 00602 su3_matrix** ghost_sitelink_diag, Float* act_path_coeff) 00603 { 00604 QudaPrecision prec; 00605 if (sizeof(Float) == 4){ 00606 prec = QUDA_SINGLE_PRECISION; 00607 }else{ 00608 prec = QUDA_DOUBLE_PRECISION; 00609 } 00610 00611 su3_matrix* staple = (su3_matrix *)malloc(V*sizeof(su3_matrix)); 00612 if(staple == NULL){ 00613 fprintf(stderr, "Error: malloc failed for staple in function %s\n", __FUNCTION__); 00614 exit(1); 00615 } 00616 00617 00618 su3_matrix* ghost_staple[4]; 00619 su3_matrix* ghost_staple1[4]; 00620 00621 for(int i=0;i < 4;i++){ 00622 ghost_staple[i] = (su3_matrix*)malloc(2*Vs[i]*sizeof(su3_matrix)); 00623 if (ghost_staple[i] == NULL){ 00624 fprintf(stderr, "Error: malloc failed for ghost staple in function %s\n", __FUNCTION__); 00625 exit(1); 00626 } 00627 00628 ghost_staple1[i] = (su3_matrix*)malloc(2*Vs[i]*sizeof(su3_matrix)); 00629 if (ghost_staple1[i] == NULL){ 00630 fprintf(stderr, "Error: malloc failed for ghost staple1 in function %s\n", __FUNCTION__); 00631 exit(1); 00632 } 00633 } 00634 00635 su3_matrix* tempmat1 = (su3_matrix *)malloc(V*sizeof(su3_matrix)); 00636 if(tempmat1 == NULL){ 00637 fprintf(stderr, "ERROR: malloc failed for tempmat1 in function %s\n", __FUNCTION__); 00638 exit(1); 00639 } 00640 00641 /* to fix up the Lepage term, included by a trick below */ 00642 Float one_link = (act_path_coeff[0] - 6.0*act_path_coeff[5]); 00643 00644 00645 for (int dir=XUP; dir<=TUP; dir++){ 00646 00647 /* Intialize fat links with c_1*U_\mu(x) */ 00648 for(int i=0;i < V;i ++){ 00649 su3_matrix* fat1 = ((su3_matrix*)fatlink[dir]) + i; 00650 llfat_scalar_mult_su3_matrix(sitelink[dir] + i, one_link, fat1 ); 00651 } 00652 } 00653 00654 for (int dir=XUP; dir<=TUP; dir++){ 00655 for(int nu=XUP; nu<=TUP; nu++){ 00656 if(nu!=dir){ 00657 llfat_compute_gen_staple_field_mg(staple,dir,nu, 00658 sitelink[dir], (su3_matrix**)NULL, 00659 sitelink, ghost_sitelink, ghost_sitelink_diag, 00660 fatlink, act_path_coeff[2], 0); 00661 /* The Lepage term */ 00662 /* Note this also involves modifying c_1 (above) */ 00663 00664 exchange_cpu_staple(Z, staple, (void**)ghost_staple, prec); 00665 00666 llfat_compute_gen_staple_field_mg((su3_matrix*)NULL,dir,nu, 00667 staple,ghost_staple, 00668 sitelink, ghost_sitelink, ghost_sitelink_diag, 00669 fatlink, act_path_coeff[5],1); 00670 00671 for(int rho=XUP; rho<=TUP; rho++) { 00672 if((rho!=dir)&&(rho!=nu)){ 00673 llfat_compute_gen_staple_field_mg( tempmat1, dir, rho, 00674 staple,ghost_staple, 00675 sitelink, ghost_sitelink, ghost_sitelink_diag, 00676 fatlink, act_path_coeff[3], 1); 00677 00678 00679 exchange_cpu_staple(Z, tempmat1, (void**)ghost_staple1, prec); 00680 00681 for(int sig=XUP; sig<=TUP; sig++){ 00682 if((sig!=dir)&&(sig!=nu)&&(sig!=rho)){ 00683 00684 llfat_compute_gen_staple_field_mg((su3_matrix*)NULL,dir,sig, 00685 tempmat1, ghost_staple1, 00686 sitelink, ghost_sitelink, ghost_sitelink_diag, 00687 fatlink, act_path_coeff[4], 1); 00688 //FIXME 00689 //return; 00690 00691 } 00692 }/* sig */ 00693 } 00694 }/* rho */ 00695 } 00696 00697 }/* nu */ 00698 00699 }/* dir */ 00700 00701 free(staple); 00702 for(int i=0;i < 4;i++){ 00703 free(ghost_staple[i]); 00704 free(ghost_staple1[i]); 00705 } 00706 free(tempmat1); 00707 00708 } 00709 00710 00711 00712 void 00713 llfat_reference_mg(void** fatlink, void** sitelink, void** ghost_sitelink, 00714 void** ghost_sitelink_diag, QudaPrecision prec, void* act_path_coeff) 00715 { 00716 00717 switch(prec){ 00718 case QUDA_DOUBLE_PRECISION:{ 00719 llfat_cpu_mg((void**)fatlink, (dsu3_matrix**)sitelink, (dsu3_matrix**)ghost_sitelink, 00720 (dsu3_matrix**)ghost_sitelink_diag, (double*) act_path_coeff); 00721 break; 00722 } 00723 case QUDA_SINGLE_PRECISION:{ 00724 llfat_cpu_mg((void**)fatlink, (fsu3_matrix**)sitelink, (fsu3_matrix**)ghost_sitelink, 00725 (fsu3_matrix**)ghost_sitelink_diag, (float*) act_path_coeff); 00726 break; 00727 } 00728 default: 00729 fprintf(stderr, "ERROR: unsupported precision(%d)\n", prec); 00730 exit(1); 00731 break; 00732 00733 } 00734 00735 return; 00736 00737 } 00738 00739 00740 #endif