QUDA: quda/tests/gauge_force_reference.cpp Source File

QUDA v0.4.0
A library for QCD on GPUs
00001 #include <stdio.h>
00002 #include <stdlib.h>
00003 #include <math.h>
00004 #include <string.h>
00005 
00006 #include "quda.h"
00007 #include "test_util.h"
00008 #include "misc.h"
00009 #include "gauge_force_reference.h"
00010 
00011 extern int Z[4];
00012 extern int V;
00013 extern int Vh;
00014 extern int Vh_ex;
00015 extern int E[4];
00016 
00017 
00018 #define CADD(a,b,c) { (c).real = (a).real + (b).real;   \
00019         (c).imag = (a).imag + (b).imag; }
00020 #define CMUL(a,b,c) { (c).real = (a).real*(b).real - (a).imag*(b).imag; \
00021                       (c).imag = (a).real*(b).imag + (a).imag*(b).real; }
00022 #define CSUM(a,b) { (a).real += (b).real; (a).imag += (b).imag; }
00023 
00024 /* c = a* * b */
00025 #define CMULJ_(a,b,c) { (c).real = (a).real*(b).real + (a).imag*(b).imag; \
00026                         (c).imag = (a).real*(b).imag - (a).imag*(b).real; }
00027 
00028 /* c = a * b* */
00029 #define CMUL_J(a,b,c) { (c).real = (a).real*(b).real + (a).imag*(b).imag; \
00030                         (c).imag = (a).imag*(b).real - (a).real*(b).imag; }
00031 
00032 #define CONJG(a,b) { (b).real = (a).real; (b).imag = -(a).imag; }
00033 
00034 typedef struct {   
00035     float real;    
00036     float imag; 
00037 } fcomplex;  
00038 
00039 /* specific for double complex */
00040 typedef struct {
00041     double real;
00042     double imag;
00043 } dcomplex;
00044 
00045 typedef struct { fcomplex e[3][3]; } fsu3_matrix;
00046 typedef struct { fcomplex c[3]; } fsu3_vector;
00047 typedef struct { dcomplex e[3][3]; } dsu3_matrix;
00048 typedef struct { dcomplex c[3]; } dsu3_vector;
00049 
00050 typedef struct { 
00051     fcomplex m01,m02,m12; 
00052     float m00im,m11im,m22im; 
00053     float space; 
00054 } fanti_hermitmat;
00055 
00056 typedef struct { 
00057     dcomplex m01,m02,m12; 
00058     double m00im,m11im,m22im; 
00059     double space; 
00060 } danti_hermitmat;
00061 
00062 extern int neighborIndexFullLattice(int i, int dx4, int dx3, int dx2, int dx1);
00063 
00064 template<typename su3_matrix>
00065 void  su3_adjoint( su3_matrix *a, su3_matrix *b )
00066 {
00067     int i,j;
00068     for(i=0;i<3;i++)for(j=0;j<3;j++){
00069             CONJG( a->e[j][i], b->e[i][j] );
00070         }
00071 }
00072 
00073 
00074 template<typename su3_matrix, typename anti_hermitmat>
00075 void
00076 make_anti_hermitian( su3_matrix *m3, anti_hermitmat *ah3 ) 
00077 {
00078     
00079     typeof(ah3->m00im) temp =
00080         (m3->e[0][0].imag + m3->e[1][1].imag + m3->e[2][2].imag)*0.33333333333333333;
00081     ah3->m00im = m3->e[0][0].imag - temp;
00082     ah3->m11im = m3->e[1][1].imag - temp;
00083     ah3->m22im = m3->e[2][2].imag - temp;
00084     ah3->m01.real = (m3->e[0][1].real - m3->e[1][0].real)*0.5;
00085     ah3->m02.real = (m3->e[0][2].real - m3->e[2][0].real)*0.5;
00086     ah3->m12.real = (m3->e[1][2].real - m3->e[2][1].real)*0.5;
00087     ah3->m01.imag = (m3->e[0][1].imag + m3->e[1][0].imag)*0.5;
00088     ah3->m02.imag = (m3->e[0][2].imag + m3->e[2][0].imag)*0.5;
00089     ah3->m12.imag = (m3->e[1][2].imag + m3->e[2][1].imag)*0.5;
00090     
00091 }
00092 
00093 template <typename anti_hermitmat, typename su3_matrix>
00094 static void
00095 uncompress_anti_hermitian(anti_hermitmat *mat_antihermit,
00096                           su3_matrix *mat_su3 )
00097 {
00098     typeof(mat_antihermit->m00im) temp1;
00099     mat_su3->e[0][0].imag=mat_antihermit->m00im;
00100     mat_su3->e[0][0].real=0.;
00101     mat_su3->e[1][1].imag=mat_antihermit->m11im;
00102     mat_su3->e[1][1].real=0.;
00103     mat_su3->e[2][2].imag=mat_antihermit->m22im;
00104     mat_su3->e[2][2].real=0.;
00105     mat_su3->e[0][1].imag=mat_antihermit->m01.imag;
00106     temp1=mat_antihermit->m01.real;
00107     mat_su3->e[0][1].real=temp1;
00108     mat_su3->e[1][0].real= -temp1;
00109     mat_su3->e[1][0].imag=mat_antihermit->m01.imag;
00110     mat_su3->e[0][2].imag=mat_antihermit->m02.imag;
00111     temp1=mat_antihermit->m02.real;
00112     mat_su3->e[0][2].real=temp1;
00113     mat_su3->e[2][0].real= -temp1;
00114     mat_su3->e[2][0].imag=mat_antihermit->m02.imag;
00115     mat_su3->e[1][2].imag=mat_antihermit->m12.imag;
00116     temp1=mat_antihermit->m12.real;
00117     mat_su3->e[1][2].real=temp1;
00118     mat_su3->e[2][1].real= -temp1;
00119     mat_su3->e[2][1].imag=mat_antihermit->m12.imag;    
00120 }
00121 
00122 template <typename su3_matrix, typename Float>
00123 static void
00124 scalar_mult_sub_su3_matrix(su3_matrix *a,su3_matrix *b, Float s, su3_matrix *c)
00125 {    
00126     int i,j;
00127     for(i=0;i<3;i++){
00128         for(j=0;j<3;j++){           
00129             c->e[i][j].real = a->e[i][j].real - s*b->e[i][j].real;
00130             c->e[i][j].imag = a->e[i][j].imag - s*b->e[i][j].imag;
00131         }
00132     }
00133 }
00134 
00135 template <typename su3_matrix, typename Float>
00136 static void
00137 scalar_mult_add_su3_matrix(su3_matrix *a,su3_matrix *b, Float s, su3_matrix *c)
00138 {
00139     int i,j;
00140     for(i=0;i<3;i++){
00141         for(j=0;j<3;j++){           
00142             c->e[i][j].real = a->e[i][j].real + s*b->e[i][j].real;
00143             c->e[i][j].imag = a->e[i][j].imag + s*b->e[i][j].imag;
00144         }
00145     }    
00146 }
00147 
00148 template <typename su3_matrix>
00149 static void
00150 mult_su3_nn(su3_matrix* a, su3_matrix* b, su3_matrix* c)
00151 {
00152     int i,j,k;
00153     typeof(a->e[0][0]) x,y;
00154     for(i=0;i<3;i++){
00155         for(j=0;j<3;j++){
00156             x.real=x.imag=0.0;
00157             for(k=0;k<3;k++){
00158                 CMUL( a->e[i][k] , b->e[k][j] , y );
00159                 CSUM( x , y );
00160             }
00161             c->e[i][j] = x;
00162         }
00163     }    
00164 }
00165 template<typename su3_matrix>
00166 static void 
00167 mult_su3_an( su3_matrix *a, su3_matrix *b, su3_matrix *c )
00168 {
00169     int i,j,k;
00170     typeof(a->e[0][0]) x,y;
00171     for(i=0;i<3;i++){
00172         for(j=0;j<3;j++){
00173             x.real=x.imag=0.0;
00174             for(k=0;k<3;k++){
00175                 CMULJ_( a->e[k][i] , b->e[k][j], y );
00176                 CSUM( x , y );
00177             }
00178             c->e[i][j] = x;
00179         }
00180     }
00181 }
00182 
00183 template<typename su3_matrix>
00184 static void
00185 mult_su3_na(  su3_matrix *a, su3_matrix *b, su3_matrix *c )
00186 {
00187     int i,j,k;
00188     typeof(a->e[0][0]) x,y;
00189     for(i=0;i<3;i++){
00190         for(j=0;j<3;j++){
00191             x.real=x.imag=0.0;
00192             for(k=0;k<3;k++){
00193                 CMUL_J( a->e[i][k] , b->e[j][k] , y );
00194                 CSUM( x , y );
00195             }
00196             c->e[i][j] = x;
00197         }
00198     }
00199 }
00200 
00201 template < typename su3_matrix>
00202 void
00203 print_su3_matrix(su3_matrix *a)
00204 {
00205     int i, j;
00206     for(i=0;i < 3; i++){
00207         for(j=0;j < 3;j++){
00208             printf("(%f %f)\t", a->e[i][j].real, a->e[i][j].imag);
00209         }
00210         printf("\n");
00211     }
00212     
00213 }
00214 
00215 
00216 int
00217 gf_neighborIndexFullLattice(int i, int dx4, int dx3, int dx2, int dx1) 
00218 {
00219   int oddBit = 0;
00220   int half_idx = i;
00221   if (i >= Vh){
00222     oddBit =1;
00223     half_idx = i - Vh;
00224   }
00225   int X1 = Z[0];  
00226   int X2 = Z[1];
00227   int X3 = Z[2];
00228   //int X4 = Z[3];
00229   int X1h =X1/2;
00230 
00231   int za = half_idx/X1h;
00232   int x1h = half_idx - za*X1h;
00233   int zb = za/X2;
00234   int x2 = za - zb*X2;
00235   int x4 = zb/X3;
00236   int x3 = zb - x4*X3;
00237   int x1odd = (x2 + x3 + x4 + oddBit) & 1;
00238   int x1 = 2*x1h + x1odd;
00239 
00240 #ifdef MULTI_GPU
00241   x4 = x4+dx4;
00242   x3 = x3+dx3;
00243   x2 = x2+dx2;
00244   x1 = x1+dx1;
00245   
00246   int nbr_half_idx = ( (x4+2)*(E[2]*E[1]*E[0]) + (x3+2)*(E[1]*E[0]) + (x2+2)*(E[0]) + (x1+2)) / 2;
00247 #else
00248   x4 = (x4+dx4+Z[3]) % Z[3];
00249   x3 = (x3+dx3+Z[2]) % Z[2];
00250   x2 = (x2+dx2+Z[1]) % Z[1];
00251   x1 = (x1+dx1+Z[0]) % Z[0];
00252  
00253   int nbr_half_idx = (x4*(Z[2]*Z[1]*Z[0]) + x3*(Z[1]*Z[0]) + x2*(Z[0]) + x1) / 2;
00254 #endif
00255 
00256   int oddBitChanged = (dx4+dx3+dx2+dx1)%2;
00257   if (oddBitChanged){
00258     oddBit = 1 - oddBit;
00259   }
00260   int ret = nbr_half_idx;
00261   if (oddBit){
00262 #ifdef MULTI_GPU
00263     ret = Vh_ex + nbr_half_idx;    
00264 #else
00265     ret = Vh + nbr_half_idx;
00266 #endif
00267   }
00268     
00269   return ret;
00270 }
00271 
00272 
00273 
00274 //this functon compute one path for all lattice sites
00275 template<typename su3_matrix, typename Float>
00276 static void
00277 compute_path_product(su3_matrix* staple, su3_matrix** sitelink, su3_matrix** sitelink_ex_2d,
00278                      int* path, int len, Float loop_coeff, int dir)
00279 {
00280   int i, j;
00281 
00282     su3_matrix prev_matrix, curr_matrix, tmat;
00283     int dx[4];
00284 
00285     for(i=0;i<V;i++){
00286         memset(dx,0, sizeof(dx));
00287         memset(&curr_matrix, 0, sizeof(curr_matrix));
00288         
00289         curr_matrix.e[0][0].real = 1.0;
00290         curr_matrix.e[1][1].real = 1.0;
00291         curr_matrix.e[2][2].real = 1.0;
00292         
00293         dx[dir] =1;
00294         for(j=0; j < len;j++){
00295             int lnkdir;
00296 
00297             prev_matrix = curr_matrix;
00298             if (GOES_FORWARDS(path[j])){
00299                 //dx[path[j]] +=1;
00300                 lnkdir = path[j];
00301             }else{              
00302                 dx[OPP_DIR(path[j])] -=1;
00303                 lnkdir=OPP_DIR(path[j]);
00304             }
00305             
00306             int nbr_idx = gf_neighborIndexFullLattice(i, dx[3], dx[2], dx[1], dx[0]);
00307 #ifdef MULTI_GPU
00308             su3_matrix* lnk = sitelink_ex_2d[lnkdir] + nbr_idx;
00309 #else       
00310             su3_matrix* lnk = sitelink[lnkdir]+ nbr_idx;
00311 #endif
00312             if (GOES_FORWARDS(path[j])){
00313                 mult_su3_nn(&prev_matrix, lnk, &curr_matrix);           
00314             }else{
00315                 mult_su3_na(&prev_matrix, lnk, &curr_matrix);           
00316             }
00317 
00318             if (GOES_FORWARDS(path[j])){
00319                 dx[path[j]] +=1;
00320             }else{              
00321               //we already substract one in the code above
00322             }       
00323 
00324             
00325         }//j
00326 
00327         su3_adjoint(&curr_matrix, &tmat );
00328         
00329         scalar_mult_add_su3_matrix(staple + i , &tmat, loop_coeff, staple+i); 
00330 
00331     }//i
00332     
00333     return;
00334 }
00335 
00336 
00337 template <typename su3_matrix, typename anti_hermitmat, typename Float>
00338 static void
00339 update_mom(anti_hermitmat* momentum, int dir, su3_matrix** sitelink,
00340            su3_matrix* staple, Float eb3)
00341 {
00342     int i;
00343     for(i=0;i <V; i++){
00344         su3_matrix tmat1;
00345         su3_matrix tmat2;
00346         su3_matrix tmat3;
00347 
00348         su3_matrix* lnk = sitelink[dir] + i;
00349         su3_matrix* stp = staple + i;
00350         anti_hermitmat* mom = momentum + 4*i+dir;
00351         
00352         mult_su3_na(lnk, stp, &tmat1);
00353         uncompress_anti_hermitian(mom, &tmat2);
00354         
00355         scalar_mult_sub_su3_matrix(&tmat2, &tmat1, eb3, &tmat3);
00356         make_anti_hermitian(&tmat3, mom);
00357         
00358     }
00359     
00360 }
00361 
00362 
00363 
00364 /* This function only computes one direction @dir
00365  * 
00366  */
00367 
00368 void
00369 gauge_force_reference_dir(void* refMom, int dir, double eb3, void** sitelink, void** sitelink_ex_2d, QudaPrecision prec, 
00370                           int **path_dir, int* length, void* loop_coeff, int num_paths)
00371 {
00372     int i;
00373     
00374     void* staple;
00375     int gSize =  prec;    
00376 
00377     staple = malloc(V* gaugeSiteSize* gSize);
00378     if (staple == NULL){
00379         fprintf(stderr, "ERROR: malloc failed for staple in functon %s\n", __FUNCTION__);
00380         exit(1);
00381     }
00382     
00383     memset(staple, 0,  V*gaugeSiteSize* gSize);
00384     
00385     for(i=0;i < num_paths; i++){        
00386         if (prec == QUDA_DOUBLE_PRECISION){
00387             double* my_loop_coeff = (double*)loop_coeff;
00388             compute_path_product((dsu3_matrix*)staple, (dsu3_matrix**)sitelink, (dsu3_matrix**)sitelink_ex_2d, 
00389                                  path_dir[i], length[i], my_loop_coeff[i], dir);
00390         }else{
00391             float* my_loop_coeff = (float*)loop_coeff;
00392             compute_path_product((fsu3_matrix*)staple, (fsu3_matrix**)sitelink, (fsu3_matrix**)sitelink_ex_2d, 
00393                                  path_dir[i], length[i], my_loop_coeff[i], dir);
00394         }       
00395     }        
00396     
00397     if (prec == QUDA_DOUBLE_PRECISION){
00398       update_mom((danti_hermitmat*) refMom, dir, (dsu3_matrix**)sitelink, (dsu3_matrix*)staple, (double)eb3);
00399     }else{
00400       update_mom((fanti_hermitmat*)refMom, dir, (fsu3_matrix**)sitelink, (fsu3_matrix*)staple, (float)eb3);
00401     }
00402     
00403     free(staple);
00404 }
00405 
00406 
00407 void
00408 gauge_force_reference(void* refMom, double eb3, void** sitelink, void** sitelink_ex_2d, QudaPrecision prec, 
00409                       int ***path_dir, int* length, void* loop_coeff, int num_paths)
00410 {
00411   for(int dir =0; dir < 4; dir++){
00412     gauge_force_reference_dir(refMom, dir, eb3, sitelink, sitelink_ex_2d, prec, path_dir[dir],
00413                               length, loop_coeff, num_paths);
00414     
00415   }
00416   
00417 }