QUDA: quda/tests/llfat_reference.cpp Source File

QUDA v0.3.2
A library for QCD on GPUs
00001 #include <stdio.h>
00002 #include <stdlib.h>
00003 #include <math.h>
00004 
00005 #include <quda.h>
00006 #include <test_util.h>
00007 #include "llfat_reference.h"
00008 #include "misc.h"
00009 #include <string.h>
00010 
00011 #define XUP 0
00012 #define YUP 1
00013 #define ZUP 2
00014 #define TUP 3
00015 
00016 typedef struct {   
00017   float real;      
00018   float imag; 
00019 } fcomplex;  
00020 
00021 /* specific for double complex */
00022 typedef struct {
00023   double real;
00024   double imag;
00025 } dcomplex;
00026 
00027 typedef struct { fcomplex e[3][3]; } fsu3_matrix;
00028 typedef struct { fcomplex c[3]; } fsu3_vector;
00029 typedef struct { dcomplex e[3][3]; } dsu3_matrix;
00030 typedef struct { dcomplex c[3]; } dsu3_vector;
00031 
00032 
00033 #define CADD(a,b,c) { (c).real = (a).real + (b).real;   \
00034     (c).imag = (a).imag + (b).imag; }
00035 #define CMUL(a,b,c) { (c).real = (a).real*(b).real - (a).imag*(b).imag; \
00036     (c).imag = (a).real*(b).imag + (a).imag*(b).real; }
00037 #define CSUM(a,b) { (a).real += (b).real; (a).imag += (b).imag; }
00038 
00039 /* c = a* * b */
00040 #define CMULJ_(a,b,c) { (c).real = (a).real*(b).real + (a).imag*(b).imag; \
00041     (c).imag = (a).real*(b).imag - (a).imag*(b).real; }
00042 
00043 /* c = a * b* */
00044 #define CMUL_J(a,b,c) { (c).real = (a).real*(b).real + (a).imag*(b).imag; \
00045     (c).imag = (a).imag*(b).real - (a).real*(b).imag; }
00046 
00047 extern int Z[4];
00048 extern int V;
00049 extern int Vh;
00050 
00051 template<typename su3_matrix, typename Real>
00052 void 
00053 llfat_scalar_mult_su3_matrix( su3_matrix *a, Real s, su3_matrix *b )
00054 {
00055     
00056   int i,j;
00057   for(i=0;i<3;i++)for(j=0;j<3;j++){
00058       b->e[i][j].real = s*a->e[i][j].real;
00059       b->e[i][j].imag = s*a->e[i][j].imag;
00060     }
00061     
00062   return;
00063 }
00064 
00065 template<typename su3_matrix, typename Real>
00066 void
00067 llfat_scalar_mult_add_su3_matrix(su3_matrix *a,su3_matrix *b, Real s, su3_matrix *c)
00068 {    
00069   int i,j;
00070   for(i=0;i<3;i++)for(j=0;j<3;j++){
00071       c->e[i][j].real = a->e[i][j].real + s*b->e[i][j].real;
00072       c->e[i][j].imag = a->e[i][j].imag + s*b->e[i][j].imag;
00073     }
00074     
00075 }
00076 
00077 template <typename su3_matrix>
00078 void 
00079 llfat_mult_su3_na(  su3_matrix *a, su3_matrix *b, su3_matrix *c )
00080 {
00081   int i,j,k;
00082   typeof(a->e[0][0]) x,y;
00083   for(i=0;i<3;i++)for(j=0;j<3;j++){
00084       x.real=x.imag=0.0;
00085       for(k=0;k<3;k++){
00086         CMUL_J( a->e[i][k] , b->e[j][k] , y );
00087         CSUM( x , y );
00088       }
00089       c->e[i][j] = x;
00090     }
00091 }
00092 
00093 template <typename su3_matrix>
00094 void
00095 llfat_mult_su3_nn( su3_matrix *a, su3_matrix *b, su3_matrix *c )
00096 {
00097   int i,j,k;
00098   typeof(a->e[0][0]) x,y;
00099   for(i=0;i<3;i++)for(j=0;j<3;j++){
00100       x.real=x.imag=0.0;
00101       for(k=0;k<3;k++){
00102         CMUL( a->e[i][k] , b->e[k][j] , y );
00103         CSUM( x , y );
00104       }
00105       c->e[i][j] = x;
00106     }
00107 }
00108 
00109 template<typename su3_matrix>
00110 void
00111 llfat_mult_su3_an( su3_matrix *a, su3_matrix *b, su3_matrix *c )
00112 {
00113   int i,j,k;
00114   typeof(a->e[0][0]) x,y;
00115   for(i=0;i<3;i++)for(j=0;j<3;j++){
00116       x.real=x.imag=0.0;
00117       for(k=0;k<3;k++){
00118         CMULJ_( a->e[k][i] , b->e[k][j], y );
00119         CSUM( x , y );
00120       }
00121       c->e[i][j] = x;
00122     }
00123 }
00124 
00125 
00126 
00127 
00128 
00129 template<typename su3_matrix>
00130 void 
00131 llfat_add_su3_matrix( su3_matrix *a, su3_matrix *b, su3_matrix *c ) 
00132 {
00133   int i,j;
00134   for(i=0;i<3;i++)for(j=0;j<3;j++){
00135       CADD( a->e[i][j], b->e[i][j], c->e[i][j] );
00136     }
00137 }
00138 
00139 
00140 
00141 template<typename su3_matrix, typename Real>
00142 void 
00143 llfat_compute_gen_staple_field(su3_matrix *staple, int mu, int nu, 
00144                                su3_matrix* mulink, su3_matrix* sitelink, su3_matrix* fatlink, Real coef,
00145                                int use_staple) 
00146 {
00147   su3_matrix tmat1,tmat2;
00148   int i ;
00149   su3_matrix *fat1;
00150     
00151   /* Upper staple */
00152   /* Computes the staple :
00153    *                mu (B)
00154    *               +-------+
00155    *       nu      |       | 
00156    *         (A)   |       |(C)
00157    *               X       X
00158    *
00159    * Where the mu link can be any su3_matrix. The result is saved in staple.
00160    * if staple==NULL then the result is not saved.
00161    * It also adds the computed staple to the fatlink[mu] with weight coef.
00162    */
00163     
00164   int dx[4];
00165 
00166   /* upper staple */
00167     
00168   for(i=0;i < V;i++){       
00169         
00170     fat1 = fatlink + 4*i + mu;
00171     su3_matrix* A = sitelink + 4*i + nu;
00172         
00173     memset(dx, 0, sizeof(dx));
00174     dx[nu] =1;
00175     int nbr_idx = neighborIndexFullLattice(i, dx[3], dx[2], dx[1], dx[0]);
00176     su3_matrix* B;
00177     if (use_staple){
00178       B = mulink + nbr_idx;
00179     }else{
00180       B = mulink + 4*nbr_idx + mu;
00181     }
00182         
00183     memset(dx, 0, sizeof(dx));
00184     dx[mu] =1;
00185     nbr_idx = neighborIndexFullLattice(i, dx[3], dx[2],dx[1],dx[0]);
00186     su3_matrix* C = sitelink + 4*nbr_idx + nu;
00187         
00188     llfat_mult_su3_nn( A, B,&tmat1);
00189         
00190     if(staple!=NULL){/* Save the staple */
00191       llfat_mult_su3_na( &tmat1, C, &staple[i]);            
00192     } else{ /* No need to save the staple. Add it to the fatlinks */
00193       llfat_mult_su3_na( &tmat1, C, &tmat2);        
00194       llfat_scalar_mult_add_su3_matrix(fat1, &tmat2, coef, fat1);           
00195     }
00196   }    
00197   /***************lower staple****************
00198    *
00199    *               X       X
00200    *       nu      |       | 
00201    *         (A)   |       |(C)
00202    *               +-------+
00203    *                mu (B)
00204    *
00205    *********************************************/
00206 
00207   for(i=0;i < V;i++){       
00208         
00209     fat1 = fatlink + 4*i + mu;
00210     memset(dx, 0, sizeof(dx));
00211     dx[nu] = -1;
00212     int nbr_idx = neighborIndexFullLattice(i, dx[3], dx[2], dx[1], dx[0]);      
00213     if (nbr_idx >= V || nbr_idx <0){
00214       fprintf(stderr, "ERROR: invliad nbr_idx(%d), line=%d\n", nbr_idx, __LINE__);
00215       exit(1);
00216     }
00217     su3_matrix* A = sitelink + 4*nbr_idx + nu;
00218         
00219     su3_matrix* B;
00220     if (use_staple){
00221       B = mulink + nbr_idx;
00222     }else{
00223       B = mulink + 4*nbr_idx + mu;
00224     }
00225         
00226     memset(dx, 0, sizeof(dx));
00227     dx[mu] = 1;
00228     nbr_idx = neighborIndexFullLattice(nbr_idx, dx[3], dx[2],dx[1],dx[0]);
00229     su3_matrix* C = sitelink + 4*nbr_idx + nu;
00230 
00231     llfat_mult_su3_an( A, B,&tmat1);    
00232     llfat_mult_su3_nn( &tmat1, C,&tmat2);
00233         
00234     if(staple!=NULL){/* Save the staple */
00235       llfat_add_su3_matrix(&staple[i], &tmat2, &staple[i]);
00236       llfat_scalar_mult_add_su3_matrix(fat1, &staple[i], coef, fat1);
00237             
00238     } else{ /* No need to save the staple. Add it to the fatlinks */
00239       llfat_scalar_mult_add_su3_matrix(fat1, &tmat2, coef, fat1);           
00240     }
00241   } 
00242     
00243 } /* compute_gen_staple_site */
00244 
00245 
00246 
00247 /*  Optimized fattening code for the Asq and Asqtad actions.           
00248  *  I assume that: 
00249  *  path 0 is the one link
00250  *  path 2 the 3-staple
00251  *  path 3 the 5-staple 
00252  *  path 4 the 7-staple
00253  *  path 5 the Lapage term.
00254  *  Path 1 is the Naik term
00255  *
00256  */
00257 template <typename su3_matrix, typename Float>
00258 void llfat_cpu(su3_matrix* fatlink, su3_matrix* sitelink, Float* act_path_coeff)
00259 {
00260 
00261   su3_matrix* staple = (su3_matrix *)malloc(V*sizeof(su3_matrix));
00262   if(staple == NULL){
00263     fprintf(stderr, "Error: malloc failed for staple in function %s\n", __FUNCTION__);
00264     exit(1);
00265   }
00266     
00267   su3_matrix* tempmat1 = (su3_matrix *)malloc(V*sizeof(su3_matrix));
00268   if(tempmat1 == NULL){
00269     fprintf(stderr, "ERROR:  malloc failed for tempmat1 in function %s\n", __FUNCTION__);
00270     exit(1);
00271   }
00272     
00273   /* to fix up the Lepage term, included by a trick below */
00274   Float one_link = (act_path_coeff[0] - 6.0*act_path_coeff[5]);
00275     
00276 
00277   for (int dir=XUP; dir<=TUP; dir++){
00278 
00279     /* Intialize fat links with c_1*U_\mu(x) */
00280     for(int i=0;i < V;i ++){
00281       su3_matrix* fat1 = fatlink +  4*i + dir;
00282       llfat_scalar_mult_su3_matrix(sitelink+ 4*i + dir, one_link, fat1 );
00283     }
00284   }
00285 
00286   for (int dir=XUP; dir<=TUP; dir++){
00287     for(int nu=XUP; nu<=TUP; nu++){
00288       if(nu!=dir){
00289         llfat_compute_gen_staple_field(staple,dir,nu,sitelink, sitelink,fatlink, act_path_coeff[2], 0);
00290 
00291         /* The Lepage term */
00292         /* Note this also involves modifying c_1 (above) */
00293 
00294                 
00295         llfat_compute_gen_staple_field((su3_matrix*)NULL,dir,nu,staple,sitelink, fatlink, act_path_coeff[5],1);
00296                 
00297         for(int rho=XUP; rho<=TUP; rho++) {
00298           if((rho!=dir)&&(rho!=nu)){
00299             llfat_compute_gen_staple_field( tempmat1, dir, rho, staple,sitelink,fatlink, act_path_coeff[3], 1);
00300             
00301             for(int sig=XUP; sig<=TUP; sig++){
00302               if((sig!=dir)&&(sig!=nu)&&(sig!=rho)){
00303                 llfat_compute_gen_staple_field((su3_matrix*)NULL,dir,sig,tempmat1,sitelink,fatlink, act_path_coeff[4], 1);
00304               } 
00305             }/* sig */
00306 
00307           } 
00308 
00309         }/* rho */
00310 
00311 
00312       } 
00313 
00314     }/* nu */
00315         
00316   }/* dir */      
00317 
00318   free(staple);
00319   free(tempmat1);
00320 
00321 }
00322 
00323 
00324 void
00325 llfat_reference(void* fatlink, void* sitelink, QudaPrecision prec, void* act_path_coeff)
00326 {
00327   switch(prec){
00328   case QUDA_DOUBLE_PRECISION:{
00329     llfat_cpu((dsu3_matrix*)fatlink, (dsu3_matrix*)sitelink, (double*) act_path_coeff);
00330     break;
00331   }
00332   case QUDA_SINGLE_PRECISION:{
00333     llfat_cpu((fsu3_matrix*)fatlink, (fsu3_matrix*)sitelink, (float*) act_path_coeff);
00334     break;
00335   }
00336   default:
00337     fprintf(stderr, "ERROR: unsupported precision\n");
00338     exit(1);
00339     break;
00340         
00341   }
00342 
00343   return;
00344 
00345 }