QUDA v0.4.0
A library for QCD on GPUs
quda/tests/wilson_invert_test.cpp
Go to the documentation of this file.
00001 #include <stdlib.h>
00002 #include <stdio.h>
00003 #include <time.h>
00004 #include <math.h>
00005 #include <string.h>
00006 
00007 #include <util_quda.h>
00008 #include <test_util.h>
00009 #include <blas_reference.h>
00010 #include <wilson_dslash_reference.h>
00011 #include "misc.h"
00012 
00013 #include "face_quda.h"
00014 
00015 #ifdef QMP_COMMS
00016 #include <qmp.h>
00017 #endif
00018 
00019 #include <gauge_qio.h>
00020 
00021 #define MAX(a,b) ((a)>(b)?(a):(b))
00022 
00023 // In a typical application, quda.h is the only QUDA header required.
00024 #include <quda.h>
00025 
00026 // Wilson, clover-improved Wilson, and twisted mass are supported.
00027 extern QudaDslashType dslash_type;
00028 extern bool tune;
00029 extern int device;
00030 extern int xdim;
00031 extern int ydim;
00032 extern int zdim;
00033 extern int tdim;
00034 extern int gridsize_from_cmdline[];
00035 extern QudaReconstructType link_recon;
00036 extern QudaPrecision prec;
00037 extern QudaReconstructType link_recon_sloppy;
00038 extern QudaPrecision  prec_sloppy;
00039 
00040 extern char latfile[];
00041 
00042 void
00043 display_test_info()
00044 {
00045   printfQuda("running the following test:\n");
00046     
00047   printfQuda("prec    sloppy_prec    link_recon  sloppy_link_recon S_dimension T_dimension\n");
00048   printfQuda("%s   %s             %s            %s            %d/%d/%d          %d \n",
00049              get_prec_str(prec),get_prec_str(prec_sloppy),
00050              get_recon_str(link_recon), 
00051              get_recon_str(link_recon_sloppy),  xdim, ydim, zdim, tdim);     
00052 
00053   printfQuda("Grid partition info:     X  Y  Z  T\n"); 
00054   printfQuda("                         %d  %d  %d  %d\n", 
00055              commDimPartitioned(0),
00056              commDimPartitioned(1),
00057              commDimPartitioned(2),
00058              commDimPartitioned(3)); 
00059   
00060   return ;
00061   
00062 }
00063 
00064 extern void usage(char** );
00065 int main(int argc, char **argv)
00066 {
00067   /*
00068   int ndim=4, dims[4] = {1, 1, 1, 1};
00069   char dimchar[] = {'X', 'Y', 'Z', 'T'};
00070   char *gridsizeopt[] = {"--xgridsize", "--ygridsize", "--zgridsize", "--tgridsize"};
00071 
00072   for (int i=1; i<argc; i++) {
00073     for (int d=0; d<ndim; d++) {
00074       if (!strcmp(argv[i], gridsizeopt[d])) {
00075         if (i+1 >= argc) {
00076           printf("Usage: %s <args>\n", argv[0]);
00077           printf("%s\t Set %c comms grid size (default = 1)\n", gridsizeopt[d], dimchar[d]); 
00078           exit(1);
00079         }     
00080         dims[d] = atoi(argv[i+1]);
00081         if (dims[d] <= 0 ) {
00082           printf("Error: Invalid %c grid size\n", dimchar[d]);
00083           exit(1);
00084         }
00085         i++;
00086         break;
00087       }
00088     }
00089   }
00090   */
00091 
00092 
00093 
00094   int i;
00095   for (i =1;i < argc; i++){
00096 
00097     if(process_command_line_option(argc, argv, &i) == 0){
00098       continue;
00099     } 
00100     
00101     printf("ERROR: Invalid option:%s\n", argv[i]);
00102     usage(argv);
00103     
00104 
00105   }
00106 
00107 
00108   if (prec_sloppy == QUDA_INVALID_PRECISION){
00109     prec_sloppy = prec;
00110   }
00111   if (link_recon_sloppy == QUDA_RECONSTRUCT_INVALID){
00112     link_recon_sloppy = link_recon;
00113   }
00114 
00115 
00116   initCommsQuda(argc, argv, gridsize_from_cmdline, 4);
00117 
00118   // *** QUDA parameters begin here.
00119 
00120   int multi_shift = 0; // whether to test multi-shift or standard solver
00121 
00122   if (dslash_type != QUDA_WILSON_DSLASH &&
00123       dslash_type != QUDA_CLOVER_WILSON_DSLASH &&
00124       dslash_type != QUDA_TWISTED_MASS_DSLASH) {
00125     printf("dslash_type %d not supported\n", dslash_type);
00126     exit(0);
00127   }
00128 
00129   QudaPrecision cpu_prec = QUDA_DOUBLE_PRECISION;
00130   QudaPrecision cuda_prec = prec;
00131   QudaPrecision cuda_prec_sloppy = prec_sloppy;
00132   QudaPrecision cuda_prec_precondition = QUDA_HALF_PRECISION;
00133 
00134   // offsets used only by multi-shift solver
00135   int num_offsets = 4;
00136   double offsets[4] = {0.01, 0.02, 0.03, 0.04};
00137 
00138   QudaGaugeParam gauge_param = newQudaGaugeParam();
00139   QudaInvertParam inv_param = newQudaInvertParam();
00140  
00141   gauge_param.X[0] = xdim;
00142   gauge_param.X[1] = ydim;
00143   gauge_param.X[2] = zdim;
00144   gauge_param.X[3] = tdim;
00145 
00146   gauge_param.anisotropy = 1.0;
00147   gauge_param.type = QUDA_WILSON_LINKS;
00148   gauge_param.gauge_order = QUDA_QDP_GAUGE_ORDER;
00149   gauge_param.t_boundary = QUDA_ANTI_PERIODIC_T;
00150   
00151   gauge_param.cpu_prec = cpu_prec;
00152   gauge_param.cuda_prec = cuda_prec;
00153   gauge_param.reconstruct = link_recon;
00154   gauge_param.cuda_prec_sloppy = cuda_prec_sloppy;
00155   gauge_param.reconstruct_sloppy = link_recon_sloppy;
00156   gauge_param.cuda_prec_precondition = cuda_prec_precondition;
00157   gauge_param.reconstruct_precondition = link_recon_sloppy;
00158   gauge_param.gauge_fix = QUDA_GAUGE_FIXED_NO;
00159 
00160   inv_param.dslash_type = dslash_type;
00161 
00162   double mass = -0.2180;
00163   inv_param.kappa = 1.0 / (2.0 * (1 + 3/gauge_param.anisotropy + mass));
00164 
00165   if (dslash_type == QUDA_TWISTED_MASS_DSLASH) {
00166     inv_param.mu = 0.1;
00167     inv_param.twist_flavor = QUDA_TWIST_MINUS;
00168   }
00169 
00170   inv_param.solution_type = QUDA_MATPC_SOLUTION;
00171   inv_param.solve_type = QUDA_NORMEQ_PC_SOLVE;
00172   inv_param.matpc_type = QUDA_MATPC_EVEN_EVEN;
00173   inv_param.dagger = QUDA_DAG_NO;
00174   inv_param.mass_normalization = QUDA_KAPPA_NORMALIZATION;
00175 
00176   inv_param.inv_type = QUDA_BICGSTAB_INVERTER;
00177   inv_param.gcrNkrylov = 30;
00178   inv_param.tol = 5e-7;
00179   inv_param.maxiter = 30;
00180   inv_param.reliable_delta = 1e-1; // ignored by multi-shift solver
00181 
00182   // domain decomposition preconditioner parameters
00183   inv_param.inv_type_precondition = QUDA_INVALID_INVERTER;
00184   inv_param.tol_precondition = 1e-1;
00185   inv_param.maxiter_precondition = 10;
00186   inv_param.verbosity_precondition = QUDA_SILENT;
00187   inv_param.prec_precondition = cuda_prec_precondition;
00188   inv_param.omega = 1.0;
00189 
00190   inv_param.cpu_prec = cpu_prec;
00191   inv_param.cuda_prec = cuda_prec;
00192   inv_param.cuda_prec_sloppy = cuda_prec_sloppy;
00193   inv_param.preserve_source = QUDA_PRESERVE_SOURCE_NO;
00194   inv_param.gamma_basis = QUDA_DEGRAND_ROSSI_GAMMA_BASIS;
00195   inv_param.dirac_order = QUDA_DIRAC_ORDER;
00196 
00197   inv_param.tune = tune ? QUDA_TUNE_YES : QUDA_TUNE_NO;
00198 
00199   gauge_param.ga_pad = 0; // 24*24*24/2;
00200   inv_param.sp_pad = 0; // 24*24*24/2;
00201   inv_param.cl_pad = 0; // 24*24*24/2;
00202 
00203   // For multi-GPU, ga_pad must be large enough to store a time-slice
00204 #ifdef MULTI_GPU
00205   int x_face_size = gauge_param.X[1]*gauge_param.X[2]*gauge_param.X[3]/2;
00206   int y_face_size = gauge_param.X[0]*gauge_param.X[2]*gauge_param.X[3]/2;
00207   int z_face_size = gauge_param.X[0]*gauge_param.X[1]*gauge_param.X[3]/2;
00208   int t_face_size = gauge_param.X[0]*gauge_param.X[1]*gauge_param.X[2]/2;
00209   int pad_size =MAX(x_face_size, y_face_size);
00210   pad_size = MAX(pad_size, z_face_size);
00211   pad_size = MAX(pad_size, t_face_size);
00212   gauge_param.ga_pad = pad_size;    
00213 #endif
00214 
00215   if (dslash_type == QUDA_CLOVER_WILSON_DSLASH) {
00216     inv_param.clover_cpu_prec = cpu_prec;
00217     inv_param.clover_cuda_prec = cuda_prec;
00218     inv_param.clover_cuda_prec_sloppy = cuda_prec_sloppy;
00219     inv_param.clover_cuda_prec_precondition = cuda_prec_precondition;
00220     inv_param.clover_order = QUDA_PACKED_CLOVER_ORDER;
00221   }
00222 
00223   inv_param.verbosity = QUDA_VERBOSE;
00224 
00225   // *** Everything between here and the call to initQuda() is
00226   // *** application-specific.
00227 
00228   // set parameters for the reference Dslash, and prepare fields to be loaded
00229   setDims(gauge_param.X);
00230 
00231   size_t gSize = (gauge_param.cpu_prec == QUDA_DOUBLE_PRECISION) ? sizeof(double) : sizeof(float);
00232   size_t sSize = (inv_param.cpu_prec == QUDA_DOUBLE_PRECISION) ? sizeof(double) : sizeof(float);
00233 
00234   void *gauge[4], *clover_inv=0, *clover=0;
00235 
00236   for (int dir = 0; dir < 4; dir++) {
00237     gauge[dir] = malloc(V*gaugeSiteSize*gSize);
00238   }
00239 
00240   if (strcmp(latfile,"")) {  // load in the command line supplied gauge field
00241     read_gauge_field(latfile, gauge, gauge_param.cpu_prec, gauge_param.X, argc, argv);
00242     construct_gauge_field(gauge, 2, gauge_param.cpu_prec, &gauge_param);
00243   } else { // else generate a random SU(3) field
00244     construct_gauge_field(gauge, 1, gauge_param.cpu_prec, &gauge_param);
00245   }
00246 
00247   if (dslash_type == QUDA_CLOVER_WILSON_DSLASH) {
00248     double norm = 0.0; // clover components are random numbers in the range (-norm, norm)
00249     double diag = 1.0; // constant added to the diagonal
00250 
00251     size_t cSize = (inv_param.clover_cpu_prec == QUDA_DOUBLE_PRECISION) ? sizeof(double) : sizeof(float);
00252     clover_inv = malloc(V*cloverSiteSize*cSize);
00253     construct_clover_field(clover_inv, norm, diag, inv_param.clover_cpu_prec);
00254 
00255     // The uninverted clover term is only needed when solving the unpreconditioned
00256     // system or when using "asymmetric" even/odd preconditioning.
00257     int preconditioned = (inv_param.solve_type == QUDA_DIRECT_PC_SOLVE ||
00258                           inv_param.solve_type == QUDA_NORMEQ_PC_SOLVE);
00259     int asymmetric = preconditioned &&
00260                          (inv_param.matpc_type == QUDA_MATPC_EVEN_EVEN_ASYMMETRIC ||
00261                           inv_param.matpc_type == QUDA_MATPC_ODD_ODD_ASYMMETRIC);
00262     if (!preconditioned) {
00263       clover = clover_inv;
00264       clover_inv = NULL;
00265     } else if (asymmetric) { // fake it by using the same random matrix
00266       clover = clover_inv;   // for both clover and clover_inv
00267     } else {
00268       clover = NULL;
00269     }
00270   }
00271 
00272   void *spinorIn = malloc(V*spinorSiteSize*sSize);
00273   void *spinorCheck = malloc(V*spinorSiteSize*sSize);
00274 
00275   void *spinorOut = NULL, **spinorOutMulti = NULL;
00276   if (multi_shift) {
00277     spinorOutMulti = (void**)malloc(num_offsets*sizeof(void *));
00278     for (int i=0; i<num_offsets; i++) {
00279       spinorOutMulti[i] = malloc(V*spinorSiteSize*sSize);
00280     }
00281   } else {
00282     spinorOut = malloc(V*spinorSiteSize*sSize);
00283   }
00284 
00285   // create a point source at 0
00286   if (inv_param.cpu_prec == QUDA_SINGLE_PRECISION) *((float*)spinorIn) = 1.0;
00287   else *((double*)spinorIn) = 1.0;
00288 
00289   // start the timer
00290   double time0 = -((double)clock());
00291 
00292   // initialize the QUDA library
00293   initQuda(device);
00294 
00295   // load the gauge field
00296   loadGaugeQuda((void*)gauge, &gauge_param);
00297 
00298   // load the clover term, if desired
00299   if (dslash_type == QUDA_CLOVER_WILSON_DSLASH) loadCloverQuda(clover, clover_inv, &inv_param);
00300   
00301   // perform the inversion
00302   if (multi_shift) {
00303     double resid_sq;
00304     invertMultiShiftQuda(spinorOutMulti, spinorIn, &inv_param,
00305                          offsets, num_offsets, &resid_sq);
00306   } else {
00307     invertQuda(spinorOut, spinorIn, &inv_param);
00308   }
00309 
00310   // stop the timer
00311   time0 += clock();
00312   time0 /= CLOCKS_PER_SEC;
00313     
00314   printf("Device memory used:\n   Spinor: %f GiB\n    Gauge: %f GiB\n", 
00315          inv_param.spinorGiB, gauge_param.gaugeGiB);
00316   if (dslash_type == QUDA_CLOVER_WILSON_DSLASH) printf("   Clover: %f GiB\n", inv_param.cloverGiB);
00317   printf("\nDone: %i iter / %g secs = %g Gflops, total time = %g secs\n", 
00318          inv_param.iter, inv_param.secs, inv_param.gflops/inv_param.secs, time0);
00319 
00320   if (multi_shift) {
00321 
00322     void *spinorTmp = malloc(V*spinorSiteSize*sSize);
00323 
00324     printf("Host residuum checks: \n");
00325     for(int i=0; i < num_offsets; i++) {
00326       ax(0, spinorCheck, V*spinorSiteSize, inv_param.cpu_prec);
00327       
00328       if (dslash_type == QUDA_TWISTED_MASS_DSLASH) {
00329         tm_matpc(spinorTmp, gauge, spinorOutMulti[i], inv_param.kappa, inv_param.mu, inv_param.twist_flavor, 
00330                  inv_param.matpc_type, 0, inv_param.cpu_prec, gauge_param);
00331         tm_matpc(spinorCheck, gauge, spinorTmp, inv_param.kappa, inv_param.mu, inv_param.twist_flavor, 
00332                  inv_param.matpc_type, 1, inv_param.cpu_prec, gauge_param);
00333       } else {
00334         wil_matpc(spinorTmp, gauge, spinorOutMulti[i], inv_param.kappa, inv_param.matpc_type, 0,
00335                   inv_param.cpu_prec, gauge_param);
00336         wil_matpc(spinorCheck, gauge, spinorTmp, inv_param.kappa, inv_param.matpc_type, 1,
00337                   inv_param.cpu_prec, gauge_param);
00338       }
00339 
00340       axpy(offsets[i], spinorOutMulti[i], spinorCheck, V*spinorSiteSize, inv_param.cpu_prec);
00341       mxpy(spinorIn, spinorCheck, V*spinorSiteSize, inv_param.cpu_prec);
00342       double nrm2 = norm_2(spinorCheck, V*spinorSiteSize, inv_param.cpu_prec);
00343       double src2 = norm_2(spinorIn, V*spinorSiteSize, inv_param.cpu_prec);
00344       printf("Shift i=%d Relative residual: requested = %g, actual = %g\n", i, inv_param.tol, sqrt(nrm2/src2));
00345     }
00346     free(spinorTmp);
00347 
00348   } else {
00349     
00350     if (inv_param.solution_type == QUDA_MAT_SOLUTION) {
00351       if (dslash_type == QUDA_TWISTED_MASS_DSLASH) {
00352         tm_mat(spinorCheck, gauge, spinorOut, inv_param.kappa, inv_param.mu, inv_param.twist_flavor, 
00353                0, inv_param.cpu_prec, gauge_param); 
00354       } else {
00355         wil_mat(spinorCheck, gauge, spinorOut, inv_param.kappa, 0, 
00356                 inv_param.cpu_prec, gauge_param);
00357       }
00358       if (inv_param.mass_normalization == QUDA_MASS_NORMALIZATION) {
00359         ax(0.5/inv_param.kappa, spinorCheck, V*spinorSiteSize, inv_param.cpu_prec);
00360       }
00361     } else if(inv_param.solution_type == QUDA_MATPC_SOLUTION) {   
00362       if (dslash_type == QUDA_TWISTED_MASS_DSLASH) {
00363         tm_matpc(spinorCheck, gauge, spinorOut, inv_param.kappa, inv_param.mu, inv_param.twist_flavor, 
00364                  inv_param.matpc_type, 0, inv_param.cpu_prec, gauge_param);
00365       } else {
00366         wil_matpc(spinorCheck, gauge, spinorOut, inv_param.kappa, inv_param.matpc_type, 0, 
00367                   inv_param.cpu_prec, gauge_param);
00368       }
00369       if (inv_param.mass_normalization == QUDA_MASS_NORMALIZATION) {
00370         ax(0.25/(inv_param.kappa*inv_param.kappa), spinorCheck, V*spinorSiteSize, inv_param.cpu_prec);
00371       }
00372     }
00373 
00374     mxpy(spinorIn, spinorCheck, V*spinorSiteSize, inv_param.cpu_prec);
00375     double nrm2 = norm_2(spinorCheck, V*spinorSiteSize, inv_param.cpu_prec);
00376     double src2 = norm_2(spinorIn, V*spinorSiteSize, inv_param.cpu_prec);
00377     printf("Relative residual: requested = %g, actual = %g\n", inv_param.tol, sqrt(nrm2/src2));
00378 
00379   }
00380     
00381   freeGaugeQuda();
00382   if (dslash_type == QUDA_CLOVER_WILSON_DSLASH) freeCloverQuda();
00383 
00384   // finalize the QUDA library
00385   endQuda();
00386 
00387   // end if the communications layer
00388   endCommsQuda();
00389 
00390   return 0;
00391 }
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines