QUDA v0.3.2
A library for QCD on GPUs

quda/tests/staggered_dslash_test.cpp

Go to the documentation of this file.
00001 #include <iostream>
00002 #include <stdio.h>
00003 #include <stdlib.h>
00004 #include <string.h>
00005 
00006 #include <quda.h>
00007 #include <quda_internal.h>
00008 #include <dirac_quda.h>
00009 #include <dslash_quda.h>
00010 #include <invert_quda.h>
00011 #include <util_quda.h>
00012 #include <blas_quda.h>
00013 
00014 #include <misc.h>
00015 #include <test_util.h>
00016 #include <staggered_dslash_reference.h>
00017 
00018 // What test are we doing (0 = dslash, 1 = MatPC, 2 = Mat)
00019 int test_type = 0;
00020 int device = 0;
00021 
00022 QudaGaugeParam gauge_param;
00023 QudaInvertParam inv_param;
00024 
00025 FullGauge cudaFatLink;
00026 FullGauge cudaLongLink;
00027 
00028 cpuColorSpinorField *spinor, *spinorOut, *spinorRef;
00029 cudaColorSpinorField *cudaSpinor, *cudaSpinorOut;
00030 
00031 cudaColorSpinorField* tmp;
00032 
00033 void *hostGauge[4];
00034 void *fatlink[4], *longlink[4];
00035 
00036 QudaParity parity;
00037 QudaDagType dagger = QUDA_DAG_NO;
00038 int transfer = 0; // include transfer time in the benchmark?
00039 int tdim = 24;
00040 int sdim = 24;
00041 
00042 QudaReconstructType link_recon = QUDA_RECONSTRUCT_12;
00043 QudaPrecision prec = QUDA_SINGLE_PRECISION;
00044 
00045 Dirac* dirac;
00046 
00047 void init()
00048 {    
00049   gauge_param = newQudaGaugeParam();
00050   inv_param = newQudaInvertParam();
00051 
00052   gauge_param.X[0] = sdim;
00053   gauge_param.X[1] = sdim;
00054   gauge_param.X[2] = sdim;
00055   gauge_param.X[3] = tdim;
00056 
00057   setDims(gauge_param.X);
00058 
00059   Vh = sdim*sdim*sdim*tdim/2;
00060 
00061   gauge_param.cpu_prec = QUDA_DOUBLE_PRECISION;
00062   gauge_param.cuda_prec = prec;
00063   gauge_param.reconstruct = link_recon;
00064   gauge_param.reconstruct_sloppy = gauge_param.reconstruct;
00065   gauge_param.cuda_prec_sloppy = gauge_param.cuda_prec;
00066     
00067   gauge_param.tadpole_coeff = 0.8;
00068   gauge_param.gauge_order = QUDA_QDP_GAUGE_ORDER;
00069   gauge_param.t_boundary = QUDA_ANTI_PERIODIC_T;
00070   gauge_param.gauge_fix = QUDA_GAUGE_FIXED_NO;
00071   gauge_param.gaugeGiB = 0;
00072     
00073   inv_param.cpu_prec = QUDA_DOUBLE_PRECISION;
00074   inv_param.cuda_prec = prec;
00075   inv_param.dirac_order = QUDA_DIRAC_ORDER;
00076   inv_param.dagger = dagger;
00077   inv_param.matpc_type = QUDA_MATPC_EVEN_EVEN;
00078   inv_param.dslash_type = QUDA_ASQTAD_DSLASH;
00079     
00080   gauge_param.ga_pad = sdim*sdim*sdim/2;
00081   inv_param.sp_pad = sdim*sdim*sdim/2;
00082 
00083   ColorSpinorParam csParam;
00084   csParam.fieldLocation = QUDA_CPU_FIELD_LOCATION;
00085   csParam.nColor=3;
00086   csParam.nSpin=1;
00087   csParam.nDim=4;
00088   for(int d = 0; d < 4; d++) {
00089     csParam.x[d] = gauge_param.X[d];
00090   }
00091   csParam.precision = inv_param.cpu_prec;
00092   csParam.pad = 0;
00093   if (test_type < 2) {
00094     inv_param.solution_type = QUDA_MATPC_SOLUTION;
00095     csParam.siteSubset = QUDA_PARITY_SITE_SUBSET;
00096     csParam.x[0] /= 2;
00097   } else {
00098     inv_param.solution_type = QUDA_MAT_SOLUTION;
00099     csParam.siteSubset = QUDA_FULL_SITE_SUBSET; 
00100   }
00101 
00102   csParam.siteOrder = QUDA_EVEN_ODD_SITE_ORDER;
00103   csParam.fieldOrder  = QUDA_SPACE_SPIN_COLOR_FIELD_ORDER;
00104   csParam.gammaBasis = QUDA_DEGRAND_ROSSI_GAMMA_BASIS;
00105   csParam.create = QUDA_ZERO_FIELD_CREATE;    
00106 
00107   spinor = new cpuColorSpinorField(csParam);
00108   spinorOut = new cpuColorSpinorField(csParam);
00109   spinorRef = new cpuColorSpinorField(csParam);
00110 
00111   csParam.siteSubset = QUDA_FULL_SITE_SUBSET;
00112   csParam.x[0] = gauge_param.X[0];
00113     
00114   printfQuda("Randomizing fields ...\n");
00115     
00116   spinor->Source(QUDA_RANDOM_SOURCE);
00117 
00118   size_t gSize = (gauge_param.cpu_prec == QUDA_DOUBLE_PRECISION) ? sizeof(double) : sizeof(float);
00119     
00120   for (int dir = 0; dir < 4; dir++) {
00121     fatlink[dir] = malloc(V*gaugeSiteSize*gSize);
00122     longlink[dir] = malloc(V*gaugeSiteSize*gSize);
00123   }
00124   if (fatlink == NULL || longlink == NULL){
00125     fprintf(stderr, "ERROR: malloc failed for fatlink/longlink\n");
00126     exit(1);
00127   }
00128     
00129   construct_fat_long_gauge_field(fatlink, longlink, 1, gauge_param.cpu_prec, &gauge_param);
00130 
00131 #if 0
00132   //printf("links are:\n");
00133   //display_link(fatlink[0], 1, gauge_param.cpu_prec);
00134   //display_link(longlink[0], 1, gauge_param.cpu_prec);
00135     
00136   for (int i =0;i < 4 ;i++){
00137     int dir = 2*i;
00138     link_sanity_check(fatlink[i], V, gauge_param.cpu_prec, dir, &gauge_param);
00139     link_sanity_check(longlink[i], V, gauge_param.cpu_prec, dir, &gauge_param);
00140   }
00141 
00142   //printf("spinors are:\n");  
00143   //display_spinor(spinor, 10, inv_param.cpu_prec);
00144 #endif
00145 
00146   initQuda(device);
00147 
00148   gauge_param.type = QUDA_ASQTAD_FAT_LINKS;
00149   gauge_param.reconstruct = gauge_param.reconstruct_sloppy = QUDA_RECONSTRUCT_NO;
00150   loadGaugeQuda(fatlink, &gauge_param);
00151 
00152   gauge_param.type = QUDA_ASQTAD_LONG_LINKS;
00153   gauge_param.reconstruct = gauge_param.reconstruct_sloppy = link_recon;
00154   loadGaugeQuda(longlink, &gauge_param);
00155 
00156   cudaFatLink = cudaFatLinkPrecise;
00157   cudaLongLink = cudaLongLinkPrecise;
00158     
00159   printf("Sending fields to GPU..."); fflush(stdout);
00160     
00161   if (!transfer) {
00162         
00163     csParam.fieldLocation = QUDA_CUDA_FIELD_LOCATION;
00164     csParam.fieldOrder = QUDA_FLOAT2_FIELD_ORDER;
00165     csParam.gammaBasis = QUDA_DEGRAND_ROSSI_GAMMA_BASIS;
00166     csParam.pad = inv_param.sp_pad;
00167     csParam.precision = inv_param.cuda_prec;
00168     if (test_type < 2){
00169       csParam.siteSubset = QUDA_PARITY_SITE_SUBSET;
00170       csParam.x[0] /=2;
00171     }
00172         
00173     printfQuda("Creating cudaSpinor\n");
00174     cudaSpinor = new cudaColorSpinorField(csParam);
00175 
00176     printfQuda("Creating cudaSpinorOut\n");
00177     cudaSpinorOut = new cudaColorSpinorField(csParam);
00178         
00179     printfQuda("Sending spinor field to GPU\n");
00180     *cudaSpinor = *spinor;
00181         
00182     cudaThreadSynchronize();
00183     checkCudaError();
00184         
00185     printf("Source CPU = %f, CUDA=%f\n", norm2(*spinor), norm2(*cudaSpinor));
00186         
00187     if(test_type == 2){
00188       csParam.x[0] /=2;
00189     }
00190     csParam.siteSubset = QUDA_PARITY_SITE_SUBSET;
00191     tmp = new cudaColorSpinorField(csParam);
00192 
00193     bool pc = (test_type != 2);
00194     DiracParam diracParam;
00195     setDiracParam(diracParam, &inv_param, pc);
00196     diracParam.fatGauge = &cudaFatLinkPrecise;
00197     diracParam.longGauge = &cudaLongLinkPrecise;
00198 
00199     diracParam.verbose = QUDA_VERBOSE;
00200     diracParam.tmp1=tmp;
00201     dirac = Dirac::create(diracParam);
00202         
00203   } else {
00204     printf("ERROR: not suppported\n");
00205   }
00206     
00207   return;
00208 }
00209 
00210 void end(void) 
00211 {
00212   for (int dir = 0; dir < 4; dir++) {
00213     free(fatlink[dir]);
00214     free(longlink[dir]);
00215   }
00216   if (!transfer){
00217     delete dirac;
00218     delete cudaSpinor;
00219     delete cudaSpinorOut;
00220     delete tmp;
00221   }
00222     
00223   delete spinor;
00224   delete spinorOut;
00225   delete spinorRef;
00226     
00227   endQuda();
00228 }
00229 
00230 double dslashCUDA() {
00231     
00232   // execute kernel
00233   const int LOOPS = 1;
00234   printf("Executing %d kernel loops...", LOOPS);
00235   fflush(stdout);
00236   stopwatchStart();
00237   for (int i = 0; i < LOOPS; i++) {
00238     switch (test_type) {
00239     case 0:
00240       parity = QUDA_EVEN_PARITY;
00241       if (transfer){
00242         //dslashQuda(spinorOdd, spinorEven, &inv_param, parity);
00243       }
00244       else {
00245         dirac->Dslash(*cudaSpinorOut, *cudaSpinor, parity);
00246       }    
00247       break;
00248     case 1:
00249       parity = QUDA_ODD_PARITY;
00250       if (transfer){
00251         //MatPCQuda(spinorOdd, spinorEven, &inv_param);
00252       }else {
00253         dirac->Dslash(*cudaSpinorOut, *cudaSpinor, parity);
00254       }
00255       break;
00256     case 2:
00257       if (transfer){
00258         //MatQuda(spinorGPU, spinor, &inv_param);
00259       }
00260       else {
00261         dirac->M(*cudaSpinorOut, *cudaSpinor);
00262       }
00263     }
00264   }
00265     
00266   // check for errors
00267   cudaError_t stat = cudaGetLastError();
00268   if (stat != cudaSuccess)
00269     printf("with ERROR: %s\n", cudaGetErrorString(stat));
00270     
00271   cudaThreadSynchronize();
00272   double secs = stopwatchReadSeconds() / LOOPS;
00273   printf("done.\n\n");
00274     
00275   return secs;
00276 }
00277 
00278 void staggeredDslashRef()
00279 {
00280   int cpu_parity;
00281   // compare to dslash reference implementation
00282   printf("Calculating reference implementation...");
00283   fflush(stdout);
00284   switch (test_type) {
00285   case 0:    
00286     cpu_parity = 0; //EVEN
00287     staggered_dslash(spinorRef->v, fatlink, longlink, spinor->v, cpu_parity, dagger, 
00288                      inv_param.cpu_prec, gauge_param.cpu_prec);
00289     break;
00290   case 1: 
00291     cpu_parity=1; //ODD
00292     staggered_dslash(spinorRef->v, fatlink, longlink, spinor->v, cpu_parity, dagger, 
00293                      inv_param.cpu_prec, gauge_param.cpu_prec);
00294     break;
00295   case 2:
00296     //mat(spinorRef->v, fatlink, longlink, spinor->v, kappa, dagger, 
00297     //inv_param.cpu_prec, gauge_param.cpu_prec);
00298     break;
00299   default:
00300     printf("Test type not defined\n");
00301     exit(-1);
00302   }
00303     
00304   printf("done.\n");
00305     
00306 }
00307 
00308 static void dslashTest() 
00309 {
00310     
00311   init();
00312     
00313   int attempts = 1;
00314 
00315   staggeredDslashRef();
00316     
00317   for (int i=0; i<attempts; i++) {
00318         
00319     double secs = dslashCUDA();
00320     
00321     if (!transfer) {
00322       *spinorOut = *cudaSpinorOut;
00323     }
00324       
00325     printf("\n%fms per loop\n", 1000*secs);
00326         
00327     int flops = dirac->Flops();
00328     int link_floats = 8*gauge_param.packed_size+8*18;
00329     int spinor_floats = 8*6*2 + 6;
00330     int link_float_size = prec;
00331     int spinor_float_size = 0;
00332     
00333     link_floats = test_type ? (2*link_floats) : link_floats;
00334     spinor_floats = test_type ? (2*spinor_floats) : spinor_floats;
00335 
00336     int bytes_for_one_site = link_floats * link_float_size + spinor_floats * spinor_float_size;
00337     if (prec == QUDA_HALF_PRECISION) {
00338       bytes_for_one_site += (8*2 + 1)*4;        
00339     }
00340     printf("GFLOPS = %f\n", 1.0e-9*flops/secs);
00341     printf("GiB/s = %f\n\n", 1.0*Vh*bytes_for_one_site/(secs*(1<<30)));
00342         
00343     if (!transfer) {
00344       std::cout << "Results: CPU = " << norm2(*spinorRef) << ", CUDA = " << norm2(*cudaSpinorOut) << 
00345         ", CPU-CUDA = " << norm2(*spinorOut) << std::endl;
00346     } else {
00347       std::cout << "Result: CPU = " << norm2(*spinorRef) << ", CPU-CUDA = " << norm2(*spinorOut) << std::endl;
00348     }
00349         
00350     cpuColorSpinorField::Compare(*spinorRef, *spinorOut);       
00351         
00352     printf("Output spinor:\n");
00353     spinorOut->PrintVector(0);
00354 
00355     printf("Ref spinor:\n");
00356     spinorRef->PrintVector(0);
00357         
00358   }
00359   end();
00360   
00361 }
00362 
00363 
00364 void display_test_info()
00365 {
00366   printf("running the following test:\n");
00367  
00368   printf("prec recon   test_type     dagger   S_dim     T_dimension\n");
00369   printf("%s   %s       %d           %d       %d        %d \n", 
00370          get_prec_str(prec), get_recon_str(link_recon), 
00371          test_type, dagger, sdim, tdim);
00372   return ;
00373     
00374 }
00375 
00376 void usage(char** argv )
00377 {
00378   printf("Usage: %s <args>\n", argv[0]);
00379   printf("--prec <double/single/half> \t Precision in GPU\n"); 
00380   printf("--recon <8/12> \t\t\t Long link reconstruction type\n"); 
00381   printf("--type <0/1/2> \t\t\t Test type\n"); 
00382   printf("--dagger \t\t\t Set the dagger to 1\n"); 
00383   printf("--tdim \t\t\t\t Set T dimention size(default 24)\n");     
00384   printf("--sdim \t\t\t\t Set space dimention size\n"); 
00385   printf("--help \t\t\t\t Print out this message\n"); 
00386   exit(1);
00387   return ;
00388 }
00389 
00390 int main(int argc, char **argv) 
00391 {
00392   int i;
00393   for (i =1;i < argc; i++){
00394         
00395     if( strcmp(argv[i], "--help")== 0){
00396       usage(argv);
00397     }
00398 
00399     if( strcmp(argv[i], "--device") == 0){
00400       if (i+1 >= argc){
00401         usage(argv);
00402       }
00403       device =  atoi(argv[i+1]);
00404       if (device < 0){
00405         fprintf(stderr, "Error: invalid device number(%d)\n", device);
00406         exit(1);
00407       }
00408       i++;
00409       continue;
00410     }
00411         
00412     if( strcmp(argv[i], "--prec") == 0){
00413       if (i+1 >= argc){
00414         usage(argv);
00415       }     
00416       prec =  get_prec(argv[i+1]);
00417       i++;
00418       continue;     
00419     }
00420         
00421         
00422     if( strcmp(argv[i], "--recon") == 0){
00423       if (i+1 >= argc){
00424         usage(argv);
00425       }     
00426       link_recon =  get_recon(argv[i+1]);
00427       i++;
00428       continue;     
00429     }
00430         
00431     if( strcmp(argv[i], "--test") == 0){
00432       if (i+1 >= argc){
00433         usage(argv);
00434       }     
00435       test_type =  atoi(argv[i+1]);
00436       if (test_type < 0 || test_type >= 2){
00437         fprintf(stderr, "Error: invalid test type\n");
00438         exit(1);
00439       }
00440       i++;
00441       continue;     
00442     }
00443 
00444     if( strcmp(argv[i], "--tdim") == 0){
00445       if (i+1 >= argc){
00446         usage(argv);
00447       }     
00448       tdim =  atoi(argv[i+1]);
00449       if (tdim < 0 || tdim > 128){
00450         fprintf(stderr, "Error: invalid t dimention\n");
00451         exit(1);
00452       }
00453       i++;
00454       continue;     
00455     }
00456 
00457     if( strcmp(argv[i], "--sdim") == 0){
00458       if (i+1 >= argc){
00459         usage(argv);
00460       }     
00461       sdim =  atoi(argv[i+1]);
00462       if (sdim < 0 || sdim > 128){
00463         fprintf(stderr, "Error: invalid S dimention\n");
00464         exit(1);
00465       }
00466       i++;
00467       continue;     
00468     }
00469         
00470     if( strcmp(argv[i], "--dagger") == 0){
00471       dagger = QUDA_DAG_YES;
00472       continue;     
00473     }   
00474 
00475     fprintf(stderr, "ERROR: Invalid option:%s\n", argv[i]);
00476     usage(argv);
00477   }
00478 
00479   display_test_info();
00480 
00481   dslashTest();
00482 }
 All Classes Files Functions Variables Typedefs Enumerations Enumerator Friends Defines