QUDA  0.9.0
dslash_ctest.cpp
Go to the documentation of this file.
1 #include <iostream>
2 #include <stdio.h>
3 #include <stdlib.h>
4 #include <string.h>
5 
6 #include <quda.h>
7 #include <quda_internal.h>
8 #include <dirac_quda.h>
9 #include <dslash_quda.h>
10 #include <invert_quda.h>
11 #include <util_quda.h>
12 #include <blas_quda.h>
13 
14 #include <test_util.h>
15 #include <dslash_util.h>
18 #include "misc.h"
19 
20 #include <qio_field.h>
21 // google test frame work
22 #include <gtest.h>
23 
24 #define MAX(a,b) ((a)>(b)?(a):(b))
25 
26 using namespace quda;
27 
28 const QudaParity parity = QUDA_EVEN_PARITY; // even or odd?
29 const int transfer = 0; // include transfer time in the benchmark?
30 
31 double kappa5;
32 
35 
38 
41 
43 
44 Dirac *dirac = NULL;
45 DiracMobiusPC *dirac_mdwf = NULL; // create the MDWF Dirac operator
46 DiracDomainWall4DPC *dirac_4dpc = NULL; // create the 4d preconditioned DWF Dirac operator
47 
48 // What test are we doing (0 = dslash, 1 = MatPC, 2 = Mat, 3 = MatPCDagMatPC, 4 = MatDagMat)
49 extern int test_type;
50 
51 // Dirac operator type
53 
54 // Twisted mass flavor type
57 
58 extern int device;
59 extern int xdim;
60 extern int ydim;
61 extern int zdim;
62 extern int tdim;
63 extern int Lsdim;
64 extern int gridsize_from_cmdline[];
65 extern QudaDagType dagger;
67 
68 extern bool compute_clover;
69 extern double clover_coeff;
70 
71 extern bool verify_results;
72 extern int niter;
73 extern char latfile[];
74 
75 extern bool kernel_pack_t;
76 
77 extern double mass; // mass of Dirac operator
78 extern double mu;
79 extern void usage(char**);
80 
82 
83 const char *prec_str[] = {"half", "single", "double"};
84 const char *recon_str[] = {"r18", "r12", "r8"};
85 
86 // For googletest names must be non-empty, unique, and may only contain ASCII
87 // alphanumeric characters or underscore
88 
89 
90 void init(int precision, QudaReconstructType link_recon) {
91 
92  cuda_prec = precision == 2 ? QUDA_DOUBLE_PRECISION : precision == 1 ? QUDA_SINGLE_PRECISION : QUDA_HALF_PRECISION;
93 
96 
97  gauge_param.X[0] = xdim;
98  gauge_param.X[1] = ydim;
99  gauge_param.X[2] = zdim;
100  gauge_param.X[3] = tdim;
101 
103  errorQuda("Asqtad not supported. Please try staggered_dslash_test instead");
104  } else if (dslash_type == QUDA_DOMAIN_WALL_DSLASH ||
107  // for these we always use kernel packing
109  setKernelPackT(true);
110  } else {
113  Ls = 1;
114  }
115 
116  setSpinorSiteSize(24);
117 
118  gauge_param.anisotropy = 1.0;
119 
123 
130 
131  inv_param.kappa = 0.1;
132 
134  inv_param.epsilon = 0.1;
136  } else if (dslash_type == QUDA_DOMAIN_WALL_DSLASH ||
138  inv_param.m5 = -1.5;
139  kappa5 = 0.5/(5 + inv_param.m5);
140  } else if (dslash_type == QUDA_MOBIUS_DWF_DSLASH ) {
141  inv_param.m5 = -1.5;
142  kappa5 = 0.5/(5 + inv_param.m5);
143  for(int k = 0; k < Lsdim; k++)
144  {
145  // b5[k], c[k] values are chosen for arbitrary values,
146  // but the difference of them are same as 1.0
147  inv_param.b_5[k] = 1.50;
148  inv_param.c_5[k] = 0.50;
149  }
150  }
151 
152  inv_param.mu = mu;
153  inv_param.mass = mass;
155 
159  not_dagger = (QudaDagType)((dagger + 1)%2);
160 
163  errorQuda("Gauge and spinor CPU precisions must match");
164  }
166 
169 
170 #ifndef MULTI_GPU // free parameter for single GPU
171  gauge_param.ga_pad = 0;
172 #else // must be this one c/b face for multi gpu
173  int x_face_size = gauge_param.X[1]*gauge_param.X[2]*gauge_param.X[3]/2;
174  int y_face_size = gauge_param.X[0]*gauge_param.X[2]*gauge_param.X[3]/2;
175  int z_face_size = gauge_param.X[0]*gauge_param.X[1]*gauge_param.X[3]/2;
176  int t_face_size = gauge_param.X[0]*gauge_param.X[1]*gauge_param.X[2]/2;
177  int pad_size =MAX(x_face_size, y_face_size);
178  pad_size = MAX(pad_size, z_face_size);
179  pad_size = MAX(pad_size, t_face_size);
180  gauge_param.ga_pad = pad_size;
181 #endif
182  inv_param.sp_pad = 0;
183  inv_param.cl_pad = 0;
184 
185  //inv_param.sp_pad = xdim*ydim*zdim/2;
186  //inv_param.cl_pad = 24*24*24;
187 
188  inv_param.gamma_basis = QUDA_DEGRAND_ROSSI_GAMMA_BASIS; // test code only supports DeGrand-Rossi Basis
190 
192  switch(test_type) {
193  case 0:
194  case 1:
195  case 2:
196  case 3:
198  break;
199  case 4:
201  break;
202  default:
203  errorQuda("Test type %d not defined QUDA_DOMAIN_WALL_4D_DSLASH\n", test_type);
204  }
205  } else if(dslash_type == QUDA_MOBIUS_DWF_DSLASH) {
206  switch(test_type) {
207  case 0:
208  case 1:
209  case 2:
210  case 3:
211  case 4:
213  break;
214  case 5:
216  break;
217  default:
218  errorQuda("Test type %d not defined on QUDA_MOBIUS_DWF_DSLASH\n", test_type);
219  }
220  }
221  else
222  {
223  switch(test_type) {
224  case 0:
225  case 1:
227  break;
228  case 2:
230  break;
231  case 3:
233  break;
234  case 4:
236  break;
237  default:
238  errorQuda("Test type %d not defined\n", test_type);
239  }
240  }
241 
243 
253  }
254 
255  // construct input fields
256  for (int dir = 0; dir < 4; dir++) hostGauge[dir] = malloc((size_t)V*gaugeSiteSize*gauge_param.cpu_prec);
257 
259 
260  csParam.nColor = 3;
261  csParam.nSpin = 4;
262  csParam.nDim = 4;
263  for (int d=0; d<4; d++) csParam.x[d] = gauge_param.X[d];
267  csParam.nDim = 5;
268  csParam.x[4] = Ls;
269  }
273 } else {
275 }
276 
277 //ndeg_tm
281  csParam.x[4] = inv_param.Ls;
282 }
283 
284 
286 csParam.pad = 0;
287 
290  csParam.x[0] /= 2;
291 } else {
292  if (test_type < 2 || test_type == 3) {
294  csParam.x[0] /= 2;
295  } else {
297  }
298 }
299 
304 
309 
310 csParam.x[0] = gauge_param.X[0];
311 
312 // printfQuda("Randomizing fields... ");
313 
314 
315 //FIXME
316  // if (strcmp(latfile,"")) { // load in the command line supplied gauge field
317  // read_gauge_field(latfile, hostGauge, gauge_param.cpu_prec, gauge_param.X, argc, argv);
318  // construct_gauge_field(hostGauge, 2, gauge_param.cpu_prec, &gauge_param);
319  // } else { // else generate a random SU(3) field
321  // }
322 
324 
326  double norm = 0.1; // clover components are random numbers in the range (-norm, norm)
327  double diag = 1.0; // constant added to the diagonal
330  }
331 
332  // printfQuda("done.\n"); fflush(stdout);
333 
334 
335 
336  // set verbosity prior to loadGaugeQuda
339 
340  // printfQuda("Sending gauge field to GPU\n");
342 
344  if (compute_clover) printfQuda("Computing clover field on GPU\n");
345  else printfQuda("Sending clover field to GPU\n");
350 
352 
354  }
355 
356  if (!transfer) {
362  } else {
363  /* Single and half */
365  }
366 
369  {
371  csParam.x[0] /= 2;
372  } else
373  {
374  if (test_type < 2 || test_type == 3) {
376  csParam.x[0] /= 2;
377  }
378  }
379 
380  // printfQuda("Creating cudaSpinor\n");
382  // printfQuda("Creating cudaSpinorOut\n");
384 
386 
389  if (test_type == 2 || test_type == 4) csParam.x[0] /= 2;
390 
393 
394  // printfQuda("Sending spinor field to GPU\n");
395  *cudaSpinor = *spinor;
396 
397  // double cpu_norm = blas::norm2(*spinor);
398  // double cuda_norm = blas::norm2(*cudaSpinor);
399  // printfQuda("Source: CPU = %e, CUDA = %e\n", cpu_norm, cuda_norm);
400 
401  bool pc;
404  pc = true;
405  else
406  pc = (test_type != 2 && test_type != 4);
407  DiracParam diracParam;
408  setDiracParam(diracParam, &inv_param, pc);
409  diracParam.tmp1 = tmp1;
410  diracParam.tmp2 = tmp2;
411 
413  dirac_4dpc = new DiracDomainWall4DPC(diracParam);
414  dirac = (Dirac*)dirac_4dpc;
415  }
416  else if (dslash_type == QUDA_MOBIUS_DWF_DSLASH){
417  dirac_mdwf = new DiracMobiusPC(diracParam);
418  dirac = (Dirac*)dirac_mdwf;
419  }
420  else {
421  dirac = Dirac::create(diracParam);
422  }
423  } else {
424  // double cpu_norm = blas::norm2(*spinor);
425  // printfQuda("Source: CPU = %e\n", cpu_norm);
426  }
427 
428 }
429 
430 void end() {
431  if (!transfer) {
432  if(dirac != NULL)
433  {
434  delete dirac;
435  dirac = NULL;
436  }
437  delete cudaSpinor;
438  delete cudaSpinorOut;
439  delete tmp1;
440  delete tmp2;
441  }
442 
443  // release memory
444  delete spinor;
445  delete spinorOut;
446  delete spinorRef;
447  delete spinorTmp;
448 
449  freeGaugeQuda();
450 
451  for (int dir = 0; dir < 4; dir++) free(hostGauge[dir]);
453  free(hostClover);
455  }
457 
458  }
459 
460  struct DslashTime {
461  double event_time;
462  double cpu_time;
463  double cpu_min;
464  double cpu_max;
465 
466  DslashTime() : event_time(0.0), cpu_time(0.0), cpu_min(DBL_MAX), cpu_max(0.0) {}
467  };
468 
469 // execute kernel
471 
472  DslashTime dslash_time;
473  timeval tstart, tstop;
474 
475  cudaEvent_t start, end;
476  cudaEventCreate(&start);
477  cudaEventCreate(&end);
478 
479  comm_barrier();
480  cudaEventRecord(start, 0);
481 
482  for (int i = 0; i < niter; i++) {
483 
484  gettimeofday(&tstart, NULL);
485 
487  switch (test_type) {
488  case 0:
489  if (transfer) {
491  } else {
493  }
494  break;
495  case 1:
496  if (transfer) {
498  } else {
500  }
501  break;
502  case 2:
503  if (transfer) {
505  } else {
507  }
508  break;
509  case 3:
510  if (transfer) {
511  MatQuda(spinorOut->V(), spinor->V(), &inv_param);
512  } else {
514  }
515  break;
516  case 4:
517  if (transfer) {
519  } else {
521  }
522  break;
523  }
524  }
525  else if (dslash_type == QUDA_MOBIUS_DWF_DSLASH){
526  switch (test_type) {
527  case 0:
528  if (transfer) {
530  } else {
532  }
533  break;
534  case 1:
535  if (transfer) {
537  } else {
539  }
540  break;
541  case 2:
542  if (transfer) {
544  } else {
546  }
547  break;
548  case 3:
549  if (transfer) {
551  } else {
553  }
554  break;
555  case 4:
556  if (transfer) {
557  MatQuda(spinorOut->V(), spinor->V(), &inv_param);
558  } else {
560  }
561  break;
562  case 5:
563  if (transfer) {
565  } else {
567  }
568  break;
569  }
570  } else {
571  switch (test_type) {
572  case 0:
574  if (transfer) {
576  } else {
577  if (dagger) {
578  ((DiracTwistedCloverPC *) dirac)->TwistCloverInv(*tmp1, *cudaSpinor, (parity+1)%2);
580  } else {
582  }
583  }
584  } else {
585  if (transfer) {
587  } else {
589  }
590  }
591  break;
592  case 1:
593  if (transfer) {
594  MatQuda(spinorOut->V(), spinor->V(), &inv_param);
595  } else {
597  }
598  break;
599  case 2:
600  if (transfer) {
601  MatQuda(spinorOut->V(), spinor->V(), &inv_param);
602  } else {
604  }
605  break;
606  case 3:
607  if (transfer) {
609  } else {
611  }
612  break;
613  case 4:
614  if (transfer) {
616  } else {
618  }
619  break;
620  }
621  }
622 
623  gettimeofday(&tstop, NULL);
624  long ds = tstop.tv_sec - tstart.tv_sec;
625  long dus = tstop.tv_usec - tstart.tv_usec;
626  double elapsed = ds + 0.000001*dus;
627 
628  dslash_time.cpu_time += elapsed;
629  // skip first and last iterations since they may skew these metrics if comms are not synchronous
630  if (i>0 && i<niter) {
631  if (elapsed < dslash_time.cpu_min) dslash_time.cpu_min = elapsed;
632  if (elapsed > dslash_time.cpu_max) dslash_time.cpu_max = elapsed;
633  }
634  }
635 
636  cudaEventRecord(end, 0);
637  cudaEventSynchronize(end);
638  float runTime;
639  cudaEventElapsedTime(&runTime, start, end);
640  cudaEventDestroy(start);
641  cudaEventDestroy(end);
642 
643  dslash_time.event_time = runTime / 1000;
644 
645  // check for errors
646  cudaError_t stat = cudaGetLastError();
647  if (stat != cudaSuccess)
648  printfQuda("with ERROR: %s\n", cudaGetErrorString(stat));
649 
650  return dslash_time;
651 }
652 
653 void dslashRef() {
654 
655  // compare to dslash reference implementation
656  // printfQuda("Calculating reference implementation...");
657  fflush(stdout);
658 
660  switch (test_type) {
661  case 0:
663  break;
664  case 1:
667  break;
668  case 2:
670  break;
671  case 3:
676  break;
677  case 4:
680  break;
681  default:
682  printfQuda("Test type not defined\n");
683  exit(-1);
684  }
685  } else if (dslash_type == QUDA_CLOVER_WILSON_DSLASH) {
686  switch (test_type) {
687  case 0:
689  break;
690  case 1:
693  break;
694  case 2:
696  break;
697  case 3:
702  break;
703  case 4:
707  break;
708  default:
709  printfQuda("Test type not defined\n");
710  exit(-1);
711  }
712  } else if (dslash_type == QUDA_TWISTED_MASS_DSLASH) {
713  switch (test_type) {
714  case 0:
717  else
718  {
719  int tm_offset = 12*spinorRef->Volume();
720 
721  void *ref1 = spinorRef->V();
722  void *ref2 = (char*)ref1 + tm_offset*cpu_prec;
723 
724  void *flv1 = spinor->V();
725  void *flv2 = (char*)flv1 + tm_offset*cpu_prec;
726 
727  tm_ndeg_dslash(ref1, ref2, hostGauge, flv1, flv2, inv_param.kappa, inv_param.mu, inv_param.epsilon,
729  }
730  break;
731  case 1:
734  else
735  {
736  int tm_offset = 12*spinorRef->Volume();
737 
738  void *ref1 = spinorRef->V();
739  void *ref2 = (char*)ref1 + tm_offset*cpu_prec;
740 
741  void *flv1 = spinor->V();
742  void *flv2 = (char*)flv1 + tm_offset*cpu_prec;
743 
745  }
746  break;
747  case 2:
750  else
751  {
752  int tm_offset = 12*spinorRef->Volume();
753 
754  void *evenOut = spinorRef->V();
755  void *oddOut = (char*)evenOut + tm_offset*cpu_prec;
756 
757  void *evenIn = spinor->V();
758  void *oddIn = (char*)evenIn + tm_offset*cpu_prec;
759 
761 }
762 break;
763 case 3:
769 }
770 else
771 {
772  int tm_offset = 12*spinorRef->Volume();
773 
774  void *ref1 = spinorRef->V();
775  void *ref2 = (char*)ref1 + tm_offset*cpu_prec;
776 
777  void *flv1 = spinor->V();
778  void *flv2 = (char*)flv1 + tm_offset*cpu_prec;
779 
780  void *tmp1 = spinorTmp->V();
781  void *tmp2 = (char*)tmp1 + tm_offset*cpu_prec;
782 
785 }
786 break;
787 case 4:
793 }
794 else
795 {
796  int tm_offset = 12*spinorRef->Volume();
797 
798  void *evenOut = spinorRef->V();
799  void *oddOut = (char*)evenOut + tm_offset*cpu_prec;
800 
801  void *evenIn = spinor->V();
802  void *oddIn = (char*)evenIn + tm_offset*cpu_prec;
803 
804  void *evenTmp = spinorTmp->V();
805  void *oddTmp = (char*)evenTmp + tm_offset*cpu_prec;
806 
809 }
810 break;
811 default:
812 printfQuda("Test type not defined\n");
813 exit(-1);
814 }
816  switch (test_type) {
817  case 0:
820  else
821  errorQuda("Not supported\n");
822  break;
823  case 1:
826  else
827  errorQuda("Not supported\n");
828 break;
829 case 2:
832 else
833  errorQuda("Not supported\n");
834 break;
835 case 3:
841 } else
842 errorQuda("Not supported\n");
843 break;
844 case 4:
848 } else
849 errorQuda("Not supported\n");
850 break;
851 default:
852 printfQuda("Test type not defined\n");
853 exit(-1);
854 }
855 } else if (dslash_type == QUDA_DOMAIN_WALL_DSLASH ){
856  switch (test_type) {
857  case 0:
859  break;
860  case 1:
862  break;
863  case 2:
865  break;
866  case 3:
869  break;
870  case 4:
872  break;
873  default:
874  printf("Test type not supported for domain wall\n");
875  exit(-1);
876  }
878  double *kappa_5 = (double*)malloc(Ls*sizeof(double));
879  for(int xs = 0; xs < Ls ; xs++)
880  kappa_5[xs] = kappa5;
881  switch (test_type) {
882  case 0:
884  break;
885  case 1:
887  break;
888  case 2:
890  break;
891  case 3:
893  break;
894  case 4:
897  break;
898  break;
899  default:
900  printf("Test type not supported for domain wall\n");
901  exit(-1);
902  }
903  free(kappa_5);
904 } else if (dslash_type == QUDA_MOBIUS_DWF_DSLASH){
905  double *kappa_b, *kappa_c, *kappa_5, *kappa_mdwf;
906  kappa_b = (double*)malloc(Lsdim*sizeof(double));
907  kappa_c = (double*)malloc(Lsdim*sizeof(double));
908  kappa_5 = (double*)malloc(Lsdim*sizeof(double));
909  kappa_mdwf = (double*)malloc(Lsdim*sizeof(double));
910  for(int xs = 0 ; xs < Lsdim ; xs++)
911  {
912  kappa_b[xs] = 1.0/(2*(inv_param.b_5[xs]*(4.0 + inv_param.m5) + 1.0));
913  kappa_c[xs] = 1.0/(2*(inv_param.c_5[xs]*(4.0 + inv_param.m5) - 1.0));
914  kappa_5[xs] = 0.5*kappa_b[xs]/kappa_c[xs];
915  kappa_mdwf[xs] = -kappa_5[xs];
916  }
917  switch (test_type) {
918  case 0:
920  break;
921  case 1:
923  break;
924  case 2:
926  break;
927  case 3:
929  break;
930  case 4:
932  break;
933  case 5:
936  break;
937  break;
938  default:
939  printf("Test type not supported for domain wall\n");
940  exit(-1);
941  }
942  free(kappa_b);
943  free(kappa_c);
944  free(kappa_5);
945  free(kappa_mdwf);
946 } else {
947  printfQuda("Unsupported dslash_type\n");
948  exit(-1);
949 }
950 
951 // printfQuda("done.\n");
952 }
953 
954 
956 {
957  auto prec = precision == 2 ? QUDA_DOUBLE_PRECISION : precision == 1 ? QUDA_SINGLE_PRECISION : QUDA_HALF_PRECISION;
958  // printfQuda("running the following test:\n");
959 
960  printfQuda("prec recon test_type matpc_type dagger S_dim T_dimension Ls_dimension dslash_type niter\n");
961  printfQuda("%6s %2s %d %12s %d %3d/%3d/%3d %3d %2d %14s %d\n",
965  // printfQuda("Grid partition info: X Y Z T\n");
966  // printfQuda(" %d %d %d %d\n",
967  // dimPartitioned(0),
968  // dimPartitioned(1),
969  // dimPartitioned(2),
970  // dimPartitioned(3));
971 
972  return ;
973 
974 }
975 
976 
977 
978 using ::testing::TestWithParam;
979 using ::testing::Bool;
980 using ::testing::Values;
981 using ::testing::Range;
982 using ::testing::Combine;
983 
984 class DslashTest : public ::testing::TestWithParam<::testing::tuple<int, int, int>> {
985 protected:
986  ::testing::tuple<int, int, int> param;
987 
988 public:
989  virtual ~DslashTest() { }
990  virtual void SetUp() {
991  int prec = ::testing::get<0>(GetParam());
992  QudaReconstructType recon = static_cast<QudaReconstructType>(::testing::get<1>(GetParam()));
993 
994 
995  int value = ::testing::get<2>(GetParam());
996  for(int j=0; j < 4;j++){
997  if (value & (1 << j)){
999  }
1000 
1001  }
1002  updateR();
1003  init(prec, recon);
1004  display_test_info(prec, recon);
1005  }
1006  virtual void TearDown() { end(); }
1007 
1008  static void SetUpTestCase() {
1009  initQuda(device);
1010  }
1011 
1012  // Per-test-case tear-down.
1013  // Called after the last test in this test case.
1014  // Can be omitted if not needed.
1015  static void TearDownTestCase() {
1016  endQuda();
1017  }
1018 
1019 };
1020 
1021 
1022 
1024  dslashRef();
1025 
1026  // printfQuda("Tuning...\n");
1027  dslashCUDA(1); // warm-up run
1028  dslashCUDA(2);
1029 
1030  if (!transfer) *spinorOut = *cudaSpinorOut;
1031 
1032  double norm2_cpu = blas::norm2(*spinorRef);
1033  double norm2_cpu_cuda= blas::norm2(*spinorOut);
1034  if (!transfer) {
1035  double norm2_cuda= blas::norm2(*cudaSpinorOut);
1036  printfQuda("Results: CPU = %f, CUDA=%f, CPU-CUDA = %f\n", norm2_cpu, norm2_cuda, norm2_cpu_cuda);
1037  } else {
1038  printfQuda("Result: CPU = %f, CPU-QUDA = %f\n", norm2_cpu, norm2_cpu_cuda);
1039  }
1040  double deviation = pow(10, -(double)(cpuColorSpinorField::Compare(*spinorRef, *spinorOut)));
1041  double tol = (inv_param.cuda_prec == QUDA_DOUBLE_PRECISION ? 1e-12 :
1042  (inv_param.cuda_prec == QUDA_SINGLE_PRECISION ? 1e-3 : 1e-1));
1043  ASSERT_LE(deviation, tol) << "CPU and CUDA implementations do not agree";
1044 }
1045 
1047 
1048  // printfQuda("Tuning...\n");
1049  dslashCUDA(1); // warm-up run
1050 
1051  if (!transfer) dirac->Flops();
1052  auto dslash_time = dslashCUDA(niter);
1053  printfQuda("%fus per kernel call\n", 1e6*dslash_time.event_time / niter);
1054  //FIXME No flops count for twisted-clover yet
1055  unsigned long long flops = 0;
1056  if (!transfer) flops = dirac->Flops();
1057  double gflops=1.0e-9*flops/dslash_time.event_time;
1058  printfQuda("GFLOPS = %f\n", gflops );
1059  RecordProperty("Gflops", std::to_string(gflops));
1060  RecordProperty("Halo_bidirectitonal_BW_GPU", 1.0e-9*2*cudaSpinor->GhostBytes()*niter/dslash_time.event_time);
1061  RecordProperty("Halo_bidirectitonal_BW_CPU", 1.0e-9*2*cudaSpinor->GhostBytes()*niter/dslash_time.cpu_time);
1062  RecordProperty("Halo_bidirectitonal_BW_CPU_min", 1.0e-9*2*cudaSpinor->GhostBytes()/dslash_time.cpu_max);
1063  RecordProperty("Halo_bidirectitonal_BW_CPU_max", 1.0e-9*2*cudaSpinor->GhostBytes()/dslash_time.cpu_min);
1064  RecordProperty("Halo_message_size_bytes",2*cudaSpinor->GhostBytes());
1065 
1066  printfQuda("Effective halo bi-directional bandwidth (GB/s) GPU = %f ( CPU = %f, min = %f , max = %f ) for aggregate message size %lu bytes\n",
1067  1.0e-9*2*cudaSpinor->GhostBytes()*niter/dslash_time.event_time, 1.0e-9*2*cudaSpinor->GhostBytes()*niter/dslash_time.cpu_time,
1068  1.0e-9*2*cudaSpinor->GhostBytes()/dslash_time.cpu_max, 1.0e-9*2*cudaSpinor->GhostBytes()/dslash_time.cpu_min,
1069  2*cudaSpinor->GhostBytes());
1070  }
1071 
1072  int main(int argc, char **argv)
1073  {
1074  // initalize google test, includes command line options
1075  ::testing::InitGoogleTest(&argc, argv);
1076  // return code for google test
1077  int test_rc = 0;
1078  for (int i =1;i < argc; i++) {
1079  if(process_command_line_option(argc, argv, &i) == 0){
1080  continue;
1081  }
1082 
1083  fprintf(stderr, "ERROR: Invalid option:%s\n", argv[i]);
1084  usage(argv);
1085  }
1086 
1087  initComms(argc, argv, gridsize_from_cmdline);
1088 
1089  test_rc = RUN_ALL_TESTS();
1090 
1091  finalizeComms();
1092  return test_rc;
1093  }
1094 
1095  std::string getdslashtestname(testing::TestParamInfo<::testing::tuple<int, int, int>> param){
1096  const int prec = ::testing::get<0>(param.param);
1097  const int recon = ::testing::get<1>(param.param);
1098  const int part = ::testing::get<2>(param.param);
1099  std::stringstream ss;
1100  // std::cout << "getdslashtestname" << get_dslash_str(dslash_type) << "_" << prec_str[prec] << "_r" << recon << "_partition" << part << std::endl;
1101  // ss << get_dslash_str(dslash_type) << "_";
1102  ss << prec_str[prec];
1103  ss << "_r" << recon;
1104  ss << "_partition" << part;
1105  return ss.str();
1106  }
1107 
1108 
1109 #ifdef MULTI_GPU
1110  INSTANTIATE_TEST_CASE_P(QUDA, DslashTest, Combine( Range(0,3), ::testing::Values(QUDA_RECONSTRUCT_NO,QUDA_RECONSTRUCT_12,QUDA_RECONSTRUCT_8), Range(0,16)),getdslashtestname);
1111 #else
1112  INSTANTIATE_TEST_CASE_P(QUDA, DslashTest, Combine( Range(0,3), ::testing::Values(QUDA_RECONSTRUCT_NO,QUDA_RECONSTRUCT_12,QUDA_RECONSTRUCT_8), ::testing::Values(0) ),getdslashtestname);
1113 #endif
cudaColorSpinorField * tmp2
QudaDiracFieldOrder dirac_order
Definition: quda.h:195
QudaReconstructType reconstruct_sloppy
Definition: quda.h:46
double anisotropy
Definition: quda.h:31
const int transfer
int main(int argc, char **argv)
cudaColorSpinorField * tmp1
double mu
Definition: test_util.cpp:1643
double b_5[QUDA_MAX_DWF_LS]
Definition: quda.h:102
void dw_4d_matpc(void *out, void **gauge, void *in, double kappa, QudaMatPCType matpc_type, int dagger_bit, QudaPrecision precision, QudaGaugeParam &gauge_param, double mferm)
void endQuda(void)
void dw_dslash_5_4d(void *out, void **gauge, void *in, int oddBit, int daggerBit, QudaPrecision precision, QudaGaugeParam &gauge_param, double mferm, bool zero_initialize)
void free(void *)
void construct_gauge_field(void **gauge, int type, QudaPrecision precision, QudaGaugeParam *param)
Definition: test_util.cpp:1054
QudaSolveType solve_type
Definition: quda.h:182
int gridsize_from_cmdline[]
Definition: test_util.cpp:50
QudaVerbosity verbosity
enum QudaPrecision_s QudaPrecision
DiracMobiusPC * dirac_mdwf
void Dslash5(ColorSpinorField &out, const ColorSpinorField &in, const QudaParity parity) const
void Dslash5(ColorSpinorField &out, const ColorSpinorField &in, const QudaParity parity) const
int ga_pad
Definition: quda.h:53
void dw_setDims(int *X, const int L5)
Definition: test_util.cpp:167
double mu
Definition: quda.h:105
QudaGaugeFixed gauge_fix
Definition: quda.h:51
const QudaParity parity
cpuColorSpinorField * spinorRef
__host__ __device__ ValueType norm(const complex< ValueType > &z)
Returns the magnitude of z squared.
Definition: complex_quda.h:896
void tm_mat(void *out, void **gauge, void *in, double kappa, double mu, QudaTwistFlavorType flavor, int dagger_bit, QudaPrecision precision, QudaGaugeParam &gauge_param)
void tm_dslash(void *res, void **gaugeFull, void *spinorField, double kappa, double mu, QudaTwistFlavorType flavor, int oddBit, QudaMatPCType matpc_type, int daggerBit, QudaPrecision precision, QudaGaugeParam &gauge_param)
__darwin_time_t tv_sec
int niter
Definition: test_util.cpp:1630
QudaLinkType type
Definition: quda.h:35
int fflush(FILE *)
double kappa
Definition: quda.h:97
void MatDagMatQuda(void *h_out, void *h_in, QudaInvertParam *inv_param)
int xdim
Definition: test_util.cpp:1620
#define errorQuda(...)
Definition: util_quda.h:90
double norm2(const ColorSpinorField &a)
Definition: reduce_quda.cu:241
QudaDslashType dslash_type
Definition: quda.h:93
cudaEvent_t start
QudaPrecision cuda_prec
Definition: quda.h:191
virtual void TearDown()
double c_5[QUDA_MAX_DWF_LS]
Definition: quda.h:103
#define cloverSiteSize
Definition: test_util.h:8
int return_clover_inverse
Definition: quda.h:217
void loadGaugeQuda(void *h_gauge, QudaGaugeParam *param)
double cpu_min
void tmc_dslash(void *out, void **gauge, void *in, void *clover, void *cInv, double kappa, double mu, QudaTwistFlavorType flavor, int parity, QudaMatPCType matpc_type, int dagger, QudaPrecision precision, QudaGaugeParam &param)
void commDimPartitionedSet(int dir)
QudaPrecision cpu_prec
Definition: quda.h:190
QudaDagType dagger
int process_command_line_option(int argc, char **argv, int *idx)
Definition: test_util.cpp:1795
void Source(const QudaSourceType sourceType, const int st=0, const int s=0, const int c=0)
void tm_ndeg_mat(void *evenOut, void *oddOut, void **gauge, void *evenIn, void *oddIn, double kappa, double mu, double epsilon, int daggerBit, QudaPrecision precision, QudaGaugeParam &gauge_param)
QudaMatPCType matpc_type
Definition: test_util.cpp:1652
QudaDagType not_dagger
QudaPrecision precision
Definition: lattice_field.h:54
void clover_matpc(void *out, void **gauge, void *clover, void *clover_inv, void *in, double kappa, QudaMatPCType matpc_type, int dagger, QudaPrecision precision, QudaGaugeParam &gauge_param)
QudaDagType dagger
Definition: quda.h:184
const char * get_matpc_str(QudaMatPCType type)
Definition: misc.cpp:987
void finalizeComms()
Definition: test_util.cpp:107
void dw_dslash(void *out, void **gauge, void *in, int oddBit, int daggerBit, QudaPrecision precision, QudaGaugeParam &gauge_param, double mferm)
QudaGaugeParam gauge_param
INSTANTIATE_TEST_CASE_P(QUDA, DslashTest, Combine(Range(0, 3), ::testing::Values(QUDA_RECONSTRUCT_NO, QUDA_RECONSTRUCT_12, QUDA_RECONSTRUCT_8), ::testing::Values(0)), getdslashtestname)
QudaGaugeFieldOrder gauge_order
Definition: quda.h:36
void dw_mat(void *out, void **gauge, void *in, double kappa, int dagger_bit, QudaPrecision precision, QudaGaugeParam &gauge_param, double mferm)
void tmc_mat(void *out, void **gauge, void *clover, void *in, double kappa, double mu, QudaTwistFlavorType flavor, int dagger, QudaPrecision precision, QudaGaugeParam &gauge_param)
bool compute_clover
Definition: test_util.cpp:1646
void tm_matpc(void *outEven, void **gauge, void *inEven, double kappa, double mu, QudaTwistFlavorType flavor, QudaMatPCType matpc_type, int daggerBit, QudaPrecision precision, QudaGaugeParam &gauge_param)
const char * get_prec_str(QudaPrecision prec)
Definition: misc.cpp:704
void loadCloverQuda(void *h_clover, void *h_clovinv, QudaInvertParam *inv_param)
int return_clover
Definition: quda.h:216
void M(ColorSpinorField &out, const ColorSpinorField &in) const
unsigned long long Flops() const
Definition: dirac_quda.h:148
QudaTwistFlavorType twist_flavor
Definition: test_util.cpp:1649
void dslashRef()
QudaDslashType dslash_type
Definition: test_util.cpp:1626
virtual void MdagM(ColorSpinorField &out, const ColorSpinorField &in) const =0
QudaSiteSubset siteSubset
Definition: lattice_field.h:55
void exit(int) __attribute__((noreturn))
QudaPrecision clover_cuda_prec_sloppy
Definition: quda.h:202
void setDims(int *)
Definition: test_util.cpp:130
void dslashQuda(void *h_out, void *h_in, QudaInvertParam *inv_param, QudaParity parity)
QudaFieldLocation input_location
Definition: quda.h:90
void freeGaugeQuda(void)
bool verify_results
Definition: test_util.cpp:1641
__darwin_suseconds_t tv_usec
const char * recon_str[]
int Ls
Definition: test_util.cpp:39
QudaGaugeParam param
Definition: pack_test.cpp:17
void setDiracParam(DiracParam &diracParam, QudaInvertParam *inv_param, bool pc)
QudaSolutionType solution_type
Definition: quda.h:181
else return(__swbuf(_c, _p))
void dslashQuda_4dpc(void *h_out, void *h_in, QudaInvertParam *inv_param, QudaParity parity, int test_type)
int x[QUDA_MAX_DIM]
Definition: lattice_field.h:50
QudaPrecision clover_cuda_prec
Definition: quda.h:201
void Dslash4(ColorSpinorField &out, const ColorSpinorField &in, const QudaParity parity) const
void dw_matdagmat(void *out, void **gauge, void *in, double kappa, int dagger_bit, QudaPrecision precision, QudaGaugeParam &gauge_param, double mferm)
Dirac * dirac
void initQuda(int device)
void dw_matpc(void *out, void **gauge, void *in, double kappa, QudaMatPCType matpc_type, int dagger_bit, QudaPrecision precision, QudaGaugeParam &gauge_param, double mferm)
double tol
Definition: test_util.cpp:1647
QudaFieldLocation output_location
Definition: quda.h:91
void * malloc(size_t __size) __attribute__((__warn_unused_result__)) __attribute__((alloc_size(1)))
QudaPrecision clover_cuda_prec_precondition
Definition: quda.h:203
double benchmark(int kernel, const int niter)
Definition: blas_test.cu:283
virtual void SetUp()
int printf(const char *,...) __attribute__((__format__(__printf__
void mdw_dslash_5(void *out, void **gauge, void *in, int oddBit, int daggerBit, QudaPrecision precision, QudaGaugeParam &gauge_param, double mferm, double *kappa, bool zero_initialize)
double m5
Definition: quda.h:99
virtual ~DslashTest()
QudaReconstructType link_recon
Definition: test_util.cpp:1612
void Dslash5inv(ColorSpinorField &out, const ColorSpinorField &in, const QudaParity parity, const double &kappa5) const
QudaVerbosity verbosity
Definition: quda.h:219
void setSpinorSiteSize(int n)
Definition: test_util.cpp:192
ColorSpinorParam csParam
Definition: pack_test.cpp:24
QudaInvertParam newQudaInvertParam(void)
int tdim
Definition: test_util.cpp:1623
void mdw_dslash_4_pre(void *out, void **gauge, void *in, int oddBit, int daggerBit, QudaPrecision precision, QudaGaugeParam &gauge_param, double mferm, double *b5, double *c5, bool zero_initialize)
const char * get_recon_str(QudaReconstructType recon)
Definition: misc.cpp:770
double event_time
QudaCloverFieldOrder clover_order
Definition: quda.h:205
int V
Definition: test_util.cpp:28
enum QudaMatPCType_s QudaMatPCType
#define gaugeSiteSize
Definition: test_util.h:6
double cpu_time
void mdw_matpc(void *out, void **gauge, void *in, double *kappa_b, double *kappa_c, QudaMatPCType matpc_type, int dagger, QudaPrecision precision, QudaGaugeParam &gauge_param, double mferm, double *b5, double *c5)
__host__ __device__ ValueType pow(ValueType x, ExponentType e)
Definition: complex_quda.h:100
QudaGammaBasis gamma_basis
Definition: quda.h:197
char latfile[]
Definition: test_util.cpp:1627
QudaPrecision cuda_prec_sloppy
Definition: quda.h:45
void Dslash4pre(ColorSpinorField &out, const ColorSpinorField &in, const QudaParity parity) const
const char * get_dslash_str(QudaDslashType type)
Definition: misc.cpp:878
void display_test_info(int precision, QudaReconstructType link_recon)
std::string getdslashtestname(testing::TestParamInfo<::testing::tuple< int, int, int >> param)
void clover_mat(void *out, void **gauge, void *clover, void *in, double kappa, int dagger, QudaPrecision precision, QudaGaugeParam &gauge_param)
void tm_ndeg_matpc(void *outEven1, void *outEven2, void **gauge, void *inEven1, void *inEven2, double kappa, double mu, double epsilon, QudaMatPCType matpc_type, int daggerBit, QudaPrecision precision, QudaGaugeParam &gauge_param)
enum QudaDagType_s QudaDagType
enum QudaParity_s QudaParity
QudaReconstructType reconstruct
Definition: quda.h:43
QudaPrecision cuda_prec
Definition: quda.h:42
int X[4]
Definition: quda.h:29
QudaPrecision cuda_prec
double mass
Definition: quda.h:96
double clover_coeff
Definition: test_util.cpp:1645
int fprintf(FILE *, const char *,...) __attribute__((__format__(__printf__
void * memcpy(void *__dst, const void *__src, size_t __n)
void end()
void * hostGauge[4]
void * hostClover
int zdim
Definition: test_util.cpp:1622
static int Compare(const cpuColorSpinorField &a, const cpuColorSpinorField &b, const int resolution=1)
void clover_dslash(void *out, void **gauge, void *clover, void *in, int parity, int dagger, QudaPrecision precision, QudaGaugeParam &param)
int compute_clover_inverse
Definition: quda.h:215
void Dslash4(ColorSpinorField &out, const ColorSpinorField &in, const QudaParity parity) const
::testing::tuple< int, int, int > param
void tm_ndeg_dslash(void *res1, void *res2, void **gauge, void *spinorField1, void *spinorField2, double kappa, double mu, double epsilon, int oddBit, int daggerBit, QudaMatPCType matpc_type, QudaPrecision precision, QudaGaugeParam &gauge_param)
void * hostCloverInv
void construct_clover_field(void *clover, double norm, double diag, QudaPrecision precision)
Definition: test_util.cpp:1166
virtual void M(ColorSpinorField &out, const ColorSpinorField &in) const =0
void wil_mat(void *out, void **gauge, void *in, double kappa, int dagger_bit, QudaPrecision precision, QudaGaugeParam &gauge_param)
DslashTime dslashCUDA(int niter)
int Lsdim
Definition: test_util.cpp:1624
enum QudaReconstructType_s QudaReconstructType
void commDimPartitionedReset()
Reset the comm dim partioned array to zero,.
Main header file for the QUDA library.
bool kernel_pack_t
Definition: test_util.cpp:1650
void tmc_matpc(void *out, void **gauge, void *in, void *clover, void *cInv, double kappa, double mu, QudaTwistFlavorType flavor, QudaMatPCType matpc_type, int dagger, QudaPrecision precision, QudaGaugeParam &gauge_param)
if(err !=cudaSuccess)
void MatQuda(void *h_out, void *h_in, QudaInvertParam *inv_param)
#define printfQuda(...)
Definition: util_quda.h:84
QudaTboundary t_boundary
Definition: quda.h:38
QudaTwistFlavorType twist_flavor
Definition: quda.h:108
void dslash_4_4d(void *out, void **gauge, void *in, int oddBit, int daggerBit, QudaPrecision precision, QudaGaugeParam &gauge_param, double mferm)
QudaTwistFlavorType twistFlavor
void usage(char **)
Definition: test_util.cpp:1693
unsigned long long flops
Definition: blas_quda.cu:42
double cpu_max
void MdagM(ColorSpinorField &out, const ColorSpinorField &in) const
enum QudaDslashType_s QudaDslashType
void setKernelPackT(bool pack)
Definition: dslash_quda.cu:59
ColorSpinorField * tmp2
Definition: dirac_quda.h:41
static void SetUpTestCase()
QudaInvertParam inv_param
void wil_matpc(void *outEven, void **gauge, void *inEven, double kappa, QudaMatPCType matpc_type, int daggerBit, QudaPrecision precision, QudaGaugeParam &gauge_param)
static void TearDownTestCase()
double mass
Definition: test_util.cpp:1642
enum QudaVerbosity_s QudaVerbosity
int compute_clover
Definition: quda.h:214
double epsilon
Definition: quda.h:106
void wil_dslash(void *out, void **gauge, void *in, int oddBit, int daggerBit, QudaPrecision precision, QudaGaugeParam &gauge_param)
void MdagM(ColorSpinorField &out, const ColorSpinorField &in) const
cpuColorSpinorField * spinorTmp
static Dirac * create(const DiracParam &param)
Definition: dirac.cpp:142
cudaColorSpinorField * cudaSpinorOut
QudaPrecision cpu_prec
void Dslash5inv(ColorSpinorField &out, const ColorSpinorField &in, const QudaParity parity) const
static __inline__ size_t size_t d
virtual void Dslash(ColorSpinorField &out, const ColorSpinorField &in, const QudaParity parity) const =0
cudaColorSpinorField * cudaSpinor
cpuColorSpinorField * spinor
QudaPrecision clover_cpu_prec
Definition: quda.h:200
QudaPrecision prec
Definition: test_util.cpp:1615
void M(ColorSpinorField &out, const ColorSpinorField &in) const
void initComms(int argc, char **argv, const int *commDims)
Definition: test_util.cpp:72
void init(int precision, QudaReconstructType link_recon)
void setVerbosity(const QudaVerbosity verbosity)
Definition: util_quda.cpp:24
QudaMatPCType matpc_type
Definition: quda.h:183
void dslashQuda_mdwf(void *h_out, void *h_in, QudaInvertParam *inv_param, QudaParity parity, int test_type)
int test_type
Definition: test_util.cpp:1634
void dslash_5_inv(void *out, void **gauge, void *in, int oddBit, int daggerBit, QudaPrecision precision, QudaGaugeParam &gauge_param, double mferm, double *kappa)
ColorSpinorField * tmp1
Definition: dirac_quda.h:40
const char * prec_str[]
#define MAX(a, b)
double kappa5
QudaPrecision cpu_prec
Definition: quda.h:40
int ydim
Definition: test_util.cpp:1621
void updateR()
update the radius for halos.
QudaGaugeParam newQudaGaugeParam(void)
double clover_coeff
Definition: quda.h:208
void comm_barrier(void)
Definition: comm_mpi.cpp:328
cpuColorSpinorField * spinorOut
enum QudaTwistFlavorType_s QudaTwistFlavorType
DiracDomainWall4DPC * dirac_4dpc
TEST_P(DslashTest, verify)