QUDA  1.0.0
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros
staggered_dslash_ctest.cpp
Go to the documentation of this file.
1 #include <iostream>
2 #include <stdio.h>
3 #include <stdlib.h>
4 #include <string.h>
5 
6 #include <quda.h>
7 #include <quda_internal.h>
8 #include <dirac_quda.h>
9 #include <dslash_quda.h>
10 #include <invert_quda.h>
11 #include <util_quda.h>
12 #include <blas_quda.h>
13 
14 #include <misc.h>
15 #include <test_util.h>
16 #include <dslash_util.h>
18 #include <staggered_gauge_utils.h>
19 #include "llfat_reference.h"
20 #include <gauge_field.h>
21 #include <unitarization_links.h>
22 
23 #if defined(QMP_COMMS)
24 #include <qmp.h>
25 #elif defined(MPI_COMMS)
26 #include <mpi.h>
27 #endif
28 
29 #include <qio_field.h>
30 
31 #include <assert.h>
32 #include <gtest/gtest.h>
33 
34 using namespace quda;
35 
36 #define MAX(a,b) ((a)>(b)?(a):(b))
37 #define staggeredSpinorSiteSize 6
38 // What test are we doing (0 = dslash, 1 = MatPC, 2 = Mat)
39 extern int test_type;
40 
41 extern void usage(char** argv );
42 
43 // Only load the gauge from a file once.
44 bool gauge_loaded = false;
45 void *qdp_inlink[4] = { nullptr, nullptr, nullptr, nullptr };
46 
49 
52 
55 
57 
58 // In the HISQ case, we include building fat/long links in this unit test
61 
62 // To speed up the unit test, build the CPU field once per partition
63 #ifdef MULTI_GPU
64 void *qdp_fatlink_cpu_backup[16][4];
65 void *qdp_longlink_cpu_backup[16][4];
66 void *qdp_inlink_backup[16][4];
67 #else
70 void *qdp_inlink_backup[1][4];
71 #endif
72 bool global_skip = true; // hack to skip tests
73 
74 
76 extern QudaDagType dagger;
77 extern int xdim;
78 extern int ydim;
79 extern int zdim;
80 extern int tdim;
81 extern int gridsize_from_cmdline[];
82 extern int device;
83 extern bool verify_results;
84 extern int niter;
85 extern double mass; // the mass of the Dirac operator
86 extern double kappa; // will get overriden
87 extern bool compute_fatlong; // build the true fat/long links or use random numbers
89 
90 // extern double tadpole_factor;
91 extern double eps_naik; // relativistic correction for naik term
92 static int n_naiks = 1; // Number of naiks. If eps_naik is 0.0, we only need to construct one naik.
93 
94 extern char latfile[];
95 
96 int X[4];
97 extern int Nsrc; // number of spinors to apply to simultaneously
98 
100 
101 const char *prec_str[] = {"quarter", "half", "single", "double"};
102 const char *recon_str[] = {"r18", "r13", "r9"};
103 
104 // For loading the gauge fields
106 char** argv_copy;
107 
109 {
110  switch (prec) {
111  case QUDA_QUARTER_PRECISION: return 1e-1;
112  case QUDA_HALF_PRECISION: return 1e-3;
113  case QUDA_SINGLE_PRECISION: return 1e-4;
114  case QUDA_DOUBLE_PRECISION: return 1e-11;
115  case QUDA_INVALID_PRECISION: return 1.0;
116  }
117  return 1.0;
118 }
119 
120 void init(int precision, QudaReconstructType link_recon, int partition)
121 {
122  auto prec = getPrecision(precision);
123 
125 
126  gauge_param = newQudaGaugeParam();
127  inv_param = newQudaInvertParam();
128 
129  gauge_param.X[0] = X[0] = xdim;
130  gauge_param.X[1] = X[1] = ydim;
131  gauge_param.X[2] = X[2] = zdim;
132  gauge_param.X[3] = X[3] = tdim;
133 
134  setDims(gauge_param.X);
135  dw_setDims(gauge_param.X, Nsrc); // so we can use 5-d indexing from dwf
137 
138  gauge_param.cpu_prec = QUDA_DOUBLE_PRECISION;
139  gauge_param.cuda_prec = prec; // Test parameter
140  gauge_param.reconstruct = link_recon; // Test parameter
141  gauge_param.reconstruct_sloppy = gauge_param.reconstruct;
142  gauge_param.cuda_prec_sloppy = gauge_param.cuda_prec;
143 
144  // ensure that the default is improved staggered
149  }
150 
151  gauge_param.anisotropy = 1.0;
152 
153  // For HISQ, this must always be set to 1.0, since the tadpole
154  // correction is baked into the coefficients for the first fattening.
155  // The tadpole doesn't mean anything for the second fattening
156  // since the input fields are unitarized.
157  gauge_param.tadpole_coeff = 1.0;
159  gauge_param.scale = -1.0 / 24.0;
160  if (eps_naik != 0) { gauge_param.scale *= (1.0 + eps_naik); }
161  } else {
162  gauge_param.scale = 1.0;
163  }
164  gauge_param.gauge_order = QUDA_MILC_GAUGE_ORDER;
165  gauge_param.t_boundary = QUDA_ANTI_PERIODIC_T;
167  gauge_param.gauge_fix = QUDA_GAUGE_FIXED_NO;
168  gauge_param.type = QUDA_WILSON_LINKS;
169 
170  inv_param.cpu_prec = QUDA_DOUBLE_PRECISION;
171  inv_param.cuda_prec = prec; // Test parameter
172  inv_param.dirac_order = QUDA_DIRAC_ORDER;
174  inv_param.dagger = dagger;
175  inv_param.matpc_type = QUDA_MATPC_EVEN_EVEN;
176  inv_param.dslash_type = dslash_type;
177  inv_param.mass = mass;
178  inv_param.kappa = kappa = 1.0/(8.0+mass); // for laplace
180  inv_param.dslash_type = dslash_type;
181 
184 
185  int tmpint = MAX(X[1] * X[2] * X[3], X[0] * X[2] * X[3]);
186  tmpint = MAX(tmpint, X[0] * X[1] * X[3]);
187  tmpint = MAX(tmpint, X[0] * X[1] * X[2]);
188 
189  gauge_param.ga_pad = tmpint;
190  inv_param.sp_pad = tmpint;
191 
192  size_t gSize = (gauge_param.cpu_prec == QUDA_DOUBLE_PRECISION) ? sizeof(double) : sizeof(float);
193 
194  // Allocate a lot of memory because I'm very confused
195  void* milc_fatlink_cpu = malloc(4*V*gaugeSiteSize*gSize);
196  void* milc_longlink_cpu = malloc(4*V*gaugeSiteSize*gSize);
197 
198  void* milc_fatlink_gpu = malloc(4*V*gaugeSiteSize*gSize);
199  void* milc_longlink_gpu = malloc(4*V*gaugeSiteSize*gSize);
200 
201  void* qdp_fatlink_gpu[4];
202  void* qdp_longlink_gpu[4];
203 
204  for (int dir = 0; dir < 4; dir++) {
205  qdp_fatlink_gpu[dir] = malloc(V*gaugeSiteSize*gSize);
206  qdp_longlink_gpu[dir] = malloc(V*gaugeSiteSize*gSize);
207 
208  qdp_fatlink_cpu[dir] = malloc(V*gaugeSiteSize*gSize);
209  qdp_longlink_cpu[dir] = malloc(V*gaugeSiteSize*gSize);
210 
211  if (qdp_fatlink_gpu[dir] == NULL || qdp_longlink_gpu[dir] == NULL ||
212  qdp_fatlink_cpu[dir] == NULL || qdp_longlink_cpu[dir] == NULL) {
213  errorQuda("ERROR: malloc failed for fatlink/longlink");
214  }
215  }
216 
217  // create a base field
218  for (int dir = 0; dir < 4; dir++) {
219  if (qdp_inlink[dir] == nullptr) {
220  qdp_inlink[dir] = malloc(V*gaugeSiteSize*gSize);
221  }
222  }
223 
224  // load a field WITHOUT PHASES
225  if (strcmp(latfile,"")) {
226  if (!gauge_loaded) {
227  read_gauge_field(latfile, qdp_inlink, gauge_param.cpu_prec, gauge_param.X, argc_copy, argv_copy);
230  }
231  gauge_loaded = true;
232  } // else it's already been loaded
233  } else {
235  construct_gauge_field(qdp_inlink, 1, gauge_param.cpu_prec, &gauge_param);
236  } else {
239  }
240  }
241 
242  // QUDA_STAGGERED_DSLASH follows the same codepath whether or not you
243  // "compute" the fat/long links or not.
245  for (int dir = 0; dir < 4; dir++) {
246  memcpy(qdp_fatlink_gpu[dir],qdp_inlink[dir], V*gaugeSiteSize*gSize);
247  memcpy(qdp_fatlink_cpu[dir],qdp_inlink[dir], V*gaugeSiteSize*gSize);
248  memset(qdp_longlink_gpu[dir],0,V*gaugeSiteSize*gSize);
249  memset(qdp_longlink_cpu[dir],0,V*gaugeSiteSize*gSize);
250  }
251  } else { // QUDA_ASQTAD_DSLASH
252 
253  if (compute_fatlong) {
254  computeFatLongGPUandCPU(qdp_fatlink_gpu, qdp_longlink_gpu, qdp_fatlink_cpu, qdp_longlink_cpu, qdp_inlink,
255  gauge_param, gSize, n_naiks, eps_naik);
256  } else { //
257 
258  for (int dir = 0; dir < 4; dir++) {
259  memcpy(qdp_fatlink_gpu[dir],qdp_inlink[dir], V*gaugeSiteSize*gSize);
260  memcpy(qdp_fatlink_cpu[dir],qdp_inlink[dir], V*gaugeSiteSize*gSize);
261  memcpy(qdp_longlink_gpu[dir],qdp_longlink_cpu[dir],V*gaugeSiteSize*gSize);
262  }
263  }
264  }
265 
266  // Alright, we've created all the void** links.
267  // Create the void* pointers
268  reorderQDPtoMILC(milc_fatlink_gpu, qdp_fatlink_gpu, V, gaugeSiteSize, gauge_param.cpu_prec, gauge_param.cpu_prec);
269  reorderQDPtoMILC(milc_fatlink_cpu, qdp_fatlink_cpu, V, gaugeSiteSize, gauge_param.cpu_prec, gauge_param.cpu_prec);
270  reorderQDPtoMILC(milc_longlink_gpu, qdp_longlink_gpu, V, gaugeSiteSize, gauge_param.cpu_prec, gauge_param.cpu_prec);
271  reorderQDPtoMILC(milc_longlink_cpu, qdp_longlink_cpu, V, gaugeSiteSize, gauge_param.cpu_prec, gauge_param.cpu_prec);
272  // Create ghost zones for CPU fields,
273  // prepare and load the GPU fields
274 
275 #ifdef MULTI_GPU
276 
278  gauge_param.reconstruct = QUDA_RECONSTRUCT_NO;
279  GaugeFieldParam cpuFatParam(milc_fatlink_cpu, gauge_param);
281  cpuFat = new cpuGaugeField(cpuFatParam);
282  ghost_fatlink_cpu = cpuFat->Ghost();
283 
284  gauge_param.type = QUDA_ASQTAD_LONG_LINKS;
285  GaugeFieldParam cpuLongParam(milc_longlink_cpu, gauge_param);
286  cpuLongParam.ghostExchange = QUDA_GHOST_EXCHANGE_PAD;
287  cpuLong = new cpuGaugeField(cpuLongParam);
288  ghost_longlink_cpu = cpuLong->Ghost();
289 
290  int x_face_size = X[1]*X[2]*X[3]/2;
291  int y_face_size = X[0]*X[2]*X[3]/2;
292  int z_face_size = X[0]*X[1]*X[3]/2;
293  int t_face_size = X[0]*X[1]*X[2]/2;
294  int pad_size = MAX(x_face_size, y_face_size);
295  pad_size = MAX(pad_size, z_face_size);
296  pad_size = MAX(pad_size, t_face_size);
297  gauge_param.ga_pad = pad_size;
298 #endif
299 
302  gauge_param.reconstruct = gauge_param.reconstruct_sloppy = (link_recon == QUDA_RECONSTRUCT_12) ?
305  } else {
306  gauge_param.reconstruct = gauge_param.reconstruct_sloppy = QUDA_RECONSTRUCT_NO;
307  }
308 
309  loadGaugeQuda(milc_fatlink_gpu, &gauge_param);
310 
311  gauge_param.type = QUDA_ASQTAD_LONG_LINKS;
312 
313 #ifdef MULTI_GPU
314  gauge_param.ga_pad = 3 * pad_size;
315 #endif
316 
319  gauge_param.reconstruct = gauge_param.reconstruct_sloppy = (link_recon == QUDA_RECONSTRUCT_12) ?
322 
323  loadGaugeQuda(milc_longlink_gpu, &gauge_param);
324  }
325 
327  csParam.nColor = 3;
328  csParam.nSpin = 1;
329  csParam.nDim = 5;
330  for (int d = 0; d < 4; d++) { csParam.x[d] = gauge_param.X[d]; }
331  csParam.x[4] = Nsrc; // number of sources becomes the fifth dimension
332 
333  csParam.setPrecision(inv_param.cpu_prec);
334  inv_param.solution_type = QUDA_MAT_SOLUTION;
335  csParam.pad = 0;
336  if (test_type < 2 && dslash_type != QUDA_LAPLACE_DSLASH) {
338  csParam.x[0] /= 2;
339  } else {
341  }
342 
345  csParam.gammaBasis = inv_param.gamma_basis; // this parameter is meaningless for staggered
346  csParam.create = QUDA_ZERO_FIELD_CREATE;
347 
348  spinor = new cpuColorSpinorField(csParam);
349  spinorOut = new cpuColorSpinorField(csParam);
350  spinorRef = new cpuColorSpinorField(csParam);
351  tmpCpu = new cpuColorSpinorField(csParam);
352 
353  // printfQuda("Randomizing fields ...\n");
354 
355  spinor->Source(QUDA_RANDOM_SOURCE);
356 
358  csParam.pad = inv_param.sp_pad;
359  csParam.setPrecision(inv_param.cuda_prec);
360 
361  // printfQuda("Creating cudaSpinor\n");
362  cudaSpinor = new cudaColorSpinorField(csParam);
363 
364  // printfQuda("Creating cudaSpinorOut\n");
365  cudaSpinorOut = new cudaColorSpinorField(csParam);
366 
367  // printfQuda("Sending spinor field to GPU\n");
368  *cudaSpinor = *spinor;
369 
370  cudaDeviceSynchronize();
371  checkCudaError();
372 
373  tmp = new cudaColorSpinorField(csParam);
374 
375  bool pc = (test_type == 1); // For test_type 0, can use either pc or not pc
376  // because both call the same "Dslash" directly.
377  DiracParam diracParam;
378  setDiracParam(diracParam, &inv_param, pc);
379 
380  diracParam.tmp1 = tmp;
381 
382  dirac = Dirac::create(diracParam);
383 
384  for (int dir = 0; dir < 4; dir++) {
385  free(qdp_fatlink_gpu[dir]); qdp_fatlink_gpu[dir] = nullptr;
386  free(qdp_longlink_gpu[dir]); qdp_longlink_gpu[dir] = nullptr;
387  }
388  free(milc_fatlink_gpu); milc_fatlink_gpu = nullptr;
389  free(milc_longlink_gpu); milc_longlink_gpu = nullptr;
390  free(milc_fatlink_cpu); milc_fatlink_cpu = nullptr;
391  free(milc_longlink_cpu); milc_longlink_cpu = nullptr;
392 
393  gauge_param.reconstruct = link_recon;
394 
395  return;
396 }
397 
398 void end(void)
399 {
400  for (int dir = 0; dir < 4; dir++) {
401  if (qdp_fatlink_cpu[dir] != nullptr) { free(qdp_fatlink_cpu[dir]); qdp_fatlink_cpu[dir] = nullptr; }
402  if (qdp_longlink_cpu[dir] != nullptr) { free(qdp_longlink_cpu[dir]); qdp_longlink_cpu[dir] = nullptr; }
403  }
404 
405  if (dirac != nullptr) {
406  delete dirac;
407  dirac = nullptr;
408  }
409  if (cudaSpinor != nullptr) {
410  delete cudaSpinor;
411  cudaSpinor = nullptr;
412  }
413  if (cudaSpinorOut != nullptr) {
414  delete cudaSpinorOut;
415  cudaSpinorOut = nullptr;
416  }
417  if (tmp != nullptr) {
418  delete tmp;
419  tmp = nullptr;
420  }
421 
422  if (spinor != nullptr) { delete spinor; spinor = nullptr; }
423  if (spinorOut != nullptr) { delete spinorOut; spinorOut = nullptr; }
424  if (spinorRef != nullptr) { delete spinorRef; spinorRef = nullptr; }
425  if (tmpCpu != nullptr) { delete tmpCpu; tmpCpu = nullptr; }
426 
427  freeGaugeQuda();
428 
429  if (cpuFat) { delete cpuFat; cpuFat = nullptr; }
430  if (cpuLong) { delete cpuLong; cpuLong = nullptr; }
432 }
433 
434 struct DslashTime {
435  double event_time;
436  double cpu_time;
437  double cpu_min;
438  double cpu_max;
439 
440  DslashTime() : event_time(0.0), cpu_time(0.0), cpu_min(DBL_MAX), cpu_max(0.0) {}
441 };
442 
444 
445  DslashTime dslash_time;
446  timeval tstart, tstop;
447 
448  cudaEvent_t start, end;
449  cudaEventCreate(&start);
450  cudaEventRecord(start, 0);
451  cudaEventSynchronize(start);
452 
453  comm_barrier();
454  cudaEventRecord(start, 0);
455 
456  for (int i = 0; i < niter; i++) {
457 
458  gettimeofday(&tstart, NULL);
459 
460  switch (test_type) {
461  case 0: dirac->Dslash(*cudaSpinorOut, *cudaSpinor, parity); break;
462  case 1: dirac->M(*cudaSpinorOut, *cudaSpinor); break;
463  case 2: dirac->M(*cudaSpinorOut, *cudaSpinor); break;
464  }
465 
466  gettimeofday(&tstop, NULL);
467  long ds = tstop.tv_sec - tstart.tv_sec;
468  long dus = tstop.tv_usec - tstart.tv_usec;
469  double elapsed = ds + 0.000001*dus;
470 
471  dslash_time.cpu_time += elapsed;
472  // skip first and last iterations since they may skew these metrics if comms are not synchronous
473  if (i>0 && i<niter) {
474  if (elapsed < dslash_time.cpu_min) dslash_time.cpu_min = elapsed;
475  if (elapsed > dslash_time.cpu_max) dslash_time.cpu_max = elapsed;
476  }
477  }
478 
479  cudaEventCreate(&end);
480  cudaEventRecord(end, 0);
481  cudaEventSynchronize(end);
482  float runTime;
483  cudaEventElapsedTime(&runTime, start, end);
484  cudaEventDestroy(start);
485  cudaEventDestroy(end);
486 
487  dslash_time.event_time = runTime / 1000;
488 
489  // check for errors
490  cudaError_t stat = cudaGetLastError();
491  if (stat != cudaSuccess)
492  errorQuda("with ERROR: %s\n", cudaGetErrorString(stat));
493 
494  return dslash_time;
495 }
496 
498 {
499 
500  // compare to dslash reference implementation
501  // printfQuda("Calculating reference implementation...");
502  fflush(stdout);
503  switch (test_type) {
504  case 0:
506  parity, dagger, inv_param.cpu_prec, gauge_param.cpu_prec, dslash_type);
507  break;
508  case 1:
510  inv_param.cpu_prec, gauge_param.cpu_prec, tmpCpu, parity, dslash_type);
511  break;
512  case 2:
513  // The !dagger is to compensate for the convention of actually
514  // applying -D_eo and -D_oe.
515  staggered_dslash(reinterpret_cast<cpuColorSpinorField *>(&spinorRef->Even()), qdp_fatlink_cpu, qdp_longlink_cpu,
516  ghost_fatlink_cpu, ghost_longlink_cpu, reinterpret_cast<cpuColorSpinorField *>(&spinor->Odd()),
517  QUDA_EVEN_PARITY, !dagger, inv_param.cpu_prec, gauge_param.cpu_prec, dslash_type);
518  staggered_dslash(reinterpret_cast<cpuColorSpinorField *>(&spinorRef->Odd()), qdp_fatlink_cpu, qdp_longlink_cpu,
519  ghost_fatlink_cpu, ghost_longlink_cpu, reinterpret_cast<cpuColorSpinorField *>(&spinor->Even()),
520  QUDA_ODD_PARITY, !dagger, inv_param.cpu_prec, gauge_param.cpu_prec, dslash_type);
522  xpay(spinor->V(), kappa, spinorRef->V(), spinor->Length(), gauge_param.cpu_prec);
523  } else {
524  axpy(2 * mass, spinor->V(), spinorRef->V(), spinor->Length(), gauge_param.cpu_prec);
525  }
526  break;
527  default:
528  errorQuda("Test type not defined");
529  }
530 
531 }
532 
534 {
535  auto prec = precision == 2 ? QUDA_DOUBLE_PRECISION : precision == 1 ? QUDA_SINGLE_PRECISION : QUDA_HALF_PRECISION;
536 
537  printfQuda("prec recon test_type dagger S_dim T_dimension\n");
538  printfQuda("%s %s %d %d %d/%d/%d %d \n", get_prec_str(prec), get_recon_str(link_recon),
540  return ;
541 
542 }
543 
544 using ::testing::TestWithParam;
545 using ::testing::Bool;
546 using ::testing::Values;
547 using ::testing::Range;
548 using ::testing::Combine;
549 
550 
551 void usage_extra(char** argv )
552 {
553  printfQuda("Extra options:\n");
554  printfQuda(" --test <0/1> # Test method\n");
555  printfQuda(" 0: Even destination spinor\n");
556  printfQuda(" 1: Odd destination spinor\n");
557  return ;
558 }
559 
560 using ::testing::TestWithParam;
561 using ::testing::Bool;
562 using ::testing::Values;
563 using ::testing::Range;
564 using ::testing::Combine;
565 
566 class StaggeredDslashTest : public ::testing::TestWithParam<::testing::tuple<int, int, int>> {
567 protected:
568  ::testing::tuple<int, int, int> param;
569 
570  bool skip()
571  {
572  QudaReconstructType recon = static_cast<QudaReconstructType>(::testing::get<1>(GetParam()));
573 
574  if ((QUDA_PRECISION & getPrecision(::testing::get<0>(GetParam()))) == 0
575  || (QUDA_RECONSTRUCT & getReconstructNibble(recon)) == 0) {
576  return true;
577  }
578 
580  && (::testing::get<0>(GetParam()) == 0 || ::testing::get<0>(GetParam()) == 1)) {
581  warningQuda("Fixed precision unsupported in fat/long compute, skipping...");
582  return true;
583  }
584 
586  warningQuda("Reconstruct 9 unsupported in fat/long compute, skipping...");
587  return true;
588  }
589 
590  if (dslash_type == QUDA_LAPLACE_DSLASH && (::testing::get<0>(GetParam()) == 0 || ::testing::get<0>(GetParam()) == 1)) {
591  warningQuda("Fixed precision unsupported for Laplace operator, skipping...");
592  return true;
593  }
594  return false;
595  }
596 
597 public:
598  virtual ~StaggeredDslashTest() { }
599  virtual void SetUp() {
600  int prec = ::testing::get<0>(GetParam());
601  QudaReconstructType recon = static_cast<QudaReconstructType>(::testing::get<1>(GetParam()));
602 
603  if (skip()) GTEST_SKIP();
604 
605  int value = ::testing::get<2>(GetParam());
606  for(int j=0; j < 4;j++){
607  if (value & (1 << j)){
609  }
610 
611  }
612  updateR();
613 
614  for (int dir = 0; dir < 4; dir++) {
615  qdp_fatlink_cpu[dir] = nullptr;
616  qdp_longlink_cpu[dir] = nullptr;
617  }
618 
619  dirac = nullptr;
620  cudaSpinor = nullptr;
621  cudaSpinorOut = nullptr;
622  tmp = nullptr;
623 
624  spinor = nullptr;
625  spinorOut = nullptr;
626  spinorRef = nullptr;
627  tmpCpu = nullptr;
628 
629  init(prec, recon, value);
630  display_test_info(prec, recon);
631  }
632 
633  virtual void TearDown()
634  {
635  if (skip()) GTEST_SKIP();
636  end();
637  }
638 
639  static void SetUpTestCase() { initQuda(device); }
640 
641  // Per-test-case tear-down.
642  // Called after the last test in this test case.
643  // Can be omitted if not needed.
644  static void TearDownTestCase() { endQuda(); }
645 };
646 
648  double deviation = 1.0;
649  double tol = getTolerance(inv_param.cuda_prec);
650 
651  bool failed = false; // for the nan catch
652 
653  // check for skip_kernel
654  if (spinorRef != nullptr) {
655 
656  { // warm-up run
657  // printfQuda("Tuning...\n");
658  dslashCUDA(1);
659  }
660 
661  dslashCUDA(2);
662 
663  *spinorOut = *cudaSpinorOut;
664 
666 
667  double spinor_ref_norm2 = blas::norm2(*spinorRef);
668  double spinor_out_norm2 = blas::norm2(*spinorOut);
669 
670  // for verification
671  // printfQuda("\n\nCUDA: %f\n\n", ((double*)(spinorOut->V()))[0]);
672  // printfQuda("\n\nCPU: %f\n\n", ((double*)(spinorRef->V()))[0]);
673 
674  // Catching nans is weird.
675  if (std::isnan(spinor_ref_norm2)) { failed = true; }
676  if (std::isnan(spinor_out_norm2)) { failed = true; }
677 
678  double cuda_spinor_out_norm2 = blas::norm2(*cudaSpinorOut);
679  printfQuda("Results: CPU=%f, CUDA=%f, CPU-CUDA=%f\n", spinor_ref_norm2, cuda_spinor_out_norm2, spinor_out_norm2);
680  deviation = pow(10, -(double)(cpuColorSpinorField::Compare(*spinorRef, *spinorOut)));
681  if (failed) { deviation = 1.0; }
682  }
683  ASSERT_LE(deviation, tol) << "CPU and CUDA implementations do not agree";
684  }
685 
687 
688  { // warm-up run
689  // printfQuda("Tuning...\n");
690  dslashCUDA(1);
691  }
692 
693  // reset flop counter
694  dirac->Flops();
695 
696  DslashTime dslash_time = dslashCUDA(niter);
697 
698  *spinorOut = *cudaSpinorOut;
699 
700  printfQuda("%fus per kernel call\n", 1e6 * dslash_time.event_time / niter);
701 
702  unsigned long long flops = dirac->Flops();
703  double gflops = 1.0e-9 * flops / dslash_time.event_time;
704  printfQuda("GFLOPS = %f\n", gflops);
705  RecordProperty("Gflops", std::to_string(gflops));
706 
707  RecordProperty("Halo_bidirectitonal_BW_GPU", 1.0e-9 * 2 * cudaSpinor->GhostBytes() * niter / dslash_time.event_time);
708  RecordProperty("Halo_bidirectitonal_BW_CPU", 1.0e-9 * 2 * cudaSpinor->GhostBytes() * niter / dslash_time.cpu_time);
709  RecordProperty("Halo_bidirectitonal_BW_CPU_min", 1.0e-9 * 2 * cudaSpinor->GhostBytes() / dslash_time.cpu_max);
710  RecordProperty("Halo_bidirectitonal_BW_CPU_max", 1.0e-9 * 2 * cudaSpinor->GhostBytes() / dslash_time.cpu_min);
711  RecordProperty("Halo_message_size_bytes", 2 * cudaSpinor->GhostBytes());
712 
713  printfQuda("Effective halo bi-directional bandwidth (GB/s) GPU = %f ( CPU = %f, min = %f , max = %f ) for aggregate "
714  "message size %lu bytes\n",
715  1.0e-9 * 2 * cudaSpinor->GhostBytes() * niter / dslash_time.event_time,
716  1.0e-9 * 2 * cudaSpinor->GhostBytes() * niter / dslash_time.cpu_time,
717  1.0e-9 * 2 * cudaSpinor->GhostBytes() / dslash_time.cpu_max,
718  1.0e-9 * 2 * cudaSpinor->GhostBytes() / dslash_time.cpu_min, 2 * cudaSpinor->GhostBytes());
719 }
720 
721  int main(int argc, char **argv)
722  {
723  // hack for loading gauge fields
724  argc_copy = argc;
725  argv_copy = argv;
726 
727  // initialize CPU field backup
728  int pmax = 1;
729 #ifdef MULTI_GPU
730  pmax = 16;
731 #endif
732  for (int p = 0; p < pmax; p++) {
733  for (int d = 0; d < 4; d++) {
734  qdp_fatlink_cpu_backup[p][d] = nullptr;
735  qdp_longlink_cpu_backup[p][d] = nullptr;
736  qdp_inlink_backup[p][d] = nullptr;
737  }
738  }
739 
740  // initalize google test
741  ::testing::InitGoogleTest(&argc, argv);
742  for (int i = 1; i < argc; i++) {
743 
744  if (process_command_line_option(argc, argv, &i) == 0) { continue; }
745 
746  fprintf(stderr, "ERROR: Invalid option:%s\n", argv[i]);
747  usage(argv);
748  }
749 
750  initComms(argc, argv, gridsize_from_cmdline);
751 
752  // Ensure that the default is improved staggered
756  warningQuda("The dslash_type %d isn't staggered, asqtad, or laplace. Defaulting to asqtad.\n", dslash_type);
758  }
759 
760  // Sanity check: if you pass in a gauge field, want to test the asqtad/hisq dslash, and don't
761  // ask to build the fat/long links... it doesn't make sense.
762  if (strcmp(latfile,"") && !compute_fatlong && dslash_type == QUDA_ASQTAD_DSLASH) {
763  errorQuda("Cannot load a gauge field and test the ASQTAD/HISQ operator without setting \"--compute-fat-long true\".\n");
764  compute_fatlong = true;
765  }
766 
767  // Set n_naiks to 2 if eps_naik != 0.0
769  if (eps_naik != 0.0) {
770  if (compute_fatlong) {
771  n_naiks = 2;
772  printfQuda("Note: epsilon-naik != 0, testing epsilon correction links.\n");
773  } else {
774  eps_naik = 0.0;
775  printfQuda("Not computing fat-long, ignoring epsilon correction.\n");
776  }
777  } else {
778  printfQuda("Note: epsilon-naik = 0, testing original HISQ links.\n");
779  }
780  }
781 
783  if (test_type != 2) { errorQuda("Test type %d is not supported for the Laplace operator.\n", test_type); }
784  }
785 
786  // return result of RUN_ALL_TESTS
787  int test_rc = RUN_ALL_TESTS();
788 
789  // Clean up loaded gauge field
790  for (int dir = 0; dir < 4; dir++) {
791  if (qdp_inlink[dir] != nullptr) { free(qdp_inlink[dir]); qdp_inlink[dir] = nullptr; }
792  }
793 
794  // Clean up per-partition backup
795  for (int p = 0; p < pmax; p++) {
796  for (int d = 0; d < 4; d++) {
797  if (qdp_inlink_backup[p][d] != nullptr) { free(qdp_inlink_backup[p][d]); qdp_inlink_backup[p][d] = nullptr; }
798  if (qdp_fatlink_cpu_backup[p][d] != nullptr) {
799  free(qdp_fatlink_cpu_backup[p][d]);
800  qdp_fatlink_cpu_backup[p][d] = nullptr;
801  }
802  if (qdp_longlink_cpu_backup[p][d] != nullptr) {
803  free(qdp_longlink_cpu_backup[p][d]);
804  qdp_longlink_cpu_backup[p][d] = nullptr;
805  }
806  }
807  }
808 
809  finalizeComms();
810 
811  return test_rc;
812  }
813 
814  std::string getstaggereddslashtestname(testing::TestParamInfo<::testing::tuple<int, int, int>> param){
815  const int prec = ::testing::get<0>(param.param);
816  const int recon = ::testing::get<1>(param.param);
817  const int part = ::testing::get<2>(param.param);
818  std::stringstream ss;
819  // ss << get_dslash_str(dslash_type) << "_";
820  ss << prec_str[prec];
821  ss << "_r" << recon;
822  ss << "_partition" << part;
823  return ss.str();
824  }
825 
826 #ifdef MULTI_GPU
828  Combine(Range(0, 4),
830  Range(0, 16)),
832 #else
834  Combine(Range(0, 4),
836  ::testing::Values(0)),
838 #endif
int Nsrc
Definition: test_util.cpp:1627
void computeFatLongGPUandCPU(void **qdp_fatlink_gpu, void **qdp_longlink_gpu, void **qdp_fatlink_cpu, void **qdp_longlink_cpu, void **qdp_inlink, QudaGaugeParam &gauge_param, size_t gSize, int n_naiks, double eps_naik)
void init(int precision, QudaReconstructType link_recon, int partition)
static size_t gSize
QudaDiracFieldOrder dirac_order
Definition: quda.h:219
QudaDagType dagger
Definition: test_util.cpp:1620
void applyGaugeFieldScaling_long(Float **gauge, int Vh, QudaGaugeParam *param, QudaDslashType dslash_type)
Definition: test_util.cpp:747
QudaMassNormalization mass_normalization
Definition: quda.h:208
QudaReconstructType reconstruct_sloppy
Definition: quda.h:53
double anisotropy
Definition: quda.h:38
void usage_extra(char **argv)
cpuColorSpinorField * spinorOut
QudaGhostExchange ghostExchange
Definition: lattice_field.h:76
void setPrecision(QudaPrecision precision, QudaPrecision ghost_precision=QUDA_INVALID_PRECISION, bool force_native=false)
void endQuda(void)
void construct_gauge_field(void **gauge, int type, QudaPrecision precision, QudaGaugeParam *param)
Definition: test_util.cpp:1047
int getReconstructNibble(QudaReconstructType recon)
Definition: test_util.h:140
enum QudaPrecision_s QudaPrecision
int ga_pad
Definition: quda.h:63
void * qdp_inlink_backup[1][4]
void dw_setDims(int *X, const int L5)
Definition: test_util.cpp:187
QudaGaugeFixed gauge_fix
Definition: quda.h:61
INSTANTIATE_TEST_SUITE_P(QUDA, StaggeredDslashTest, Combine(Range(0, 4), ::testing::Values(QUDA_RECONSTRUCT_NO, QUDA_RECONSTRUCT_12, QUDA_RECONSTRUCT_8), ::testing::Values(0)), getstaggereddslashtestname)
QudaParity parity
cpuColorSpinorField * tmpCpu
QudaInvertParam inv_param
QudaLinkType type
Definition: quda.h:42
double kappa
Definition: quda.h:106
#define errorQuda(...)
Definition: util_quda.h:121
double norm2(const ColorSpinorField &a)
Definition: reduce_quda.cu:721
QudaDslashType dslash_type
Definition: quda.h:102
QudaPrecision cuda_prec
Definition: quda.h:214
int ydim
Definition: test_util.cpp:1616
QudaDslashType dslash_type
Definition: test_util.cpp:1621
void loadGaugeQuda(void *h_gauge, QudaGaugeParam *param)
double cpu_min
void commDimPartitionedSet(int dir)
QudaPrecision cpu_prec
Definition: quda.h:213
int process_command_line_option(int argc, char **argv, int *idx)
Definition: test_util.cpp:2019
void Source(const QudaSourceType sourceType, const int st=0, const int s=0, const int c=0)
const ColorSpinorField & Even() const
int xdim
Definition: test_util.cpp:1615
const ColorSpinorField & Odd() const
QudaStaggeredPhase staggered_phase_type
Definition: quda.h:71
std::string getstaggereddslashtestname(testing::TestParamInfo<::testing::tuple< int, int, int >> param)
QudaDagType dagger
Definition: quda.h:207
const char * recon_str[]
double mass
Definition: test_util.cpp:1646
void finalizeComms()
Definition: test_util.cpp:128
double getTolerance(QudaPrecision prec)
QudaGaugeFieldOrder gauge_order
Definition: quda.h:43
static int n_naiks
void * qdp_longlink_cpu_backup[1][4]
DslashTime dslashCUDA(int niter)
const char * get_prec_str(QudaPrecision prec)
Definition: misc.cpp:701
unsigned long long Flops() const
Definition: dirac_quda.h:177
char latfile[]
Definition: test_util.cpp:1623
cudaColorSpinorField * cudaSpinor
void * qdp_fatlink_cpu[4]
QudaSiteSubset siteSubset
Definition: lattice_field.h:71
void setDims(int *)
Definition: test_util.cpp:151
QudaFieldLocation input_location
Definition: quda.h:99
void freeGaugeQuda(void)
cpuColorSpinorField * spinor
void xpay(ColorSpinorField &x, double a, ColorSpinorField &y)
Definition: blas_quda.h:37
bool verify_results
Definition: test_util.cpp:1643
QudaGaugeParam param
Definition: pack_test.cpp:17
void setDiracParam(DiracParam &diracParam, QudaInvertParam *inv_param, bool pc)
QudaSolutionType solution_type
Definition: quda.h:204
void end(void)
int x[QUDA_MAX_DIM]
Definition: lattice_field.h:67
double scale
Definition: quda.h:40
void initQuda(int device)
double tol
Definition: test_util.cpp:1656
QudaPrecision getPrecision(int i)
Definition: test_util.h:129
QudaFieldLocation output_location
Definition: quda.h:100
double benchmark(int kernel, const int niter)
Definition: blas_test.cu:303
double kappa
Definition: test_util.cpp:1647
void usage(char **argv)
Definition: test_util.cpp:1783
QudaReconstructType link_recon
Definition: test_util.cpp:1605
void reorderQDPtoMILC(Out *milc_out, In **qdp_in, int V, int siteSize)
void setSpinorSiteSize(int n)
Definition: test_util.cpp:211
ColorSpinorParam csParam
Definition: pack_test.cpp:24
QudaInvertParam newQudaInvertParam(void)
int test_type
Definition: test_util.cpp:1636
const char * get_recon_str(QudaReconstructType recon)
Definition: misc.cpp:768
double event_time
double cpu_time
#define warningQuda(...)
Definition: util_quda.h:133
::testing::tuple< int, int, int > param
char ** argv_copy
__host__ __device__ ValueType pow(ValueType x, ExponentType e)
Definition: complex_quda.h:111
void matdagmat(void *out, void **link, void *in, int dagger_bit, int mu, QudaPrecision sPrecision, QudaPrecision gPrecision, void *tmp, QudaParity parity)
QudaGammaBasis gamma_basis
Definition: quda.h:221
void staggered_dslash(cpuColorSpinorField *out, void **fatlink, void **longlink, void **ghost_fatlink, void **ghost_longlink, cpuColorSpinorField *in, int oddBit, int daggerBit, QudaPrecision sPrecision, QudaPrecision gPrecision, QudaDslashType dslash_type)
QudaPrecision cuda_prec_sloppy
Definition: quda.h:52
const void ** Ghost() const
Definition: gauge_field.h:323
enum QudaDagType_s QudaDagType
enum QudaParity_s QudaParity
QudaReconstructType reconstruct
Definition: quda.h:50
QudaPrecision cuda_prec
Definition: quda.h:49
int X[4]
Definition: quda.h:36
void staggeredDslashRef()
double mass
Definition: quda.h:105
int V
Definition: test_util.cpp:27
static int Compare(const cpuColorSpinorField &a, const cpuColorSpinorField &b, const int resolution=1)
Perform a component by component comparison of two color-spinor fields. In doing we normalize with re...
void * memset(void *s, int c, size_t n)
cpuGaugeField * cpuLong
void construct_fat_long_gauge_field(void **fatlink, void **longlink, int type, QudaPrecision precision, QudaGaugeParam *param, QudaDslashType dslash_type)
Definition: test_util.cpp:1062
int device
Definition: test_util.cpp:1602
virtual void M(ColorSpinorField &out, const ColorSpinorField &in) const =0
const char * prec_str[]
int X[4]
double tadpole_coeff
Definition: quda.h:39
Dirac * dirac
double eps_naik
Definition: test_util.cpp:1652
void display_test_info(int precision, QudaReconstructType link_recon)
enum QudaReconstructType_s QudaReconstructType
void commDimPartitionedReset()
Reset the comm dim partioned array to zero,.
Main header file for the QUDA library.
void * qdp_inlink[4]
bool global_skip
#define MAX(a, b)
void ** ghost_fatlink_cpu
cpuGaugeField * cpuFat
#define printfQuda(...)
Definition: util_quda.h:115
QudaTboundary t_boundary
Definition: quda.h:45
unsigned long long flops
Definition: blas_quda.cu:22
double cpu_max
int main(int argc, char **argv)
void ** ghost_longlink_cpu
enum QudaDslashType_s QudaDslashType
bool compute_fatlong
Definition: test_util.cpp:1655
TEST_P(StaggeredDslashTest, verify)
cudaColorSpinorField * cudaSpinorOut
int tdim
Definition: test_util.cpp:1618
bool gauge_loaded
void * qdp_longlink_cpu[4]
int zdim
Definition: test_util.cpp:1617
__device__ void axpy(real a, const real *x, Link &y)
#define checkCudaError()
Definition: util_quda.h:161
static Dirac * create(const DiracParam &param)
Definition: dirac.cpp:159
cpuColorSpinorField * spinorRef
void initComms(int argc, char **argv, int *const commDims)
Definition: test_util.cpp:88
cudaColorSpinorField * tmp
void read_gauge_field(const char *filename, void *gauge[], QudaPrecision prec, const int *X, int argc, char *argv[])
Definition: qio_field.h:14
virtual void Dslash(ColorSpinorField &out, const ColorSpinorField &in, const QudaParity parity) const =0
QudaPrecision prec
Definition: test_util.cpp:1608
int niter
Definition: test_util.cpp:1629
void setVerbosity(QudaVerbosity verbosity)
Definition: util_quda.cpp:25
QudaMatPCType matpc_type
Definition: quda.h:206
ColorSpinorField * tmp1
Definition: dirac_quda.h:41
QudaGaugeParam gauge_param
QudaPrecision cpu_prec
Definition: quda.h:47
void updateR()
update the radius for halos.
#define gaugeSiteSize
Definition: face_gauge.cpp:34
QudaGaugeParam newQudaGaugeParam(void)
int gridsize_from_cmdline[]
Definition: test_util.cpp:49
void * qdp_fatlink_cpu_backup[1][4]
void comm_barrier(void)
Definition: comm_mpi.cpp:326
int Vh
Definition: test_util.cpp:28