QUDA  1.0.0
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros
staggered_dslash_test.cpp
Go to the documentation of this file.
1 #include <iostream>
2 #include <stdio.h>
3 #include <stdlib.h>
4 #include <string.h>
5 
6 #include <quda.h>
7 #include <quda_internal.h>
8 #include <dirac_quda.h>
9 #include <dslash_quda.h>
10 #include <invert_quda.h>
11 #include <util_quda.h>
12 #include <blas_quda.h>
13 
14 #include <misc.h>
15 #include <test_util.h>
16 #include <dslash_util.h>
18 #include <staggered_gauge_utils.h>
19 #include <llfat_reference.h>
20 #include <gauge_field.h>
21 #include <unitarization_links.h>
22 
23 #include <qio_field.h>
24 
25 #include <assert.h>
26 #include <gtest/gtest.h>
27 
28 using namespace quda;
29 
30 #define MAX(a,b) ((a)>(b)?(a):(b))
31 
32 #define staggeredSpinorSiteSize 6
33 
34 // What test are we doing (0 = dslash, 1 = MatPC, 2 = Mat)
35 extern int test_type;
36 
37 extern void usage(char** argv );
38 
39 void *qdp_inlink[4] = { nullptr, nullptr, nullptr, nullptr };
40 
43 
46 
50 
51 // In the HISQ case, we include building fat/long links in this unit test
54 
56 extern QudaDagType dagger;
57 extern int xdim;
58 extern int ydim;
59 extern int zdim;
60 extern int tdim;
61 extern int gridsize_from_cmdline[];
63 extern QudaPrecision prec;
64 extern QudaPrecision cpu_prec;
68 extern int device;
69 extern bool verify_results;
70 extern int niter;
71 extern double mass; // the mass of the Dirac operator
72 extern double kappa; // will get overriden
73 extern int laplace3D;
74 
75 extern bool compute_fatlong; // build the true fat/long links or use random numbers
76 extern double eps_naik; // relativistic correction for naik term
77 static int n_naiks = 1; // Number of naiks. If eps_naik is 0.0, we only need to construct one naik.
78 
79 extern char latfile[];
80 
81 int X[4];
82 extern int Nsrc; // number of spinors to apply to simultaneously
83 
85 
87 
88 // For loading the gauge fields
90 char** argv_copy;
91 
93 {
94  switch (prec) {
95  case QUDA_QUARTER_PRECISION: return 1e-1;
96  case QUDA_HALF_PRECISION: return 1e-3;
97  case QUDA_SINGLE_PRECISION: return 1e-4;
98  case QUDA_DOUBLE_PRECISION: return 1e-11;
99  case QUDA_INVALID_PRECISION: return 1.0;
100  }
101  return 1.0;
102 }
103 
104 void setGaugeParam(QudaGaugeParam &gaugeParam)
105 {
106  gaugeParam.X[0] = X[0] = xdim;
107  gaugeParam.X[1] = X[1] = ydim;
108  gaugeParam.X[2] = X[2] = zdim;
109  gaugeParam.X[3] = X[3] = tdim;
110 
111  gaugeParam.cpu_prec = QUDA_DOUBLE_PRECISION;
112  gaugeParam.cuda_prec = prec;
113  gaugeParam.reconstruct = link_recon;
114  gaugeParam.reconstruct_sloppy = gaugeParam.reconstruct;
115  gaugeParam.cuda_prec_sloppy = gaugeParam.cuda_prec;
116 
117  // ensure that the default is improved staggered
120  }
121 
122  gaugeParam.anisotropy = 1.0;
123 
124  // For HISQ, this must always be set to 1.0, since the tadpole
125  // correction is baked into the coefficients for the first fattening.
126  // The tadpole doesn't mean anything for the second fattening
127  // since the input fields are unitarized.
128  gaugeParam.tadpole_coeff = 1.0;
130  gaugeParam.scale = -1.0 / 24.0;
131  if (eps_naik != 0) {
132  gaugeParam.scale *= (1.0+eps_naik);
133  }
134  } else {
135  gaugeParam.scale = 1.0;
136  }
137  gaugeParam.gauge_order = QUDA_MILC_GAUGE_ORDER;
138  gaugeParam.t_boundary = QUDA_ANTI_PERIODIC_T;
140  gaugeParam.gauge_fix = QUDA_GAUGE_FIXED_NO;
141  gaugeParam.type = QUDA_WILSON_LINKS;
142 
143  int tmpint = MAX(X[1] * X[2] * X[3], X[0] * X[2] * X[3]);
144  tmpint = MAX(tmpint, X[0] * X[1] * X[3]);
145  tmpint = MAX(tmpint, X[0] * X[1] * X[2]);
146 
147  gaugeParam.ga_pad = tmpint;
148 }
149 
151 {
152  inv_param.cpu_prec = QUDA_DOUBLE_PRECISION;
153  inv_param.cuda_prec = prec;
154  inv_param.dirac_order = QUDA_DIRAC_ORDER;
156  inv_param.dagger = dagger;
157  inv_param.matpc_type = QUDA_MATPC_EVEN_EVEN;
158  inv_param.dslash_type = dslash_type;
159  inv_param.mass = mass;
160  inv_param.kappa = kappa = 1.0/(8.0+mass); // for laplace
162  inv_param.laplace3D = laplace3D; // for laplace
163  inv_param.verbosity = verbosity;
164 
167 
168  int tmpint = MAX(X[1]*X[2]*X[3], X[0]*X[2]*X[3]);
169  tmpint = MAX(tmpint, X[0]*X[1]*X[3]);
170  tmpint = MAX(tmpint, X[0]*X[1]*X[2]);
171 
172  inv_param.sp_pad = tmpint;
173 }
174 
175 void init()
176 {
177 
178  initQuda(device);
179 
180  gaugeParam = newQudaGaugeParam();
181  inv_param = newQudaInvertParam();
182 
183  setGaugeParam(gaugeParam);
184  setInvertParam(inv_param);
185 
186  setDims(gaugeParam.X);
187  dw_setDims(gaugeParam.X, Nsrc); // so we can use 5-d indexing from dwf
189 
190  size_t gSize = (gaugeParam.cpu_prec == QUDA_DOUBLE_PRECISION) ? sizeof(double) : sizeof(float);
191 
192  // Allocate a lot of memory because I'm very confused
193  void* milc_fatlink_cpu = malloc(4*V*gaugeSiteSize*gSize);
194  void* milc_longlink_cpu = malloc(4*V*gaugeSiteSize*gSize);
195 
196  void* milc_fatlink_gpu = malloc(4*V*gaugeSiteSize*gSize);
197  void* milc_longlink_gpu = malloc(4*V*gaugeSiteSize*gSize);
198 
199  void* qdp_fatlink_gpu[4];
200  void* qdp_longlink_gpu[4];
201 
202  for (int dir = 0; dir < 4; dir++) {
203  qdp_fatlink_gpu[dir] = malloc(V*gaugeSiteSize*gSize);
204  qdp_longlink_gpu[dir] = malloc(V*gaugeSiteSize*gSize);
205 
206  qdp_fatlink_cpu[dir] = malloc(V*gaugeSiteSize*gSize);
207  qdp_longlink_cpu[dir] = malloc(V*gaugeSiteSize*gSize);
208 
209  if (qdp_fatlink_gpu[dir] == NULL || qdp_longlink_gpu[dir] == NULL ||
210  qdp_fatlink_cpu[dir] == NULL || qdp_longlink_cpu[dir] == NULL) {
211  errorQuda("ERROR: malloc failed for fatlink/longlink");
212  }
213  }
214 
215  // create a base field
216  for (int dir = 0; dir < 4; dir++) {
217  if (qdp_inlink[dir] == nullptr) {
218  qdp_inlink[dir] = malloc(V*gaugeSiteSize*gSize);
219  }
220  }
221 
222  // load a field WITHOUT PHASES
223  if (strcmp(latfile,"")) {
224  read_gauge_field(latfile, qdp_inlink, gaugeParam.cpu_prec, gaugeParam.X, argc_copy, argv_copy);
227  } // else it's already been loaded
228  } else {
230  construct_gauge_field(qdp_inlink, 1, gaugeParam.cpu_prec, &gaugeParam);
231  } else {
233  }
234  }
235 
236  // QUDA_STAGGERED_DSLASH follows the same codepath whether or not you
237  // "compute" the fat/long links or not.
239  for (int dir = 0; dir < 4; dir++) {
240  memcpy(qdp_fatlink_gpu[dir],qdp_inlink[dir], V*gaugeSiteSize*gSize);
241  memcpy(qdp_fatlink_cpu[dir],qdp_inlink[dir], V*gaugeSiteSize*gSize);
242  memset(qdp_longlink_gpu[dir],0,V*gaugeSiteSize*gSize);
243  memset(qdp_longlink_cpu[dir],0,V*gaugeSiteSize*gSize);
244  }
245  } else { // QUDA_ASQTAD_DSLASH
246 
247  if (compute_fatlong) {
248  computeFatLongGPUandCPU(qdp_fatlink_gpu, qdp_longlink_gpu, qdp_fatlink_cpu, qdp_longlink_cpu, qdp_inlink,
249  gaugeParam, gSize, n_naiks, eps_naik);
250  } else {
251  // Not computing FatLong
252  for (int dir = 0; dir < 4; dir++) {
253  memcpy(qdp_fatlink_gpu[dir],qdp_inlink[dir], V*gaugeSiteSize*gSize);
254  memcpy(qdp_fatlink_cpu[dir],qdp_inlink[dir], V*gaugeSiteSize*gSize);
255  memcpy(qdp_longlink_gpu[dir],qdp_longlink_cpu[dir],V*gaugeSiteSize*gSize);
256  }
257  }
258  }
259 
260  // Alright, we've created all the void** links.
261  // Create the void* pointers
262  reorderQDPtoMILC(milc_fatlink_gpu,qdp_fatlink_gpu,V,gaugeSiteSize,gaugeParam.cpu_prec,gaugeParam.cpu_prec);
263  reorderQDPtoMILC(milc_fatlink_cpu,qdp_fatlink_cpu,V,gaugeSiteSize,gaugeParam.cpu_prec,gaugeParam.cpu_prec);
264  reorderQDPtoMILC(milc_longlink_gpu,qdp_longlink_gpu,V,gaugeSiteSize,gaugeParam.cpu_prec,gaugeParam.cpu_prec);
265  reorderQDPtoMILC(milc_longlink_cpu,qdp_longlink_cpu,V,gaugeSiteSize,gaugeParam.cpu_prec,gaugeParam.cpu_prec);
266  // Create ghost zones for CPU fields,
267  // prepare and load the GPU fields
268 
269 #ifdef MULTI_GPU
270 
272  gaugeParam.reconstruct = QUDA_RECONSTRUCT_NO;
273  GaugeFieldParam cpuFatParam(milc_fatlink_cpu, gaugeParam);
275  cpuFat = new cpuGaugeField(cpuFatParam);
276  ghost_fatlink_cpu = cpuFat->Ghost();
277 
278  gaugeParam.type = QUDA_ASQTAD_LONG_LINKS;
279  GaugeFieldParam cpuLongParam(milc_longlink_cpu, gaugeParam);
280  cpuLongParam.ghostExchange = QUDA_GHOST_EXCHANGE_PAD;
281  cpuLong = new cpuGaugeField(cpuLongParam);
282  ghost_longlink_cpu = cpuLong->Ghost();
283 
284  int x_face_size = X[1]*X[2]*X[3]/2;
285  int y_face_size = X[0]*X[2]*X[3]/2;
286  int z_face_size = X[0]*X[1]*X[3]/2;
287  int t_face_size = X[0]*X[1]*X[2]/2;
288  int pad_size = MAX(x_face_size, y_face_size);
289  pad_size = MAX(pad_size, z_face_size);
290  pad_size = MAX(pad_size, t_face_size);
291  gaugeParam.ga_pad = pad_size;
292 #endif
293 
296  gaugeParam.reconstruct = gaugeParam.reconstruct_sloppy = (link_recon == QUDA_RECONSTRUCT_12) ?
299  } else {
300  gaugeParam.reconstruct = gaugeParam.reconstruct_sloppy = QUDA_RECONSTRUCT_NO;
301  }
302 
303  // set verbosity prior to loadGaugeQuda
305 
306  // printfQuda("Fat links sending...");
307  loadGaugeQuda(milc_fatlink_gpu, &gaugeParam);
308  // printfQuda("Fat links sent\n");
309 
310  gaugeParam.type = QUDA_ASQTAD_LONG_LINKS;
311 
312 #ifdef MULTI_GPU
313  gaugeParam.ga_pad = 3*pad_size;
314 #endif
315 
318  gaugeParam.reconstruct = gaugeParam.reconstruct_sloppy = (link_recon == QUDA_RECONSTRUCT_12) ?
321 
322  // printfQuda("Long links sending...");
323  loadGaugeQuda(milc_longlink_gpu, &gaugeParam);
324  // printfQuda("Long links sent...\n");
325  }
326 
328  csParam.nColor = 3;
329  csParam.nSpin = 1;
330  csParam.nDim = 5;
331  for (int d = 0; d < 4; d++) { csParam.x[d] = gaugeParam.X[d]; }
332  csParam.x[4] = Nsrc; // number of sources becomes the fifth dimension
333 
334  csParam.setPrecision(inv_param.cpu_prec);
335  inv_param.solution_type = QUDA_MAT_SOLUTION;
336  csParam.pad = 0;
337  if (test_type < 2 && dslash_type != QUDA_LAPLACE_DSLASH) {
339  csParam.x[0] /= 2;
340  } else {
342  }
343 
346  csParam.gammaBasis = inv_param.gamma_basis; // this parameter is meaningless for staggered
347  csParam.create = QUDA_ZERO_FIELD_CREATE;
348 
349  spinor = new cpuColorSpinorField(csParam);
350  spinorOut = new cpuColorSpinorField(csParam);
351  spinorRef = new cpuColorSpinorField(csParam);
352  tmpCpu = new cpuColorSpinorField(csParam);
353 
354  spinor->Source(QUDA_RANDOM_SOURCE);
355 
357  csParam.pad = inv_param.sp_pad;
358  csParam.setPrecision(inv_param.cuda_prec);
359 
360  cudaSpinor = new cudaColorSpinorField(csParam);
361  cudaSpinorOut = new cudaColorSpinorField(csParam);
362  *cudaSpinor = *spinor;
363  tmp = new cudaColorSpinorField(csParam);
364 
365  cudaDeviceSynchronize();
366  checkCudaError();
367 
368  bool pc = (test_type == 1); // For test_type 0, can use either pc or not pc
369  // because both call the same "Dslash" directly.
370  DiracParam diracParam;
371  setDiracParam(diracParam, &inv_param, pc);
372  diracParam.tmp1 = tmp;
373  dirac = Dirac::create(diracParam);
374 
375  for (int dir = 0; dir < 4; dir++) {
376  free(qdp_fatlink_gpu[dir]); qdp_fatlink_gpu[dir] = nullptr;
377  free(qdp_longlink_gpu[dir]); qdp_longlink_gpu[dir] = nullptr;
378  }
379  free(milc_fatlink_gpu); milc_fatlink_gpu = nullptr;
380  free(milc_longlink_gpu); milc_longlink_gpu = nullptr;
381  free(milc_fatlink_cpu); milc_fatlink_cpu = nullptr;
382  free(milc_longlink_cpu); milc_longlink_cpu = nullptr;
383 
384  gaugeParam.reconstruct = link_recon;
385 
386  return;
387 }
388 
389 void end()
390 {
391  for (int dir = 0; dir < 4; dir++) {
392  if (qdp_fatlink_cpu[dir] != nullptr) { free(qdp_fatlink_cpu[dir]); qdp_fatlink_cpu[dir] = nullptr; }
393  if (qdp_longlink_cpu[dir] != nullptr) { free(qdp_longlink_cpu[dir]); qdp_longlink_cpu[dir] = nullptr; }
394  }
395 
396  if (dirac != nullptr) {
397  delete dirac;
398  dirac = nullptr;
399  }
400  if (cudaSpinor != nullptr) {
401  delete cudaSpinor;
402  cudaSpinor = nullptr;
403  }
404  if (cudaSpinorOut != nullptr) {
405  delete cudaSpinorOut;
406  cudaSpinorOut = nullptr;
407  }
408  if (tmp != nullptr) {
409  delete tmp;
410  tmp = nullptr;
411  }
412 
413  if (spinor != nullptr) { delete spinor; spinor = nullptr; }
414  if (spinorOut != nullptr) { delete spinorOut; spinorOut = nullptr; }
415  if (spinorRef != nullptr) { delete spinorRef; spinorRef = nullptr; }
416  if (tmpCpu != nullptr) { delete tmpCpu; tmpCpu = nullptr; }
417 
418  freeGaugeQuda();
419 
420  if (cpuFat) { delete cpuFat; cpuFat = nullptr; }
421  if (cpuLong) { delete cpuLong; cpuLong = nullptr; }
423 
424  endQuda();
425 }
426 
427 struct DslashTime {
428  double event_time;
429  double cpu_time;
430  double cpu_min;
431  double cpu_max;
432 
433  DslashTime() : event_time(0.0), cpu_time(0.0), cpu_min(DBL_MAX), cpu_max(0.0) {}
434 };
435 
437 
438  DslashTime dslash_time;
439  timeval tstart, tstop;
440 
441  cudaEvent_t start, end;
442  cudaEventCreate(&start);
443  cudaEventRecord(start, 0);
444  cudaEventSynchronize(start);
445 
446  comm_barrier();
447  cudaEventRecord(start, 0);
448 
449  for (int i = 0; i < niter; i++) {
450 
451  gettimeofday(&tstart, NULL);
452 
453  switch (test_type) {
454  case 0: dirac->Dslash(*cudaSpinorOut, *cudaSpinor, parity); break;
455  case 1: dirac->M(*cudaSpinorOut, *cudaSpinor); break;
456  case 2: dirac->M(*cudaSpinorOut, *cudaSpinor); break;
457  }
458 
459  gettimeofday(&tstop, NULL);
460  long ds = tstop.tv_sec - tstart.tv_sec;
461  long dus = tstop.tv_usec - tstart.tv_usec;
462  double elapsed = ds + 0.000001*dus;
463 
464  dslash_time.cpu_time += elapsed;
465  // skip first and last iterations since they may skew these metrics if comms are not synchronous
466  if (i>0 && i<niter) {
467  if (elapsed < dslash_time.cpu_min) dslash_time.cpu_min = elapsed;
468  if (elapsed > dslash_time.cpu_max) dslash_time.cpu_max = elapsed;
469  }
470  }
471 
472  cudaEventCreate(&end);
473  cudaEventRecord(end, 0);
474  cudaEventSynchronize(end);
475  float runTime;
476  cudaEventElapsedTime(&runTime, start, end);
477  cudaEventDestroy(start);
478  cudaEventDestroy(end);
479 
480  dslash_time.event_time = runTime / 1000;
481 
482  // check for errors
483  cudaError_t stat = cudaGetLastError();
484  if (stat != cudaSuccess)
485  errorQuda("with ERROR: %s\n", cudaGetErrorString(stat));
486 
487  return dslash_time;
488 }
489 
491 {
492 
493  // compare to dslash reference implementation
494  // printfQuda("Calculating reference implementation...");
495  fflush(stdout);
496  switch (test_type) {
497  case 0:
499  parity, dagger, inv_param.cpu_prec, gaugeParam.cpu_prec, dslash_type);
500  break;
501  case 1:
503  inv_param.cpu_prec, gaugeParam.cpu_prec, tmpCpu, parity, dslash_type);
504  break;
505  case 2:
506  // Not sure about the !dagger...
507  staggered_dslash(reinterpret_cast<cpuColorSpinorField *>(&spinorRef->Even()), qdp_fatlink_cpu, qdp_longlink_cpu,
508  ghost_fatlink_cpu, ghost_longlink_cpu, reinterpret_cast<cpuColorSpinorField *>(&spinor->Odd()),
509  QUDA_EVEN_PARITY, !dagger, inv_param.cpu_prec, gaugeParam.cpu_prec, dslash_type);
510  staggered_dslash(reinterpret_cast<cpuColorSpinorField *>(&spinorRef->Odd()), qdp_fatlink_cpu, qdp_longlink_cpu,
511  ghost_fatlink_cpu, ghost_longlink_cpu, reinterpret_cast<cpuColorSpinorField *>(&spinor->Even()),
512  QUDA_ODD_PARITY, !dagger, inv_param.cpu_prec, gaugeParam.cpu_prec, dslash_type);
514  xpay(spinor->V(), kappa, spinorRef->V(), spinor->Length(), gaugeParam.cpu_prec);
515  } else {
516  axpy(2*mass, spinor->V(), spinorRef->V(), spinor->Length(), gaugeParam.cpu_prec);
517  }
518  break;
519  default:
520  errorQuda("Test type not defined");
521  }
522 }
523 
524 TEST(dslash, verify) {
525  double deviation = pow(10, -(double)(cpuColorSpinorField::Compare(*spinorRef, *spinorOut)));
526  double tol = getTolerance(prec);
527  ASSERT_LE(deviation, tol) << "CPU and CUDA implementations do not agree";
528 }
529 
530 static int dslashTest()
531 {
532 
533  for (int dir = 0; dir < 4; dir++) {
534  qdp_fatlink_cpu[dir] = nullptr;
535  qdp_longlink_cpu[dir] = nullptr;
536  }
537 
538  dirac = nullptr;
539  cudaSpinor = nullptr;
540  cudaSpinorOut = nullptr;
541  tmp = nullptr;
542 
543  spinor = nullptr;
544  spinorOut = nullptr;
545  spinorRef = nullptr;
546  tmpCpu = nullptr;
547 
548  bool failed = false;
549 
550  // return code for google test
551  int test_rc = 0;
552  init();
553 
554  int attempts = 1;
555 
556  for (int i=0; i<attempts; i++) {
557 
558  { // warm-up run
559  printfQuda("Tuning...\n");
560  dslashCUDA(1);
561  }
562  printfQuda("Executing %d kernel loops...", niter);
563 
564  // reset flop counter
565  dirac->Flops();
566 
567  DslashTime dslash_time = dslashCUDA(niter);
568 
569  *spinorOut = *cudaSpinorOut;
570 
571  printfQuda("%fus per kernel call\n", 1e6*dslash_time.event_time / niter);
573 
574  double spinor_ref_norm2 = blas::norm2(*spinorRef);
575  double spinor_out_norm2 = blas::norm2(*spinorOut);
576 
577  // Catching nans is weird.
578  if (std::isnan(spinor_ref_norm2)) { failed = true; }
579  if (std::isnan(spinor_out_norm2)) { failed = true; }
580 
581  unsigned long long flops = dirac->Flops();
582  printfQuda("GFLOPS = %f\n", 1.0e-9*flops/dslash_time.event_time);
583 
584  if (niter > 2) { // only print this if valid
585  printfQuda("Effective halo bi-directional bandwidth (GB/s) GPU = %f ( CPU = %f, min = %f , max = %f ) for "
586  "aggregate message size %lu bytes\n",
587  1.0e-9 * 2 * cudaSpinor->GhostBytes() * niter / dslash_time.event_time,
588  1.0e-9 * 2 * cudaSpinor->GhostBytes() * niter / dslash_time.cpu_time,
589  1.0e-9 * 2 * cudaSpinor->GhostBytes() / dslash_time.cpu_max,
590  1.0e-9 * 2 * cudaSpinor->GhostBytes() / dslash_time.cpu_min, 2 * cudaSpinor->GhostBytes());
591  }
592 
593  double cuda_spinor_out_norm2 = blas::norm2(*cudaSpinorOut);
594  printfQuda("Results: CPU=%f, CUDA=%f, CPU-CUDA=%f\n", spinor_ref_norm2, cuda_spinor_out_norm2, spinor_out_norm2);
595 
596  if (verify_results) {
597  test_rc = RUN_ALL_TESTS();
598  if (test_rc != 0 || failed) warningQuda("Tests failed");
599  }
600  }
601  end();
602 
603  return test_rc;
604 }
605 
607 {
608  printfQuda("running the following test:\n");
609  printfQuda("prec recon test_type dagger S_dim T_dimension\n");
610  printfQuda("%s %s %d %d %d/%d/%d %d \n", get_prec_str(prec), get_recon_str(link_recon),
612  printfQuda("Grid partition info: X Y Z T\n");
613  printfQuda(" %d %d %d %d\n", dimPartitioned(0), dimPartitioned(1), dimPartitioned(2),
614  dimPartitioned(3));
615 
616  return ;
617 }
618 
619 void usage_extra(char **argv)
620 {
621  printfQuda("Extra options:\n");
622  printfQuda(" --test <0/1/2> # Test method\n");
623  printfQuda(" 0: Even destination spinor\n");
624  printfQuda(" 1: Odd destination spinor\n");
625  printfQuda(" 2: Full spinor\n");
626  return ;
627 }
628 
629 int main(int argc, char **argv)
630 {
631  // hack for loading gauge fields
632  argc_copy = argc;
633  argv_copy = argv;
634 
635  // initalize google test
636  ::testing::InitGoogleTest(&argc, argv);
637  for (int i=1 ;i < argc; i++){
638 
639  if (process_command_line_option(argc, argv, &i) == 0) { continue; }
640 
641  fprintf(stderr, "ERROR: Invalid option:%s\n", argv[i]);
642  usage(argv);
643  }
644 
645  initComms(argc, argv, gridsize_from_cmdline);
646 
647  // Ensure that the default is improved staggered
651  warningQuda("The dslash_type %d isn't staggered, asqtad, or laplace. Defaulting to asqtad.\n", dslash_type);
653  }
654 
655  // Sanity check: if you pass in a gauge field, want to test the asqtad/hisq dslash,
656  // and don't ask to build the fat/long links... it doesn't make sense.
657  if (strcmp(latfile,"") && !compute_fatlong && dslash_type == QUDA_ASQTAD_DSLASH) {
658  errorQuda("Cannot load a gauge field and test the ASQTAD/HISQ operator without setting \"--compute-fat-long true\".\n");
659  }
660 
661  // Set n_naiks to 2 if eps_naik != 0.0
663  if (eps_naik != 0.0) {
664  if (compute_fatlong) {
665  n_naiks = 2;
666  printfQuda("Note: epsilon-naik != 0, testing epsilon correction links.\n");
667  } else {
668  eps_naik = 0.0;
669  printfQuda("Not computing fat-long, ignoring epsilon correction.\n");
670  }
671  } else {
672  printfQuda("Note: epsilon-naik = 0, testing original HISQ links.\n");
673  }
674  }
675 
677  if (test_type != 2) {
678  errorQuda("Test type %d is not supported for the Laplace operator.\n", test_type);
679  }
680  }
681 
682  // If we're building fat/long links, there are some
683  // tests we have to skip.
685  if (prec == QUDA_HALF_PRECISION /* half */) {
686  errorQuda("Half precision unsupported in fat/long compute");
687  }
688  }
690  errorQuda("Half precision unsupported for Laplace operator.\n");
691  }
692 
694 
695  // return result of RUN_ALL_TESTS
696  int test_rc = dslashTest();
697 
698  finalizeComms();
699 
700  return test_rc;
701 }
void computeFatLongGPUandCPU(void **qdp_fatlink_gpu, void **qdp_longlink_gpu, void **qdp_fatlink_cpu, void **qdp_longlink_cpu, void **qdp_inlink, QudaGaugeParam &gauge_param, size_t gSize, int n_naiks, double eps_naik)
static size_t gSize
int dimPartitioned(int dim)
Definition: test_util.cpp:1776
QudaDiracFieldOrder dirac_order
Definition: quda.h:219
void applyGaugeFieldScaling_long(Float **gauge, int Vh, QudaGaugeParam *param, QudaDslashType dslash_type)
Definition: test_util.cpp:747
QudaMassNormalization mass_normalization
Definition: quda.h:208
QudaDslashType dslash_type
Definition: test_util.cpp:1621
QudaReconstructType reconstruct_sloppy
Definition: quda.h:53
double anisotropy
Definition: quda.h:38
cpuColorSpinorField * spinorOut
int device
Definition: test_util.cpp:1602
void setInvertParam(QudaInvertParam &inv_param)
QudaGhostExchange ghostExchange
Definition: lattice_field.h:76
void setPrecision(QudaPrecision precision, QudaPrecision ghost_precision=QUDA_INVALID_PRECISION, bool force_native=false)
static int dslashTest()
int Nsrc
Definition: test_util.cpp:1627
void endQuda(void)
void construct_gauge_field(void **gauge, int type, QudaPrecision precision, QudaGaugeParam *param)
Definition: test_util.cpp:1047
enum QudaPrecision_s QudaPrecision
char ** argv_copy
int zdim
Definition: test_util.cpp:1617
int ga_pad
Definition: quda.h:63
void dw_setDims(int *X, const int L5)
Definition: test_util.cpp:187
int main(int argc, char **argv)
DslashTime dslashCUDA(int niter)
QudaGaugeFixed gauge_fix
Definition: quda.h:61
void ** ghost_fatlink_cpu
QudaLinkType type
Definition: quda.h:42
cpuColorSpinorField * spinorRef
void ** ghost_longlink_cpu
double kappa
Definition: quda.h:106
QudaDagType dagger
Definition: test_util.cpp:1620
double eps_naik
Definition: test_util.cpp:1652
QudaReconstructType link_recon
Definition: test_util.cpp:1605
#define errorQuda(...)
Definition: util_quda.h:121
double norm2(const ColorSpinorField &a)
Definition: reduce_quda.cu:721
QudaDslashType dslash_type
Definition: quda.h:102
QudaGaugeParam gaugeParam
QudaPrecision cuda_prec
Definition: quda.h:214
void loadGaugeQuda(void *h_gauge, QudaGaugeParam *param)
double cpu_min
QudaVerbosity verbosity
Definition: test_util.cpp:1614
QudaPrecision cpu_prec
Definition: quda.h:213
int process_command_line_option(int argc, char **argv, int *idx)
Definition: test_util.cpp:2019
void Source(const QudaSourceType sourceType, const int st=0, const int s=0, const int c=0)
const ColorSpinorField & Even() const
int niter
Definition: test_util.cpp:1629
const ColorSpinorField & Odd() const
QudaStaggeredPhase staggered_phase_type
Definition: quda.h:71
int argc_copy
QudaDagType dagger
Definition: quda.h:207
void * qdp_fatlink_cpu[4]
void finalizeComms()
Definition: test_util.cpp:128
void usage(char **argv)
Definition: test_util.cpp:1783
cpuColorSpinorField * spinor
cudaColorSpinorField * cudaSpinor
QudaGaugeFieldOrder gauge_order
Definition: quda.h:43
const char * get_prec_str(QudaPrecision prec)
Definition: misc.cpp:701
unsigned long long Flops() const
Definition: dirac_quda.h:177
bool compute_fatlong
Definition: test_util.cpp:1655
QudaSiteSubset siteSubset
Definition: lattice_field.h:71
void setDims(int *)
Definition: test_util.cpp:151
QudaFieldLocation input_location
Definition: quda.h:99
void freeGaugeQuda(void)
void xpay(ColorSpinorField &x, double a, ColorSpinorField &y)
Definition: blas_quda.h:37
int test_type
Definition: test_util.cpp:1636
void setDiracParam(DiracParam &diracParam, QudaInvertParam *inv_param, bool pc)
QudaSolutionType solution_type
Definition: quda.h:204
bool verify_results
Definition: test_util.cpp:1643
#define staggeredSpinorSiteSize
QudaPrecision prec
Definition: test_util.cpp:1608
Dirac * dirac
int x[QUDA_MAX_DIM]
Definition: lattice_field.h:67
cpuColorSpinorField * tmpCpu
void usage_extra(char **argv)
double scale
Definition: quda.h:40
void initQuda(int device)
cudaColorSpinorField * tmp
double tol
Definition: test_util.cpp:1656
QudaFieldLocation output_location
Definition: quda.h:100
int xdim
Definition: test_util.cpp:1615
void * qdp_longlink_cpu[4]
void reorderQDPtoMILC(Out *milc_out, In **qdp_in, int V, int siteSize)
QudaVerbosity verbosity
Definition: quda.h:244
void setSpinorSiteSize(int n)
Definition: test_util.cpp:211
char latfile[]
Definition: test_util.cpp:1623
ColorSpinorParam csParam
Definition: pack_test.cpp:24
QudaInvertParam newQudaInvertParam(void)
const char * get_recon_str(QudaReconstructType recon)
Definition: misc.cpp:768
#define MAX(a, b)
double event_time
double cpu_time
TEST(dslash, verify)
#define warningQuda(...)
Definition: util_quda.h:133
__host__ __device__ ValueType pow(ValueType x, ExponentType e)
Definition: complex_quda.h:111
void matdagmat(void *out, void **link, void *in, int dagger_bit, int mu, QudaPrecision sPrecision, QudaPrecision gPrecision, void *tmp, QudaParity parity)
QudaParity parity
QudaGammaBasis gamma_basis
Definition: quda.h:221
void staggered_dslash(cpuColorSpinorField *out, void **fatlink, void **longlink, void **ghost_fatlink, void **ghost_longlink, cpuColorSpinorField *in, int oddBit, int daggerBit, QudaPrecision sPrecision, QudaPrecision gPrecision, QudaDslashType dslash_type)
cudaColorSpinorField * cudaSpinorOut
QudaPrecision cuda_prec_sloppy
Definition: quda.h:52
const void ** Ghost() const
Definition: gauge_field.h:323
enum QudaDagType_s QudaDagType
QudaReconstructType link_recon_sloppy
Definition: test_util.cpp:1606
enum QudaParity_s QudaParity
QudaReconstructType reconstruct
Definition: quda.h:50
QudaPrecision cuda_prec
Definition: quda.h:49
int X[4]
Definition: quda.h:36
double mass
Definition: quda.h:105
QudaPrecision prec_sloppy
Definition: test_util.cpp:1609
int laplace3D
Definition: test_util.cpp:1622
int V
Definition: test_util.cpp:27
int X[4]
void setGaugeParam(QudaGaugeParam &gaugeParam)
static int Compare(const cpuColorSpinorField &a, const cpuColorSpinorField &b, const int resolution=1)
Perform a component by component comparison of two color-spinor fields. In doing we normalize with re...
void * memset(void *s, int c, size_t n)
void construct_fat_long_gauge_field(void **fatlink, void **longlink, int type, QudaPrecision precision, QudaGaugeParam *param, QudaDslashType dslash_type)
Definition: test_util.cpp:1062
void init()
virtual void M(ColorSpinorField &out, const ColorSpinorField &in) const =0
cpuGaugeField * cpuLong
static int n_naiks
double tadpole_coeff
Definition: quda.h:39
double getTolerance(QudaPrecision prec)
int tdim
Definition: test_util.cpp:1618
int ydim
Definition: test_util.cpp:1616
void end()
void display_test_info()
enum QudaReconstructType_s QudaReconstructType
void commDimPartitionedReset()
Reset the comm dim partioned array to zero,.
Main header file for the QUDA library.
#define printfQuda(...)
Definition: util_quda.h:115
QudaTboundary t_boundary
Definition: quda.h:45
unsigned long long flops
Definition: blas_quda.cu:22
double cpu_max
QudaInvertParam inv_param
enum QudaDslashType_s QudaDslashType
void staggeredDslashRef()
enum QudaVerbosity_s QudaVerbosity
void * qdp_inlink[4]
__device__ void axpy(real a, const real *x, Link &y)
#define checkCudaError()
Definition: util_quda.h:161
static Dirac * create(const DiracParam &param)
Definition: dirac.cpp:159
void initComms(int argc, char **argv, int *const commDims)
Definition: test_util.cpp:88
void read_gauge_field(const char *filename, void *gauge[], QudaPrecision prec, const int *X, int argc, char *argv[])
Definition: qio_field.h:14
double kappa
Definition: test_util.cpp:1647
virtual void Dslash(ColorSpinorField &out, const ColorSpinorField &in, const QudaParity parity) const =0
int gridsize_from_cmdline[]
Definition: test_util.cpp:49
void setVerbosity(QudaVerbosity verbosity)
Definition: util_quda.cpp:25
cpuGaugeField * cpuFat
QudaMatPCType matpc_type
Definition: quda.h:206
ColorSpinorField * tmp1
Definition: dirac_quda.h:41
QudaPrecision cpu_prec
Definition: quda.h:47
#define gaugeSiteSize
Definition: face_gauge.cpp:34
QudaGaugeParam newQudaGaugeParam(void)
void comm_barrier(void)
Definition: comm_mpi.cpp:326
int Vh
Definition: test_util.cpp:28
QudaPrecision cpu_prec
double mass
Definition: test_util.cpp:1646