QUDA  v1.1.0
A library for QCD on GPUs
staggered_dslash_test_utils.h
Go to the documentation of this file.
1 #pragma once
2 
3 #include <iostream>
4 #include <stdio.h>
5 #include <stdlib.h>
6 #include <string.h>
7 #include <algorithm>
8 
9 #include <quda.h>
10 #include <gauge_field.h>
11 #include <dirac_quda.h>
12 #include <misc.h>
13 #include <host_utils.h>
14 #include <command_line_params.h>
15 #include <dslash_reference.h>
17 #include <staggered_gauge_utils.h>
18 
19 #include "dslash_test_helpers.h"
20 #include <assert.h>
21 #include <gtest/gtest.h>
22 
23 using namespace quda;
24 
26 CLI::TransformPairs<dslash_test_type> dtest_type_map {
28  // left here for completeness but not support in staggered dslash test
29  // {"MatPCDagMatPC", dslash_test_type::MatPCDagMatPC},
30  // {"MatDagMat", dslash_test_type::MatDagMat},
31  // {"M5", dslash_test_type::M5},
32  // {"M5inv", dslash_test_type::M5inv},
33  // {"Dslash4pre", dslash_test_type::Dslash4pre}
34 };
35 
36 struct DslashTime {
37  double event_time;
38  double cpu_time;
39  double cpu_min;
40  double cpu_max;
41 
42  DslashTime() : event_time(0.0), cpu_time(0.0), cpu_min(DBL_MAX), cpu_max(0.0) {}
43 };
44 
46 
47  bool is_ctest = false; // Added to distinguish from being used in dslash_test.
48 
49  void *qdp_inlink[4] = {nullptr, nullptr, nullptr, nullptr};
50 
53 
56 
57  cpuGaugeField *cpuFat = nullptr;
58  cpuGaugeField *cpuLong = nullptr;
59 
63  cpuColorSpinorField *tmpCpu = nullptr;
67 
68  std::vector<cpuColorSpinorField *> vp_spinor;
69  std::vector<cpuColorSpinorField *> vp_spinor_out;
70 
71  // In the HISQ case, we include building fat/long links in this unit test
72  void *qdp_fatlink_cpu[4] = {nullptr, nullptr, nullptr, nullptr};
73  void *qdp_longlink_cpu[4] = {nullptr, nullptr, nullptr, nullptr};
74  void **ghost_fatlink_cpu, **ghost_longlink_cpu;
75 
76  // To speed up the unit test, build the CPU field once per partition
77 #ifdef MULTI_GPU
78  void *qdp_fatlink_cpu_backup[16][4];
79  void *qdp_longlink_cpu_backup[16][4];
80  void *qdp_inlink_backup[16][4];
81 #else
82  void *qdp_fatlink_cpu_backup[1][4];
83  void *qdp_longlink_cpu_backup[1][4];
84  void *qdp_inlink_backup[1][4];
85 #endif
86 
88 
90 
91  // For loading the gauge fields
92  int argc_copy;
93  char **argv_copy;
94 
95  // Split grid options
96  int num_src;
98 
100  {
101 
102  // compare to dslash reference implementation
103  printfQuda("Calculating reference implementation...");
104  switch (dtest_type) {
106  staggeredDslash(spinorRef, qdp_fatlink_cpu, qdp_longlink_cpu, ghost_fatlink_cpu, ghost_longlink_cpu, spinor,
108  break;
110  staggeredMatDagMat(spinorRef, qdp_fatlink_cpu, qdp_longlink_cpu, ghost_fatlink_cpu, ghost_longlink_cpu, spinor,
112  break;
114  // the !dagger is to reconcile the QUDA convention of D_stag = {{ 2m, -D_{eo}}, -D_{oe}, 2m}} vs the host convention without the minus signs
115  staggeredDslash(reinterpret_cast<cpuColorSpinorField *>(&spinorRef->Even()), qdp_fatlink_cpu, qdp_longlink_cpu,
116  ghost_fatlink_cpu, ghost_longlink_cpu, reinterpret_cast<cpuColorSpinorField *>(&spinor->Odd()),
118  staggeredDslash(reinterpret_cast<cpuColorSpinorField *>(&spinorRef->Odd()), qdp_fatlink_cpu, qdp_longlink_cpu,
119  ghost_fatlink_cpu, ghost_longlink_cpu, reinterpret_cast<cpuColorSpinorField *>(&spinor->Even()),
123  } else {
125  }
126  break;
127  default: errorQuda("Test type not defined");
128  }
129  }
130 
132  {
133  static bool has_been_called = false;
134  if (has_been_called) { errorQuda("This function is not supposed to be called twice.\n"); }
135  // initialize CPU field backup
136  int pmax = 1;
137 #ifdef MULTI_GPU
138  pmax = 16;
139 #endif
140  for (int p = 0; p < pmax; p++) {
141  for (int d = 0; d < 4; d++) {
142  qdp_fatlink_cpu_backup[p][d] = nullptr;
143  qdp_longlink_cpu_backup[p][d] = nullptr;
144  qdp_inlink_backup[p][d] = nullptr;
145  }
146  }
147  is_ctest = true; // Is being used in dslash_ctest.
148  has_been_called = true;
149  }
150 
152  {
153  static bool has_been_called = false;
154  if (has_been_called) { errorQuda("This function is not supposed to be called twice.\n"); }
155  // Clean up per-partition backup
156  int pmax = 1;
157 #ifdef MULTI_GPU
158  pmax = 16;
159 #endif
160  for (int p = 0; p < pmax; p++) {
161  for (int d = 0; d < 4; d++) {
162  if (qdp_inlink_backup[p][d] != nullptr) {
163  free(qdp_inlink_backup[p][d]);
164  qdp_inlink_backup[p][d] = nullptr;
165  }
166  if (qdp_fatlink_cpu_backup[p][d] != nullptr) {
167  free(qdp_fatlink_cpu_backup[p][d]);
168  qdp_fatlink_cpu_backup[p][d] = nullptr;
169  }
170  if (qdp_longlink_cpu_backup[p][d] != nullptr) {
171  free(qdp_longlink_cpu_backup[p][d]);
172  qdp_longlink_cpu_backup[p][d] = nullptr;
173  }
174  }
175  }
176  has_been_called = true;
177  }
178 
179  void init_ctest(int precision, QudaReconstructType link_recon_, int partition)
180  {
183 
186 
187  auto prec = getPrecision(precision);
189 
194 
196 
197  link_recon = link_recon_;
198 
199  init();
200  }
201 
202  void init_test()
203  {
206 
209 
210  init();
211  }
212 
213  void init()
214  {
219 
220  num_src = grid_partition[0] * grid_partition[1] * grid_partition[2] * grid_partition[3];
221  test_split_grid = num_src > 1;
222 
223  if (test_split_grid) { dtest_type = dslash_test_type::Dslash; }
224 
226 
229  if (Nsrc != 1) {
230  warningQuda("Ignoring Nsrc = %d, setting to 1.", Nsrc);
231  Nsrc = 1;
232  }
233 
234  // Allocate a lot of memory because I'm very confused
235  void *milc_fatlink_cpu = malloc(4 * V * gauge_site_size * host_gauge_data_type_size);
236  void *milc_longlink_cpu = malloc(4 * V * gauge_site_size * host_gauge_data_type_size);
237 
238  milc_fatlink_gpu = malloc(4 * V * gauge_site_size * host_gauge_data_type_size);
239  milc_longlink_gpu = malloc(4 * V * gauge_site_size * host_gauge_data_type_size);
240 
241  void *qdp_fatlink_gpu[4];
242  void *qdp_longlink_gpu[4];
243 
244  for (int dir = 0; dir < 4; dir++) {
245  qdp_fatlink_gpu[dir] = malloc(V * gauge_site_size * host_gauge_data_type_size);
246  qdp_longlink_gpu[dir] = malloc(V * gauge_site_size * host_gauge_data_type_size);
247 
248  qdp_fatlink_cpu[dir] = malloc(V * gauge_site_size * host_gauge_data_type_size);
249  qdp_longlink_cpu[dir] = malloc(V * gauge_site_size * host_gauge_data_type_size);
250 
251  if (qdp_fatlink_gpu[dir] == NULL || qdp_longlink_gpu[dir] == NULL || qdp_fatlink_cpu[dir] == NULL
252  || qdp_longlink_cpu[dir] == NULL) {
253  errorQuda("ERROR: malloc failed for fatlink/longlink");
254  }
255  }
256 
257  // create a base field
258  for (int dir = 0; dir < 4; dir++) {
259  if (qdp_inlink[dir] == nullptr) { qdp_inlink[dir] = malloc(V * gauge_site_size * host_gauge_data_type_size); }
260  }
261 
262  bool gauge_loaded = false;
263  constructStaggeredHostDeviceGaugeField(qdp_inlink, qdp_longlink_cpu, qdp_longlink_gpu, qdp_fatlink_cpu,
264  qdp_fatlink_gpu, gauge_param, argc_copy, argv_copy, gauge_loaded);
265 
266  // Alright, we've created all the void** links.
267  // Create the void* pointers
268  reorderQDPtoMILC(milc_fatlink_gpu, qdp_fatlink_gpu, V, gauge_site_size, gauge_param.cpu_prec, gauge_param.cpu_prec);
269  reorderQDPtoMILC(milc_fatlink_cpu, qdp_fatlink_cpu, V, gauge_site_size, gauge_param.cpu_prec, gauge_param.cpu_prec);
270  reorderQDPtoMILC(milc_longlink_gpu, qdp_longlink_gpu, V, gauge_site_size, gauge_param.cpu_prec, gauge_param.cpu_prec);
271  reorderQDPtoMILC(milc_longlink_cpu, qdp_longlink_cpu, V, gauge_site_size, gauge_param.cpu_prec, gauge_param.cpu_prec);
272  // Create ghost zones for CPU fields,
273  // prepare and load the GPU fields
274 
275 #ifdef MULTI_GPU
278  GaugeFieldParam cpuFatParam(milc_fatlink_cpu, gauge_param);
280  cpuFat = new cpuGaugeField(cpuFatParam);
281  ghost_fatlink_cpu = cpuFat->Ghost();
282 
284  GaugeFieldParam cpuLongParam(milc_longlink_cpu, gauge_param);
285  cpuLongParam.ghostExchange = QUDA_GHOST_EXCHANGE_PAD;
286  cpuLong = new cpuGaugeField(cpuLongParam);
287  ghost_longlink_cpu = cpuLong->Ghost();
288 #endif
289 
295  } else {
297  }
298 
299  // set verbosity prior to loadGaugeQuda
301 
302  printfQuda("Sending fat links to GPU\n");
303  loadGaugeQuda(milc_fatlink_gpu, &gauge_param);
304 
306 
307 #ifdef MULTI_GPU
308  gauge_param.ga_pad *= 3;
309 #endif
310 
316  printfQuda("Sending long links to GPU\n");
317  loadGaugeQuda(milc_longlink_gpu, &gauge_param);
318  }
319 
321  csParam.nColor = 3;
322  csParam.nSpin = 1;
323  csParam.nDim = 5;
324  for (int d = 0; d < 4; d++) { csParam.x[d] = gauge_param.X[d]; }
325  csParam.x[4] = 1;
326 
328  // inv_param.solution_type = QUDA_MAT_SOLUTION;
329  csParam.pad = 0;
332  csParam.x[0] /= 2;
334  } else {
337  }
338 
341  csParam.gammaBasis = inv_param.gamma_basis; // this parameter is meaningless for staggered
343 
347  tmpCpu = new cpuColorSpinorField(csParam);
348 
350 
351  if (test_split_grid) {
352  inv_param.num_src = num_src;
354  for (int n = 0; n < num_src; n++) {
355  vp_spinor.push_back(new cpuColorSpinorField(csParam));
356  vp_spinor_out.push_back(new cpuColorSpinorField(csParam));
357  *vp_spinor[n] = *spinor;
358  }
359  }
360 
364 
367  *cudaSpinor = *spinor;
369 
370  bool pc = (dtest_type == dslash_test_type::MatPC); // For test_type 0, can use either pc or not pc
371  // because both call the same "Dslash" directly.
372  DiracParam diracParam;
373  setDiracParam(diracParam, &inv_param, pc);
374  diracParam.tmp1 = tmp;
375  dirac = Dirac::create(diracParam);
376 
377  for (int dir = 0; dir < 4; dir++) {
378  free(qdp_fatlink_gpu[dir]);
379  qdp_fatlink_gpu[dir] = nullptr;
380  free(qdp_longlink_gpu[dir]);
381  qdp_longlink_gpu[dir] = nullptr;
382  }
383  // free(milc_fatlink_gpu); milc_fatlink_gpu = nullptr;
384  // free(milc_longlink_gpu); milc_longlink_gpu = nullptr;
385  free(milc_fatlink_cpu);
386  milc_fatlink_cpu = nullptr;
387  free(milc_longlink_cpu);
388  milc_longlink_cpu = nullptr;
389 
390  // gauge_param.reconstruct = link_recon;
391  }
392 
393  void end()
394  {
395  for (int dir = 0; dir < 4; dir++) {
396  if (qdp_fatlink_cpu[dir] != nullptr) {
397  free(qdp_fatlink_cpu[dir]);
398  qdp_fatlink_cpu[dir] = nullptr;
399  }
400  if (qdp_longlink_cpu[dir] != nullptr) {
401  free(qdp_longlink_cpu[dir]);
402  qdp_longlink_cpu[dir] = nullptr;
403  }
404  }
405 
406  if (dirac != nullptr) {
407  delete dirac;
408  dirac = nullptr;
409  }
410  if (cudaSpinor != nullptr) {
411  delete cudaSpinor;
412  cudaSpinor = nullptr;
413  }
414  if (cudaSpinorOut != nullptr) {
415  delete cudaSpinorOut;
416  cudaSpinorOut = nullptr;
417  }
418  if (tmp != nullptr) {
419  delete tmp;
420  tmp = nullptr;
421  }
422 
423  if (spinor != nullptr) {
424  delete spinor;
425  spinor = nullptr;
426  }
427  if (spinorOut != nullptr) {
428  delete spinorOut;
429  spinorOut = nullptr;
430  }
431  if (spinorRef != nullptr) {
432  delete spinorRef;
433  spinorRef = nullptr;
434  }
435  if (tmpCpu != nullptr) {
436  delete tmpCpu;
437  tmpCpu = nullptr;
438  }
439 
440  if (test_split_grid) {
441  for (auto p : vp_spinor) { delete p; }
442  for (auto p : vp_spinor_out) { delete p; }
443  vp_spinor.clear();
444  vp_spinor_out.clear();
445  }
446 
447  free(milc_fatlink_gpu);
448  milc_fatlink_gpu = nullptr;
449  free(milc_longlink_gpu);
450  milc_longlink_gpu = nullptr;
451 
452  freeGaugeQuda();
453 
454  if (cpuFat) {
455  delete cpuFat;
456  cpuFat = nullptr;
457  }
458  if (cpuLong) {
459  delete cpuLong;
460  cpuLong = nullptr;
461  }
463  }
464 
466  {
467 
468  DslashTime dslash_time;
469  timeval tstart, tstop;
470 
471  cudaEvent_t start, end;
472  cudaEventCreate(&start);
473  cudaEventRecord(start, 0);
474  cudaEventSynchronize(start);
475 
476  comm_barrier();
477  cudaEventRecord(start, 0);
478 
479  if (test_split_grid) {
480 
481  std::vector<void *> _hp_x(inv_param.num_src);
482  std::vector<void *> _hp_b(inv_param.num_src);
483  for (int i = 0; i < inv_param.num_src; i++) {
484  _hp_x[i] = vp_spinor_out[i]->V();
485  _hp_b[i] = vp_spinor[i]->V();
486  }
487  dslashMultiSrcStaggeredQuda(_hp_x.data(), _hp_b.data(), &inv_param, parity, milc_fatlink_gpu, milc_longlink_gpu,
488  &gauge_param);
489 
490  } else {
491 
492  for (int i = 0; i < niter; i++) {
493 
494  gettimeofday(&tstart, NULL);
495 
496  switch (dtest_type) {
500  default: errorQuda("Test type %d not defined on staggered dslash.\n", static_cast<int>(dtest_type));
501  }
502 
503  gettimeofday(&tstop, NULL);
504  long ds = tstop.tv_sec - tstart.tv_sec;
505  long dus = tstop.tv_usec - tstart.tv_usec;
506  double elapsed = ds + 0.000001 * dus;
507 
508  dslash_time.cpu_time += elapsed;
509  // skip first and last iterations since they may skew these metrics if comms are not synchronous
510  if (i > 0 && i < niter) {
511  if (elapsed < dslash_time.cpu_min) dslash_time.cpu_min = elapsed;
512  if (elapsed > dslash_time.cpu_max) dslash_time.cpu_max = elapsed;
513  }
514  }
515  }
516 
517  cudaEventCreate(&end);
518  cudaEventRecord(end, 0);
519  cudaEventSynchronize(end);
520  float runTime;
521  cudaEventElapsedTime(&runTime, start, end);
522  cudaEventDestroy(start);
523  cudaEventDestroy(end);
524 
525  dslash_time.event_time = runTime / 1000;
526 
527  return dslash_time;
528  }
529 
530  void run_test(int niter, bool print_metrics = false)
531  {
532  printfQuda("Tuning...\n");
533  dslashCUDA(1);
534 
535  // reset flop counter
536  dirac->Flops();
537 
538  DslashTime dslash_time = dslashCUDA(niter);
540 
541  if (print_metrics) {
542  printfQuda("%fus per kernel call\n", 1e6 * dslash_time.event_time / niter);
543 
544  unsigned long long flops = dirac->Flops();
545  double gflops = 1.0e-9 * flops / dslash_time.event_time;
546  printfQuda("GFLOPS = %f\n", gflops);
547  ::testing::Test::RecordProperty("Gflops", std::to_string(gflops));
548 
549  size_t ghost_bytes = cudaSpinor->GhostBytes();
550 
551  ::testing::Test::RecordProperty("Halo_bidirectitonal_BW_GPU",
552  1.0e-9 * 2 * ghost_bytes * niter / dslash_time.event_time);
553  ::testing::Test::RecordProperty("Halo_bidirectitonal_BW_CPU",
554  1.0e-9 * 2 * ghost_bytes * niter / dslash_time.cpu_time);
555  ::testing::Test::RecordProperty("Halo_bidirectitonal_BW_CPU_min", 1.0e-9 * 2 * ghost_bytes / dslash_time.cpu_max);
556  ::testing::Test::RecordProperty("Halo_bidirectitonal_BW_CPU_max", 1.0e-9 * 2 * ghost_bytes / dslash_time.cpu_min);
557  ::testing::Test::RecordProperty("Halo_message_size_bytes", 2 * ghost_bytes);
558 
559  printfQuda(
560  "Effective halo bi-directional bandwidth (GB/s) GPU = %f ( CPU = %f, min = %f , max = %f ) for aggregate "
561  "message size %lu bytes\n",
562  1.0e-9 * 2 * ghost_bytes * niter / dslash_time.event_time,
563  1.0e-9 * 2 * ghost_bytes * niter / dslash_time.cpu_time, 1.0e-9 * 2 * ghost_bytes / dslash_time.cpu_max,
564  1.0e-9 * 2 * ghost_bytes / dslash_time.cpu_min, 2 * ghost_bytes);
565  }
566  }
567 
568  double verify()
569  {
570  double deviation = 0.0;
571 
572  if (test_split_grid) {
573  for (int n = 0; n < num_src; n++) {
574  double spinor_ref_norm2 = blas::norm2(*spinorRef);
575  double spinor_out_norm2 = blas::norm2(*vp_spinor_out[n]);
576 
577  bool failed = false;
578  // Catching nans is weird.
579  if (std::isnan(spinor_ref_norm2)) { failed = true; }
580  if (std::isnan(spinor_out_norm2)) { failed = true; }
581 
582  printfQuda("Results: CPU=%f, CPU-CUDA=%f\n", spinor_ref_norm2, spinor_out_norm2);
583  deviation = std::max(deviation, pow(10, -(double)(cpuColorSpinorField::Compare(*spinorRef, *vp_spinor_out[n]))));
584  if (failed) { deviation = 1.0; }
585  }
586  } else {
587  double spinor_ref_norm2 = blas::norm2(*spinorRef);
588  double spinor_out_norm2 = blas::norm2(*spinorOut);
589 
590  bool failed = false;
591  // Catching nans is weird.
592  if (std::isnan(spinor_ref_norm2)) { failed = true; }
593  if (std::isnan(spinor_out_norm2)) { failed = true; }
594 
595  double cuda_spinor_out_norm2 = blas::norm2(*cudaSpinorOut);
596  printfQuda("Results: CPU=%f, CUDA=%f, CPU-CUDA=%f\n", spinor_ref_norm2, cuda_spinor_out_norm2, spinor_out_norm2);
597  deviation = pow(10, -(double)(cpuColorSpinorField::Compare(*spinorRef, *spinorOut)));
598  if (failed) { deviation = 1.0; }
599  }
600 
601  return deviation;
602  }
603 };
const ColorSpinorField & Odd() const
const ColorSpinorField & Even() const
void setPrecision(QudaPrecision precision, QudaPrecision ghost_precision=QUDA_INVALID_PRECISION, bool force_native=false)
unsigned long long Flops() const
returns and then zeroes flopcount
Definition: dirac_quda.h:313
static Dirac * create(const DiracParam &param)
Creates a subclass from parameters.
Definition: dirac.cpp:151
ColorSpinorField * tmp1
Definition: dirac_quda.h:52
virtual void M(ColorSpinorField &out, const ColorSpinorField &in) const
Apply M for the dirac op. E.g. the Schur Complement operator.
virtual void Dslash(ColorSpinorField &out, const ColorSpinorField &in, const QudaParity parity) const
apply 'dslash' operator for the DiracOp. This may be e.g. AD
const void ** Ghost() const
Definition: gauge_field.h:368
void Source(const QudaSourceType sourceType, const int st=0, const int s=0, const int c=0)
static int Compare(const cpuColorSpinorField &a, const cpuColorSpinorField &b, const int resolution=1)
Perform a component by component comparison of two color-spinor fields. In doing we normalize with re...
static void RecordProperty(const std::string &key, const std::string &value)
void comm_barrier(void)
void commDimPartitionedReset()
Reset the comm dim partioned array to zero,.
double kappa
double mass
QudaReconstructType link_recon
int niter
QudaVerbosity verbosity
std::array< int, 4 > grid_partition
QudaDslashType dslash_type
int Nsrc
QudaPrecision prec
bool dagger
int V
Definition: host_utils.cpp:37
void setDims(int *)
Definition: host_utils.cpp:315
void end(void)
GaugeCovDev * dirac
Definition: covdev_test.cpp:42
double dslashCUDA(int niter, int mu)
cudaColorSpinorField * cudaSpinor
Definition: covdev_test.cpp:32
QudaParity parity
Definition: covdev_test.cpp:40
cudaColorSpinorField * cudaSpinorOut
Definition: covdev_test.cpp:32
cudaColorSpinorField * tmp
Definition: covdev_test.cpp:34
cpuColorSpinorField * spinor
Definition: covdev_test.cpp:31
cpuColorSpinorField * spinorOut
Definition: covdev_test.cpp:31
cpuColorSpinorField * spinorRef
Definition: covdev_test.cpp:31
QudaGaugeParam gauge_param
Definition: covdev_test.cpp:26
QudaInvertParam inv_param
Definition: covdev_test.cpp:27
int argc_copy
Definition: dslash_ctest.cpp:8
char ** argv_copy
Definition: dslash_ctest.cpp:9
dslash_test_type
@ QUDA_RANDOM_SOURCE
Definition: enum_quda.h:376
@ QUDA_STAGGERED_PHASE_NO
Definition: enum_quda.h:515
@ QUDA_STAGGERED_DSLASH
Definition: enum_quda.h:97
@ QUDA_ASQTAD_DSLASH
Definition: enum_quda.h:98
@ QUDA_LAPLACE_DSLASH
Definition: enum_quda.h:101
@ QUDA_DAG_NO
Definition: enum_quda.h:223
@ QUDA_DAG_YES
Definition: enum_quda.h:223
@ QUDA_SUMMARIZE
Definition: enum_quda.h:266
@ QUDA_FULL_SITE_SUBSET
Definition: enum_quda.h:333
@ QUDA_PARITY_SITE_SUBSET
Definition: enum_quda.h:332
@ QUDA_RECONSTRUCT_NO
Definition: enum_quda.h:70
@ QUDA_RECONSTRUCT_12
Definition: enum_quda.h:71
@ QUDA_RECONSTRUCT_13
Definition: enum_quda.h:74
@ QUDA_RECONSTRUCT_8
Definition: enum_quda.h:72
@ QUDA_RECONSTRUCT_9
Definition: enum_quda.h:73
@ QUDA_EVEN_PARITY
Definition: enum_quda.h:284
@ QUDA_ODD_PARITY
Definition: enum_quda.h:284
@ QUDA_GHOST_EXCHANGE_PAD
Definition: enum_quda.h:509
@ QUDA_EVEN_ODD_SITE_ORDER
Definition: enum_quda.h:340
enum QudaReconstructType_s QudaReconstructType
@ QUDA_MATPC_SOLUTION
Definition: enum_quda.h:159
@ QUDA_MAT_SOLUTION
Definition: enum_quda.h:157
@ QUDA_FLOAT2_FIELD_ORDER
Definition: enum_quda.h:348
@ QUDA_SPACE_SPIN_COLOR_FIELD_ORDER
Definition: enum_quda.h:351
@ QUDA_ZERO_FIELD_CREATE
Definition: enum_quda.h:361
enum QudaParity_s QudaParity
@ QUDA_SU3_LINKS
Definition: enum_quda.h:24
@ QUDA_ASQTAD_LONG_LINKS
Definition: enum_quda.h:32
@ QUDA_ASQTAD_FAT_LINKS
Definition: enum_quda.h:31
#define gauge_site_size
Definition: face_gauge.cpp:34
size_t host_gauge_data_type_size
Definition: host_utils.cpp:65
void dw_setDims(int *X, const int L5)
Definition: host_utils.cpp:353
void setStaggeredInvertParam(QudaInvertParam &inv_param)
Definition: set_params.cpp:868
void constructStaggeredHostDeviceGaugeField(void **qdp_inlink, void **qdp_longlink_cpu, void **qdp_longlink_gpu, void **qdp_fatlink_cpu, void **qdp_fatlink_gpu, QudaGaugeParam &gauge_param, int argc, char **argv, bool &gauge_loaded)
void reorderQDPtoMILC(void *milc_out, void **qdp_in, int V, int siteSize, QudaPrecision out_precision, QudaPrecision in_precision)
void setStaggeredGaugeParam(QudaGaugeParam &gauge_param)
Definition: set_params.cpp:69
QudaPrecision getPrecision(int i)
Definition: host_utils.h:222
void init()
Create the BLAS context.
unsigned long long flops
void xpay(ColorSpinorField &x, double a, ColorSpinorField &y)
Definition: blas_quda.h:45
double norm2(const ColorSpinorField &a)
void axpy(double a, ColorSpinorField &x, ColorSpinorField &y)
Definition: blas_quda.h:43
void start()
Start profiling.
Definition: device.cpp:226
__host__ __device__ ValueType pow(ValueType x, ExponentType e)
Definition: complex_quda.h:111
void setDiracParam(DiracParam &diracParam, QudaInvertParam *inv_param, bool pc)
ColorSpinorParam csParam
Definition: pack_test.cpp:25
Main header file for the QUDA library.
QudaGaugeParam newQudaGaugeParam(void)
void freeGaugeQuda(void)
void loadGaugeQuda(void *h_gauge, QudaGaugeParam *param)
QudaInvertParam newQudaInvertParam(void)
void dslashMultiSrcStaggeredQuda(void **_hp_x, void **_hp_b, QudaInvertParam *param, QudaParity parity, void *milc_fatlinks, void *milc_longlinks, QudaGaugeParam *gauge_param)
Really the same with @dslashMultiSrcQuda but for staggered-style fermions, by accepting pointers to f...
bool gauge_loaded
void staggeredMatDagMat(ColorSpinorField *out, void **fatlink, void **longlink, void **ghost_fatlink, void **ghost_longlink, ColorSpinorField *in, double mass, int dagger_bit, QudaPrecision sPrecision, QudaPrecision gPrecision, ColorSpinorField *tmp, QudaParity parity, QudaDslashType dslash_type)
void staggeredDslash(ColorSpinorField *out, void **fatlink, void **longlink, void **ghost_fatlink, void **ghost_longlink, ColorSpinorField *in, int oddBit, int daggerBit, QudaPrecision sPrecision, QudaPrecision gPrecision, QudaDslashType dslash_type)
CLI::TransformPairs< dslash_test_type > dtest_type_map
dslash_test_type dtest_type
QudaReconstructType reconstruct
Definition: quda.h:49
QudaPrecision cuda_prec_precondition
Definition: quda.h:57
int ga_pad
Definition: quda.h:65
QudaLinkType type
Definition: quda.h:41
QudaPrecision cuda_prec_refinement_sloppy
Definition: quda.h:54
QudaPrecision cuda_prec_sloppy
Definition: quda.h:51
QudaReconstructType reconstruct_sloppy
Definition: quda.h:52
QudaPrecision cuda_prec
Definition: quda.h:48
QudaStaggeredPhase staggered_phase_type
Definition: quda.h:73
int X[4]
Definition: quda.h:35
QudaPrecision cpu_prec
Definition: quda.h:46
QudaSolutionType solution_type
Definition: quda.h:228
int split_grid[QUDA_MAX_DIM]
Definition: quda.h:195
QudaPrecision cuda_prec
Definition: quda.h:238
int num_src_per_sub_partition
Definition: quda.h:190
QudaDagType dagger
Definition: quda.h:231
QudaPrecision cpu_prec
Definition: quda.h:237
QudaGammaBasis gamma_basis
Definition: quda.h:246
void run_test(int niter, bool print_metrics=false)
std::vector< cpuColorSpinorField * > vp_spinor_out
std::vector< cpuColorSpinorField * > vp_spinor
void init_ctest(int precision, QudaReconstructType link_recon_, int partition)
QudaGhostExchange ghostExchange
Definition: lattice_field.h:77
int x[QUDA_MAX_DIM]
Definition: lattice_field.h:68
QudaSiteSubset siteSubset
Definition: lattice_field.h:72
#define printfQuda(...)
Definition: util_quda.h:114
#define warningQuda(...)
Definition: util_quda.h:132
void setVerbosity(QudaVerbosity verbosity)
Definition: util_quda.cpp:25
#define errorQuda(...)
Definition: util_quda.h:120