QUDA  1.0.0
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros
blas_test.cu
Go to the documentation of this file.
1 #include <stdio.h>
2 #include <stdlib.h>
3 
4 #include <quda_internal.h>
5 #include <color_spinor_field.h>
6 #include <blas_quda.h>
7 
8 #include <test_util.h>
9 
10 // include because of nasty globals used in the tests
11 #include <dslash_util.h>
12 
13 // google test
14 #include <gtest/gtest.h>
15 
16 extern int test_type;
17 extern QudaPrecision prec;
20 extern int nvec[QUDA_MAX_MG_LEVEL];
21 extern int device;
22 extern int xdim;
23 extern int ydim;
24 extern int zdim;
25 extern int tdim;
26 extern int gridsize_from_cmdline[];
27 extern int niter;
28 
29 extern int Nsrc;
30 extern int Msrc;
33 
34 extern void usage(char** );
35 
36 const int Nkernels = 40;
37 
38 using namespace quda;
39 
40 ColorSpinorField *xH, *yH, *zH, *wH, *vH, *hH, *mH, *lH;
41 ColorSpinorField *xD, *yD, *zD, *wD, *vD, *hD, *mD, *lD, *xmD, *ymD, *zmD;
42 std::vector<cpuColorSpinorField*> xmH;
43 std::vector<cpuColorSpinorField*> ymH;
44 std::vector<cpuColorSpinorField*> zmH;
45 int Nspin;
46 int Ncolor;
47 
49 {
50  param.setPrecision(precision);
51  if (Nspin == 1 || Nspin == 2 || precision == QUDA_DOUBLE_PRECISION) {
53  } else {
55  }
56 }
57 
59 {
60  printfQuda("running the following test:\n");
61  printfQuda("S_dimension T_dimension Nspin Ncolor\n");
62  printfQuda("%3d /%3d / %3d %3d %d %d\n", xdim, ydim, zdim, tdim, Nspin, Ncolor);
63  printfQuda("Grid partition info: X Y Z T\n");
64  printfQuda(" %d %d %d %d\n",
65  dimPartitioned(0),
66  dimPartitioned(1),
67  dimPartitioned(2),
68  dimPartitioned(3));
69  return;
70 }
71 
72 int Nprec = 4;
73 
74 bool skip_kernel(int precision, int kernel) {
75  if ((QUDA_PRECISION & getPrecision(precision)) == 0) return true;
76 
77  // if we've selected a given kernel then make sure we only run that
78  if (test_type != -1 && kernel != test_type) return true;
79 
80  // if we've selected a given precision then make sure we only run that
81  auto this_prec = getPrecision(precision);
82  if (prec != QUDA_INVALID_PRECISION && this_prec != prec) return true;
83 
84  if ( Nspin == 2 && ( precision == 0 || precision ==1 ) ) {
85  // avoid quarter, half precision tests if doing coarse fields
86  return true;
87  } else if (Nspin == 2 && (kernel == 1 || kernel == 2)) {
88  // avoid low-precision copy if doing coarse fields
89  return true;
90  } else if (Ncolor != 3 && (kernel == 31 || kernel == 32)) {
91  // only benchmark heavy-quark norm if doing 3 colors
92  return true;
93  } else if ((Nprec < 4) && (kernel == 0)) {
94  // only benchmark high-precision copy() if double is supported
95  return true;
96  }
97 
98  return false;
99 }
100 
101 void initFields(int prec)
102 {
103  // precisions used for the source field in the copyCuda() benchmark
104  QudaPrecision high_aux_prec = QUDA_INVALID_PRECISION;
105  QudaPrecision mid_aux_prec = QUDA_INVALID_PRECISION;
106  QudaPrecision low_aux_prec = QUDA_INVALID_PRECISION;
107 
109  param.nColor = Ncolor;
110  param.nSpin = Nspin;
111  param.nDim = 4; // number of spacetime dimensions
112 
113  param.pad = 0; // padding must be zero for cpu fields
114 
115  switch (solve_type) {
118  case QUDA_DIRECT_SOLVE:
120  default: errorQuda("Unexpected solve_type=%d\n", solve_type);
121  }
122 
123  if (param.siteSubset == QUDA_PARITY_SITE_SUBSET) param.x[0] = xdim/2;
124  else param.x[0] = xdim;
125  param.x[1] = ydim;
126  param.x[2] = zdim;
127  param.x[3] = tdim;
128 
133 
135 
136  vH = new cpuColorSpinorField(param);
137  wH = new cpuColorSpinorField(param);
138  xH = new cpuColorSpinorField(param);
139  yH = new cpuColorSpinorField(param);
140  zH = new cpuColorSpinorField(param);
141  hH = new cpuColorSpinorField(param);
142  mH = new cpuColorSpinorField(param);
143  lH = new cpuColorSpinorField(param);
144 
145 // create composite fields
146 
147  // xmH = new cpuColorSpinorField(param);
148  // ymH = new cpuColorSpinorField(param);
149 
150 
151 
152  xmH.reserve(Nsrc);
153  for (int cid = 0; cid < Nsrc; cid++) xmH.push_back(new cpuColorSpinorField(param));
154  ymH.reserve(Msrc);
155  for (int cid = 0; cid < Msrc; cid++) ymH.push_back(new cpuColorSpinorField(param));
156  zmH.reserve(Nsrc);
157  for (int cid = 0; cid < Nsrc; cid++) zmH.push_back(new cpuColorSpinorField(param));
158 
159 
160  static_cast<cpuColorSpinorField*>(vH)->Source(QUDA_RANDOM_SOURCE, 0, 0, 0);
161  static_cast<cpuColorSpinorField*>(wH)->Source(QUDA_RANDOM_SOURCE, 0, 0, 0);
162  static_cast<cpuColorSpinorField*>(xH)->Source(QUDA_RANDOM_SOURCE, 0, 0, 0);
163  static_cast<cpuColorSpinorField*>(yH)->Source(QUDA_RANDOM_SOURCE, 0, 0, 0);
164  static_cast<cpuColorSpinorField*>(zH)->Source(QUDA_RANDOM_SOURCE, 0, 0, 0);
165  static_cast<cpuColorSpinorField*>(hH)->Source(QUDA_RANDOM_SOURCE, 0, 0, 0);
166  static_cast<cpuColorSpinorField*>(mH)->Source(QUDA_RANDOM_SOURCE, 0, 0, 0);
167  static_cast<cpuColorSpinorField*>(lH)->Source(QUDA_RANDOM_SOURCE, 0, 0, 0);
168  for(int i=0; i<Nsrc; i++){
169  static_cast<cpuColorSpinorField*>(xmH[i])->Source(QUDA_RANDOM_SOURCE, 0, 0, 0);
170  }
171  for(int i=0; i<Msrc; i++){
172  static_cast<cpuColorSpinorField*>(ymH[i])->Source(QUDA_RANDOM_SOURCE, 0, 0, 0);
173  }
174  // Now set the parameters for the cuda fields
175  //param.pad = xdim*ydim*zdim/2;
176 
177  if (param.nSpin == 4) param.gammaBasis = QUDA_UKQCD_GAMMA_BASIS;
179 
180  switch(prec) {
181  case 0:
183  high_aux_prec = QUDA_DOUBLE_PRECISION;
184  mid_aux_prec = QUDA_SINGLE_PRECISION;
185  low_aux_prec = QUDA_HALF_PRECISION;
186  break;
187  case 1:
189  high_aux_prec = QUDA_DOUBLE_PRECISION;
190  mid_aux_prec = QUDA_SINGLE_PRECISION;
191  low_aux_prec = QUDA_QUARTER_PRECISION;
192  break;
193  case 2:
195  high_aux_prec = QUDA_DOUBLE_PRECISION;
196  mid_aux_prec = QUDA_HALF_PRECISION;
197  low_aux_prec = QUDA_QUARTER_PRECISION;
198  break;
199  case 3:
201  high_aux_prec = QUDA_SINGLE_PRECISION;
202  mid_aux_prec = QUDA_HALF_PRECISION;
203  low_aux_prec = QUDA_QUARTER_PRECISION;
204  break;
205  default:
206  errorQuda("Precision option not defined");
207  }
208 
209  checkCudaError();
210 
211  vD = new cudaColorSpinorField(param);
212  wD = new cudaColorSpinorField(param);
213  xD = new cudaColorSpinorField(param);
214  yD = new cudaColorSpinorField(param);
215  zD = new cudaColorSpinorField(param);
216 
217  param.is_composite = true;
218  param.is_component = false;
219 
220 // create composite fields
221  param.composite_dim = Nsrc;
222  xmD = new cudaColorSpinorField(param);
223 
224  param.composite_dim = Msrc;
225  ymD = new cudaColorSpinorField(param);
226 
227  param.composite_dim = Nsrc;
228  zmD = new cudaColorSpinorField(param);
229 
230  param.is_composite = false;
231  param.is_component = false;
232  param.composite_dim = 1;
233 
234  setPrec(param, high_aux_prec);
235  hD = new cudaColorSpinorField(param);
236 
237  setPrec(param, mid_aux_prec);
238  mD = new cudaColorSpinorField(param);
239 
240  setPrec(param, low_aux_prec);
241  lD = new cudaColorSpinorField(param);
242 
243  // check for successful allocation
244  checkCudaError();
245 
246  // only do copy if not doing half precision with mg
247  bool flag = !(param.nSpin == 2 &&
248  (prec == 0 || prec == 1 || low_aux_prec == QUDA_HALF_PRECISION || mid_aux_prec == QUDA_HALF_PRECISION ||
249  low_aux_prec == QUDA_QUARTER_PRECISION || mid_aux_prec == QUDA_QUARTER_PRECISION) );
250 
251  if ( flag ) {
252  *vD = *vH;
253  *wD = *wH;
254  *xD = *xH;
255  *yD = *yH;
256  *zD = *zH;
257  *hD = *hH;
258  *mD = *mH;
259  *lD = *lH;
260  // for (int i=0; i < Nsrc; i++){
261  // xmD->Component(i) = *(xmH[i]);
262  // ymD->Component(i) = *(ymH[i]);
263  // }
264  // *ymD = *ymH;
265  }
266 }
267 
268 
270 {
271 
272  // release memory
273  delete vD;
274  delete wD;
275  delete xD;
276  delete yD;
277  delete zD;
278  delete hD;
279  delete mD;
280  delete lD;
281  delete xmD;
282  delete ymD;
283  delete zmD;
284 
285  // release memory
286  delete vH;
287  delete wH;
288  delete xH;
289  delete yH;
290  delete zH;
291  delete hH;
292  delete mH;
293  delete lH;
294  for (int i=0; i < Nsrc; i++) delete xmH[i];
295  for (int i=0; i < Msrc; i++) delete ymH[i];
296  for (int i=0; i < Nsrc; i++) delete zmH[i];
297  xmH.clear();
298  ymH.clear();
299  zmH.clear();
300 }
301 
302 
303 double benchmark(int kernel, const int niter) {
304 
305  double a = 1.0, b = 2.0, c = 3.0;
306  quda::Complex a2, b2;
310  quda::Complex * A2 = new quda::Complex[Nsrc*Nsrc]; // for the block cDotProductNorm test
311 
312  cudaEvent_t start, end;
313  cudaEventCreate(&start);
314  cudaEventCreate(&end);
315  cudaEventRecord(start, 0);
316 
317  {
318  switch (kernel) {
319 
320  case 0:
321  for (int i=0; i < niter; ++i) blas::copy(*yD, *hD);
322  break;
323 
324  case 1:
325  for (int i=0; i < niter; ++i) blas::copy(*yD, *mD);
326  break;
327 
328  case 2:
329  for (int i=0; i < niter; ++i) blas::copy(*yD, *lD);
330  break;
331 
332  case 3:
333  for (int i=0; i < niter; ++i) blas::axpby(a, *xD, b, *yD);
334  break;
335 
336  case 4:
337  for (int i=0; i < niter; ++i) blas::xpy(*xD, *yD);
338  break;
339 
340  case 5:
341  for (int i=0; i < niter; ++i) blas::axpy(a, *xD, *yD);
342  break;
343 
344  case 6:
345  for (int i=0; i < niter; ++i) blas::xpay(*xD, a, *yD);
346  break;
347 
348  case 7:
349  for (int i=0; i < niter; ++i) blas::mxpy(*xD, *yD);
350  break;
351 
352  case 8:
353  for (int i=0; i < niter; ++i) blas::ax(a, *xD);
354  break;
355 
356  case 9:
357  for (int i=0; i < niter; ++i) blas::caxpy(a2, *xD, *yD);
358  break;
359 
360  case 10:
361  for (int i=0; i < niter; ++i) blas::caxpby(a2, *xD, b2, *yD);
362  break;
363 
364  case 11:
365  for (int i=0; i < niter; ++i) blas::cxpaypbz(*xD, a2, *yD, b2, *zD);
366  break;
367 
368  case 12:
369  for (int i=0; i < niter; ++i) blas::axpyBzpcx(a, *xD, *yD, b, *zD, c);
370  break;
371 
372  case 13:
373  for (int i=0; i < niter; ++i) blas::axpyZpbx(a, *xD, *yD, *zD, b);
374  break;
375 
376  case 14:
377  for (int i=0; i < niter; ++i) blas::caxpbypzYmbw(a2, *xD, b2, *yD, *zD, *wD);
378  break;
379 
380  case 15:
381  for (int i=0; i < niter; ++i) blas::cabxpyAx(a, b2, *xD, *yD);
382  break;
383 
384  case 16:
385  for (int i=0; i < niter; ++i) blas::caxpyXmaz(a2, *xD, *yD, *zD);
386  break;
387 
388  // double
389  case 17:
390  for (int i=0; i < niter; ++i) blas::norm2(*xD);
391  break;
392 
393  case 18:
394  for (int i=0; i < niter; ++i) blas::reDotProduct(*xD, *yD);
395  break;
396 
397  case 19:
398  for (int i=0; i < niter; ++i) blas::axpyNorm(a, *xD, *yD);
399  break;
400 
401  case 20:
402  for (int i=0; i < niter; ++i) blas::xmyNorm(*xD, *yD);
403  break;
404 
405  case 21:
406  for (int i=0; i < niter; ++i) blas::caxpyNorm(a2, *xD, *yD);
407  break;
408 
409  case 22:
410  for (int i=0; i < niter; ++i) blas::caxpyXmazNormX(a2, *xD, *yD, *zD);
411  break;
412 
413  case 23:
414  for (int i=0; i < niter; ++i) blas::cabxpyzAxNorm(a, b2, *xD, *yD, *yD);
415  break;
416 
417  // double2
418  case 24:
419  for (int i=0; i < niter; ++i) blas::cDotProduct(*xD, *yD);
420  break;
421 
422  case 25:
423  for (int i=0; i < niter; ++i) blas::caxpyDotzy(a2, *xD, *yD, *zD);
424  break;
425 
426  // double3
427  case 26:
428  for (int i=0; i < niter; ++i) blas::cDotProductNormA(*xD, *yD);
429  break;
430 
431  case 27:
432  for (int i=0; i < niter; ++i) blas::cDotProductNormB(*xD, *yD);
433  break;
434 
435  case 28:
436  for (int i=0; i < niter; ++i) blas::caxpbypzYmbwcDotProductUYNormY(a2, *xD, b2, *yD, *zD, *wD, *vD);
437  break;
438 
439  case 29:
440  for (int i=0; i < niter; ++i) blas::HeavyQuarkResidualNorm(*xD, *yD);
441  break;
442 
443  case 30:
444  for (int i=0; i < niter; ++i) blas::xpyHeavyQuarkResidualNorm(*xD, *yD, *zD);
445  break;
446 
447  case 31:
448  for (int i=0; i < niter; ++i) blas::tripleCGReduction(*xD, *yD, *zD);
449  break;
450 
451  case 32:
452  for (int i=0; i < niter; ++i) blas::tripleCGUpdate(a, b, *xD, *yD, *zD, *wD);
453  break;
454 
455  case 33:
456  for (int i=0; i < niter; ++i) blas::axpyReDot(a, *xD, *yD);
457  break;
458 
459  case 34:
460  for (int i=0; i < niter; ++i) blas::caxpy(A, *xmD,* ymD);
461  break;
462 
463  case 35:
464  for (int i=0; i < niter; ++i) blas::axpyBzpcx((double*)A, xmD->Components(), zmD->Components(), (double*)B, *yD, (double*)C);
465  break;
466 
467  case 36:
468  for (int i=0; i < niter; ++i) blas::caxpyBxpz(a2, *xD, *yD, b2, *zD);
469  break;
470 
471  case 37:
472  for (int i=0; i < niter; ++i) blas::caxpyBzpx(a2, *xD, *yD, b2, *zD);
473  break;
474 
475  case 38:
476  for (int i=0; i < niter; ++i) blas::cDotProduct(A2, xmD->Components(), xmD->Components());
477  break;
478 
479  case 39:
480  for (int i=0; i < niter; ++i) blas::cDotProduct(A, xmD->Components(), ymD->Components());
481  break;
482 
483  default:
484  errorQuda("Undefined blas kernel %d\n", kernel);
485  }
486  }
487 
488  cudaEventRecord(end, 0);
489  cudaEventSynchronize(end);
490  float runTime;
491  cudaEventElapsedTime(&runTime, start, end);
492  cudaEventDestroy(start);
493  cudaEventDestroy(end);
494  delete[] A;
495  delete[] B;
496  delete[] C;
497  delete[] A2;
498  double secs = runTime / 1000;
499  return secs;
500 }
501 
502 #define ERROR(a) fabs(blas::norm2(*a##D) - blas::norm2(*a##H)) / blas::norm2(*a##H)
503 
504 double test(int kernel) {
505 
506  double a = M_PI, b = M_PI*exp(1.0), c = sqrt(M_PI);
507  quda::Complex a2(a, b), b2(b, -c), c2(a+b, c*a);
508  double error = 0;
512  quda::Complex * A2 = new quda::Complex[Nsrc*Nsrc]; // for the block cDotProductNorm test
513  quda::Complex * B2 = new quda::Complex[Nsrc*Nsrc]; // for the block cDotProductNorm test
514  for(int i=0; i < Nsrc*Msrc; i++){
515  A[i] = a2* (1.0*((i/Nsrc) + i)) + b2 * (1.0*i) + c2 *(1.0*(Nsrc*Msrc/2-i));
516  B[i] = a2* (1.0*((i/Nsrc) + i)) - b2 * (M_PI*i) + c2 *(1.0*(Nsrc*Msrc/2-i));
517  C[i] = a2* (1.0*((M_PI/Nsrc) + i)) + b2 * (1.0*i) + c2 *(1.0*(Nsrc*Msrc/2-i));
518  }
519  for(int i=0; i < Nsrc*Nsrc; i++){
520  A2[i] = a2* (1.0*((i/Nsrc) + i)) + b2 * (1.0*i) + c2 *(1.0*(Nsrc*Nsrc/2-i));
521  B2[i] = a2* (1.0*((i/Nsrc) + i)) - b2 * (M_PI*i) + c2 *(1.0*(Nsrc*Nsrc/2-i));
522  }
523  // A[0] = a2;
524  // A[1] = 0.;
525  // A[2] = 0.;
526  // A[3] = 0.;
527 
528  switch (kernel) {
529 
530  case 0:
531  *hD = *hH;
532  blas::copy(*yD, *hD);
533  blas::copy(*yH, *hH);
534  error = ERROR(y);
535  break;
536 
537  case 1:
538  *mD = *mH;
539  blas::copy(*yD, *mD);
540  blas::copy(*yH, *mH);
541  error = ERROR(y);
542  break;
543 
544  case 2:
545  *lD = *lH;
546  blas::copy(*yD, *lD);
547  blas::copy(*yH, *lH);
548  error = ERROR(y);
549  break;
550 
551  case 3:
552  *xD = *xH;
553  *yD = *yH;
554  blas::axpby(a, *xD, b, *yD);
555  blas::axpby(a, *xH, b, *yH);
556  error = ERROR(y);
557  break;
558 
559  case 4:
560  *xD = *xH;
561  *yD = *yH;
562  blas::xpy(*xD, *yD);
563  blas::xpy(*xH, *yH);
564  error = ERROR(y);
565  break;
566 
567  case 5:
568  *xD = *xH;
569  *yD = *yH;
570  blas::axpy(a, *xD, *yD);
571  blas::axpy(a, *xH, *yH);
572  *zH = *yD;
573  error = ERROR(y);
574  break;
575 
576  case 6:
577  *xD = *xH;
578  *yD = *yH;
579  blas::xpay(*xD, a, *yD);
580  blas::xpay(*xH, a, *yH);
581  error = ERROR(y);
582  break;
583 
584  case 7:
585  *xD = *xH;
586  *yD = *yH;
587  blas::mxpy(*xD, *yD);
588  blas::mxpy(*xH, *yH);
589  error = ERROR(y);
590  break;
591 
592  case 8:
593  *xD = *xH;
594  blas::ax(a, *xD);
595  blas::ax(a, *xH);
596  error = ERROR(x);
597  break;
598 
599  case 9:
600  *xD = *xH;
601  *yD = *yH;
602  blas::caxpy(a2, *xD, *yD);
603  blas::caxpy(a2, *xH, *yH);
604  error = ERROR(y);
605  break;
606 
607  case 10:
608  *xD = *xH;
609  *yD = *yH;
610  blas::caxpby(a2, *xD, b2, *yD);
611  blas::caxpby(a2, *xH, b2, *yH);
612  error = ERROR(y);
613  break;
614 
615  case 11:
616  *xD = *xH;
617  *yD = *yH;
618  *zD = *zH;
619  blas::cxpaypbz(*xD, a2, *yD, b2, *zD);
620  blas::cxpaypbz(*xH, a2, *yH, b2, *zH);
621  error = ERROR(z);
622  break;
623 
624  case 12:
625  *xD = *xH;
626  *yD = *yH;
627  *zD = *zH;
628  blas::axpyBzpcx(a, *xD, *yD, b, *zD, c);
629  blas::axpyBzpcx(a, *xH, *yH, b, *zH, c);
630  error = ERROR(x) + ERROR(y);
631  break;
632 
633  case 13:
634  *xD = *xH;
635  *yD = *yH;
636  *zD = *zH;
637  blas::axpyZpbx(a, *xD, *yD, *zD, b);
638  blas::axpyZpbx(a, *xH, *yH, *zH, b);
639  error = ERROR(x) + ERROR(y);
640  break;
641 
642  case 14:
643  *xD = *xH;
644  *yD = *yH;
645  *zD = *zH;
646  *wD = *wH;
647  blas::caxpbypzYmbw(a2, *xD, b2, *yD, *zD, *wD);
648  blas::caxpbypzYmbw(a2, *xH, b2, *yH, *zH, *wH);
649  error = ERROR(z) + ERROR(y);
650  break;
651 
652  case 15:
653  *xD = *xH;
654  *yD = *yH;
655  blas::cabxpyAx(a, b2, *xD, *yD);
656  blas::cabxpyAx(a, b2, *xH, *yH);
657  error = ERROR(y) + ERROR(x);
658  break;
659 
660  case 16:
661  *xD = *xH;
662  *yD = *yH;
663  *zD = *zH;
664  {blas::caxpyXmaz(a, *xD, *yD, *zD);
665  blas::caxpyXmaz(a, *xH, *yH, *zH);
666  error = ERROR(y) + ERROR(x);}
667  break;
668 
669  case 17:
670  *xD = *xH;
671  *yH = *xD;
672  error = fabs(blas::norm2(*xD) - blas::norm2(*xH)) / blas::norm2(*xH);
673  break;
674 
675  case 18:
676  *xD = *xH;
677  *yD = *yH;
678  error = fabs(blas::reDotProduct(*xD, *yD) - blas::reDotProduct(*xH, *yH)) / fabs(blas::reDotProduct(*xH, *yH));
679  break;
680 
681  case 19:
682  *xD = *xH;
683  *yD = *yH;
684  {double d = blas::axpyNorm(a, *xD, *yD);
685  double h = blas::axpyNorm(a, *xH, *yH);
686  error = ERROR(y) + fabs(d-h)/fabs(h);}
687  break;
688 
689  case 20:
690  *xD = *xH;
691  *yD = *yH;
692  {double d = blas::xmyNorm(*xD, *yD);
693  double h = blas::xmyNorm(*xH, *yH);
694  error = ERROR(y) + fabs(d-h)/fabs(h);}
695  break;
696 
697  case 21:
698  *xD = *xH;
699  *yD = *yH;
700  {double d = blas::caxpyNorm(a, *xD, *yD);
701  double h = blas::caxpyNorm(a, *xH, *yH);
702  error = ERROR(y) + fabs(d-h)/fabs(h);}
703  break;
704 
705  case 22:
706  *xD = *xH;
707  *yD = *yH;
708  *zD = *zH;
709  {double d = blas::caxpyXmazNormX(a, *xD, *yD, *zD);
710  double h = blas::caxpyXmazNormX(a, *xH, *yH, *zH);
711  error = ERROR(y) + ERROR(x) + fabs(d-h)/fabs(h);}
712  break;
713 
714  case 23:
715  *xD = *xH;
716  *yD = *yH;
717  {double d = blas::cabxpyzAxNorm(a, b2, *xD, *yD, *yD);
718  double h = blas::cabxpyzAxNorm(a, b2, *xH, *yH, *yH);
719  error = ERROR(x) + ERROR(y) + fabs(d-h)/fabs(h);}
720  break;
721 
722  case 24:
723  *xD = *xH;
724  *yD = *yH;
725  error = abs(blas::cDotProduct(*xD, *yD) - blas::cDotProduct(*xH, *yH)) / abs(blas::cDotProduct(*xH, *yH));
726  break;
727 
728  case 25:
729  *xD = *xH;
730  *yD = *yH;
731  *zD = *zH;
732  {quda::Complex d = blas::caxpyDotzy(a, *xD, *yD, *zD);
733  quda::Complex h = blas::caxpyDotzy(a, *xH, *yH, *zH);
734  error = ERROR(y) + abs(d-h)/abs(h);}
735  break;
736 
737  case 26:
738  *xD = *xH;
739  *yD = *yH;
740  { double3 d = blas::cDotProductNormA(*xD, *yD);
741  double3 h = blas::cDotProductNormA(*xH, *yH);
742  error = abs(Complex(d.x - h.x, d.y - h.y)) / abs(Complex(h.x, h.y)) + fabs(d.z - h.z) / fabs(h.z);
743  }
744  break;
745 
746  case 27:
747  *xD = *xH;
748  *yD = *yH;
749  { double3 d = blas::cDotProductNormB(*xD, *yD);
750  double3 h = blas::cDotProductNormB(*xH, *yH);
751  error = abs(Complex(d.x - h.x, d.y - h.y)) / abs(Complex(h.x, h.y)) + fabs(d.z - h.z) / fabs(h.z);
752  }
753  break;
754 
755  case 28:
756  *xD = *xH;
757  *yD = *yH;
758  *zD = *zH;
759  *wD = *wH;
760  *vD = *vH;
761  { double3 d = blas::caxpbypzYmbwcDotProductUYNormY(a2, *xD, b2, *yD, *zD, *wD, *vD);
762  double3 h = blas::caxpbypzYmbwcDotProductUYNormY(a2, *xH, b2, *yH, *zH, *wH, *vH);
763  error = ERROR(z) + ERROR(y) + abs(Complex(d.x - h.x, d.y - h.y)) / abs(Complex(h.x, h.y))
764  + fabs(d.z - h.z) / fabs(h.z);
765  }
766  break;
767 
768  case 29:
769  *xD = *xH;
770  *yD = *yH;
771  { double3 d = blas::HeavyQuarkResidualNorm(*xD, *yD);
772  double3 h = blas::HeavyQuarkResidualNorm(*xH, *yH);
773  error = fabs(d.x - h.x) / fabs(h.x) +
774  fabs(d.y - h.y) / fabs(h.y) + fabs(d.z - h.z) / fabs(h.z); }
775  break;
776 
777  case 30:
778  *xD = *xH;
779  *yD = *yH;
780  *zD = *zH;
781  { double3 d = blas::xpyHeavyQuarkResidualNorm(*xD, *yD, *zD);
782  double3 h = blas::xpyHeavyQuarkResidualNorm(*xH, *yH, *zH);
783  error = ERROR(y) + fabs(d.x - h.x) / fabs(h.x) +
784  fabs(d.y - h.y) / fabs(h.y) + fabs(d.z - h.z) / fabs(h.z); }
785  break;
786 
787  case 31:
788  *xD = *xH;
789  *yD = *yH;
790  *zD = *zH;
791  { double3 d = blas::tripleCGReduction(*xD, *yD, *zD);
792  double3 h = make_double3(blas::norm2(*xH), blas::norm2(*yH), blas::reDotProduct(*yH, *zH));
793  error = fabs(d.x - h.x) / fabs(h.x) +
794  fabs(d.y - h.y) / fabs(h.y) + fabs(d.z - h.z) / fabs(h.z); }
795  break;
796 
797  case 32:
798  *xD = *xH;
799  *yD = *yH;
800  *zD = *zH;
801  *wD = *wH;
802  { blas::tripleCGUpdate(a, b, *xD, *yD, *zD, *wD);
803  blas::tripleCGUpdate(a, b, *xH, *yH, *zH, *wH);
804  error = ERROR(y) + ERROR(z) + ERROR(w); }
805  break;
806 
807  case 33:
808  *xD = *xH;
809  *yD = *yH;
810  { double d = blas::axpyReDot(a, *xD, *yD);
811  double h = blas::axpyReDot(a, *xH, *yH);
812  error = ERROR(y) + fabs(d-h)/fabs(h); }
813  break;
814 
815  case 34:
816  for (int i=0; i < Nsrc; i++) xmD->Component(i) = *(xmH[i]);
817  for (int i=0; i < Msrc; i++) ymD->Component(i) = *(ymH[i]);
818 
819  blas::caxpy(A, *xmD, *ymD);
820  for (int i=0; i < Nsrc; i++){
821  for(int j=0; j < Msrc; j++){
822  blas::caxpy(A[Msrc*i+j], *(xmH[i]), *(ymH[j]));
823  }
824  }
825  error = 0;
826  for (int i=0; i < Msrc; i++){
827  error+= fabs(blas::norm2((ymD->Component(i))) - blas::norm2(*(ymH[i]))) / blas::norm2(*(ymH[i]));
828  }
829  error/= Msrc;
830  break;
831 
832  case 35:
833  for (int i=0; i < Nsrc; i++) {
834  xmD->Component(i) = *(xmH[i]);
835  zmD->Component(i) = *(zmH[i]);
836  }
837  *yD = *yH;
838 
839  blas::axpyBzpcx((double*)A, xmD->Components(), zmD->Components(), (double*)B, *yD, (const double*)C);
840 
841  for (int i=0; i<Nsrc; i++) {
842  blas::axpyBzpcx(((double*)A)[i], *xmH[i], *zmH[i], ((double*)B)[i], *yH, ((double*)C)[i]);
843  }
844 
845  error = 0;
846  for (int i=0; i < Nsrc; i++){
847  error+= fabs(blas::norm2((xmD->Component(i))) - blas::norm2(*(xmH[i]))) / blas::norm2(*(xmH[i]));
848  //error+= fabs(blas::norm2((zmD->Component(i))) - blas::norm2(*(zmH[i]))) / blas::norm2(*(zmH[i]));
849  }
850  error/= Nsrc;
851  break;
852 
853  case 36:
854  *xD = *xH;
855  *yD = *yH;
856  *zD = *zH;
857  {blas::caxpyBxpz(a, *xD, *yD, b2, *zD);
858  blas::caxpyBxpz(a, *xH, *yH, b2, *zH);
859  error = ERROR(x) + ERROR(z);}
860  break;
861 
862  case 37:
863  *xD = *xH;
864  *yD = *yH;
865  *zD = *zH;
866  {blas::caxpyBzpx(a, *xD, *yD, b2, *zD);
867  blas::caxpyBzpx(a, *xH, *yH, b2, *zH);
868  error = ERROR(x) + ERROR(z);}
869  break;
870 
871  case 38:
872  for (int i=0; i < Nsrc; i++) xmD->Component(i) = *(xmH[i]);
873  blas::cDotProduct(A2, xmD->Components(), xmD->Components());
874  error = 0.0;
875  for (int i = 0; i < Nsrc; i++) {
876  for (int j = 0; j < Nsrc; j++) {
877  B2[i*Nsrc+j] = blas::cDotProduct(xmD->Component(i), xmD->Component(j));
878  error += std::abs(A2[i*Nsrc+j] - B2[i*Nsrc+j])/std::abs(B2[i*Nsrc+j]);
879  }
880  }
881  error /= Nsrc*Nsrc;
882  break;
883 
884  case 39:
885  for (int i=0; i < Nsrc; i++) xmD->Component(i) = *(xmH[i]);
886  for (int i=0; i < Msrc; i++) ymD->Component(i) = *(ymH[i]);
887  blas::cDotProduct(A, xmD->Components(), ymD->Components());
888  error = 0.0;
889  for (int i = 0; i < Nsrc; i++) {
890  for (int j = 0; j < Msrc; j++) {
891  B[i*Msrc+j] = blas::cDotProduct(xmD->Component(i), ymD->Component(j));
892  error += std::abs(A[i*Msrc+j] - B[i*Msrc+j])/std::abs(B[i*Msrc+j]);
893  }
894  }
895  error /= Nsrc*Msrc;
896  break;
897 
898  default:
899  errorQuda("Undefined blas kernel %d\n", kernel);
900  }
901  delete[] A;
902  delete[] B;
903  delete[] C;
904  delete[] A2;
905  delete[] B2;
906  return error;
907 }
908 
909 const char *prec_str[] = {"quarter", "half", "single", "double"};
910 
911 
912 // For googletest names must be non-empty, unique, and may only contain ASCII
913 // alphanumeric characters or underscore
914 const char *names[] = {
915  "copyHS",
916  "copyMS",
917  "copyLS",
918  "axpby",
919  "xpy",
920  "axpy",
921  "xpay",
922  "mxpy",
923  "ax",
924  "caxpy",
925  "caxpby",
926  "cxpaypbz",
927  "axpyBzpcx",
928  "axpyZpbx",
929  "caxpbypzYmbw",
930  "cabxpyAx",
931  "caxpyXmaz",
932  "norm",
933  "reDotProduct",
934  "axpyNorm",
935  "xmyNorm",
936  "caxpyNorm",
937  "caxpyXmazNormX",
938  "cabxpyzAxNorm",
939  "cDotProduct",
940  "caxpyDotzy",
941  "cDotProductNormA",
942  "cDotProductNormB",
943  "caxpbypzYmbwcDotProductUYNormY",
944  "HeavyQuarkResidualNorm",
945  "xpyHeavyQuarkResidualNorm",
946  "tripleCGReduction",
947  "tripleCGUpdate",
948  "axpyReDot",
949  "caxpy_block",
950  "axpyBzpcx_block",
951  "caxpyBxpz",
952  "caxpyBzpx",
953  "cDotProductNorm_block",
954  "cDotProduct_block",
955 };
956 
957 int main(int argc, char** argv)
958 {
959 
960  ::testing::InitGoogleTest(&argc, argv);
961  int result = 0;
962 
964  test_type = -1;
965 
966  for (int i = 1; i < argc; i++){
967  if(process_command_line_option(argc, argv, &i) == 0){
968  continue;
969  }
970  printfQuda("ERROR: Invalid option:%s\n", argv[i]);
971  usage(argv);
972  }
973 
974  // override spin setting if mg solver is set to test coarse grids
975  if (inv_type == QUDA_MG_INVERTER) {
976  Nspin = 2;
977  Ncolor = nvec[0];
978  if (Ncolor == 0) Ncolor = 24;
979  } else {
980  // set spin according to the type of dslash
983  Ncolor = 3;
984  }
985 
986  setSpinorSiteSize(24);
987  initComms(argc, argv, gridsize_from_cmdline);
989  initQuda(device);
990 
992 
993  // clear the error state
994  cudaGetLastError();
995 
996  // lastly check for correctness
997  ::testing::TestEventListeners &listeners = ::testing::UnitTest::GetInstance()->listeners();
998  if (comm_rank() != 0) { delete listeners.Release(listeners.default_result_printer()); }
999  result = RUN_ALL_TESTS();
1000 
1001  endQuda();
1002 
1003  finalizeComms();
1004  return result;
1005 }
1006 
1007 // The following tests each kernel at each precision using the google testing framework
1008 
1009 using ::testing::TestWithParam;
1010 using ::testing::Bool;
1011 using ::testing::Values;
1012 using ::testing::Range;
1013 using ::testing::Combine;
1014 
1015 class BlasTest : public ::testing::TestWithParam<::testing::tuple<int, int>> {
1016 protected:
1017  ::testing::tuple<int, int> param;
1018 
1019 public:
1020  virtual ~BlasTest() { }
1021  virtual void SetUp() {
1022  param = GetParam();
1023  initFields(::testing::get<0>(GetParam()));
1024  }
1025  virtual void TearDown() { freeFields(); }
1026 
1027 };
1028 
1029 
1030 TEST_P(BlasTest, verify) {
1031  int prec = ::testing::get<0>(GetParam());
1032  int kernel = ::testing::get<1>(GetParam());
1033  if (skip_kernel(prec, kernel)) GTEST_SKIP();
1034 
1035  // certain tests will fail to run for coarse grids so mark these as
1036  // failed without running
1037  double deviation = test(kernel);
1038  // printfQuda("%-35s error = %e\n", names[kernel], deviation);
1039  double tol = (prec == 3 ? 1e-12 : (prec == 2 ? 1e-6 : (prec == 1 ? 1e-4 : 1e-2)));
1040  tol = (kernel < 4) ? 5e-2 : tol; // use different tolerance for copy
1041  EXPECT_LE(deviation, tol) << "CPU and CUDA implementations do not agree";
1042 }
1043 
1045  int prec = ::testing::get<0>(GetParam());
1046  int kernel = ::testing::get<1>(GetParam());
1047  if (skip_kernel(prec, kernel)) GTEST_SKIP();
1048 
1049  // do the initial tune
1050  benchmark(kernel, 1);
1051 
1052  // now rerun with more iterations to get accurate speed measurements
1053  quda::blas::flops = 0;
1054  quda::blas::bytes = 0;
1055 
1056  double secs = benchmark(kernel, niter);
1057 
1058  double gflops = (quda::blas::flops*1e-9)/(secs);
1059  double gbytes = quda::blas::bytes/(secs*1e9);
1060  RecordProperty("Gflops", std::to_string(gflops));
1061  RecordProperty("GBs", std::to_string(gbytes));
1062  printfQuda("%-31s: Gflop/s = %6.1f, GB/s = %6.1f\n", names[kernel], gflops, gbytes);
1063 }
1064 
1065 std::string getblasname(testing::TestParamInfo<::testing::tuple<int, int>> param){
1066  int prec = ::testing::get<0>(param.param);
1067  int kernel = ::testing::get<1>(param.param);
1068  std::string str(names[kernel]);
1069  str += std::string("_");
1070  str += std::string(prec_str[prec]);
1071  return str; // names[kernel] + "_" + prec_str[prec];
1072 }
1073 
1074 // instantiate all test cases
1075 INSTANTIATE_TEST_SUITE_P(QUDA, BlasTest, Combine(Range(0, 4), Range(0, Nkernels)), getblasname);
QudaDslashType dslash_type
Definition: test_util.cpp:1621
ColorSpinorField * ymD
Definition: blas_test.cu:41
int dimPartitioned(int dim)
Definition: test_util.cpp:1776
void ax(double a, ColorSpinorField &x)
Definition: blas_quda.cu:508
int comm_rank(void)
Definition: comm_mpi.cpp:82
void setPrecision(QudaPrecision precision, QudaPrecision ghost_precision=QUDA_INVALID_PRECISION, bool force_native=false)
void endQuda(void)
enum QudaPrecision_s QudaPrecision
double3 cDotProductNormA(ColorSpinorField &a, ColorSpinorField &b)
Definition: reduce_quda.cu:778
double test(int kernel)
Definition: blas_test.cu:504
int Msrc
Definition: test_util.cpp:1628
double caxpyNorm(const Complex &a, ColorSpinorField &x, ColorSpinorField &y)
Definition: reduce_quda.cu:746
void axpyZpbx(double a, ColorSpinorField &x, ColorSpinorField &y, ColorSpinorField &z, double b)
Definition: blas_quda.cu:552
void display_test_info()
Definition: blas_test.cu:58
void end(void)
Definition: blas_quda.cu:489
__host__ __device__ ValueType exp(ValueType x)
Definition: complex_quda.h:96
#define errorQuda(...)
Definition: util_quda.h:121
double norm2(const ColorSpinorField &a)
Definition: reduce_quda.cu:721
QudaInverterType inv_type
Definition: test_util.cpp:1640
ColorSpinorField * mD
Definition: blas_test.cu:41
int xdim
Definition: test_util.cpp:1615
enum QudaSolveType_s QudaSolveType
__host__ __device__ ValueType sqrt(ValueType x)
Definition: complex_quda.h:120
Complex cDotProduct(ColorSpinorField &, ColorSpinorField &)
Definition: reduce_quda.cu:764
int process_command_line_option(int argc, char **argv, int *idx)
Definition: test_util.cpp:2019
ColorSpinorField * yH
Definition: blas_test.cu:40
::testing::tuple< int, int > param
Definition: blas_test.cu:1017
ColorSpinorField * vH
Definition: blas_test.cu:40
virtual ~BlasTest()
Definition: blas_test.cu:1020
void cabxpyAx(double a, const Complex &b, ColorSpinorField &x, ColorSpinorField &y)
Definition: blas_quda.cu:591
double3 xpyHeavyQuarkResidualNorm(ColorSpinorField &x, ColorSpinorField &y, ColorSpinorField &r)
Definition: reduce_quda.cu:818
std::vector< cpuColorSpinorField * > ymH
Definition: blas_test.cu:43
CompositeColorSpinorField & Components()
ColorSpinorField * yD
Definition: blas_test.cu:41
int Nprec
Definition: blas_test.cu:72
bool skip_kernel(int precision, int kernel)
Definition: blas_test.cu:74
std::vector< cpuColorSpinorField * > xmH
Definition: blas_test.cu:42
double reDotProduct(ColorSpinorField &x, ColorSpinorField &y)
Definition: reduce_quda.cu:728
int zdim
Definition: test_util.cpp:1617
void copy(ColorSpinorField &dst, const ColorSpinorField &src)
Definition: copy_quda.cu:355
void finalizeComms()
Definition: test_util.cpp:128
ColorSpinorField & Component(const int idx) const
double xmyNorm(ColorSpinorField &x, ColorSpinorField &y)
Definition: blas_quda.h:75
void caxpyBzpx(const Complex &, ColorSpinorField &, ColorSpinorField &, const Complex &, ColorSpinorField &)
Definition: blas_quda.cu:563
int Nspin
Definition: blas_test.cu:45
QudaSiteSubset siteSubset
Definition: lattice_field.h:71
void caxpyBxpz(const Complex &, ColorSpinorField &, ColorSpinorField &, const Complex &, ColorSpinorField &)
Definition: blas_quda.cu:574
ColorSpinorField * xD
Definition: blas_test.cu:41
ColorSpinorField * wD
Definition: blas_test.cu:41
void xpay(ColorSpinorField &x, double a, ColorSpinorField &y)
Definition: blas_quda.h:37
int tdim
ColorSpinorField * mH
Definition: blas_test.cu:40
ColorSpinorField * wH
Definition: blas_test.cu:40
QudaGaugeParam param
Definition: pack_test.cpp:17
void usage(char **)
Definition: test_util.cpp:1783
int x[QUDA_MAX_DIM]
Definition: lattice_field.h:67
const char * names[]
Definition: blas_test.cu:914
bool is_composite
for deflation solvers:
void initQuda(int device)
double tol
Definition: test_util.cpp:1656
QudaPrecision getPrecision(int i)
Definition: test_util.h:129
double benchmark(int kernel, const int niter)
Definition: blas_test.cu:303
void initFields(int prec)
Definition: blas_test.cu:101
virtual void SetUp()
Definition: blas_test.cu:1021
ColorSpinorField * zD
Definition: blas_test.cu:41
virtual void TearDown()
Definition: blas_test.cu:1025
ColorSpinorField * xmD
Definition: blas_test.cu:41
void setSpinorSiteSize(int n)
Definition: test_util.cpp:211
ColorSpinorField * vD
Definition: blas_test.cu:41
void axpy(double a, ColorSpinorField &x, ColorSpinorField &y)
Definition: blas_quda.h:35
int nvec[QUDA_MAX_MG_LEVEL]
Definition: test_util.cpp:1637
void caxpbypzYmbw(const Complex &, ColorSpinorField &, const Complex &, ColorSpinorField &, ColorSpinorField &, ColorSpinorField &)
Definition: blas_quda.cu:585
const char * prec_str[]
Definition: blas_test.cu:909
void axpyBzpcx(double a, ColorSpinorField &x, ColorSpinorField &y, double b, ColorSpinorField &z, double c)
Definition: blas_quda.cu:541
int niter
Definition: test_util.cpp:1629
double cabxpyzAxNorm(double a, const Complex &b, ColorSpinorField &x, ColorSpinorField &y, ColorSpinorField &z)
Definition: reduce_quda.cu:758
double3 HeavyQuarkResidualNorm(ColorSpinorField &x, ColorSpinorField &r)
Definition: reduce_quda.cu:809
ColorSpinorField * xH
Definition: blas_test.cu:40
std::complex< double > Complex
Definition: quda_internal.h:46
double caxpyXmazNormX(const Complex &a, ColorSpinorField &x, ColorSpinorField &y, ColorSpinorField &z)
Definition: reduce_quda.cu:752
void tripleCGUpdate(double alpha, double beta, ColorSpinorField &q, ColorSpinorField &r, ColorSpinorField &x, ColorSpinorField &p)
Definition: blas_quda.cu:614
double axpyReDot(double a, ColorSpinorField &x, ColorSpinorField &y)
Definition: reduce_quda.cu:740
const int Nkernels
Definition: blas_test.cu:36
Complex caxpyDotzy(const Complex &a, ColorSpinorField &x, ColorSpinorField &y, ColorSpinorField &z)
Definition: reduce_quda.cu:771
ColorSpinorField * lH
Definition: blas_test.cu:40
void caxpy(const Complex &a, ColorSpinorField &x, ColorSpinorField &y)
Definition: blas_quda.cu:512
QudaSolveType solve_type
Definition: test_util.cpp:1663
ColorSpinorField * hD
Definition: blas_test.cu:41
double3 caxpbypzYmbwcDotProductUYNormY(const Complex &a, ColorSpinorField &x, const Complex &b, ColorSpinorField &y, ColorSpinorField &z, ColorSpinorField &w, ColorSpinorField &u)
Definition: reduce_quda.cu:783
ColorSpinorField * lD
Definition: blas_test.cu:41
int Ncolor
Definition: blas_test.cu:46
int Nsrc
Definition: test_util.cpp:1627
int ydim
Definition: test_util.cpp:1616
void caxpyXmaz(const Complex &a, ColorSpinorField &x, ColorSpinorField &y, ColorSpinorField &z)
Definition: blas_quda.cu:597
#define QUDA_MAX_MG_LEVEL
Maximum number of multi-grid levels. This number may be increased if needed.
ColorSpinorField * zmD
Definition: blas_test.cu:41
void setPrec(ColorSpinorParam &param, const QudaPrecision precision)
Definition: blas_test.cu:48
#define printfQuda(...)
Definition: util_quda.h:115
unsigned long long flops
Definition: blas_quda.cu:22
void xpy(ColorSpinorField &x, ColorSpinorField &y)
Definition: blas_quda.h:33
std::string getblasname(testing::TestParamInfo<::testing::tuple< int, int >> param)
Definition: blas_test.cu:1065
void axpby(double a, ColorSpinorField &x, double b, ColorSpinorField &y)
Definition: blas_quda.h:36
enum QudaDslashType_s QudaDslashType
void caxpby(const Complex &a, ColorSpinorField &x, const Complex &b, ColorSpinorField &y)
Definition: blas_quda.cu:523
double axpyNorm(double a, ColorSpinorField &x, ColorSpinorField &y)
Definition: blas_quda.h:74
int device
Definition: test_util.cpp:1602
ColorSpinorField * zH
Definition: blas_test.cu:40
void mxpy(ColorSpinorField &x, ColorSpinorField &y)
Definition: blas_quda.h:34
__host__ __device__ ValueType abs(ValueType x)
Definition: complex_quda.h:125
enum QudaVerbosity_s QudaVerbosity
int test_type
Definition: test_util.cpp:1636
void cxpaypbz(ColorSpinorField &, const Complex &b, ColorSpinorField &y, const Complex &c, ColorSpinorField &z)
Definition: blas_quda.cu:535
#define checkCudaError()
Definition: util_quda.h:161
TEST_P(BlasTest, verify)
Definition: blas_test.cu:1030
INSTANTIATE_TEST_SUITE_P(QUDA, BlasTest, Combine(Range(0, 4), Range(0, Nkernels)), getblasname)
std::vector< cpuColorSpinorField * > zmH
Definition: blas_test.cu:44
void initComms(int argc, char **argv, int *const commDims)
Definition: test_util.cpp:88
double3 cDotProductNormB(ColorSpinorField &a, ColorSpinorField &b)
Return (a,b) and ||b||^2 - implemented using cDotProductNormA.
Definition: blas_quda.h:83
ColorSpinorField * hH
Definition: blas_test.cu:40
QudaPrecision prec
Definition: test_util.cpp:1608
void freeFields()
Definition: blas_test.cu:269
int gridsize_from_cmdline[]
Definition: test_util.cpp:49
void setVerbosity(QudaVerbosity verbosity)
Definition: util_quda.cpp:25
enum QudaInverterType_s QudaInverterType
unsigned long long bytes
Definition: blas_quda.cu:23
int main(int argc, char **argv)
Definition: blas_test.cu:957
double3 tripleCGReduction(ColorSpinorField &x, ColorSpinorField &y, ColorSpinorField &z)
Definition: reduce_quda.cu:828
#define ERROR(a)
Definition: blas_test.cu:502
QudaVerbosity verbosity
Definition: test_util.cpp:1614