QUDA  v0.5.0
A library for QCD on GPUs
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
blas_test.cu
Go to the documentation of this file.
1 #include <stdio.h>
2 #include <stdlib.h>
3 
4 #include <quda_internal.h>
5 #include <color_spinor_field.h>
6 #include <blas_quda.h>
7 
8 #include <test_util.h>
9 #include <face_quda.h>
10 
11 // include because of nasty globals used in the tests
12 #include <dslash_util.h>
13 
14 // Wilson, clover-improved Wilson, and twisted mass are supported.
16 extern bool tune;
17 extern int device;
18 extern int xdim;
19 extern int ydim;
20 extern int zdim;
21 extern int tdim;
22 extern int gridsize_from_cmdline[];
23 extern int niter;
24 
25 extern bool tune;
26 
27 extern void usage(char** );
28 
29 #if (__COMPUTE_CAPABILITY__ >= 200)
30 const int Nkernels = 32;
31 #else // exclude Heavy Quark Norm if on Tesla architecture
32 const int Nkernels = 31;
33 #endif
34 
35 using namespace quda;
36 
39 int Nspin;
40 
42 {
43  param.precision = precision;
44  if (Nspin == 1 || precision == QUDA_DOUBLE_PRECISION) {
46  } else {
48  }
49 }
50 
51 void
53 {
54  printfQuda("running the following test:\n");
55 
56  printfQuda("S_dimension T_dimension Nspin\n");
57  printfQuda("%d/%d/%d %d %d\n", xdim, ydim, zdim, tdim, Nspin);
58 
59  printfQuda("Grid partition info: X Y Z T\n");
60  printfQuda(" %d %d %d %d\n",
61  dimPartitioned(0),
62  dimPartitioned(1),
63  dimPartitioned(2),
64  dimPartitioned(3));
65 
66  return;
67 }
68 
69 void initFields(int prec)
70 {
71  // precisions used for the source field in the copyCuda() benchmark
72  QudaPrecision high_aux_prec;
73  QudaPrecision low_aux_prec;
74 
76  param.nColor = 3;
77  // set spin according to the type of dslash
78  Nspin = (dslash_type == QUDA_ASQTAD_DSLASH) ? 1 : 4;
79  param.nSpin = Nspin;
80  param.nDim = 4; // number of spacetime dimensions
81 
82  param.pad = 0; // padding must be zero for cpu fields
84  if (param.siteSubset == QUDA_PARITY_SITE_SUBSET) param.x[0] = xdim/2;
85  else param.x[0] = xdim;
86  param.x[1] = ydim;
87  param.x[2] = zdim;
88  param.x[3] = tdim;
89 
94 
96 
97  vH = new cpuColorSpinorField(param);
98  wH = new cpuColorSpinorField(param);
99  xH = new cpuColorSpinorField(param);
100  yH = new cpuColorSpinorField(param);
101  zH = new cpuColorSpinorField(param);
102  hH = new cpuColorSpinorField(param);
103  lH = new cpuColorSpinorField(param);
104 
105  vH->Source(QUDA_RANDOM_SOURCE, 0, 0, 0);
106  wH->Source(QUDA_RANDOM_SOURCE, 0, 0, 0);
107  xH->Source(QUDA_RANDOM_SOURCE, 0, 0, 0);
108  yH->Source(QUDA_RANDOM_SOURCE, 0, 0, 0);
109  zH->Source(QUDA_RANDOM_SOURCE, 0, 0, 0);
110  hH->Source(QUDA_RANDOM_SOURCE, 0, 0, 0);
111  lH->Source(QUDA_RANDOM_SOURCE, 0, 0, 0);
112 
113  // Now set the parameters for the cuda fields
114  //param.pad = xdim*ydim*zdim/2;
115 
116  if (param.nSpin == 4) param.gammaBasis = QUDA_UKQCD_GAMMA_BASIS;
118 
119  switch(prec) {
120  case 0:
122  high_aux_prec = QUDA_DOUBLE_PRECISION;
123  low_aux_prec = QUDA_SINGLE_PRECISION;
124  break;
125  case 1:
127  high_aux_prec = QUDA_DOUBLE_PRECISION;
128  low_aux_prec = QUDA_HALF_PRECISION;
129  break;
130  case 2:
132  high_aux_prec = QUDA_SINGLE_PRECISION;
133  low_aux_prec = QUDA_HALF_PRECISION;
134  break;
135  }
136 
137  checkCudaError();
138 
139  vD = new cudaColorSpinorField(param);
140  wD = new cudaColorSpinorField(param);
141  xD = new cudaColorSpinorField(param);
142  yD = new cudaColorSpinorField(param);
143  zD = new cudaColorSpinorField(param);
144 
145  setPrec(param, high_aux_prec);
146  hD = new cudaColorSpinorField(param);
147 
148  setPrec(param, low_aux_prec);
149  lD = new cudaColorSpinorField(param);
150 
151  // check for successful allocation
152  checkCudaError();
153 
154  *vD = *vH;
155  *wD = *wH;
156  *xD = *xH;
157  *yD = *yH;
158  *zD = *zH;
159  *hD = *hH;
160  *lD = *lH;
161 }
162 
163 
165 {
166 
167  // release memory
168  delete vD;
169  delete wD;
170  delete xD;
171  delete yD;
172  delete zD;
173  delete hD;
174  delete lD;
175 
176  // release memory
177  delete vH;
178  delete wH;
179  delete xH;
180  delete yH;
181  delete zH;
182  delete hH;
183  delete lH;
184 }
185 
186 
187 double benchmark(int kernel, const int niter) {
188 
189  double a, b, c;
190  quda::Complex a2, b2, c2;
191 
192  cudaEvent_t start, end;
193  cudaEventCreate(&start);
194  cudaEventCreate(&end);
195  cudaEventRecord(start, 0);
196 
197  for (int i=0; i < niter; ++i) {
198 
199  switch (kernel) {
200 
201  case 0:
202  copyCuda(*yD, *hD);
203  break;
204 
205  case 1:
206  copyCuda(*yD, *lD);
207  break;
208 
209  case 2:
210  axpbyCuda(a, *xD, b, *yD);
211  break;
212 
213  case 3:
214  xpyCuda(*xD, *yD);
215  break;
216 
217  case 4:
218  axpyCuda(a, *xD, *yD);
219  break;
220 
221  case 5:
222  xpayCuda(*xD, a, *yD);
223  break;
224 
225  case 6:
226  mxpyCuda(*xD, *yD);
227  break;
228 
229  case 7:
230  axCuda(a, *xD);
231  break;
232 
233  case 8:
234  caxpyCuda(a2, *xD, *yD);
235  break;
236 
237  case 9:
238  caxpbyCuda(a2, *xD, b2, *yD);
239  break;
240 
241  case 10:
242  cxpaypbzCuda(*xD, a2, *yD, b2, *zD);
243  break;
244 
245  case 11:
246  axpyBzpcxCuda(a, *xD, *yD, b, *zD, c);
247  break;
248 
249  case 12:
250  axpyZpbxCuda(a, *xD, *yD, *zD, b);
251  break;
252 
253  case 13:
254  caxpbypzYmbwCuda(a2, *xD, b2, *yD, *zD, *wD);
255  break;
256 
257  case 14:
258  cabxpyAxCuda(a, b2, *xD, *yD);
259  break;
260 
261  case 15:
262  caxpbypzCuda(a2, *xD, b2, *yD, *zD);
263  break;
264 
265  case 16:
266  caxpbypczpwCuda(a2, *xD, b2, *yD, c2, *zD, *wD);
267  break;
268 
269  case 17:
270  caxpyXmazCuda(a2, *xD, *yD, *zD);
271  break;
272 
273  // double
274  case 18:
275  normCuda(*xD);
276  break;
277 
278  case 19:
279  reDotProductCuda(*xD, *yD);
280  break;
281 
282  case 20:
283  axpyNormCuda(a, *xD, *yD);
284  break;
285 
286  case 21:
287  xmyNormCuda(*xD, *yD);
288  break;
289 
290  case 22:
291  caxpyNormCuda(a2, *xD, *yD);
292  break;
293 
294  case 23:
295  caxpyXmazNormXCuda(a2, *xD, *yD, *zD);
296  break;
297 
298  case 24:
299  cabxpyAxNormCuda(a, b2, *xD, *yD);
300  break;
301 
302  // double2
303  case 25:
304  cDotProductCuda(*xD, *yD);
305  break;
306 
307  case 26:
308  xpaycDotzyCuda(*xD, a, *yD, *zD);
309  break;
310 
311  case 27:
312  caxpyDotzyCuda(a2, *xD, *yD, *zD);
313  break;
314 
315  // double3
316  case 28:
318  break;
319 
320  case 29:
322  break;
323 
324  case 30:
325  caxpbypzYmbwcDotProductUYNormYCuda(a2, *xD, b2, *yD, *zD, *wD, *vD);
326  break;
327 
328  case 31:
330  break;
331 
332  default:
333  errorQuda("Undefined blas kernel %d\n", kernel);
334  }
335  }
336 
337  cudaEventRecord(end, 0);
338  cudaEventSynchronize(end);
339  float runTime;
340  cudaEventElapsedTime(&runTime, start, end);
341  cudaEventDestroy(start);
342  cudaEventDestroy(end);
343 
344  double secs = runTime / 1000;
345  return secs;
346 }
347 
348 #define ERROR(a) fabs(norm2(*a##D) - norm2(*a##H)) / norm2(*a##H)
349 
350 double test(int kernel) {
351 
352  double a = 1.5, b = 2.5, c = 3.5;
353  quda::Complex a2(a, b), b2(b, -c), c2(a+b, c*a);
354  double error = 0;
355 
356  switch (kernel) {
357 
358  case 0:
359  *hD = *hH;
360  copyCuda(*yD, *hD);
361  yH->copy(*hH);
362  error = ERROR(y);
363  break;
364 
365  case 1:
366  *lD = *lH;
367  copyCuda(*yD, *lD);
368  yH->copy(*lH);
369  error = ERROR(y);
370  break;
371 
372  case 2:
373  *xD = *xH;
374  *yD = *yH;
375  axpbyCuda(a, *xD, b, *yD);
376  axpbyCpu(a, *xH, b, *yH);
377  error = ERROR(y);
378  break;
379 
380  case 3:
381  *xD = *xH;
382  *yD = *yH;
383  xpyCuda(*xD, *yD);
384  xpyCpu(*xH, *yH);
385  error = ERROR(y);
386  break;
387 
388  case 4:
389  *xD = *xH;
390  *yD = *yH;
391  axpyCuda(a, *xD, *yD);
392  axpyCpu(a, *xH, *yH);
393  error = ERROR(y);
394  break;
395 
396  case 5:
397  *xD = *xH;
398  *yD = *yH;
399  xpayCuda(*xD, a, *yD);
400  xpayCpu(*xH, a, *yH);
401  error = ERROR(y);
402  break;
403 
404  case 6:
405  *xD = *xH;
406  *yD = *yH;
407  mxpyCuda(*xD, *yD);
408  mxpyCpu(*xH, *yH);
409  error = ERROR(y);
410  break;
411 
412  case 7:
413  *xD = *xH;
414  axCuda(a, *xD);
415  axCpu(a, *xH);
416  error = ERROR(x);
417  break;
418 
419  case 8:
420  *xD = *xH;
421  *yD = *yH;
422  caxpyCuda(a2, *xD, *yD);
423  caxpyCpu(a2, *xH, *yH);
424  error = ERROR(y);
425  break;
426 
427  case 9:
428  *xD = *xH;
429  *yD = *yH;
430  caxpbyCuda(a2, *xD, b2, *yD);
431  caxpbyCpu(a2, *xH, b2, *yH);
432  error = ERROR(y);
433  break;
434 
435  case 10:
436  *xD = *xH;
437  *yD = *yH;
438  *zD = *zH;
439  cxpaypbzCuda(*xD, a2, *yD, b2, *zD);
440  cxpaypbzCpu(*xH, a2, *yH, b2, *zH);
441  error = ERROR(z);
442  break;
443 
444  case 11:
445  *xD = *xH;
446  *yD = *yH;
447  *zD = *zH;
448  axpyBzpcxCuda(a, *xD, *yD, b, *zD, c);
449  axpyBzpcxCpu(a, *xH, *yH, b, *zH, c);
450  error = ERROR(x) + ERROR(y);
451  break;
452 
453  case 12:
454  *xD = *xH;
455  *yD = *yH;
456  *zD = *zH;
457  axpyZpbxCuda(a, *xD, *yD, *zD, b);
458  axpyZpbxCpu(a, *xH, *yH, *zH, b);
459  error = ERROR(x) + ERROR(y);
460  break;
461 
462  case 13:
463  *xD = *xH;
464  *yD = *yH;
465  *zD = *zH;
466  *wD = *wH;
467  caxpbypzYmbwCuda(a2, *xD, b2, *yD, *zD, *wD);
468  caxpbypzYmbwCpu(a2, *xH, b2, *yH, *zH, *wH);
469  error = ERROR(z) + ERROR(y);
470  break;
471 
472  case 14:
473  *xD = *xH;
474  *yD = *yH;
475  cabxpyAxCuda(a, b2, *xD, *yD);
476  cabxpyAxCpu(a, b2, *xH, *yH);
477  error = ERROR(y) + ERROR(x);
478  break;
479 
480  case 15:
481  *xD = *xH;
482  *yD = *yH;
483  *zD = *zH;
484  {caxpbypzCuda(a2, *xD, b2, *yD, *zD);
485  caxpbypzCpu(a2, *xH, b2, *yH, *zH);
486  error = ERROR(z); }
487  break;
488 
489  case 16:
490  *xD = *xH;
491  *yD = *yH;
492  *zD = *zH;
493  *wD = *wH;
494  {caxpbypczpwCuda(a2, *xD, b2, *yD, c2, *zD, *wD);
495  caxpbypczpwCpu(a2, *xH, b2, *yH, c2, *zH, *wH);
496  error = ERROR(w); }
497  break;
498 
499  case 17:
500  *xD = *xH;
501  *yD = *yH;
502  *zD = *zH;
503  {caxpyXmazCuda(a, *xD, *yD, *zD);
504  caxpyXmazCpu(a, *xH, *yH, *zH);
505  error = ERROR(y) + ERROR(x);}
506  break;
507 
508  // double
509  case 18:
510  *xD = *xH;
511  error = fabs(normCuda(*xD) - normCpu(*xH)) / normCpu(*xH);
512  break;
513 
514  case 19:
515  *xD = *xH;
516  *yD = *yH;
517  error = fabs(reDotProductCuda(*xD, *yD) - reDotProductCpu(*xH, *yH)) / fabs(reDotProductCpu(*xH, *yH));
518  break;
519 
520  case 20:
521  *xD = *xH;
522  *yD = *yH;
523  {double d = axpyNormCuda(a, *xD, *yD);
524  double h = axpyNormCpu(a, *xH, *yH);
525  error = ERROR(y) + fabs(d-h)/fabs(h);}
526  break;
527 
528  case 21:
529  *xD = *xH;
530  *yD = *yH;
531  {double d = xmyNormCuda(*xD, *yD);
532  double h = xmyNormCpu(*xH, *yH);
533  error = ERROR(y) + fabs(d-h)/fabs(h);}
534  break;
535 
536  case 22:
537  *xD = *xH;
538  *yD = *yH;
539  {double d = caxpyNormCuda(a, *xD, *yD);
540  double h = caxpyNormCpu(a, *xH, *yH);
541  error = ERROR(y) + fabs(d-h)/fabs(h);}
542  break;
543 
544  case 23:
545  *xD = *xH;
546  *yD = *yH;
547  *zD = *zH;
548  {double d = caxpyXmazNormXCuda(a, *xD, *yD, *zD);
549  double h = caxpyXmazNormXCpu(a, *xH, *yH, *zH);
550  error = ERROR(y) + ERROR(x) + fabs(d-h)/fabs(h);}
551  break;
552 
553  case 24:
554  *xD = *xH;
555  *yD = *yH;
556  {double d = cabxpyAxNormCuda(a, b2, *xD, *yD);
557  double h = cabxpyAxNormCpu(a, b2, *xH, *yH);
558  error = ERROR(x) + ERROR(y) + fabs(d-h)/fabs(h);}
559  break;
560 
561  // double2
562  case 25:
563  *xD = *xH;
564  *yD = *yH;
565  error = abs(cDotProductCuda(*xD, *yD) - cDotProductCpu(*xH, *yH)) / abs(cDotProductCpu(*xH, *yH));
566  break;
567 
568  case 26:
569  *xD = *xH;
570  *yD = *yH;
571  *zD = *zH;
572  { quda::Complex d = xpaycDotzyCuda(*xD, a, *yD, *zD);
573  quda::Complex h = xpaycDotzyCpu(*xH, a, *yH, *zH);
574  error = fabs(norm2(*yD) - norm2(*yH)) / norm2(*yH) + abs(d-h)/abs(h);
575  }
576  break;
577 
578  case 27:
579  *xD = *xH;
580  *yD = *yH;
581  *zD = *zH;
582  {quda::Complex d = caxpyDotzyCuda(a, *xD, *yD, *zD);
583  quda::Complex h = caxpyDotzyCpu(a, *xH, *yH, *zH);
584  error = ERROR(y) + abs(d-h)/abs(h);}
585  break;
586 
587  // double3
588  case 28:
589  *xD = *xH;
590  *yD = *yH;
591  { double3 d = cDotProductNormACuda(*xD, *yD);
592  double3 h = cDotProductNormACpu(*xH, *yH);
593  error = fabs(d.x - h.x) / fabs(h.x) + fabs(d.y - h.y) / fabs(h.y) + fabs(d.z - h.z) / fabs(h.z); }
594  break;
595 
596  case 29:
597  *xD = *xH;
598  *yD = *yH;
599  { double3 d = cDotProductNormBCuda(*xD, *yD);
600  double3 h = cDotProductNormBCpu(*xH, *yH);
601  error = fabs(d.x - h.x) / fabs(h.x) + fabs(d.y - h.y) / fabs(h.y) + fabs(d.z - h.z) / fabs(h.z); }
602  break;
603 
604  case 30:
605  *xD = *xH;
606  *yD = *yH;
607  *zD = *zH;
608  *wD = *wH;
609  *vD = *vH;
610  { double3 d = caxpbypzYmbwcDotProductUYNormYCuda(a2, *xD, b2, *yD, *zD, *wD, *vD);
611  double3 h = caxpbypzYmbwcDotProductUYNormYCpu(a2, *xH, b2, *yH, *zH, *wH, *vH);
612  error = ERROR(z) + ERROR(y) + fabs(d.x - h.x) / fabs(h.x) +
613  fabs(d.y - h.y) / fabs(h.y) + fabs(d.z - h.z) / fabs(h.z); }
614  break;
615 
616  case 31:
617  *xD = *xH;
618  *yD = *yH;
619  { double3 d = HeavyQuarkResidualNormCuda(*xD, *yD);
620  double3 h = HeavyQuarkResidualNormCpu(*xH, *yH);
621  error = fabs(d.x - h.x) / fabs(h.x) +
622  fabs(d.y - h.y) / fabs(h.y) + fabs(d.z - h.z) / fabs(h.z); }
623  break;
624 
625  default:
626  errorQuda("Undefined blas kernel %d\n", kernel);
627  }
628 
629  return error;
630 }
631 
632 int main(int argc, char** argv)
633 {
634  for (int i = 1; i < argc; i++){
635  if(process_command_line_option(argc, argv, &i) == 0){
636  continue;
637  }
638  printfQuda("ERROR: Invalid option:%s\n", argv[i]);
639  usage(argv);
640  }
641 
642  setSpinorSiteSize(24);
643  initComms(argc, argv, gridsize_from_cmdline);
645  initQuda(device);
646 
647  char *names[] = {
648  "copyHS",
649  "copyLS",
650  "axpby",
651  "xpy",
652  "axpy",
653  "xpay",
654  "mxpy",
655  "ax",
656  "caxpy",
657  "caxpby",
658  "cxpaypbz",
659  "axpyBzpcx",
660  "axpyZpbx",
661  "caxpbypzYmbw",
662  "cabxpyAx",
663  "caxpbypz",
664  "caxpbypczpw",
665  "caxpyXmaz",
666  "norm",
667  "reDotProduct",
668  "axpyNorm",
669  "xmyNorm",
670  "caxpyNorm",
671  "caxpyXmazNormX",
672  "cabxpyAxNorm",
673  "cDotProduct",
674  "xpaycDotzy",
675  "caxpyDotzy",
676  "cDotProductNormA",
677  "cDotProductNormB",
678  "caxpbypzYmbwcDotProductWYNormY",
679  "HeavyQuarkResidualNorm"
680  };
681 
682  char *prec_str[] = {"half", "single", "double"};
683 
684  // Only benchmark double precision if supported
685 #if (__COMPUTE_CAPABILITY__ >= 130)
686  int Nprec = 3;
687 #else
688  int Nprec = 2;
689 #endif
690 
691  // enable the tuning
693 
694  for (int prec = 0; prec < Nprec; prec++) {
695 
696  printfQuda("\nBenchmarking %s precision with %d iterations...\n\n", prec_str[prec], niter);
697  initFields(prec);
698 
699  for (int kernel = 0; kernel < Nkernels; kernel++) {
700  // only benchmark "high precision" copyCuda() if double is supported
701  if ((Nprec < 3) && (kernel == 0)) continue;
702 
703  // do the initial tune
704  benchmark(kernel, 1);
705 
706  // now rerun with more iterations to get accurate speed measurements
707  quda::blas_flops = 0;
708  quda::blas_bytes = 0;
709 
710  double secs = benchmark(kernel, niter);
711 
712  double gflops = (quda::blas_flops*1e-9)/(secs);
713  double gbytes = quda::blas_bytes/(secs*1e9);
714 
715  printfQuda("%-31s: Gflop/s = %6.1f, GB/s = %6.1f\n", names[kernel], gflops, gbytes);
716  }
717  freeFields();
718  }
719 
720  // clear the error state
721  cudaGetLastError();
722 
723  // lastly check for correctness
724  for (int prec = 0; prec < Nprec; prec++) {
725  printfQuda("\nTesting %s precision...\n\n", prec_str[prec]);
726  initFields(prec);
727 
728  for (int kernel = 0; kernel < Nkernels; kernel++) {
729  // only benchmark "high precision" copyCuda() if double is supported
730  if ((Nprec < 3) && (kernel == 0)) continue;
731  double error = test(kernel);
732  printfQuda("%-35s error = %e, \n", names[kernel], error);
733  }
734  freeFields();
735  }
736 
737  endQuda();
738 
739  finalizeComms();
740 }