QUDA  v0.5.0
A library for QCD on GPUs
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
dslash_test.cpp
Go to the documentation of this file.
1 #include <iostream>
2 #include <stdio.h>
3 #include <stdlib.h>
4 #include <string.h>
5 
6 #include <quda.h>
7 #include <quda_internal.h>
8 #include <dirac_quda.h>
9 #include <dslash_quda.h>
10 #include <invert_quda.h>
11 #include <util_quda.h>
12 #include <blas_quda.h>
13 
14 #include <test_util.h>
15 #include <dslash_util.h>
18 #include "misc.h"
19 
20 #include <gauge_qio.h>
21 
22 #define MAX(a,b) ((a)>(b)?(a):(b))
23 
24 using namespace quda;
25 
26 const QudaParity parity = QUDA_EVEN_PARITY; // even or odd?
27 const int transfer = 0; // include transfer time in the benchmark?
28 
29 double kappa5;
30 
33 
36 
39 
41 
43 
44 // What test are we doing (0 = dslash, 1 = MatPC, 2 = Mat, 3 = MatPCDagMatPC, 4 = MatDagMat)
45 extern int test_type;
46 
47 // Dirac operator type
49 
50 extern bool tune;
51 
52 extern int device;
53 extern int xdim;
54 extern int ydim;
55 extern int zdim;
56 extern int tdim;
57 extern int Lsdim;
58 extern int gridsize_from_cmdline[];
60 extern QudaPrecision prec;
61 extern QudaDagType dagger;
62 
63 extern int niter;
64 extern char latfile[];
65 
66 void init(int argc, char **argv) {
67 
68  cuda_prec = prec;
69 
72 
73  gauge_param.X[0] = xdim;
74  gauge_param.X[1] = ydim;
75  gauge_param.X[2] = zdim;
76  gauge_param.X[3] = tdim;
77 
79  errorQuda("Asqtad not supported. Please try staggered_dslash_test instead");
80  } else if (dslash_type == QUDA_DOMAIN_WALL_DSLASH) {
82  setKernelPackT(true);
83  } else {
86  {
87  Ls = 2;
88  setKernelPackT(true);
90  //setKernelPackT(false);
91  }
92  else
93  {
94  Ls = 1;
95  setKernelPackT(false);
96  }
97  }
98 
100 
101  gauge_param.anisotropy = 1.0;
102 
106 
113 
114  inv_param.kappa = 0.1;
115 
117  inv_param.mu = 0.01;
118  inv_param.epsilon = 0.01;
121  } else if (dslash_type == QUDA_DOMAIN_WALL_DSLASH) {
122  inv_param.mass = 0.01;
123  inv_param.m5 = -1.5;
124  kappa5 = 0.5/(5 + inv_param.m5);
125  }
126 
127  inv_param.Ls = Ls;
128 
131 
134  errorQuda("Gauge and spinor CPU precisions must match");
135  }
137 
140 
141 #ifndef MULTI_GPU // free parameter for single GPU
142  gauge_param.ga_pad = 0;
143 #else // must be this one c/b face for multi gpu
144  int x_face_size = gauge_param.X[1]*gauge_param.X[2]*gauge_param.X[3]/2;
145  int y_face_size = gauge_param.X[0]*gauge_param.X[2]*gauge_param.X[3]/2;
146  int z_face_size = gauge_param.X[0]*gauge_param.X[1]*gauge_param.X[3]/2;
147  int t_face_size = gauge_param.X[0]*gauge_param.X[1]*gauge_param.X[2]/2;
148  int pad_size =MAX(x_face_size, y_face_size);
149  pad_size = MAX(pad_size, z_face_size);
150  pad_size = MAX(pad_size, t_face_size);
151  gauge_param.ga_pad = pad_size;
152 #endif
153  inv_param.sp_pad = 0;
154  inv_param.cl_pad = 0;
155 
156  //inv_param.sp_pad = xdim*ydim*zdim/2;
157  //inv_param.cl_pad = 24*24*24;
158 
159  inv_param.gamma_basis = QUDA_DEGRAND_ROSSI_GAMMA_BASIS; // test code only supports DeGrand-Rossi Basis
161 
162  switch(test_type) {
163  case 0:
164  case 1:
166  break;
167  case 2:
169  break;
170  case 3:
172  break;
173  case 4:
175  break;
176  default:
177  errorQuda("Test type %d not defined\n", test_type);
178  }
179 
181 
187  //if (test_type > 0) {
189  hostCloverInv = hostClover; // fake it
190  /*} else {
191  hostClover = NULL;
192  hostCloverInv = malloc(V*cloverSiteSize*inv_param.clover_cpu_prec);
193  }*/
194  } else if (dslash_type == QUDA_TWISTED_MASS_DSLASH) {
195 
196  }
197 
198  //inv_param.verbosity = QUDA_VERBOSE;
199 
200  // construct input fields
201  for (int dir = 0; dir < 4; dir++) hostGauge[dir] = malloc(V*gaugeSiteSize*gauge_param.cpu_prec);
202 
204 
205  csParam.nColor = 3;
206  csParam.nSpin = 4;
209  }
210  csParam.nDim = 4;
211  for (int d=0; d<4; d++) csParam.x[d] = gauge_param.X[d];
213  csParam.nDim = 5;
214  csParam.x[4] = Ls;
215  }
216 
217 //ndeg_tm
221  csParam.x[4] = Ls;
222  }
223 
224 
225  csParam.precision = inv_param.cpu_prec;
226  csParam.pad = 0;
227  if (test_type < 2 || test_type ==3) {
229  csParam.x[0] /= 2;
230  } else {
232  }
235  csParam.gammaBasis = inv_param.gamma_basis;
236  csParam.create = QUDA_ZERO_FIELD_CREATE;
237 
238  //csParam.verbose = QUDA_DEBUG_VERBOSE;
239 
240  spinor = new cpuColorSpinorField(csParam);
241  spinorOut = new cpuColorSpinorField(csParam);
242  spinorRef = new cpuColorSpinorField(csParam);
243  spinorTmp = new cpuColorSpinorField(csParam);
244 
246  csParam.x[0] = gauge_param.X[0];
247 
248  printfQuda("Randomizing fields... ");
249 
250  if (strcmp(latfile,"")) { // load in the command line supplied gauge field
253  } else { // else generate a random SU(3) field
255  }
256 
258 
260  double norm = 0.0; // clover components are random numbers in the range (-norm, norm)
261  double diag = 1.0; // constant added to the diagonal
262 
263  if (test_type == 2 || test_type == 4) {
265  } else {
267  }
268  }
269  printfQuda("done.\n"); fflush(stdout);
270 
271  initQuda(device);
272 
273  printfQuda("Sending gauge field to GPU\n");
275 
277  printfQuda("Sending clover field to GPU\n");
279  //clover = cudaCloverPrecise;
280  }
281 
282  if (!transfer) {
284  csParam.pad = inv_param.sp_pad;
285  csParam.precision = inv_param.cuda_prec;
286  if (csParam.precision == QUDA_DOUBLE_PRECISION ) {
288  } else {
289  /* Single and half */
291  }
292 
293  if (test_type < 2 || test_type == 3) {
295  csParam.x[0] /= 2;
296  }
297 
298  printfQuda("Creating cudaSpinor\n");
299  cudaSpinor = new cudaColorSpinorField(csParam);
300  printfQuda("Creating cudaSpinorOut\n");
301  cudaSpinorOut = new cudaColorSpinorField(csParam);
302 
303  tmp1 = new cudaColorSpinorField(csParam);
304 
305  if (test_type == 2 || test_type == 4) csParam.x[0] /= 2;
306 
308  tmp2 = new cudaColorSpinorField(csParam);
309 
310  printfQuda("Sending spinor field to GPU\n");
311  *cudaSpinor = *spinor;
312 
313  double cpu_norm = norm2(*spinor);
314  double cuda_norm = norm2(*cudaSpinor);
315  printfQuda("Source: CPU = %e, CUDA = %e\n", cpu_norm, cuda_norm);
316 
317  bool pc = (test_type != 2 && test_type != 4);
318  DiracParam diracParam;
319  setDiracParam(diracParam, &inv_param, pc);
320  diracParam.verbose = QUDA_VERBOSE;
321  diracParam.tmp1 = tmp1;
322  diracParam.tmp2 = tmp2;
323 
324  dirac = Dirac::create(diracParam);
325  } else {
326  double cpu_norm = norm2(*spinor);
327  printfQuda("Source: CPU = %e\n", cpu_norm);
328  }
329 
330 }
331 
332 void end() {
333  if (!transfer) {
334  delete dirac;
335  delete cudaSpinor;
336  delete cudaSpinorOut;
337  delete tmp1;
338  delete tmp2;
339  }
340 
341  // release memory
342  delete spinor;
343  delete spinorOut;
344  delete spinorRef;
345  delete spinorTmp;
346 
347  for (int dir = 0; dir < 4; dir++) free(hostGauge[dir]);
350  free(hostCloverInv);
351  }
352  endQuda();
353 
354 }
355 
356 // execute kernel
357 double dslashCUDA(int niter) {
358 
359  cudaEvent_t start, end;
360  cudaEventCreate(&start);
361  cudaEventCreate(&end);
362  cudaEventRecord(start, 0);
363 
364  for (int i = 0; i < niter; i++) {
365  switch (test_type) {
366  case 0:
367  if (transfer) {
369  } else {
371  }
372  break;
373  case 1:
374  case 2:
375  if (transfer) {
376  MatQuda(spinorOut->V(), spinor->V(), &inv_param);
377  } else {
379  }
380  break;
381  case 3:
382  case 4:
383  if (transfer) {
385  } else {
387  }
388  break;
389  }
390  }
391 
392  cudaEventRecord(end, 0);
393  cudaEventSynchronize(end);
394  float runTime;
395  cudaEventElapsedTime(&runTime, start, end);
396  cudaEventDestroy(start);
397  cudaEventDestroy(end);
398 
399  double secs = runTime / 1000; //stopwatchReadSeconds();
400 
401  // check for errors
402  cudaError_t stat = cudaGetLastError();
403  if (stat != cudaSuccess)
404  printfQuda("with ERROR: %s\n", cudaGetErrorString(stat));
405 
406  return secs;
407 }
408 
409 void dslashRef() {
410 
411  // compare to dslash reference implementation
412  printfQuda("Calculating reference implementation...");
413  fflush(stdout);
414 
417  switch (test_type) {
418  case 0:
420  break;
421  case 1:
424  break;
425  case 2:
427  break;
428  case 3:
433  break;
434  case 4:
437  break;
438  default:
439  printfQuda("Test type not defined\n");
440  exit(-1);
441  }
442  } else if (dslash_type == QUDA_TWISTED_MASS_DSLASH) {
443  switch (test_type) {
444  case 0:
447  else
448  {
449  int tm_offset = 12*spinorRef->Volume();
450 
451  void *ref1 = spinorRef->V();
452  void *ref2 = cpu_prec == sizeof(double) ? (void*)((double*)ref1 + tm_offset): (void*)((float*)ref1 + tm_offset);
453 
454  void *flv1 = spinor->V();
455  void *flv2 = cpu_prec == sizeof(double) ? (void*)((double*)flv1 + tm_offset): (void*)((float*)flv1 + tm_offset);
456 
457  tm_ndeg_dslash(ref1, ref2, hostGauge, flv1, flv2, inv_param.kappa, inv_param.mu, inv_param.epsilon,
459  }
460  break;
461  case 1:
464  else
465  {
466  int tm_offset = 12*spinorRef->Volume();
467 
468  void *ref1 = spinorRef->V();
469  void *ref2 = cpu_prec == sizeof(double) ? (void*)((double*)ref1 + tm_offset): (void*)((float*)ref1 + tm_offset);
470 
471  void *flv1 = spinor->V();
472  void *flv2 = cpu_prec == sizeof(double) ? (void*)((double*)flv1 + tm_offset): (void*)((float*)flv1 + tm_offset);
473 
475  }
476  break;
477  case 2:
480  else
481  {
482  int tm_offset = 12*spinorRef->Volume();
483 
484  void *evenOut = spinorRef->V();
485  void *oddOut = cpu_prec == sizeof(double) ? (void*)((double*)evenOut + tm_offset): (void*)((float*)evenOut + tm_offset);
486 
487  void *evenIn = spinor->V();
488  void *oddIn = cpu_prec == sizeof(double) ? (void*)((double*)evenIn + tm_offset): (void*)((float*)evenIn + tm_offset);
489 
490  tm_ndeg_mat(evenOut, oddOut, hostGauge, evenIn, oddIn, inv_param.kappa, inv_param.mu, inv_param.epsilon, dagger, inv_param.cpu_prec, gauge_param);
491  }
492  break;
493  case 3:
499  }
500  else
501  {
502  }
503  break;
504  case 4:
510  }
511  else
512  {
513  }
514  break;
515  default:
516  printfQuda("Test type not defined\n");
517  exit(-1);
518  }
519  } else if (dslash_type == QUDA_DOMAIN_WALL_DSLASH) {
520  switch (test_type) {
521  case 0:
523  break;
524  case 1:
526  break;
527  case 2:
529  break;
530  case 3:
533  break;
534  case 4:
536  break;
537  default:
538  printf("Test type not supported for domain wall\n");
539  exit(-1);
540  }
541  } else {
542  printfQuda("Unsupported dslash_type\n");
543  exit(-1);
544  }
545 
546  printfQuda("done.\n");
547 }
548 
549 
551 {
552  printfQuda("running the following test:\n");
553 
554  printfQuda("prec recon test_type dagger S_dim T_dimension Ls_dimension dslash_type niter\n");
555  printfQuda("%s %s %d %d %d/%d/%d %d %d %s %d\n",
559  printfQuda("Grid partition info: X Y Z T\n");
560  printfQuda(" %d %d %d %d\n",
561  dimPartitioned(0),
562  dimPartitioned(1),
563  dimPartitioned(2),
564  dimPartitioned(3));
565 
566  return ;
567 
568 }
569 
570 extern void usage(char**);
571 
572 
573 int main(int argc, char **argv)
574 {
575 
576  for (int i =1;i < argc; i++){
577  if(process_command_line_option(argc, argv, &i) == 0){
578  continue;
579  }
580 
581  fprintf(stderr, "ERROR: Invalid option:%s\n", argv[i]);
582  usage(argv);
583  }
584 
585  initComms(argc, argv, gridsize_from_cmdline);
586 
588 
589  init(argc, argv);
590 
591  float spinorGiB = (float)Vh*spinorSiteSize*inv_param.cuda_prec / (1 << 30);
592  printfQuda("\nSpinor mem: %.3f GiB\n", spinorGiB);
593  printfQuda("Gauge mem: %.3f GiB\n", gauge_param.gaugeGiB);
594 
595  int attempts = 1;
596  dslashRef();
597  for (int i=0; i<attempts; i++) {
598 
599  if (tune) { // warm-up run
600  printfQuda("Tuning...\n");
602  dslashCUDA(1);
603  }
604  printfQuda("Executing %d kernel loops...\n", niter);
605  if (!transfer) dirac->Flops();
606  double secs = dslashCUDA(niter);
607  printfQuda("done.\n\n");
608 
609 #ifdef DSLASH_PROFILING
610  printDslashProfile();
611 #endif
612 
613  if (!transfer) *spinorOut = *cudaSpinorOut;
614 
615  // print timing information
616  printfQuda("%fus per kernel call\n", 1e6*secs / niter);
617 
618  unsigned long long flops = 0;
619  if (!transfer) flops = dirac->Flops();
620  int spinor_floats = test_type ? 2*(7*24+24)+24 : 7*24+24;
622  spinor_floats += test_type ? 2*(7*2 + 2) + 2 : 7*2 + 2; // relative size of norm is twice a short
623  int gauge_floats = (test_type ? 2 : 1) * (gauge_param.gauge_fix ? 6 : 8) * gauge_param.reconstruct;
625  gauge_floats += test_type ? 72*2 : 72;
626  }
627  printfQuda("GFLOPS = %f\n", 1.0e-9*flops/secs);
628  printfQuda("GB/s = %f\n\n",
629  (double)Vh*(Ls*spinor_floats+gauge_floats)*inv_param.cuda_prec/((secs/niter)*1e+9));
630 
631  if (!transfer) {
632  double norm2_cpu = norm2(*spinorRef);
633  double norm2_cuda= norm2(*cudaSpinorOut);
634  double norm2_cpu_cuda= norm2(*spinorOut);
635  printfQuda("Results: CPU = %f, CUDA=%f, CPU-CUDA = %f\n", norm2_cpu, norm2_cuda, norm2_cpu_cuda);
636  } else {
637  double norm2_cpu = norm2(*spinorRef);
638  double norm2_cpu_cuda= norm2(*spinorOut);
639  printfQuda("Result: CPU = %f, CPU-QUDA = %f\n", norm2_cpu, norm2_cpu_cuda);
640  }
641 
643  }
644  end();
645 
646  finalizeComms();
647 }