QUDA  0.9.0
staggered_dslash_test.cpp
Go to the documentation of this file.
1 #include <iostream>
2 #include <stdio.h>
3 #include <stdlib.h>
4 #include <string.h>
5 
6 #include <quda.h>
7 #include <quda_internal.h>
8 #include <dirac_quda.h>
9 #include <dslash_quda.h>
10 #include <invert_quda.h>
11 #include <util_quda.h>
12 #include <blas_quda.h>
13 
14 #include <misc.h>
15 #include <test_util.h>
16 #include <dslash_util.h>
18 #include <gauge_field.h>
19 
20 #include <assert.h>
21 #include <gtest.h>
22 
23 using namespace quda;
24 
25 #define MAX(a,b) ((a)>(b)?(a):(b))
26 #define staggeredSpinorSiteSize 6
27 // What test are we doing (0 = dslash, 1 = MatPC, 2 = Mat)
28 
29 extern void usage(char** argv );
30 
32 
33 extern int test_type;
34 
37 
40 
43 
45 
46 void *hostGauge[4];
47 void *fatlink[4], *longlink[4];
48 
49 #ifdef MULTI_GPU
50 void **ghost_fatlink, **ghost_longlink;
51 #endif
52 
54 extern QudaDagType dagger;
55 int transfer = 0; // include transfer time in the benchmark?
56 extern int xdim;
57 extern int ydim;
58 extern int zdim;
59 extern int tdim;
60 extern int gridsize_from_cmdline[];
62 extern QudaPrecision prec;
63 
64 extern int device;
65 extern bool verify_results;
66 extern int niter;
67 
68 extern bool kernel_pack_t;
69 
70 extern double mass; // the mass of the Dirac operator
71 
72 int X[4];
73 extern int Nsrc; // number of spinors to apply to simultaneously
74 
76 
77 void init()
78 {
79 
81 
83 
85 
88 
89  gaugeParam.X[0] = X[0] = xdim;
90  gaugeParam.X[1] = X[1] = ydim;
91  gaugeParam.X[2] = X[2] = zdim;
92  gaugeParam.X[3] = X[3] = tdim;
93 
95  dw_setDims(gaugeParam.X,Nsrc); // so we can use 5-d indexing from dwf
97 
103 
104  // ensure that the default is improved staggered
108 
109  gaugeParam.anisotropy = 1.0;
115  gaugeParam.gaugeGiB = 0;
116 
124  inv_param.mass = mass;
125 
126  // ensure that the default is improved staggered
130 
133 
134  int tmpint = MAX(X[1]*X[2]*X[3], X[0]*X[2]*X[3]);
135  tmpint = MAX(tmpint, X[0]*X[1]*X[3]);
136  tmpint = MAX(tmpint, X[0]*X[1]*X[2]);
137 
138 
139  gaugeParam.ga_pad = tmpint;
140  inv_param.sp_pad = tmpint;
141 
143  csParam.nColor=3;
144  csParam.nSpin=1;
145  csParam.nDim=5;
146  for(int d = 0; d < 4; d++) {
147  csParam.x[d] = gaugeParam.X[d];
148  }
149  csParam.x[4] = Nsrc; // number of sources becomes the fifth dimension
150 
152  csParam.pad = 0;
153  if (test_type < 2) {
156  csParam.x[0] /= 2;
157  } else {
160  }
161 
164  csParam.gammaBasis = inv_param.gamma_basis; // this parameter is meaningless for staggered
166 
171 
173  csParam.x[0] = gaugeParam.X[0];
174 
175  printfQuda("Randomizing fields ...\n");
176 
178 
179  size_t gSize = (gaugeParam.cpu_prec == QUDA_DOUBLE_PRECISION) ? sizeof(double) : sizeof(float);
180 
181  for (int dir = 0; dir < 4; dir++) {
184 
185  if (fatlink[dir] == NULL || longlink[dir] == NULL){
186  errorQuda("ERROR: malloc failed for fatlink/longlink");
187  }
188  }
189 
191 
192 #ifdef MULTI_GPU
195  GaugeFieldParam cpuFatParam(fatlink, gaugeParam);
197  cpuFat = new cpuGaugeField(cpuFatParam);
198  ghost_fatlink = cpuFat->Ghost();
199 
201  GaugeFieldParam cpuLongParam(longlink, gaugeParam);
202  cpuLongParam.ghostExchange = QUDA_GHOST_EXCHANGE_PAD;
203  cpuLong = new cpuGaugeField(cpuLongParam);
204  ghost_longlink = cpuLong->Ghost();
205 
206  int x_face_size = X[1]*X[2]*X[3]/2;
207  int y_face_size = X[0]*X[2]*X[3]/2;
208  int z_face_size = X[0]*X[1]*X[3]/2;
209  int t_face_size = X[0]*X[1]*X[2]/2;
210  int pad_size = MAX(x_face_size, y_face_size);
211  pad_size = MAX(pad_size, z_face_size);
212  pad_size = MAX(pad_size, t_face_size);
213  gaugeParam.ga_pad = pad_size;
214 #endif
215 
219  } else {
221  }
222 
223  printfQuda("Fat links sending...");
225  printfQuda("Fat links sent\n");
226 
228 
229 #ifdef MULTI_GPU
230  gaugeParam.ga_pad = 3*pad_size;
231 #endif
232 
235  printfQuda("Long links sending...");
237  printfQuda("Long links sent...\n");
238  }
239 
240  printfQuda("Sending fields to GPU...");
241 
242  if (!transfer) {
243 
247  if (test_type < 2){
249  csParam.x[0] /=2;
250  }
251 
252  printfQuda("Creating cudaSpinor\n");
254 
255  printfQuda("Creating cudaSpinorOut\n");
257 
258  printfQuda("Sending spinor field to GPU\n");
259  *cudaSpinor = *spinor;
260 
261  cudaDeviceSynchronize();
262  checkCudaError();
263 
264  double spinor_norm2 = blas::norm2(*spinor);
265  double cuda_spinor_norm2= blas::norm2(*cudaSpinor);
266  printfQuda("Source CPU = %f, CUDA=%f\n", spinor_norm2, cuda_spinor_norm2);
267 
268  if(test_type == 2) csParam.x[0] /=2;
269 
272 
273  bool pc = (test_type != 2);
274  DiracParam diracParam;
275  setDiracParam(diracParam, &inv_param, pc);
276 
277  diracParam.tmp1=tmp;
278 
279  dirac = Dirac::create(diracParam);
280 
281  } else {
282  errorQuda("Error not suppported");
283  }
284 
285  return;
286 }
287 
288 void end(void)
289 {
290  for (int dir = 0; dir < 4; dir++) {
291  free(fatlink[dir]);
292  free(longlink[dir]);
293  }
294 
295  if (!transfer){
296  delete dirac;
297  delete cudaSpinor;
298  delete cudaSpinorOut;
299  delete tmp;
300  }
301 
302  delete spinor;
303  delete spinorOut;
304  delete spinorRef;
305  delete tmpCpu;
306 
307  if (cpuFat) delete cpuFat;
308  if (cpuLong) delete cpuLong;
309 
310  endQuda();
311 }
312 
313 struct DslashTime {
314  double event_time;
315  double cpu_time;
316  double cpu_min;
317  double cpu_max;
318 
319  DslashTime() : event_time(0.0), cpu_time(0.0), cpu_min(DBL_MAX), cpu_max(0.0) {}
320 };
321 
323 
324  DslashTime dslash_time;
325  timeval tstart, tstop;
326 
327  cudaEvent_t start, end;
328  cudaEventCreate(&start);
329  cudaEventRecord(start, 0);
330  cudaEventSynchronize(start);
331 
332  comm_barrier();
333  cudaEventRecord(start, 0);
334 
335  for (int i = 0; i < niter; i++) {
336 
337  gettimeofday(&tstart, NULL);
338 
339  switch (test_type) {
340  case 0:
341  if (transfer){
342  //dslashQuda(spinorOdd, spinorEven, &inv_param, parity);
343  } else {
345  }
346  break;
347  case 1:
348  if (transfer){
349  //MatPCDagMatPcQuda(spinorOdd, spinorEven, &inv_param);
350  } else {
352  }
353  break;
354  case 2:
355  errorQuda("Staggered operator acting on full-site not supported");
356  if (transfer){
357  //MatQuda(spinorGPU, spinor, &inv_param);
358  } else {
360  }
361  }
362 
363  gettimeofday(&tstop, NULL);
364  long ds = tstop.tv_sec - tstart.tv_sec;
365  long dus = tstop.tv_usec - tstart.tv_usec;
366  double elapsed = ds + 0.000001*dus;
367 
368  dslash_time.cpu_time += elapsed;
369  // skip first and last iterations since they may skew these metrics if comms are not synchronous
370  if (i>0 && i<niter) {
371  if (elapsed < dslash_time.cpu_min) dslash_time.cpu_min = elapsed;
372  if (elapsed > dslash_time.cpu_max) dslash_time.cpu_max = elapsed;
373  }
374  }
375 
376  cudaEventCreate(&end);
377  cudaEventRecord(end, 0);
378  cudaEventSynchronize(end);
379  float runTime;
380  cudaEventElapsedTime(&runTime, start, end);
381  cudaEventDestroy(start);
382  cudaEventDestroy(end);
383 
384  dslash_time.event_time = runTime / 1000;
385 
386  // check for errors
387  cudaError_t stat = cudaGetLastError();
388  if (stat != cudaSuccess)
389  errorQuda("with ERROR: %s\n", cudaGetErrorString(stat));
390 
391  return dslash_time;
392 }
393 
395 {
396 
397  // compare to dslash reference implementation
398  printfQuda("Calculating reference implementation...");
399  fflush(stdout);
400  switch (test_type) {
401  case 0:
402 #ifdef MULTI_GPU
403  staggered_dslash_mg4dir(spinorRef, fatlink, longlink, ghost_fatlink, ghost_longlink,
405 #else
407 #endif
408  break;
409  case 1:
410 #ifdef MULTI_GPU
411  matdagmat_mg4dir(spinorRef, fatlink, longlink, ghost_fatlink, ghost_longlink,
413 #else
415 #endif
416  break;
417  case 2:
418  //mat(spinorRef->V(), fatlink, longlink, spinor->V(), kappa, dagger,
419  //inv_param.cpu_prec, gaugeParam.cpu_prec);
420  break;
421  default:
422  errorQuda("Test type not defined");
423  }
424 
425  printfQuda("done.\n");
426 
427 }
428 
429 TEST(dslash, verify) {
430  double deviation = pow(10, -(double)(cpuColorSpinorField::Compare(*spinorRef, *spinorOut)));
431  double tol = (inv_param.cuda_prec == QUDA_DOUBLE_PRECISION ? 1e-12 :
432  (inv_param.cuda_prec == QUDA_SINGLE_PRECISION ? 1e-3 : 1e-1));
433  ASSERT_LE(deviation, tol) << "CPU and CUDA implementations do not agree";
434 }
435 
436 static int dslashTest()
437 {
438  // return code for google test
439  int test_rc = 0;
440  init();
441 
442  int attempts = 1;
443 
444  for (int i=0; i<attempts; i++) {
445 
446  { // warm-up run
447  printfQuda("Tuning...\n");
448  dslashCUDA(1);
449  }
450  printfQuda("Executing %d kernel loops...", niter);
451 
452  // reset flop counter
453  dirac->Flops();
454 
455  DslashTime dslash_time = dslashCUDA(niter);
456 
457  if (!transfer) *spinorOut = *cudaSpinorOut;
458 
459  printfQuda("%fus per kernel call\n", 1e6*dslash_time.event_time / niter);
461 
462  unsigned long long flops = dirac->Flops();
463  printfQuda("GFLOPS = %f\n", 1.0e-9*flops/dslash_time.event_time);
464 
465  printfQuda("Effective halo bi-directional bandwidth (GB/s) GPU = %f ( CPU = %f, min = %f , max = %f ) for aggregate message size %lu bytes\n",
466  1.0e-9*2*cudaSpinor->GhostBytes()*niter/dslash_time.event_time, 1.0e-9*2*cudaSpinor->GhostBytes()*niter/dslash_time.cpu_time,
467  1.0e-9*2*cudaSpinor->GhostBytes()/dslash_time.cpu_max, 1.0e-9*2*cudaSpinor->GhostBytes()/dslash_time.cpu_min,
468  2*cudaSpinor->GhostBytes());
469 
470  double spinor_ref_norm2 = blas::norm2(*spinorRef);
471  double spinor_out_norm2 = blas::norm2(*spinorOut);
472 
473  if (!transfer) {
474  double cuda_spinor_out_norm2 = blas::norm2(*cudaSpinorOut);
475  printfQuda("Results: CPU=%f, CUDA=%f, CPU-CUDA=%f\n", spinor_ref_norm2, cuda_spinor_out_norm2,
476  spinor_out_norm2);
477  } else {
478  printfQuda("Result: CPU=%f , CPU-CUDA=%f", spinor_ref_norm2, spinor_out_norm2);
479  }
480 
481  if (verify_results) {
482  test_rc = RUN_ALL_TESTS();
483  if (test_rc != 0) warningQuda("Tests failed");
484  }
485  }
486  end();
487 
488  return test_rc;
489 }
490 
491 
493 {
494  printfQuda("running the following test:\n");
495 
496  printfQuda("prec recon test_type dagger S_dim T_dimension\n");
497  printfQuda("%s %s %d %d %d/%d/%d %d \n",
500  printfQuda("Grid partition info: X Y Z T\n");
501  printfQuda(" %d %d %d %d\n",
502  dimPartitioned(0),
503  dimPartitioned(1),
504  dimPartitioned(2),
505  dimPartitioned(3));
506 
507  return ;
508 
509 }
510 
511 
512  void
513 usage_extra(char** argv )
514 {
515  printfQuda("Extra options:\n");
516  printfQuda(" --test <0/1> # Test method\n");
517  printfQuda(" 0: Even destination spinor\n");
518  printfQuda(" 1: Odd destination spinor\n");
519  return ;
520 }
521 
522 int main(int argc, char **argv)
523 {
524  // initalize google test
525  ::testing::InitGoogleTest(&argc, argv);
526  for (int i=1 ;i < argc; i++){
527 
528  if(process_command_line_option(argc, argv, &i) == 0){
529  continue;
530  }
531 
532  fprintf(stderr, "ERROR: Invalid option:%s\n", argv[i]);
533  usage(argv);
534  }
535 
536  initComms(argc, argv, gridsize_from_cmdline);
537 
539 
540  // return result of RUN_ALL_TESTS
541  int test_rc = dslashTest();
542 
543  finalizeComms();
544 
545  return test_rc;
546 }
547 
int dimPartitioned(int dim)
Definition: test_util.cpp:1686
QudaDiracFieldOrder dirac_order
Definition: quda.h:195
QudaDslashType dslash_type
Definition: test_util.cpp:1626
QudaReconstructType reconstruct_sloppy
Definition: quda.h:46
double anisotropy
Definition: quda.h:31
cpuColorSpinorField * spinorOut
void * hostGauge[4]
QudaGhostExchange ghostExchange
Definition: lattice_field.h:60
static int dslashTest()
int Nsrc
Definition: test_util.cpp:1628
void endQuda(void)
void free(void *)
enum QudaPrecision_s QudaPrecision
int zdim
Definition: test_util.cpp:1622
int ga_pad
Definition: quda.h:53
void dw_setDims(int *X, const int L5)
Definition: test_util.cpp:167
int main(int argc, char **argv)
DslashTime dslashCUDA(int niter)
QudaGaugeFixed gauge_fix
Definition: quda.h:51
__darwin_time_t tv_sec
QudaLinkType type
Definition: quda.h:35
int fflush(FILE *)
cpuColorSpinorField * spinorRef
QudaDagType dagger
QudaReconstructType link_recon
Definition: test_util.cpp:1612
#define errorQuda(...)
Definition: util_quda.h:90
double norm2(const ColorSpinorField &a)
Definition: reduce_quda.cu:241
QudaDslashType dslash_type
Definition: quda.h:93
cudaEvent_t start
QudaGaugeParam gaugeParam
QudaPrecision cuda_prec
Definition: quda.h:191
void loadGaugeQuda(void *h_gauge, QudaGaugeParam *param)
double cpu_min
void staggered_dslash(void *res, void **fatlink, void **longlink, void *spinorField, int oddBit, int daggerBit, QudaPrecision sPrecision, QudaPrecision gPrecision)
QudaPrecision cpu_prec
Definition: quda.h:190
int process_command_line_option(int argc, char **argv, int *idx)
Definition: test_util.cpp:1795
void Source(const QudaSourceType sourceType, const int st=0, const int s=0, const int c=0)
int niter
Definition: test_util.cpp:1630
QudaPrecision precision
Definition: lattice_field.h:54
QudaDagType dagger
Definition: quda.h:184
void matdagmat_mg4dir(cpuColorSpinorField *out, void **link, void **ghostLink, cpuColorSpinorField *in, int dagger_bit, int mu, QudaPrecision sPrecision, QudaPrecision gPrecision, cpuColorSpinorField *tmp, QudaParity parity)
void finalizeComms()
Definition: test_util.cpp:107
void usage(char **argv)
Definition: test_util.cpp:1693
cpuColorSpinorField * spinor
cudaColorSpinorField * cudaSpinor
QudaGaugeFieldOrder gauge_order
Definition: quda.h:36
const char * get_prec_str(QudaPrecision prec)
Definition: misc.cpp:704
unsigned long long Flops() const
Definition: dirac_quda.h:148
virtual void MdagM(ColorSpinorField &out, const ColorSpinorField &in) const =0
QudaSiteSubset siteSubset
Definition: lattice_field.h:55
void setDims(int *)
Definition: test_util.cpp:130
QudaFieldLocation input_location
Definition: quda.h:90
void * longlink[4]
__darwin_suseconds_t tv_usec
static size_t gSize
Definition: llfat_test.cpp:36
int test_type
Definition: test_util.cpp:1634
void setDiracParam(DiracParam &diracParam, QudaInvertParam *inv_param, bool pc)
QudaSolutionType solution_type
Definition: quda.h:181
else return(__swbuf(_c, _p))
bool verify_results
Definition: test_util.cpp:1641
QudaPrecision prec
Definition: test_util.cpp:1615
Dirac * dirac
int x[QUDA_MAX_DIM]
Definition: lattice_field.h:50
cpuColorSpinorField * tmpCpu
void usage_extra(char **argv)
double scale
Definition: quda.h:33
void initQuda(int device)
cudaColorSpinorField * tmp
double tol
Definition: test_util.cpp:1647
QudaFieldLocation output_location
Definition: quda.h:91
void * malloc(size_t __size) __attribute__((__warn_unused_result__)) __attribute__((alloc_size(1)))
int xdim
Definition: test_util.cpp:1620
void setSpinorSiteSize(int n)
Definition: test_util.cpp:192
ColorSpinorParam csParam
Definition: pack_test.cpp:24
QudaInvertParam newQudaInvertParam(void)
const char * get_recon_str(QudaReconstructType recon)
Definition: misc.cpp:770
#define MAX(a, b)
double event_time
int V
Definition: test_util.cpp:28
#define gaugeSiteSize
Definition: test_util.h:6
double cpu_time
TEST(dslash, verify)
#define warningQuda(...)
Definition: util_quda.h:101
__host__ __device__ ValueType pow(ValueType x, ExponentType e)
Definition: complex_quda.h:100
void matdagmat(void *out, void **link, void *in, int dagger_bit, int mu, QudaPrecision sPrecision, QudaPrecision gPrecision, void *tmp, QudaParity parity)
QudaParity parity
QudaGammaBasis gamma_basis
Definition: quda.h:197
cudaColorSpinorField * cudaSpinorOut
QudaPrecision cuda_prec_sloppy
Definition: quda.h:45
const void ** Ghost() const
Definition: gauge_field.h:254
enum QudaDagType_s QudaDagType
enum QudaParity_s QudaParity
QudaReconstructType reconstruct
Definition: quda.h:43
QudaPrecision cuda_prec
Definition: quda.h:42
int X[4]
Definition: quda.h:29
double mass
Definition: quda.h:96
int fprintf(FILE *, const char *,...) __attribute__((__format__(__printf__
int X[4]
static int Compare(const cpuColorSpinorField &a, const cpuColorSpinorField &b, const int resolution=1)
void construct_fat_long_gauge_field(void **fatlink, void **longlink, int type, QudaPrecision precision, QudaGaugeParam *param, QudaDslashType dslash_type)
Definition: test_util.cpp:1069
void init()
virtual void M(ColorSpinorField &out, const ColorSpinorField &in) const =0
cpuGaugeField * cpuLong
int transfer
double tadpole_coeff
Definition: quda.h:32
double gaugeGiB
Definition: quda.h:60
int tdim
Definition: test_util.cpp:1623
int ydim
Definition: test_util.cpp:1621
void display_test_info()
enum QudaReconstructType_s QudaReconstructType
Main header file for the QUDA library.
#define printfQuda(...)
Definition: util_quda.h:84
QudaTboundary t_boundary
Definition: quda.h:38
unsigned long long flops
Definition: blas_quda.cu:42
double cpu_max
QudaInvertParam inv_param
void staggered_dslash_mg4dir(cpuColorSpinorField *out, void **fatlink, void **longlink, void **ghost_fatlink, void **ghost_longlink, cpuColorSpinorField *in, int oddBit, int daggerBit, QudaPrecision sPrecision, QudaPrecision gPrecision)
enum QudaDslashType_s QudaDslashType
void setKernelPackT(bool pack)
Definition: dslash_quda.cu:59
void staggeredDslashRef()
void end(void)
bool kernel_pack_t
Definition: test_util.cpp:1650
#define checkCudaError()
Definition: util_quda.h:129
static Dirac * create(const DiracParam &param)
Definition: dirac.cpp:142
static __inline__ size_t size_t d
virtual void Dslash(ColorSpinorField &out, const ColorSpinorField &in, const QudaParity parity) const =0
void initComms(int argc, char **argv, const int *commDims)
Definition: test_util.cpp:72
int gridsize_from_cmdline[]
Definition: test_util.cpp:50
cpuGaugeField * cpuFat
void setVerbosity(const QudaVerbosity verbosity)
Definition: util_quda.cpp:24
QudaMatPCType matpc_type
Definition: quda.h:183
ColorSpinorField * tmp1
Definition: dirac_quda.h:40
QudaPrecision cpu_prec
Definition: quda.h:40
QudaGaugeParam newQudaGaugeParam(void)
void comm_barrier(void)
Definition: comm_mpi.cpp:328
void * fatlink[4]
double mass
Definition: test_util.cpp:1642