QUDA  v0.7.0
A library for QCD on GPUs
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
staggered_dslash_test.cpp
Go to the documentation of this file.
1 #include <iostream>
2 #include <stdio.h>
3 #include <stdlib.h>
4 #include <string.h>
5 
6 #include <quda.h>
7 #include <quda_internal.h>
8 #include <dirac_quda.h>
9 #include <dslash_quda.h>
10 #include <invert_quda.h>
11 #include <util_quda.h>
12 #include <blas_quda.h>
13 
14 #include <misc.h>
15 #include <test_util.h>
16 #include <dslash_util.h>
18 #include <gauge_field.h>
19 
20 #include <face_quda.h>
21 
22 #include <assert.h>
23 #include <gtest.h>
24 
25 using namespace quda;
26 
27 #define MAX(a,b) ((a)>(b)?(a):(b))
28 #define staggeredSpinorSiteSize 6
29 // What test are we doing (0 = dslash, 1 = MatPC, 2 = Mat)
30 
31 extern void usage(char** argv );
32 
34 
35 extern int test_type;
36 
37 extern bool tune;
38 
41 
44 
47 
49 
50 void *hostGauge[4];
51 void *fatlink[4], *longlink[4];
52 
53 #ifdef MULTI_GPU
54 const void **ghost_fatlink, **ghost_longlink;
55 #endif
56 
57 const int loops = 100;
58 
60 extern QudaDagType dagger;
61 int transfer = 0; // include transfer time in the benchmark?
62 extern int xdim;
63 extern int ydim;
64 extern int zdim;
65 extern int tdim;
66 extern int gridsize_from_cmdline[];
68 extern QudaPrecision prec;
69 
70 extern int device;
71 extern bool verify_results;
72 
73 extern bool kernel_pack_t;
74 
75 int X[4];
76 
78 
79 void init()
80 {
81 
83 
85 
87 
90 
91  gaugeParam.X[0] = X[0] = xdim;
92  gaugeParam.X[1] = X[1] = ydim;
93  gaugeParam.X[2] = X[2] = zdim;
94  gaugeParam.X[3] = X[3] = tdim;
95 
98 
104 
105  gaugeParam.anisotropy = 1.0;
111  gaugeParam.gaugeGiB = 0;
112 
120 
121  // ensure that the default is improved staggered
125 
128 
129  int tmpint = MAX(X[1]*X[2]*X[3], X[0]*X[2]*X[3]);
130  tmpint = MAX(tmpint, X[0]*X[1]*X[3]);
131  tmpint = MAX(tmpint, X[0]*X[1]*X[2]);
132 
133 
134  gaugeParam.ga_pad = tmpint;
135  inv_param.sp_pad = tmpint;
136 
138  csParam.nColor=3;
139  csParam.nSpin=1;
140  csParam.nDim=4;
141  for(int d = 0; d < 4; d++) {
142  csParam.x[d] = gaugeParam.X[d];
143  }
144  csParam.precision = inv_param.cpu_prec;
145  csParam.pad = 0;
146  if (test_type < 2) {
149  csParam.x[0] /= 2;
150  } else {
153  }
154 
157  csParam.gammaBasis = inv_param.gamma_basis; // this parameter is meaningless for staggered
158  csParam.create = QUDA_ZERO_FIELD_CREATE;
159 
160  spinor = new cpuColorSpinorField(csParam);
161  spinorOut = new cpuColorSpinorField(csParam);
162  spinorRef = new cpuColorSpinorField(csParam);
163 
165  csParam.x[0] = gaugeParam.X[0];
166 
167  printfQuda("Randomizing fields ...\n");
168 
170 
171  size_t gSize = (gaugeParam.cpu_prec == QUDA_DOUBLE_PRECISION) ? sizeof(double) : sizeof(float);
172 
173  for (int dir = 0; dir < 4; dir++) {
174  fatlink[dir] = malloc(V*gaugeSiteSize*gSize);
175  longlink[dir] = malloc(V*gaugeSiteSize*gSize);
176  }
177  if (fatlink == NULL || longlink == NULL){
178  errorQuda("ERROR: malloc failed for fatlink/longlink");
179  }
181 
182  if(link_recon == QUDA_RECONSTRUCT_9 || link_recon == QUDA_RECONSTRUCT_13){ // incorporate non-trivial phase into long links
183  const double cos_pi_3 = 0.5; // Cos(pi/3)
184  const double sin_pi_3 = sqrt(0.75); // Sin(pi/3)
185  for(int dir=0; dir<4; ++dir){
186  for(int i=0; i<V; ++i){
187  for(int j=0; j<gaugeSiteSize; j+=2){
189  const double real = ((double*)longlink[dir])[i*gaugeSiteSize + j];
190  const double imag = ((double*)longlink[dir])[i*gaugeSiteSize + j + 1];
191  ((double*)longlink[dir])[i*gaugeSiteSize + j] = real*cos_pi_3 - imag*sin_pi_3;
192  ((double*)longlink[dir])[i*gaugeSiteSize + j + 1] = real*sin_pi_3 + imag*cos_pi_3;
193  }else{
194  const float real = ((float*)longlink[dir])[i*gaugeSiteSize + j];
195  const float imag = ((float*)longlink[dir])[i*gaugeSiteSize + j + 1];
196  ((float*)longlink[dir])[i*gaugeSiteSize + j] = real*cos_pi_3 - imag*sin_pi_3;
197  ((float*)longlink[dir])[i*gaugeSiteSize + j + 1] = real*sin_pi_3 + imag*cos_pi_3;
198  }
199  }
200  }
201  }
202  }
203 
204 
205 
206 #ifdef MULTI_GPU
209  GaugeFieldParam cpuFatParam(fatlink, gaugeParam);
210  cpuFat = new cpuGaugeField(cpuFatParam);
211  ghost_fatlink = cpuFat->Ghost();
212 
214  GaugeFieldParam cpuLongParam(longlink, gaugeParam);
215  cpuLong = new cpuGaugeField(cpuLongParam);
216  ghost_longlink = cpuLong->Ghost();
217 
218  int x_face_size = X[1]*X[2]*X[3]/2;
219  int y_face_size = X[0]*X[2]*X[3]/2;
220  int z_face_size = X[0]*X[1]*X[3]/2;
221  int t_face_size = X[0]*X[1]*X[2]/2;
222  int pad_size =MAX(x_face_size, y_face_size);
223  pad_size = MAX(pad_size, z_face_size);
224  pad_size = MAX(pad_size, t_face_size);
225  gaugeParam.ga_pad = pad_size;
226 #endif
227 
230 
231  printfQuda("Fat links sending...");
233  printfQuda("Fat links sent\n");
234 
236 
237 #ifdef MULTI_GPU
238  gaugeParam.ga_pad = 3*pad_size;
239 #endif
240 
242  printfQuda("Long links sending...");
244  printfQuda("Long links sent...\n");
245 
246  printfQuda("Sending fields to GPU...");
247 
248  if (!transfer) {
249 
251  csParam.pad = inv_param.sp_pad;
252  csParam.precision = inv_param.cuda_prec;
253  if (test_type < 2){
255  csParam.x[0] /=2;
256  }
257 
258  printfQuda("Creating cudaSpinor\n");
259  cudaSpinor = new cudaColorSpinorField(csParam);
260 
261  printfQuda("Creating cudaSpinorOut\n");
262  cudaSpinorOut = new cudaColorSpinorField(csParam);
263 
264  printfQuda("Sending spinor field to GPU\n");
265  *cudaSpinor = *spinor;
266 
267  cudaDeviceSynchronize();
268  checkCudaError();
269 
270  double spinor_norm2 = norm2(*spinor);
271  double cuda_spinor_norm2= norm2(*cudaSpinor);
272  printfQuda("Source CPU = %f, CUDA=%f\n", spinor_norm2, cuda_spinor_norm2);
273 
274  if(test_type == 2){
275  csParam.x[0] /=2;
276  }
278  tmp = new cudaColorSpinorField(csParam);
279 
280  bool pc = (test_type != 2);
281  DiracParam diracParam;
282  setDiracParam(diracParam, &inv_param, pc);
283 
284  diracParam.tmp1=tmp;
285 
286  dirac = Dirac::create(diracParam);
287 
288  } else {
289  errorQuda("Error not suppported");
290  }
291 
292  return;
293 }
294 
295 void end(void)
296 {
297  for (int dir = 0; dir < 4; dir++) {
298  free(fatlink[dir]);
299  free(longlink[dir]);
300  }
301 
302  if (!transfer){
303  delete dirac;
304  delete cudaSpinor;
305  delete cudaSpinorOut;
306  delete tmp;
307  }
308 
309  delete spinor;
310  delete spinorOut;
311  delete spinorRef;
312 
313  if (cpuFat) delete cpuFat;
314  if (cpuLong) delete cpuLong;
315 
316  endQuda();
317 }
318 
319 double dslashCUDA(int niter) {
320 
321  cudaEvent_t start, end;
322  cudaEventCreate(&start);
323  cudaEventRecord(start, 0);
324  cudaEventSynchronize(start);
325 
326  for (int i = 0; i < niter; i++) {
327  switch (test_type) {
328  case 0:
330  if (transfer){
331  //dslashQuda(spinorOdd, spinorEven, &inv_param, parity);
332  } else {
334  }
335  break;
336  case 1:
338  if (transfer){
339  //MatPCQuda(spinorOdd, spinorEven, &inv_param);
340  } else {
342  }
343  break;
344  case 2:
345  errorQuda("Staggered operator acting on full-site not supported");
346  if (transfer){
347  //MatQuda(spinorGPU, spinor, &inv_param);
348  } else {
350  }
351  }
352  }
353 
354  cudaEventCreate(&end);
355  cudaEventRecord(end, 0);
356  cudaEventSynchronize(end);
357  float runTime;
358  cudaEventElapsedTime(&runTime, start, end);
359  cudaEventDestroy(start);
360  cudaEventDestroy(end);
361 
362  double secs = runTime / 1000; //stopwatchReadSeconds();
363 
364  // check for errors
365  cudaError_t stat = cudaGetLastError();
366  if (stat != cudaSuccess)
367  errorQuda("with ERROR: %s\n", cudaGetErrorString(stat));
368 
369  return secs;
370 }
371 
373 {
374 #ifndef MULTI_GPU
375  int cpu_parity = 0;
376 #endif
377 
378  // compare to dslash reference implementation
379  printfQuda("Calculating reference implementation...");
380  fflush(stdout);
381  switch (test_type) {
382  case 0:
383 #ifdef MULTI_GPU
384 
385  staggered_dslash_mg4dir(spinorRef, fatlink, longlink, (void**)ghost_fatlink, (void**)ghost_longlink,
387 #else
388  cpu_parity = 0; //EVEN
389  staggered_dslash(spinorRef->V(), fatlink, longlink, spinor->V(), cpu_parity, dagger,
391 
392 #endif
393 
394 
395  break;
396  case 1:
397 #ifdef MULTI_GPU
398  staggered_dslash_mg4dir(spinorRef, fatlink, longlink, (void**)ghost_fatlink, (void**)ghost_longlink,
400 
401 #else
402  cpu_parity=1; //ODD
403  staggered_dslash(spinorRef->V(), fatlink, longlink, spinor->V(), cpu_parity, dagger,
405 #endif
406  break;
407  case 2:
408  //mat(spinorRef->V(), fatlink, longlink, spinor->V(), kappa, dagger,
409  //inv_param.cpu_prec, gaugeParam.cpu_prec);
410  break;
411  default:
412  errorQuda("Test type not defined");
413  }
414 
415  printfQuda("done.\n");
416 
417 }
418 
419 TEST(dslash, verify) {
420  double deviation = pow(10, -(double)(cpuColorSpinorField::Compare(*spinorRef, *spinorOut)));
421  double tol = (inv_param.cuda_prec == QUDA_DOUBLE_PRECISION ? 1e-12 :
422  (inv_param.cuda_prec == QUDA_SINGLE_PRECISION ? 1e-3 : 1e-1));
423  ASSERT_LE(deviation, tol) << "CPU and CUDA implementations do not agree";
424 }
425 
426 static int dslashTest(int argc, char **argv)
427 {
428  int accuracy_level = 0;
429 
430  init();
431 
432  int attempts = 1;
433 
434  for (int i=0; i<attempts; i++) {
435 
436  if (tune) { // warm-up run
437  printfQuda("Tuning...\n");
439  dslashCUDA(1);
440  }
441  printfQuda("Executing %d kernel loops...", loops);
442  double secs = dslashCUDA(loops);
443 
444  if (!transfer) *spinorOut = *cudaSpinorOut;
445 
446  printfQuda("\n%fms per loop\n", 1000*secs);
448 
449  unsigned long long flops = dirac->Flops();
450  int link_floats = 8*gaugeParam.reconstruct+8*18;
451  int spinor_floats = 8*6*2 + 6;
452  int link_float_size = prec;
453  int spinor_float_size = 0;
454 
455  link_floats = test_type ? (2*link_floats) : link_floats;
456  spinor_floats = test_type ? (2*spinor_floats) : spinor_floats;
457 
458  int bytes_for_one_site = link_floats * link_float_size + spinor_floats * spinor_float_size;
459  if (prec == QUDA_HALF_PRECISION) bytes_for_one_site += (8*2 + 1)*4;
460 
461  printfQuda("GFLOPS = %f\n", 1.0e-9*flops/secs);
462  printfQuda("GB/s = %f\n\n", 1.0*Vh*bytes_for_one_site/((secs/loops)*1e+9));
463 
464  double norm2_cpu = norm2(*spinorRef);
465  double norm2_cpu_cuda= norm2(*spinorOut);
466  if (!transfer) {
467  double norm2_cuda= norm2(*cudaSpinorOut);
468  printfQuda("Results: CPU = %f, CUDA=%f, CPU-CUDA = %f\n", norm2_cpu, norm2_cuda, norm2_cpu_cuda);
469  } else {
470  printfQuda("Result: CPU = %f, CPU-QUDA = %f\n", norm2_cpu, norm2_cpu_cuda);
471  }
472 
473  if (verify_results) {
474  ::testing::InitGoogleTest(&argc, argv);
475  if (RUN_ALL_TESTS() != 0) warningQuda("Tests failed");
476  }
477  }
478  end();
479 
480  return accuracy_level;
481 }
482 
483 
485 {
486  printfQuda("running the following test:\n");
487 
488  printfQuda("prec recon test_type dagger S_dim T_dimension\n");
489  printfQuda("%s %s %d %d %d/%d/%d %d \n",
492  printfQuda("Grid partition info: X Y Z T\n");
493  printfQuda(" %d %d %d %d\n",
494  dimPartitioned(0),
495  dimPartitioned(1),
496  dimPartitioned(2),
497  dimPartitioned(3));
498 
499  return ;
500 
501 }
502 
503 
504  void
505 usage_extra(char** argv )
506 {
507  printfQuda("Extra options:\n");
508  printfQuda(" --test <0/1> # Test method\n");
509  printfQuda(" 0: Even destination spinor\n");
510  printfQuda(" 1: Odd destination spinor\n");
511  return ;
512 }
513 
514 int main(int argc, char **argv)
515 {
516 
517  int i;
518  for (i =1;i < argc; i++){
519 
520  if(process_command_line_option(argc, argv, &i) == 0){
521  continue;
522  }
523 
524  fprintf(stderr, "ERROR: Invalid option:%s\n", argv[i]);
525  usage(argv);
526  }
527 
528  initComms(argc, argv, gridsize_from_cmdline);
529 
531 
532  int ret =1;
533  int accuracy_level = dslashTest(argc, argv);
534 
535  printfQuda("accuracy_level =%d\n", accuracy_level);
536 
537  if (accuracy_level >= 1) ret = 0; //probably no error, -1 means no matching
538 
539  finalizeComms();
540 
541  return ret;
542 }
543 
int dimPartitioned(int dim)
Definition: test_util.cpp:1577
QudaDiracFieldOrder dirac_order
Definition: quda.h:156
QudaDslashType dslash_type
Definition: test_util.cpp:1560
QudaReconstructType reconstruct_sloppy
Definition: quda.h:46
double anisotropy
Definition: quda.h:31
cpuColorSpinorField * spinorOut
__constant__ int Vh
int device
Definition: test_util.cpp:1546
void * hostGauge[4]
cudaColorSpinorField * tmp1
Definition: dirac_quda.h:39
void endQuda(void)
const void ** Ghost() const
Definition: gauge_field.h:209
enum QudaPrecision_s QudaPrecision
int V
Definition: test_util.cpp:29
int attempts
int zdim
Definition: test_util.cpp:1555
int ga_pad
Definition: quda.h:53
#define ASSERT_LE(val1, val2)
Definition: gtest.h:19789
int main(int argc, char **argv)
QudaGaugeFixed gauge_fix
Definition: quda.h:51
QudaLinkType type
Definition: quda.h:35
cpuColorSpinorField * spinorRef
QudaDagType dagger
Definition: test_util.cpp:1558
QudaReconstructType link_recon
Definition: test_util.cpp:1549
#define errorQuda(...)
Definition: util_quda.h:73
QudaDslashType dslash_type
Definition: quda.h:85
QudaGaugeParam gaugeParam
QudaPrecision cuda_prec
Definition: quda.h:152
void setDims(int *)
Definition: test_util.cpp:88
__host__ __device__ ValueType sqrt(ValueType x)
Definition: complex_quda.h:105
void loadGaugeQuda(void *h_gauge, QudaGaugeParam *param)
void staggered_dslash(void *res, void **fatlink, void **longlink, void *spinorField, int oddBit, int daggerBit, QudaPrecision sPrecision, QudaPrecision gPrecision)
QudaPrecision cpu_prec
Definition: quda.h:151
int process_command_line_option(int argc, char **argv, int *idx)
Definition: test_util.cpp:1635
void Source(const QudaSourceType sourceType, const int st=0, const int s=0, const int c=0)
QudaPrecision precision
Definition: lattice_field.h:41
#define gaugeSiteSize
QudaDagType dagger
Definition: quda.h:145
void finalizeComms()
Definition: test_util.cpp:65
void usage(char **argv)
Definition: test_util.cpp:1584
cpuColorSpinorField * spinor
cudaColorSpinorField * cudaSpinor
QudaGaugeFieldOrder gauge_order
Definition: quda.h:36
const char * get_prec_str(QudaPrecision prec)
Definition: misc.cpp:658
QudaSiteSubset siteSubset
Definition: lattice_field.h:42
QudaFieldLocation input_location
Definition: quda.h:82
void * longlink[4]
int test_type
Definition: test_util.cpp:1564
void setDiracParam(DiracParam &diracParam, QudaInvertParam *inv_param, bool pc)
QudaSolutionType solution_type
Definition: quda.h:142
bool verify_results
Definition: test_util.cpp:1568
QudaPrecision prec
Definition: test_util.cpp:1551
Dirac * dirac
double dslashCUDA(int niter)
virtual void Dslash(cudaColorSpinorField &out, const cudaColorSpinorField &in, const QudaParity parity) const =0
int x[QUDA_MAX_DIM]
Definition: lattice_field.h:38
void setTuning(QudaTune tune)
Definition: util_quda.cpp:33
void usage_extra(char **argv)
double scale
Definition: quda.h:33
void initQuda(int device)
cudaColorSpinorField * tmp
QudaFieldLocation output_location
Definition: quda.h:83
int xdim
Definition: test_util.cpp:1553
unsigned long long Flops() const
Definition: dirac_quda.h:136
void setSpinorSiteSize(int n)
Definition: test_util.cpp:150
ColorSpinorParam csParam
Definition: pack_test.cpp:24
QudaInvertParam newQudaInvertParam(void)
const char * get_recon_str(QudaReconstructType recon)
Definition: misc.cpp:724
#define MAX(a, b)
GTEST_API_ void InitGoogleTest(int *argc, char **argv)
TEST(dslash, verify)
#define warningQuda(...)
Definition: util_quda.h:84
__host__ __device__ ValueType pow(ValueType x, ExponentType e)
Definition: complex_quda.h:100
QudaParity parity
QudaGammaBasis gamma_basis
Definition: quda.h:158
int niter
Definition: test_util.cpp:1563
cudaColorSpinorField * cudaSpinorOut
QudaPrecision cuda_prec_sloppy
Definition: quda.h:45
enum QudaDagType_s QudaDagType
enum QudaParity_s QudaParity
QudaReconstructType reconstruct
Definition: quda.h:43
QudaPrecision cuda_prec
Definition: quda.h:42
int X[4]
Definition: quda.h:29
int X[4]
static int Compare(const cpuColorSpinorField &a, const cpuColorSpinorField &b, const int resolution=1)
void construct_fat_long_gauge_field(void **fatlink, void **longlink, int type, QudaPrecision precision, QudaGaugeParam *param, QudaDslashType dslash_type)
Definition: test_util.cpp:1018
void init()
cpuGaugeField * cpuLong
int transfer
double tadpole_coeff
Definition: quda.h:32
double gaugeGiB
Definition: quda.h:60
int tdim
Definition: test_util.cpp:1556
int ydim
Definition: test_util.cpp:1554
bool tune
Definition: test_util.cpp:1562
void display_test_info()
enum QudaReconstructType_s QudaReconstructType
Main header file for the QUDA library.
virtual void M(cudaColorSpinorField &out, const cudaColorSpinorField &in) const =0
#define printfQuda(...)
Definition: util_quda.h:67
QudaTboundary t_boundary
Definition: quda.h:38
QudaInvertParam inv_param
int RUN_ALL_TESTS() GTEST_MUST_USE_RESULT_
Definition: gtest.h:20057
void staggered_dslash_mg4dir(cpuColorSpinorField *out, void **fatlink, void **longlink, void **ghost_fatlink, void **ghost_longlink, cpuColorSpinorField *in, int oddBit, int daggerBit, QudaPrecision sPrecision, QudaPrecision gPrecision)
enum QudaDslashType_s QudaDslashType
void setKernelPackT(bool pack)
Definition: dslash_quda.cu:82
void staggeredDslashRef()
void end(void)
bool kernel_pack_t
Definition: test_util.cpp:1571
#define checkCudaError()
Definition: util_quda.h:110
static Dirac * create(const DiracParam &param)
Definition: dirac.cpp:134
double norm2(const ColorSpinorField &)
void initComms(int argc, char **argv, const int *commDims)
Definition: test_util.cpp:48
int gridsize_from_cmdline[]
Definition: test_util.cpp:1559
cpuGaugeField * cpuFat
void setVerbosity(const QudaVerbosity verbosity)
Definition: util_quda.cpp:24
QudaMatPCType matpc_type
Definition: quda.h:144
const int loops
QudaPrecision cpu_prec
Definition: quda.h:40
QudaGaugeParam newQudaGaugeParam(void)
void * fatlink[4]