QUDA  v0.5.0
A library for QCD on GPUs
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
staggered_invert_test.cpp
Go to the documentation of this file.
1 #include <stdlib.h>
2 #include <stdio.h>
3 #include <time.h>
4 #include <math.h>
5 
6 #include <test_util.h>
7 #include <dslash_util.h>
8 #include <blas_reference.h>
10 #include <quda.h>
11 #include <string.h>
12 #include <face_quda.h>
13 #include "misc.h"
14 #include <gauge_field.h>
15 #include <blas_quda.h>
16 
17 #if defined(QMP_COMMS)
18 #include <qmp.h>
19 #elif defined(MPI_COMMS)
20 #include <mpi.h>
21 #endif
22 
23 #ifdef MULTI_GPU
24 #include <face_quda.h>
25 #endif
26 
27 #define MAX(a,b) ((a)>(b)?(a):(b))
28 #define mySpinorSiteSize 6
29 
30 extern void usage(char** argv);
31 void *fatlink[4];
32 void *longlink[4];
33 
34 #ifdef MULTI_GPU
35 void** ghost_fatlink, **ghost_longlink;
36 #endif
37 
38 extern int device;
39 extern bool tune;
40 
42 extern QudaPrecision prec;
44 
51 
54 
55 static double tol = 1e-7;
56 
57 extern int test_type;
58 extern int xdim;
59 extern int ydim;
60 extern int zdim;
61 extern int tdim;
62 extern int gridsize_from_cmdline[];
63 
64 static void end();
65 
66 template<typename Float>
68  for(int i = 0; i < Vh; i++) {
69  for (int s = 0; s < 1; s++) {
70  for (int m = 0; m < 3; m++) {
71  res[i*(1*3*2) + s*(3*2) + m*(2) + 0] = rand() / (Float)RAND_MAX;
72  res[i*(1*3*2) + s*(3*2) + m*(2) + 1] = rand() / (Float)RAND_MAX;
73  }
74  }
75  }
76 }
77 
78 
79 static void
81  int X1, int X2, int X3, int X4,
84  double mass, double tol, int maxiter, double reliable_delta,
85  double tadpole_coeff
86  )
87 {
88  gaugeParam->X[0] = X1;
89  gaugeParam->X[1] = X2;
90  gaugeParam->X[2] = X3;
91  gaugeParam->X[3] = X4;
92 
93  gaugeParam->cpu_prec = cpu_prec;
94  gaugeParam->cuda_prec = prec;
95  gaugeParam->reconstruct = link_recon;
96  gaugeParam->cuda_prec_sloppy = prec_sloppy;
98  gaugeParam->gauge_fix = QUDA_GAUGE_FIXED_NO;
99  gaugeParam->anisotropy = 1.0;
100  gaugeParam->tadpole_coeff = tadpole_coeff;
101  gaugeParam->t_boundary = QUDA_ANTI_PERIODIC_T;
102  gaugeParam->gauge_order = QUDA_QDP_GAUGE_ORDER;
103  gaugeParam->ga_pad = X1*X2*X3/2;
104 
105  inv_param->verbosity = QUDA_VERBOSE;
106  inv_param->mass = mass;
107 
108  // outer solver parameters
109  inv_param->inv_type = QUDA_CG_INVERTER;
110  inv_param->tol = tol;
111  inv_param->maxiter = 500000;
112  inv_param->reliable_delta = 1e-1;
113 
114 #if __COMPUTE_CAPABILITY__ >= 200
115  // require both L2 relative and heavy quark residual to determine convergence
117  inv_param->tol_hq = 1e-3; // specify a tolerance for the residual for heavy quark residual
118 #else
119  // Pre Fermi architecture only supports L2 relative residual norm
121 #endif
122 
123  //inv_param->inv_type = QUDA_GCR_INVERTER;
124  //inv_param->gcrNkrylov = 10;
125 
126  // domain decomposition preconditioner parameters
127  //inv_param->inv_type_precondition = QUDA_MR_INVERTER;
128  //inv_param->tol_precondition = 1e-1;
129  //inv_param->maxiter_precondition = 100;
130  //inv_param->verbosity_precondition = QUDA_SILENT;
131  //inv_param->prec_precondition = prec_sloppy;
132 
134  inv_param->solve_type = QUDA_NORMOP_PC_SOLVE;
135  inv_param->matpc_type = QUDA_MATPC_EVEN_EVEN;
136  inv_param->dagger = QUDA_DAG_NO;
138 
139  inv_param->cpu_prec = cpu_prec;
140  inv_param->cuda_prec = prec;
141  inv_param->cuda_prec_sloppy = prec_sloppy;
143  inv_param->gamma_basis = QUDA_DEGRAND_ROSSI_GAMMA_BASIS; // this is meaningless, but must be thus set
144  inv_param->dirac_order = QUDA_DIRAC_ORDER;
145  inv_param->dslash_type = QUDA_ASQTAD_DSLASH;
146  inv_param->tune = tune ? QUDA_TUNE_YES : QUDA_TUNE_NO;
147  inv_param->sp_pad = X1*X2*X3/2;
149 
152 }
153 
154 
155 int
157 {
160 
161  double mass = 0.005;
162 
163  set_params(&gaugeParam, &inv_param,
164  xdim, ydim, zdim, tdim,
166  link_recon, link_recon_sloppy, mass, tol, 500, 1e-3,
167  0.8);
168 
169  // declare the dimensions of the communication grid
171 
172  // this must be before the FaceBuffer is created (this is because it allocates pinned memory - FIXME)
173  initQuda(device);
174 
175  setDims(gaugeParam.X);
177 
178  size_t gSize = (gaugeParam.cpu_prec == QUDA_DOUBLE_PRECISION) ? sizeof(double) : sizeof(float);
179  for (int dir = 0; dir < 4; dir++) {
180  fatlink[dir] = malloc(V*gaugeSiteSize*gSize);
181  longlink[dir] = malloc(V*gaugeSiteSize*gSize);
182  }
183 
184  construct_fat_long_gauge_field(fatlink, longlink, 1, gaugeParam.cpu_prec, &gaugeParam);
185 
186  for (int dir = 0; dir < 4; dir++) {
187  for(int i = 0;i < V*gaugeSiteSize;i++){
188  if (gaugeParam.cpu_prec == QUDA_DOUBLE_PRECISION){
189  ((double*)fatlink[dir])[i] = 0.5 *rand()/RAND_MAX;
190  }else{
191  ((float*)fatlink[dir])[i] = 0.5* rand()/RAND_MAX;
192  }
193  }
194  }
195 
197  csParam.nColor=3;
198  csParam.nSpin=1;
199  csParam.nDim=4;
200  for(int d = 0; d < 4; d++) {
201  csParam.x[d] = gaugeParam.X[d];
202  }
203  csParam.x[0] /= 2;
204 
205  csParam.precision = inv_param.cpu_prec;
206  csParam.pad = 0;
210  csParam.gammaBasis = inv_param.gamma_basis;
211  csParam.create = QUDA_ZERO_FIELD_CREATE;
212  in = new cpuColorSpinorField(csParam);
213  out = new cpuColorSpinorField(csParam);
214  ref = new cpuColorSpinorField(csParam);
215  tmp = new cpuColorSpinorField(csParam);
216 
217  if (inv_param.cpu_prec == QUDA_SINGLE_PRECISION){
218  constructSpinorField((float*)in->V());
219  }else{
220  constructSpinorField((double*)in->V());
221  }
222 
223 #ifdef MULTI_GPU
224  int tmp_value = MAX(ydim*zdim*tdim/2, xdim*zdim*tdim/2);
225  tmp_value = MAX(tmp_value, xdim*ydim*tdim/2);
226  tmp_value = MAX(tmp_value, xdim*ydim*zdim/2);
227 
228  int fat_pad = tmp_value;
229  int link_pad = 3*tmp_value;
230 
231  gaugeParam.type = QUDA_ASQTAD_FAT_LINKS;
232  gaugeParam.reconstruct = QUDA_RECONSTRUCT_NO;
233  GaugeFieldParam cpuFatParam(fatlink, gaugeParam);
234  cpuFat = new cpuGaugeField(cpuFatParam);
235  cpuFat->exchangeGhost();
236  ghost_fatlink = (void**)cpuFat->Ghost();
237 
238  gaugeParam.type = QUDA_ASQTAD_LONG_LINKS;
239  GaugeFieldParam cpuLongParam(longlink, gaugeParam);
240  cpuLong = new cpuGaugeField(cpuLongParam);
241  cpuLong->exchangeGhost();
242  ghost_longlink = (void**)cpuLong->Ghost();
243 
244  gaugeParam.type = QUDA_ASQTAD_FAT_LINKS;
245  gaugeParam.ga_pad = fat_pad;
246  gaugeParam.reconstruct= gaugeParam.reconstruct_sloppy = QUDA_RECONSTRUCT_NO;
247  loadGaugeQuda(fatlink, &gaugeParam);
248 
249  gaugeParam.type = QUDA_ASQTAD_LONG_LINKS;
250  gaugeParam.ga_pad = link_pad;
251  gaugeParam.reconstruct= link_recon;
253  loadGaugeQuda(longlink, &gaugeParam);
254 #else
255  gaugeParam.type = QUDA_ASQTAD_FAT_LINKS;
256  gaugeParam.reconstruct = gaugeParam.reconstruct_sloppy = QUDA_RECONSTRUCT_NO;
257  loadGaugeQuda(fatlink, &gaugeParam);
258 
259  gaugeParam.type = QUDA_ASQTAD_LONG_LINKS;
260  gaugeParam.reconstruct = link_recon;
262  loadGaugeQuda(longlink, &gaugeParam);
263 #endif
264 
265  double time0 = -((double)clock()); // Start the timer
266 
267  double nrm2=0;
268  double src2=0;
269  int ret = 0;
270 
271  switch(test_type){
272  case 0: //even
273  inv_param.matpc_type = QUDA_MATPC_EVEN_EVEN;
274 
275  invertQuda(out->V(), in->V(), &inv_param);
276 
277  time0 += clock();
278  time0 /= CLOCKS_PER_SEC;
279 
280 #ifdef MULTI_GPU
281  matdagmat_mg4dir(ref, fatlink, longlink, ghost_fatlink, ghost_longlink,
282  out, mass, 0, inv_param.cpu_prec, gaugeParam.cpu_prec, tmp, QUDA_EVEN_PARITY);
283 #else
284  matdagmat(ref->V(), fatlink, longlink, out->V(), mass, 0, inv_param.cpu_prec, gaugeParam.cpu_prec, tmp->V(), QUDA_EVEN_PARITY);
285 #endif
286 
287  mxpy(in->V(), ref->V(), Vh*mySpinorSiteSize, inv_param.cpu_prec);
288  nrm2 = norm_2(ref->V(), Vh*mySpinorSiteSize, inv_param.cpu_prec);
289  src2 = norm_2(in->V(), Vh*mySpinorSiteSize, inv_param.cpu_prec);
290 
291  break;
292 
293  case 1: //odd
294 
295  inv_param.matpc_type = QUDA_MATPC_ODD_ODD;
296  invertQuda(out->V(), in->V(), &inv_param);
297  time0 += clock(); // stop the timer
298  time0 /= CLOCKS_PER_SEC;
299 
300 #ifdef MULTI_GPU
301  matdagmat_mg4dir(ref, fatlink, longlink, ghost_fatlink, ghost_longlink,
302  out, mass, 0, inv_param.cpu_prec, gaugeParam.cpu_prec, tmp, QUDA_ODD_PARITY);
303 #else
304  matdagmat(ref->V(), fatlink, longlink, out->V(), mass, 0, inv_param.cpu_prec, gaugeParam.cpu_prec, tmp->V(), QUDA_ODD_PARITY);
305 #endif
306  mxpy(in->V(), ref->V(), Vh*mySpinorSiteSize, inv_param.cpu_prec);
307  nrm2 = norm_2(ref->V(), Vh*mySpinorSiteSize, inv_param.cpu_prec);
308  src2 = norm_2(in->V(), Vh*mySpinorSiteSize, inv_param.cpu_prec);
309 
310  break;
311 
312  case 2: //full spinor
313 
314  errorQuda("full spinor not supported\n");
315  break;
316 
317  case 3: //multi mass CG, even
318  case 4:
319 
320 #define NUM_OFFSETS 12
321 
322  {
323  double masses[NUM_OFFSETS] ={0.002, 0.0021, 0.0064, 0.070, 0.077, 0.081, 0.1, 0.11, 0.12, 0.13, 0.14, 0.205};
324  inv_param.num_offset = NUM_OFFSETS;
325  // these can be set independently
326  for (int i=0; i<inv_param.num_offset; i++) {
327  inv_param.tol_offset[i] = inv_param.tol;
328  inv_param.tol_hq_offset[i] = inv_param.tol_hq;
329  }
330  void* outArray[NUM_OFFSETS];
331  int len;
332 
333  cpuColorSpinorField* spinorOutArray[NUM_OFFSETS];
334  spinorOutArray[0] = out;
335  for(int i=1;i < inv_param.num_offset; i++){
336  spinorOutArray[i] = new cpuColorSpinorField(csParam);
337  }
338 
339  for(int i=0;i < inv_param.num_offset; i++){
340  outArray[i] = spinorOutArray[i]->V();
341  inv_param.offset[i] = 4*masses[i]*masses[i];
342  }
343 
344  len=Vh;
345 
346  if (test_type == 3) {
347  inv_param.matpc_type = QUDA_MATPC_EVEN_EVEN;
348  } else {
349  inv_param.matpc_type = QUDA_MATPC_ODD_ODD;
350  }
351 
352  invertMultiShiftQuda(outArray, in->V(), &inv_param);
353 
354  cudaDeviceSynchronize();
355  time0 += clock(); // stop the timer
356  time0 /= CLOCKS_PER_SEC;
357 
358  printfQuda("done: total time = %g secs, compute time = %g, %i iter / %g secs = %g gflops\n",
359  time0, inv_param.secs, inv_param.iter, inv_param.secs,
360  inv_param.gflops/inv_param.secs);
361 
362 
363  printfQuda("checking the solution\n");
365  if (inv_param.solve_type == QUDA_NORMOP_SOLVE){
366  //parity = QUDA_EVENODD_PARITY;
367  errorQuda("full parity not supported\n");
368  }else if (inv_param.matpc_type == QUDA_MATPC_EVEN_EVEN){
369  parity = QUDA_EVEN_PARITY;
370  }else if (inv_param.matpc_type == QUDA_MATPC_ODD_ODD){
371  parity = QUDA_ODD_PARITY;
372  }else{
373  errorQuda("ERROR: invalid spinor parity \n");
374  exit(1);
375  }
376  for(int i=0;i < inv_param.num_offset;i++){
377  printfQuda("%dth solution: mass=%f, ", i, masses[i]);
378 #ifdef MULTI_GPU
379  matdagmat_mg4dir(ref, fatlink, longlink, ghost_fatlink, ghost_longlink,
380  spinorOutArray[i], masses[i], 0, inv_param.cpu_prec,
381  gaugeParam.cpu_prec, tmp, parity);
382 #else
383  matdagmat(ref->V(), fatlink, longlink, outArray[i], masses[i], 0, inv_param.cpu_prec, gaugeParam.cpu_prec, tmp->V(), parity);
384 #endif
385  mxpy(in->V(), ref->V(), len*mySpinorSiteSize, inv_param.cpu_prec);
386  double nrm2 = norm_2(ref->V(), len*mySpinorSiteSize, inv_param.cpu_prec);
387  double src2 = norm_2(in->V(), len*mySpinorSiteSize, inv_param.cpu_prec);
388  double hqr = sqrt(HeavyQuarkResidualNormCpu(*spinorOutArray[i], *ref).z);
389  double l2r = sqrt(nrm2/src2);
390 
391  printfQuda("Shift %d residuals: (L2 relative) tol %g, QUDA = %g, host = %g; (heavy-quark) tol %g, QUDA = %g, host = %g\n",
392  i, inv_param.tol_offset[i], inv_param.true_res_offset[i], l2r,
393  inv_param.tol_hq_offset[i], inv_param.true_res_hq_offset[i], hqr);
394 
395  //emperical, if the cpu residue is more than 1 order the target accuracy, the it fails to converge
396  if (sqrt(nrm2/src2) > 10*inv_param.tol_offset[i]){
397  ret |=1;
398  }
399  }
400 
401  for(int i=1; i < inv_param.num_offset;i++) delete spinorOutArray[i];
402  }
403  break;
404 
405  default:
406  errorQuda("Unsupported test type");
407 
408  }//switch
409 
410  if (test_type <=2){
411 
412  double hqr = sqrt(HeavyQuarkResidualNormCpu(*out, *ref).z);
413  double l2r = sqrt(nrm2/src2);
414 
415  printfQuda("Residuals: (L2 relative) tol %g, QUDA = %g, host = %g; (heavy-quark) tol %g, QUDA = %g, host = %g\n",
416  inv_param.tol, inv_param.true_res, l2r, inv_param.tol_hq, inv_param.true_res_hq, hqr);
417 
418  printfQuda("done: total time = %g secs, compute time = %g secs, %i iter / %g secs = %g gflops, \n",
419  time0, inv_param.secs, inv_param.iter, inv_param.secs,
420  inv_param.gflops/inv_param.secs);
421  }
422 
423  end();
424  return ret;
425 }
426 
427 
428 
429 static void
430 end(void)
431 {
432  for(int i=0;i < 4;i++){
433  free(fatlink[i]);
434  free(longlink[i]);
435  }
436 
437  delete in;
438  delete out;
439  delete ref;
440  delete tmp;
441 
442  if (cpuFat) delete cpuFat;
443  if (cpuLong) delete cpuLong;
444 
445  endQuda();
446 }
447 
448 
449 void
451 {
452  printfQuda("running the following test:\n");
453 
454  printfQuda("prec sloppy_prec link_recon sloppy_link_recon test_type S_dimension T_dimension\n");
455  printfQuda("%s %s %s %s %s %d/%d/%d %d \n",
459 
460  printfQuda("Grid partition info: X Y Z T\n");
461  printfQuda(" %d %d %d %d\n",
462  dimPartitioned(0),
463  dimPartitioned(1),
464  dimPartitioned(2),
465  dimPartitioned(3));
466 
467  return ;
468 
469 }
470 
471 void
472 usage_extra(char** argv )
473 {
474  printfQuda("Extra options:\n");
475  printfQuda(" --tol <resid_tol> # Set residual tolerance\n");
476  printfQuda(" --test <0/1> # Test method\n");
477  printfQuda(" 0: Even even spinor CG inverter\n");
478  printfQuda(" 1: Odd odd spinor CG inverter\n");
479  printfQuda(" 3: Even even spinor multishift CG inverter\n");
480  printfQuda(" 4: Odd odd spinor multishift CG inverter\n");
481  printfQuda(" --cpu_prec <double/single/half> # Set CPU precision\n");
482 
483  return ;
484 }
485 int main(int argc, char** argv)
486 {
487  for (int i = 1; i < argc; i++) {
488 
489  if(process_command_line_option(argc, argv, &i) == 0){
490  continue;
491  }
492 
493  if( strcmp(argv[i], "--tol") == 0){
494  float tmpf;
495  if (i+1 >= argc){
496  usage(argv);
497  }
498  sscanf(argv[i+1], "%f", &tmpf);
499  if (tmpf <= 0){
500  printf("ERROR: invalid tol(%f)\n", tmpf);
501  usage(argv);
502  }
503  tol = tmpf;
504  i++;
505  continue;
506  }
507 
508  if( strcmp(argv[i], "--cpu_prec") == 0){
509  if (i+1 >= argc){
510  usage(argv);
511  }
512  cpu_prec= get_prec(argv[i+1]);
513  i++;
514  continue;
515  }
516 
517  printf("ERROR: Invalid option:%s\n", argv[i]);
518  usage(argv);
519  }
520 
522  prec_sloppy = prec;
523  }
526  }
527 
528  // initialize QMP or MPI
529 #if defined(QMP_COMMS)
530  QMP_thread_level_t tl;
531  QMP_init_msg_passing(&argc, &argv, QMP_THREAD_SINGLE, &tl);
532 #elif defined(MPI_COMMS)
533  MPI_Init(&argc, &argv);
534 #endif
535 
536  // call srand() with a rank-dependent seed
537  initRand();
538 
540 
541  int ret = invert_test();
542 
543  // finalize the communications layer
544 #if defined(QMP_COMMS)
545  QMP_finalize_msg_passing();
546 #elif defined(MPI_COMMS)
547  MPI_Finalize();
548 #endif
549 
550  return ret;
551 }