QUDA  v0.5.0
A library for QCD on GPUs
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
invert_test.cpp
Go to the documentation of this file.
1 #include <stdlib.h>
2 #include <stdio.h>
3 #include <time.h>
4 #include <math.h>
5 #include <string.h>
6 
7 #include <util_quda.h>
8 #include <test_util.h>
9 #include <dslash_util.h>
10 #include <blas_reference.h>
13 #include "misc.h"
14 
15 #include "face_quda.h"
16 
17 #if defined(QMP_COMMS)
18 #include <qmp.h>
19 #elif defined(MPI_COMMS)
20 #include <mpi.h>
21 #endif
22 
23 #include <gauge_qio.h>
24 
25 #define MAX(a,b) ((a)>(b)?(a):(b))
26 
27 // In a typical application, quda.h is the only QUDA header required.
28 #include <quda.h>
29 
30 // Wilson, clover-improved Wilson, twisted mass, and domain wall are supported.
32 extern bool tune;
33 extern int device;
34 extern int xdim;
35 extern int ydim;
36 extern int zdim;
37 extern int tdim;
38 extern int Lsdim;
39 extern int gridsize_from_cmdline[];
41 extern QudaPrecision prec;
44 
45 extern char latfile[];
46 
47 extern void usage(char** );
48 
49 void
51 {
52  printfQuda("running the following test:\n");
53 
54  printfQuda("prec sloppy_prec link_recon sloppy_link_recon S_dimension T_dimension Ls_dimension\n");
55  printfQuda("%s %s %s %s %d/%d/%d %d %d\n",
59 
60  printfQuda("Grid partition info: X Y Z T\n");
61  printfQuda(" %d %d %d %d\n",
62  dimPartitioned(0),
63  dimPartitioned(1),
64  dimPartitioned(2),
65  dimPartitioned(3));
66 
67  return ;
68 
69 }
70 
71 int main(int argc, char **argv)
72 {
73 
74  for (int i = 1; i < argc; i++){
75  if(process_command_line_option(argc, argv, &i) == 0){
76  continue;
77  }
78  printfQuda("ERROR: Invalid option:%s\n", argv[i]);
79  usage(argv);
80  }
81 
83  prec_sloppy = prec;
84  }
87  }
88 
89  // initialize QMP or MPI
90 #if defined(QMP_COMMS)
91  QMP_thread_level_t tl;
92  QMP_init_msg_passing(&argc, &argv, QMP_THREAD_SINGLE, &tl);
93 #elif defined(MPI_COMMS)
94  MPI_Init(&argc, &argv);
95 #endif
96 
97  // call srand() with a rank-dependent seed
98  initRand();
99 
101 
102  // *** QUDA parameters begin here.
103 
104  int multi_shift = 0; // whether to test multi-shift or standard solver
105 
110  printfQuda("dslash_type %d not supported\n", dslash_type);
111  exit(0);
112  }
113 
116  QudaPrecision cuda_prec_sloppy = prec_sloppy;
117  QudaPrecision cuda_prec_precondition = QUDA_HALF_PRECISION;
118 
121 
122  double kappa5;
123 
124  gauge_param.X[0] = xdim;
125  gauge_param.X[1] = ydim;
126  gauge_param.X[2] = zdim;
127  gauge_param.X[3] = tdim;
128  inv_param.Ls = 1;
129 
130  gauge_param.anisotropy = 1.0;
131  gauge_param.type = QUDA_WILSON_LINKS;
132  gauge_param.gauge_order = QUDA_QDP_GAUGE_ORDER;
133  gauge_param.t_boundary = QUDA_ANTI_PERIODIC_T;
134 
135  gauge_param.cpu_prec = cpu_prec;
136  gauge_param.cuda_prec = cuda_prec;
137  gauge_param.reconstruct = link_recon;
138  gauge_param.cuda_prec_sloppy = cuda_prec_sloppy;
140  gauge_param.cuda_prec_precondition = cuda_prec_precondition;
142  gauge_param.gauge_fix = QUDA_GAUGE_FIXED_NO;
143 
144  inv_param.dslash_type = dslash_type;
145 
146  double mass = -0.4125;
147  inv_param.kappa = 1.0 / (2.0 * (1 + 3/gauge_param.anisotropy + mass));
148 
150  inv_param.mu = 0.12;
151  inv_param.epsilon = 0.1385;
153  //inv_param.twist_flavor = QUDA_TWIST_MINUS;
154  inv_param.Ls = (inv_param.twist_flavor == QUDA_TWIST_NONDEG_DOUBLET) ? 2 : 1;
155  } else if (dslash_type == QUDA_DOMAIN_WALL_DSLASH) {
156  inv_param.mass = 0.02;
157  inv_param.m5 = -1.8;
158  kappa5 = 0.5/(5 + inv_param.m5);
159  inv_param.Ls = Lsdim;
160  }
161 
162  // offsets used only by multi-shift solver
163  inv_param.num_offset = 4;
164  double offset[4] = {0.01, 0.02, 0.03, 0.04};
165  for (int i=0; i<inv_param.num_offset; i++) inv_param.offset[i] = offset[i];
166 
168  inv_param.matpc_type = QUDA_MATPC_EVEN_EVEN;
169  inv_param.dagger = QUDA_DAG_NO;
171 
173  inv_param.solve_type = QUDA_NORMOP_PC_SOLVE;
174  inv_param.inv_type = QUDA_CG_INVERTER;
175  } else {
176  inv_param.solve_type = QUDA_DIRECT_PC_SOLVE;
177  inv_param.inv_type = QUDA_BICGSTAB_INVERTER;
178  }
179 
180  inv_param.gcrNkrylov = 10;
181  inv_param.tol = 1e-7;
182 #if __COMPUTE_CAPABILITY__ >= 200
183  // require both L2 relative and heavy quark residual to determine convergence
185  inv_param.tol_hq = 1e-3; // specify a tolerance for the residual for heavy quark residual
186 #else
187  // Pre Fermi architecture only supports L2 relative residual norm
189 #endif
190  // these can be set individually
191  for (int i=0; i<inv_param.num_offset; i++) {
192  inv_param.tol_offset[i] = inv_param.tol;
193  inv_param.tol_hq_offset[i] = inv_param.tol_hq;
194  }
195  inv_param.maxiter = 25000;
196  inv_param.reliable_delta = 1e-2; // ignored by multi-shift solver
197 
198  // domain decomposition preconditioner parameters
201  inv_param.precondition_cycle = 1;
202  inv_param.tol_precondition = 1e-1;
203  inv_param.maxiter_precondition = 10;
205  inv_param.cuda_prec_precondition = cuda_prec_precondition;
206  inv_param.omega = 1.0;
207 
208  inv_param.cpu_prec = cpu_prec;
209  inv_param.cuda_prec = cuda_prec;
210  inv_param.cuda_prec_sloppy = cuda_prec_sloppy;
213  inv_param.dirac_order = QUDA_DIRAC_ORDER;
214 
217 
218  inv_param.tune = tune ? QUDA_TUNE_YES : QUDA_TUNE_NO;
219 
220  gauge_param.ga_pad = 0; // 24*24*24/2;
221  inv_param.sp_pad = 0; // 24*24*24/2;
222  inv_param.cl_pad = 0; // 24*24*24/2;
223 
224  // For multi-GPU, ga_pad must be large enough to store a time-slice
225 #ifdef MULTI_GPU
226  int x_face_size = gauge_param.X[1]*gauge_param.X[2]*gauge_param.X[3]/2;
227  int y_face_size = gauge_param.X[0]*gauge_param.X[2]*gauge_param.X[3]/2;
228  int z_face_size = gauge_param.X[0]*gauge_param.X[1]*gauge_param.X[3]/2;
229  int t_face_size = gauge_param.X[0]*gauge_param.X[1]*gauge_param.X[2]/2;
230  int pad_size =MAX(x_face_size, y_face_size);
231  pad_size = MAX(pad_size, z_face_size);
232  pad_size = MAX(pad_size, t_face_size);
233  gauge_param.ga_pad = pad_size;
234 #endif
235 
237  inv_param.clover_cpu_prec = cpu_prec;
238  inv_param.clover_cuda_prec = cuda_prec;
239  inv_param.clover_cuda_prec_sloppy = cuda_prec_sloppy;
240  inv_param.clover_cuda_prec_precondition = cuda_prec_precondition;
242  }
243 
244  inv_param.verbosity = QUDA_VERBOSE;
245 
246  // declare the dimensions of the communication grid
248 
249 
250  // *** Everything between here and the call to initQuda() is
251  // *** application-specific.
252 
253  // set parameters for the reference Dslash, and prepare fields to be loaded
255  dw_setDims(gauge_param.X, inv_param.Ls);
256  } else {
257  setDims(gauge_param.X);
258  }
259 
260  setSpinorSiteSize(24);
261 
262  size_t gSize = (gauge_param.cpu_prec == QUDA_DOUBLE_PRECISION) ? sizeof(double) : sizeof(float);
263  size_t sSize = (inv_param.cpu_prec == QUDA_DOUBLE_PRECISION) ? sizeof(double) : sizeof(float);
264 
265  void *gauge[4], *clover_inv=0, *clover=0;
266 
267  for (int dir = 0; dir < 4; dir++) {
268  gauge[dir] = malloc(V*gaugeSiteSize*gSize);
269  }
270 
271  if (strcmp(latfile,"")) { // load in the command line supplied gauge field
272  read_gauge_field(latfile, gauge, gauge_param.cpu_prec, gauge_param.X, argc, argv);
273  construct_gauge_field(gauge, 2, gauge_param.cpu_prec, &gauge_param);
274  } else { // else generate a random SU(3) field
275  construct_gauge_field(gauge, 1, gauge_param.cpu_prec, &gauge_param);
276  }
277 
279  double norm = 0.0; // clover components are random numbers in the range (-norm, norm)
280  double diag = 1.0; // constant added to the diagonal
281 
282  size_t cSize = (inv_param.clover_cpu_prec == QUDA_DOUBLE_PRECISION) ? sizeof(double) : sizeof(float);
283  clover_inv = malloc(V*cloverSiteSize*cSize);
284  construct_clover_field(clover_inv, norm, diag, inv_param.clover_cpu_prec);
285 
286  // The uninverted clover term is only needed when solving the unpreconditioned
287  // system or when using "asymmetric" even/odd preconditioning.
288  int preconditioned = (inv_param.solve_type == QUDA_DIRECT_PC_SOLVE ||
289  inv_param.solve_type == QUDA_NORMOP_PC_SOLVE);
290  int asymmetric = preconditioned &&
293  if (!preconditioned) {
294  clover = clover_inv;
295  clover_inv = NULL;
296  } else if (asymmetric) { // fake it by using the same random matrix
297  clover = clover_inv; // for both clover and clover_inv
298  } else {
299  clover = NULL;
300  }
301  }
302 
303  void *spinorIn = malloc(V*spinorSiteSize*sSize*inv_param.Ls);
304  void *spinorCheck = malloc(V*spinorSiteSize*sSize*inv_param.Ls);
305 
306  void *spinorOut = NULL, **spinorOutMulti = NULL;
307  if (multi_shift) {
308  spinorOutMulti = (void**)malloc(inv_param.num_offset*sizeof(void *));
309  for (int i=0; i<inv_param.num_offset; i++) {
310  spinorOutMulti[i] = malloc(V*spinorSiteSize*sSize*inv_param.Ls);
311  }
312  } else {
313  spinorOut = malloc(V*spinorSiteSize*sSize*inv_param.Ls);
314  }
315 
316  // create a point source at 0 (in each subvolume... FIXME)
317 
318  // create a point source at 0 (in each subvolume... FIXME)
319 
320  memset(spinorIn, 0, Ls*V*spinorSiteSize*sSize);
321  memset(spinorCheck, 0, Ls*V*spinorSiteSize*sSize);
322  if (multi_shift) {
323  for (int i=0; i<inv_param.num_offset; i++) memset(spinorOutMulti[i], 0, Ls*V*spinorSiteSize*sSize);
324  } else {
325  memset(spinorOut, 0, Ls*V*spinorSiteSize*sSize);
326  }
327 
328  if (inv_param.cpu_prec == QUDA_SINGLE_PRECISION)
329  {
330  ((float*)spinorIn)[0] = 1.0;
331  }
332  else
333  {
334  ((double*)spinorIn)[0] = 1.0;
335  }
336 
337  // start the timer
338  double time0 = -((double)clock());
339 
340  // initialize the QUDA library
341  initQuda(device);
342 
343  // load the gauge field
344  loadGaugeQuda((void*)gauge, &gauge_param);
345 
346  // load the clover term, if desired
347  if (dslash_type == QUDA_CLOVER_WILSON_DSLASH) loadCloverQuda(clover, clover_inv, &inv_param);
348 
349  // perform the inversion
350  if (multi_shift) {
351  invertMultiShiftQuda(spinorOutMulti, spinorIn, &inv_param);
352  } else {
353  invertQuda(spinorOut, spinorIn, &inv_param);
354  }
355 
356  // stop the timer
357  time0 += clock();
358  time0 /= CLOCKS_PER_SEC;
359 
360  printfQuda("Device memory used:\n Spinor: %f GiB\n Gauge: %f GiB\n",
361  inv_param.spinorGiB, gauge_param.gaugeGiB);
362  if (dslash_type == QUDA_CLOVER_WILSON_DSLASH) printfQuda(" Clover: %f GiB\n", inv_param.cloverGiB);
363  printfQuda("\nDone: %i iter / %g secs = %g Gflops, total time = %g secs\n",
364  inv_param.iter, inv_param.secs, inv_param.gflops/inv_param.secs, time0);
365 
366  if (multi_shift) {
367 
368  void *spinorTmp = malloc(V*spinorSiteSize*sSize*inv_param.Ls);
369 
370  printfQuda("Host residuum checks: \n");
371  for(int i=0; i < inv_param.num_offset; i++) {
372  ax(0, spinorCheck, V*spinorSiteSize, inv_param.cpu_prec);
373 
375  tm_matpc(spinorTmp, gauge, spinorOutMulti[i], inv_param.kappa, inv_param.mu, inv_param.twist_flavor,
376  inv_param.matpc_type, 0, inv_param.cpu_prec, gauge_param);
377  tm_matpc(spinorCheck, gauge, spinorTmp, inv_param.kappa, inv_param.mu, inv_param.twist_flavor,
378  inv_param.matpc_type, 1, inv_param.cpu_prec, gauge_param);
380  wil_matpc(spinorTmp, gauge, spinorOutMulti[i], inv_param.kappa, inv_param.matpc_type, 0,
381  inv_param.cpu_prec, gauge_param);
382  wil_matpc(spinorCheck, gauge, spinorTmp, inv_param.kappa, inv_param.matpc_type, 1,
383  inv_param.cpu_prec, gauge_param);
384  } else {
385  printfQuda("Domain wall not supported for multi-shift\n");
386  exit(-1);
387  }
388 
389  axpy(inv_param.offset[i], spinorOutMulti[i], spinorCheck, V*spinorSiteSize, inv_param.cpu_prec);
390  mxpy(spinorIn, spinorCheck, V*spinorSiteSize, inv_param.cpu_prec);
391  double nrm2 = norm_2(spinorCheck, V*spinorSiteSize, inv_param.cpu_prec);
392  double src2 = norm_2(spinorIn, V*spinorSiteSize, inv_param.cpu_prec);
393  double l2r = sqrt(nrm2 / src2);
394 
395  printfQuda("Shift %d residuals: (L2 relative) tol %g, QUDA = %g, host = %g; (heavy-quark) tol %g, QUDA = %g\n",
396  i, inv_param.tol_offset[i], inv_param.true_res_offset[i], l2r,
397  inv_param.tol_hq_offset[i], inv_param.true_res_hq_offset[i]);
398  }
399  free(spinorTmp);
400 
401  } else {
402 
403  if (inv_param.solution_type == QUDA_MAT_SOLUTION) {
404 
406  if(inv_param.twist_flavor == QUDA_TWIST_PLUS || inv_param.twist_flavor == QUDA_TWIST_MINUS)
407  tm_mat(spinorCheck, gauge, spinorOut, inv_param.kappa, inv_param.mu, inv_param.twist_flavor, 0, inv_param.cpu_prec, gauge_param);
408  else
409  {
410  int tm_offset = V*spinorSiteSize; //12*spinorRef->Volume();
411  void *evenOut = spinorCheck;
412  void *oddOut = cpu_prec == sizeof(double) ? (void*)((double*)evenOut + tm_offset): (void*)((float*)evenOut + tm_offset);
413 
414  void *evenIn = spinorOut;
415  void *oddIn = cpu_prec == sizeof(double) ? (void*)((double*)evenIn + tm_offset): (void*)((float*)evenIn + tm_offset);
416 
417  tm_ndeg_mat(evenOut, oddOut, gauge, evenIn, oddIn, inv_param.kappa, inv_param.mu, inv_param.epsilon, 0, inv_param.cpu_prec, gauge_param);
418  }
420  wil_mat(spinorCheck, gauge, spinorOut, inv_param.kappa, 0, inv_param.cpu_prec, gauge_param);
421  } else if (dslash_type == QUDA_DOMAIN_WALL_DSLASH) {
422  dw_mat(spinorCheck, gauge, spinorOut, kappa5, inv_param.dagger, inv_param.cpu_prec, gauge_param, inv_param.mass);
423  } else {
424  printfQuda("Unsupported dslash_type\n");
425  exit(-1);
426  }
427  if (inv_param.mass_normalization == QUDA_MASS_NORMALIZATION) {
429  ax(0.5/kappa5, spinorCheck, V*spinorSiteSize*inv_param.Ls, inv_param.cpu_prec);
430  } else {
431  ax(0.5/inv_param.kappa, spinorCheck, V*spinorSiteSize, inv_param.cpu_prec);
432  }
433  }
434 
435  } else if(inv_param.solution_type == QUDA_MATPC_SOLUTION) {
436 
438  tm_matpc(spinorCheck, gauge, spinorOut, inv_param.kappa, inv_param.mu, inv_param.twist_flavor,
439  inv_param.matpc_type, 0, inv_param.cpu_prec, gauge_param);
441  wil_matpc(spinorCheck, gauge, spinorOut, inv_param.kappa, inv_param.matpc_type, 0,
442  inv_param.cpu_prec, gauge_param);
443  } else if (dslash_type == QUDA_DOMAIN_WALL_DSLASH) {
444  dw_matpc(spinorCheck, gauge, spinorOut, kappa5, inv_param.matpc_type, 0, inv_param.cpu_prec, gauge_param, inv_param.mass);
445  } else {
446  printfQuda("Unsupported dslash_type\n");
447  exit(-1);
448  }
449 
450  if (inv_param.mass_normalization == QUDA_MASS_NORMALIZATION) {
452  ax(0.25/(kappa5*kappa5), spinorCheck, V*spinorSiteSize*inv_param.Ls, inv_param.cpu_prec);
453  } else {
454  ax(0.25/(inv_param.kappa*inv_param.kappa), spinorCheck, V*spinorSiteSize, inv_param.cpu_prec);
455 
456  }
457  }
458 
459  }
460 
461  mxpy(spinorIn, spinorCheck, V*spinorSiteSize*inv_param.Ls, inv_param.cpu_prec);
462  double nrm2 = norm_2(spinorCheck, V*spinorSiteSize*inv_param.Ls, inv_param.cpu_prec);
463  double src2 = norm_2(spinorIn, V*spinorSiteSize*inv_param.Ls, inv_param.cpu_prec);
464  double l2r = sqrt(nrm2 / src2);
465 
466  printfQuda("Residuals: (L2 relative) tol %g, QUDA = %g, host = %g; (heavy-quark) tol %g, QUDA = %g\n",
467  inv_param.tol, inv_param.true_res, l2r, inv_param.tol_hq, inv_param.true_res_hq);
468 
469  }
470 
471  freeGaugeQuda();
473 
474  // finalize the QUDA library
475  endQuda();
476 
477  // finalize the communications layer
478 #if defined(QMP_COMMS)
479  QMP_finalize_msg_passing();
480 #elif defined(MPI_COMMS)
481  MPI_Finalize();
482 #endif
483 
484  return 0;
485 }