QUDA  v1.1.0
A library for QCD on GPUs
hisq_stencil_test.cpp
Go to the documentation of this file.
1 #include <stdio.h>
2 #include <stdlib.h>
3 #include <string.h>
4 #include <sys/time.h>
5 
6 #include "quda.h"
7 #include "gauge_field.h"
8 #include "host_utils.h"
9 #include <command_line_params.h>
10 #include "misc.h"
11 #include "util_quda.h"
12 #include "malloc_quda.h"
13 #include <unitarization_links.h>
14 #include "dslash_quda.h"
15 #include "ks_improved_force.h"
16 
17 #ifdef MULTI_GPU
18 #include "comm_quda.h"
19 #endif
20 
21 #define TDIFF(a,b) (b.tv_sec - a.tv_sec + 0.000001*(b.tv_usec - a.tv_usec))
22 
23 using namespace quda;
24 
25 // Number of naiks. If eps_naik is 0.0, we only need
26 // to construct one naik.
28 
29 // The file "generic_ks/fermion_links_hisq_load_milc.c"
30 // within MILC is the ultimate reference for what's going on here.
31 
32 // Unitarization coefficients
33 static double unitarize_eps = 1e-6;
34 static bool reunit_allow_svd = true;
35 static bool reunit_svd_only = false;
36 static double svd_rel_error = 1e-4;
37 static double svd_abs_error = 1e-4;
38 static double max_allowed_error = 1e-11;
39 
40 
41 /*--------------------------------------------------------------------*/
42 // Some notation:
43 // U -- original link, SU(3), copied to "field" from "site"
44 // V -- after 1st level of smearing, non-SU(3)
45 // W -- unitarized, SU(3)
46 // X -- after 2nd level of smearing, non-SU(3)
47 /*--------------------------------------------------------------------*/
48 
49 static void hisq_test()
50 {
51 
52  QudaGaugeParam qudaGaugeParam;
53 
55 
57  errorQuda("Precision %d is unsupported in some link fattening routines\n",prec);
58  }
59 
60  cpu_prec = prec;
62  qudaGaugeParam = newQudaGaugeParam();
63 
64  qudaGaugeParam.anisotropy = 1.0;
65 
66  // Fix me: must always be set to 1.0 for reasons not yet discerned.
67  // The tadpole coefficient gets encoded directly into the fat link
68  // construct coefficents.
69  qudaGaugeParam.tadpole_coeff = 1.0;
70 
71  qudaGaugeParam.X[0] = xdim;
72  qudaGaugeParam.X[1] = ydim;
73  qudaGaugeParam.X[2] = zdim;
74  qudaGaugeParam.X[3] = tdim;
75 
76  setDims(qudaGaugeParam.X);
77 
78  qudaGaugeParam.cpu_prec = cpu_prec;
79  qudaGaugeParam.cuda_prec = qudaGaugeParam.cuda_prec_sloppy = prec;
80 
82  errorQuda("Unsupported gauge order %d", gauge_order);
83 
84  qudaGaugeParam.gauge_order = gauge_order;
85  qudaGaugeParam.type = QUDA_WILSON_LINKS;
86  qudaGaugeParam.reconstruct = qudaGaugeParam.reconstruct_sloppy = link_recon;
87  qudaGaugeParam.t_boundary = QUDA_ANTI_PERIODIC_T;
89  qudaGaugeParam.gauge_fix = QUDA_GAUGE_FIXED_NO;
90  qudaGaugeParam.ga_pad = 0;
91 
92  // Needed for unitarization, following "unitarize_link_test.cpp"
93  GaugeFieldParam gParam(0, qudaGaugeParam);
94  gParam.pad = 0;
98 
100  // Set up the coefficients for each part of the HISQ stencil //
102 
103  // Reference: "generic_ks/imp_actions/hisq/hisq_action.h",
104  // in QHMC: https://github.com/jcosborn/qhmc/blob/master/lib/qopqdp/hisq.c
105 
106  double u1 = 1.0/tadpole_factor;
107  double u2 = u1*u1;
108  double u4 = u2*u2;
109  double u6 = u4*u2;
110 
111  // First path: create V, W links
112  double act_path_coeff_1[6] = {
113  ( 1.0/8.0), /* one link */
114  u2*( 0.0), /* Naik */
115  u2*(-1.0/8.0)*0.5, /* simple staple */
116  u4*( 1.0/8.0)*0.25*0.5, /* displace link in two directions */
117  u6*(-1.0/8.0)*0.125*(1.0/6.0), /* displace link in three directions */
118  u4*( 0.0) /* Lepage term */
119  };
120 
121  // Second path: create X, long links
122  double act_path_coeff_2[6] = {
123  (( 1.0/8.0)+(2.0*6.0/16.0)+(1.0/8.0)), /* one link */
124  /* One link is 1/8 as in fat7 + 2*3/8 for Lepage + 1/8 for Naik */
125  (-1.0/24.0), /* Naik */
126  (-1.0/8.0)*0.5, /* simple staple */
127  ( 1.0/8.0)*0.25*0.5, /* displace link in two directions */
128  (-1.0/8.0)*0.125*(1.0/6.0), /* displace link in three directions */
129  (-2.0/16.0) /* Lepage term, correct O(a^2) 2x ASQTAD */
130  };
131 
132  // Paths for epsilon corrections. Not used if n_naiks = 1.
133  double act_path_coeff_3[6] = {
134  ( 1.0/8.0), /* one link b/c of Naik */
135  (-1.0/24.0), /* Naik */
136  0.0, /* simple staple */
137  0.0, /* displace link in two directions */
138  0.0, /* displace link in three directions */
139  0.0 /* Lepage term */
140  };
141 
142 
144  // Set unitarization coefficients //
146 
147  setUnitarizeLinksConstants(unitarize_eps,
148  max_allowed_error,
149  reunit_allow_svd,
150  reunit_svd_only,
151  svd_rel_error,
152  svd_abs_error);
153 
155  // Input links //
157 
158  void* sitelink[4];
159  for (int i = 0; i < 4; i++) sitelink[i] = pinned_malloc(V * gauge_site_size * host_gauge_data_type_size);
160 
161  void* milc_sitelink;
162  milc_sitelink = (void *)safe_malloc(4 * V * gauge_site_size * host_gauge_data_type_size);
163 
164  // Note: this could be replaced with loading a gauge field
165  createSiteLinkCPU(sitelink, qudaGaugeParam.cpu_prec, 0); // 0 -> no phases
166  for(int i=0; i<V; ++i){
167  for(int dir=0; dir<4; ++dir){
168  char* src = (char*)sitelink[dir];
169  memcpy((char *)milc_sitelink + (i * 4 + dir) * gauge_site_size * host_gauge_data_type_size,
171  }
172  }
173 
175  // Perform GPU test //
177 
178  // Paths for step 1:
179  void *vlink = pinned_malloc(4 * V * gauge_site_size * host_gauge_data_type_size); // V links
180  void *wlink = pinned_malloc(4 * V * gauge_site_size * host_gauge_data_type_size); // W links
181 
182  // Paths for step 2:
183  void *fatlink = pinned_malloc(4 * V * gauge_site_size * host_gauge_data_type_size); // final fat ("X") links
184  void *longlink = pinned_malloc(4 * V * gauge_site_size * host_gauge_data_type_size); // final long links
185 
186  // Place to accumulate Naiks
187  void* fatlink_eps = nullptr;
188  void* longlink_eps = nullptr;
189  if (n_naiks > 1) {
190  fatlink_eps = pinned_malloc(4 * V * gauge_site_size * host_gauge_data_type_size); // epsilon fat links
191  longlink_eps = pinned_malloc(4 * V * gauge_site_size * host_gauge_data_type_size); // epsilon long naiks
192  }
193 
194  // Tuning run...
195  {
196  printfQuda("Tuning...\n");
197  computeKSLinkQuda(vlink , longlink, wlink, milc_sitelink, act_path_coeff_2, &qudaGaugeParam);
198  }
199 
200  struct timeval t0, t1;
201  printfQuda("Running %d iterations of computation\n", niter);
202  gettimeofday(&t0, NULL);
203  for (int n = 0; n < niter; n++) {
204 
205  // If we create cudaGaugeField objs, we can do this 100% on the GPU, no copying!
206 
207  // Create V links (fat7 links) and W links (unitarized V links), 1st path table set
208  computeKSLinkQuda(vlink, nullptr, wlink, milc_sitelink, act_path_coeff_1, &qudaGaugeParam);
209 
210  if (n_naiks > 1) {
211  // Create Naiks, 3rd path table set
212  computeKSLinkQuda(fatlink, longlink, nullptr, wlink, act_path_coeff_3, &qudaGaugeParam);
213 
214  // Rescale+copy Naiks into Naik field
215  cpu_axy(prec, eps_naik, fatlink, fatlink_eps, V * 4 * gauge_site_size);
216  cpu_axy(prec, eps_naik, longlink, longlink_eps, V * 4 * gauge_site_size);
217  } else {
218  memset(fatlink, 0, V * 4 * gauge_site_size * host_gauge_data_type_size);
219  memset(longlink, 0, V * 4 * gauge_site_size * host_gauge_data_type_size);
220  }
221 
222  // Create X and long links, 2nd path table set
223  computeKSLinkQuda(fatlink, longlink, nullptr, wlink, act_path_coeff_2, &qudaGaugeParam);
224 
225  if (n_naiks > 1) {
226  // Add into Naik field
227  cpu_xpy(prec, fatlink, fatlink_eps, V * 4 * gauge_site_size);
228  cpu_xpy(prec, longlink, longlink_eps, V * 4 * gauge_site_size);
229  }
230  }
231  gettimeofday(&t1, NULL);
232 
233  double secs = TDIFF(t0,t1);
234 
236  // Perform CPU Build //
238 
239  void* long_reflink[4]; // Long link for fermion with zero epsilon
240  void* fat_reflink[4]; // Fat link for fermion with zero epsilon
241  for(int i=0;i < 4;i++) {
244  }
245 
246  void* long_reflink_eps[4]; // Long link for fermion with non-zero epsilon
247  void* fat_reflink_eps[4]; // Fat link for fermion with non-zero epsilon
248  if (n_naiks > 1) {
249  for(int i=0;i < 4;i++) {
250  long_reflink_eps[i] = safe_malloc(V * gauge_site_size * host_gauge_data_type_size);
251  fat_reflink_eps[i] = safe_malloc(V * gauge_site_size * host_gauge_data_type_size);
252  }
253  }
254 
255  if (verify_results) {
256 
257  double* act_paths[3] = { act_path_coeff_1, act_path_coeff_2, act_path_coeff_3 };
258 
259  computeHISQLinksCPU(fat_reflink, long_reflink,
260  fat_reflink_eps, long_reflink_eps,
261  sitelink, &qudaGaugeParam, act_paths, eps_naik);
262 
263  }
264 
266  // Layout change for fatlink, fatlink_eps, longlink, longlink_eps //
268 
269  void* myfatlink [4];
270  void* mylonglink [4];
271  void* myfatlink_eps [4];
272  void* mylonglink_eps [4];
273  for(int i=0; i < 4; i++) {
274 
277  memset(myfatlink[i], 0, V * gauge_site_size * host_gauge_data_type_size);
278  memset(mylonglink[i], 0, V * gauge_site_size * host_gauge_data_type_size);
279 
280  if (n_naiks > 1) {
281  myfatlink_eps[i] = safe_malloc(V * gauge_site_size * host_gauge_data_type_size);
282  mylonglink_eps[i] = safe_malloc(V * gauge_site_size * host_gauge_data_type_size);
283  memset(myfatlink_eps[i], 0, V * gauge_site_size * host_gauge_data_type_size);
284  memset(mylonglink_eps[i], 0, V * gauge_site_size * host_gauge_data_type_size);
285  }
286  }
287 
288  for(int i=0; i < V; i++){
289  for(int dir=0; dir< 4; dir++){
290  char *src = ((char *)fatlink) + (4 * i + dir) * gauge_site_size * host_gauge_data_type_size;
291  char *dst = ((char *)myfatlink[dir]) + i * gauge_site_size * host_gauge_data_type_size;
292  memcpy(dst, src, gauge_site_size * host_gauge_data_type_size);
293 
294  src = ((char *)longlink) + (4 * i + dir) * gauge_site_size * host_gauge_data_type_size;
295  dst = ((char *)mylonglink[dir]) + i * gauge_site_size * host_gauge_data_type_size;
296  memcpy(dst, src, gauge_site_size * host_gauge_data_type_size);
297 
298  if (n_naiks > 1) {
299  src = ((char *)fatlink_eps) + (4 * i + dir) * gauge_site_size * host_gauge_data_type_size;
300  dst = ((char *)myfatlink_eps[dir]) + i * gauge_site_size * host_gauge_data_type_size;
301  memcpy(dst, src, gauge_site_size * host_gauge_data_type_size);
302 
303  src = ((char *)longlink_eps) + (4 * i + dir) * gauge_site_size * host_gauge_data_type_size;
304  dst = ((char *)mylonglink_eps[dir]) + i * gauge_site_size * host_gauge_data_type_size;
305  memcpy(dst, src, gauge_site_size * host_gauge_data_type_size);
306  }
307  }
308  }
309 
311  // Perform the verification //
313 
314  if (verify_results) {
315  printfQuda("Checking fat links...\n");
316  int res=1;
317  for(int dir=0; dir<4; dir++){
318  res &= compare_floats(fat_reflink[dir], myfatlink[dir], V * gauge_site_size, 1e-3, qudaGaugeParam.cpu_prec);
319  }
320 
321  strong_check_link(myfatlink , "GPU results: ",
322  fat_reflink, "CPU reference results:",
323  V, qudaGaugeParam.cpu_prec);
324 
325  printfQuda("Fat-link test %s\n\n",(1 == res) ? "PASSED" : "FAILED");
326 
327 
328 
329  printfQuda("Checking long links...\n");
330  res = 1;
331  for(int dir=0; dir<4; ++dir){
332  res &= compare_floats(long_reflink[dir], mylonglink[dir], V * gauge_site_size, 1e-3, qudaGaugeParam.cpu_prec);
333  }
334 
335  strong_check_link(mylonglink, "GPU results: ",
336  long_reflink, "CPU reference results:",
337  V, qudaGaugeParam.cpu_prec);
338 
339  printfQuda("Long-link test %s\n\n",(1 == res) ? "PASSED" : "FAILED");
340 
341  if (n_naiks > 1) {
342 
343  printfQuda("Checking fat eps_naik links...\n");
344  res=1;
345  for(int dir=0; dir<4; dir++){
346  res &= compare_floats(fat_reflink_eps[dir], myfatlink_eps[dir], V * gauge_site_size, 1e-3,
347  qudaGaugeParam.cpu_prec);
348  }
349 
350  strong_check_link(myfatlink_eps , "GPU results: ",
351  fat_reflink_eps, "CPU reference results:",
352  V, qudaGaugeParam.cpu_prec);
353 
354  printfQuda("Fat-link eps_naik test %s\n\n",(1 == res) ? "PASSED" : "FAILED");
355 
356 
357  printfQuda("Checking long eps_naik links...\n");
358  res = 1;
359  for(int dir=0; dir<4; ++dir){
360  res &= compare_floats(long_reflink_eps[dir], mylonglink_eps[dir], V * gauge_site_size, 1e-3,
361  qudaGaugeParam.cpu_prec);
362  }
363 
364  strong_check_link(mylonglink_eps, "GPU results: ",
365  long_reflink_eps, "CPU reference results:",
366  V, qudaGaugeParam.cpu_prec);
367 
368  printfQuda("Long-link eps_naik test %s\n\n",(1 == res) ? "PASSED" : "FAILED");
369  }
370  }
371 
372  // FIXME: does not include unitarization, extra naiks
373  int volume = qudaGaugeParam.X[0]*qudaGaugeParam.X[1]*qudaGaugeParam.X[2]*qudaGaugeParam.X[3];
374  long long flops = 61632 * (long long)niter; // Constructing V field
375  // Constructing W field?
376  // Constructing separate Naiks
377  flops += 61632 * (long long)niter; // Constructing X field
378  flops += (252*4)*(long long)niter; // long-link contribution
379 
380  double perf = flops*volume/(secs*1024*1024*1024);
381  printfQuda("link computation time =%.2f ms, flops= %.2f Gflops\n", (secs*1000)/niter, perf);
382 
383  for (int i=0; i < 4; i++) {
384  host_free(myfatlink [i]);
385  host_free(mylonglink[i]);
386  if (n_naiks > 1) {
387  host_free(myfatlink_eps [i]);
388  host_free(mylonglink_eps[i]);
389  }
390  }
391 
392 
393  for(int i=0; i < 4; i++){
394  host_free(sitelink[i]);
395  host_free(fat_reflink[i]);
396  host_free(long_reflink[i]);
397  if (n_naiks > 1) {
398  host_free(fat_reflink_eps[i]);
399  host_free(long_reflink_eps[i]);
400  }
401  }
402 
403  // Clean up GPU compute links
404  host_free(vlink);
405  host_free(wlink);
406  host_free(fatlink);
407  host_free(longlink);
408 
409  if (n_naiks > 1) {
410  host_free(fatlink_eps);
411  host_free(longlink_eps);
412  }
413 
414  if(milc_sitelink) host_free(milc_sitelink);
415 #ifdef MULTI_GPU
417 #endif
418  endQuda();
419 }
420 
421 static void display_test_info()
422 {
423  printfQuda("running the following test:\n");
424 
425  printfQuda("link_precision link_reconstruct space_dimension T_dimension Ordering\n");
426  printfQuda("%s %s %d/%d/%d/ %d %s \n",
429  xdim, ydim, zdim, tdim,
430  get_gauge_order_str(gauge_order));
431 
432  printfQuda("Grid partition info: X Y Z T\n");
433  printfQuda(" %d %d %d %d\n",
434  dimPartitioned(0),
435  dimPartitioned(1),
436  dimPartitioned(2),
437  dimPartitioned(3));
438 
439  printfQuda("Number of Naiks: %d\n", n_naiks);
440 }
441 
442 
443 int main(int argc, char **argv)
444 {
445  // for speed
446  xdim=ydim=zdim=tdim=8;
447 
448  //default to 18 reconstruct
451 
452  auto app = make_app();
453  // app->get_formatter()->column_width(40);
454  // add_eigen_option_group(app);
455  // add_deflation_option_group(app);
456  // add_multigrid_option_group(app);
457  try {
458  app->parse(argc, argv);
459  } catch (const CLI::ParseError &e) {
460  return app->exit(e);
461  }
462 
463  if (eps_naik != 0.0) { n_naiks = 2; }
464 
465  initComms(argc, argv, gridsize_from_cmdline);
467  hisq_test();
468  finalizeComms();
469 }
470 
471 
void display_test_info()
std::shared_ptr< QUDAApp > make_app(std::string app_description, std::string app_name)
QudaReconstructType link_recon
int niter
int device_ordinal
int & ydim
bool verify_results
int & zdim
double eps_naik
QudaPrecision prec
int n_naiks
int & tdim
int & xdim
std::array< int, 4 > gridsize_from_cmdline
int V
Definition: host_utils.cpp:37
void * memset(void *s, int c, size_t n)
void setDims(int *)
Definition: host_utils.cpp:315
@ QUDA_STAGGERED_PHASE_MILC
Definition: enum_quda.h:516
enum QudaGaugeFieldOrder_s QudaGaugeFieldOrder
@ QUDA_RECONSTRUCT_NO
Definition: enum_quda.h:70
@ QUDA_ANTI_PERIODIC_T
Definition: enum_quda.h:56
@ QUDA_GHOST_EXCHANGE_NO
Definition: enum_quda.h:508
@ QUDA_GAUGE_FIXED_NO
Definition: enum_quda.h:80
@ QUDA_DOUBLE_PRECISION
Definition: enum_quda.h:65
@ QUDA_QUARTER_PRECISION
Definition: enum_quda.h:62
@ QUDA_HALF_PRECISION
Definition: enum_quda.h:63
@ QUDA_MILC_GAUGE_ORDER
Definition: enum_quda.h:47
@ QUDA_GENERAL_LINKS
Definition: enum_quda.h:25
@ QUDA_WILSON_LINKS
Definition: enum_quda.h:30
void exchange_llfat_cleanup(void)
#define gauge_site_size
Definition: face_gauge.cpp:34
QudaGaugeFieldOrder gauge_order
GaugeFieldParam gParam
int main(int argc, char **argv)
#define TDIFF(a, b)
void cpu_axy(QudaPrecision prec, double a, void *x, void *y, int size)
Definition: host_blas.cpp:74
void cpu_xpy(QudaPrecision prec, void *x, void *y, int size)
Definition: host_blas.cpp:87
int compare_floats(void *a, void *b, int len, double epsilon, QudaPrecision precision)
Definition: host_utils.cpp:889
size_t host_gauge_data_type_size
Definition: host_utils.cpp:65
int dimPartitioned(int dim)
Definition: host_utils.cpp:376
void initComms(int argc, char **argv, std::array< int, 4 > &commDims)
Definition: host_utils.cpp:255
int strong_check_link(void **linkA, const char *msgA, void **linkB, const char *msgB, int len, QudaPrecision prec)
void finalizeComms()
Definition: host_utils.cpp:292
void createSiteLinkCPU(void **link, QudaPrecision precision, int phase)
QudaPrecision & cpu_prec
Definition: host_utils.cpp:57
void computeHISQLinksCPU(void **fatlink, void **longlink, void **fatlink_eps, void **longlink_eps, void **sitelink, void *qudaGaugeParamPtr, double **act_path_coeffs, double eps_naik)
#define safe_malloc(size)
Definition: malloc_quda.h:106
#define pinned_malloc(size)
Definition: malloc_quda.h:107
#define host_free(ptr)
Definition: malloc_quda.h:115
const char * get_prec_str(QudaPrecision prec)
Definition: misc.cpp:26
const char * get_gauge_order_str(QudaGaugeFieldOrder order)
Definition: misc.cpp:54
const char * get_recon_str(QudaReconstructType recon)
Definition: misc.cpp:68
unsigned long long flops
void setUnitarizeLinksConstants(double unitarize_eps, double max_error, bool allow_svd, bool svd_only, double svd_rel_error, double svd_abs_error)
Main header file for the QUDA library.
QudaGaugeParam newQudaGaugeParam(void)
void initQuda(int device)
void computeKSLinkQuda(void *fatlink, void *longlink, void *ulink, void *inlink, double *path_coeff, QudaGaugeParam *param)
void endQuda(void)
double tadpole_factor
double tadpole_coeff
Definition: quda.h:38
double anisotropy
Definition: quda.h:37
QudaReconstructType reconstruct
Definition: quda.h:49
int ga_pad
Definition: quda.h:65
QudaLinkType type
Definition: quda.h:41
QudaPrecision cuda_prec_sloppy
Definition: quda.h:51
QudaReconstructType reconstruct_sloppy
Definition: quda.h:52
QudaGaugeFixed gauge_fix
Definition: quda.h:63
QudaGaugeFieldOrder gauge_order
Definition: quda.h:42
QudaPrecision cuda_prec
Definition: quda.h:48
QudaStaggeredPhase staggered_phase_type
Definition: quda.h:73
int X[4]
Definition: quda.h:35
QudaPrecision cpu_prec
Definition: quda.h:46
QudaTboundary t_boundary
Definition: quda.h:44
QudaGaugeFieldOrder order
Definition: gauge_field.h:51
QudaLinkType link_type
Definition: gauge_field.h:53
QudaGhostExchange ghostExchange
Definition: lattice_field.h:77
#define printfQuda(...)
Definition: util_quda.h:114
#define errorQuda(...)
Definition: util_quda.h:120