QUDA  1.0.0
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros
hisq_stencil_test.cpp
Go to the documentation of this file.
1 #include <stdio.h>
2 #include <stdlib.h>
3 #include <string.h>
4 #include <sys/time.h>
5 #include <cuda.h>
6 #include <cuda_runtime.h>
7 
8 #include "quda.h"
9 #include "gauge_field.h"
10 #include "test_util.h"
11 #include "llfat_reference.h"
12 #include "misc.h"
13 #include "util_quda.h"
14 #include "malloc_quda.h"
15 #include <unitarization_links.h>
16 #include "dslash_quda.h"
17 #include "ks_improved_force.h"
18 
19 #ifdef MULTI_GPU
20 #include "comm_quda.h"
21 #endif
22 
23 #define TDIFF(a,b) (b.tv_sec - a.tv_sec + 0.000001*(b.tv_usec - a.tv_usec))
24 
25 using namespace quda;
26 
27 extern void usage(char** argv);
28 extern bool verify_results;
29 
30 extern int device;
31 extern int xdim, ydim, zdim, tdim;
32 extern int gridsize_from_cmdline[];
33 
35 extern QudaPrecision prec;
36 extern int niter;
37 
38 extern double tadpole_factor;
39 // relativistic correction for naik term
40 extern double eps_naik;
41 // Number of naiks. If eps_naik is 0.0, we only need
42 // to construct one naik.
43 static int n_naiks = 1;
44 
47 
48 static size_t gSize;
49 
50 // The file "generic_ks/fermion_links_hisq_load_milc.c"
51 // within MILC is the ultimate reference for what's going on here.
52 
53 // Unitarization coefficients
54 static double unitarize_eps = 1e-6;
55 static bool reunit_allow_svd = true;
56 static bool reunit_svd_only = false;
57 static double svd_rel_error = 1e-4;
58 static double svd_abs_error = 1e-4;
59 static double max_allowed_error = 1e-11;
60 
61 
62 /*--------------------------------------------------------------------*/
63 // Some notation:
64 // U -- original link, SU(3), copied to "field" from "site"
65 // V -- after 1st level of smearing, non-SU(3)
66 // W -- unitarized, SU(3)
67 // X -- after 2nd level of smearing, non-SU(3)
68 /*--------------------------------------------------------------------*/
69 
70 static void hisq_test()
71 {
72 
74 
76 
78  errorQuda("Precision %d is unsupported in some link fattening routines\n",prec);
79  }
80 
81  cpu_prec = prec;
82  gSize = cpu_prec;
83  qudaGaugeParam = newQudaGaugeParam();
84 
85  qudaGaugeParam.anisotropy = 1.0;
86 
87  // Fix me: must always be set to 1.0 for reasons not yet discerned.
88  // The tadpole coefficient gets encoded directly into the fat link
89  // construct coefficents.
90  qudaGaugeParam.tadpole_coeff = 1.0;
91 
92  qudaGaugeParam.X[0] = xdim;
93  qudaGaugeParam.X[1] = ydim;
94  qudaGaugeParam.X[2] = zdim;
95  qudaGaugeParam.X[3] = tdim;
96 
97  setDims(qudaGaugeParam.X);
98 
99  qudaGaugeParam.cpu_prec = cpu_prec;
100  qudaGaugeParam.cuda_prec = qudaGaugeParam.cuda_prec_sloppy = prec;
101 
103  errorQuda("Unsupported gauge order %d", gauge_order);
104 
105  qudaGaugeParam.gauge_order = gauge_order;
106  qudaGaugeParam.type = QUDA_WILSON_LINKS;
107  qudaGaugeParam.reconstruct = qudaGaugeParam.reconstruct_sloppy = link_recon;
108  qudaGaugeParam.t_boundary = QUDA_ANTI_PERIODIC_T;
110  qudaGaugeParam.gauge_fix = QUDA_GAUGE_FIXED_NO;
111  qudaGaugeParam.ga_pad = 0;
112 
113  // Needed for unitarization, following "unitarize_link_test.cpp"
114  GaugeFieldParam gParam(0, qudaGaugeParam);
115  gParam.pad = 0;
116  gParam.link_type = QUDA_GENERAL_LINKS;
118  gParam.order = gauge_order;
119 
121  // Set up the coefficients for each part of the HISQ stencil //
123 
124  // Reference: "generic_ks/imp_actions/hisq/hisq_action.h",
125  // in QHMC: https://github.com/jcosborn/qhmc/blob/master/lib/qopqdp/hisq.c
126 
127  double u1 = 1.0/tadpole_factor;
128  double u2 = u1*u1;
129  double u4 = u2*u2;
130  double u6 = u4*u2;
131 
132  // First path: create V, W links
133  double act_path_coeff_1[6] = {
134  ( 1.0/8.0), /* one link */
135  u2*( 0.0), /* Naik */
136  u2*(-1.0/8.0)*0.5, /* simple staple */
137  u4*( 1.0/8.0)*0.25*0.5, /* displace link in two directions */
138  u6*(-1.0/8.0)*0.125*(1.0/6.0), /* displace link in three directions */
139  u4*( 0.0) /* Lepage term */
140  };
141 
142  // Second path: create X, long links
143  double act_path_coeff_2[6] = {
144  (( 1.0/8.0)+(2.0*6.0/16.0)+(1.0/8.0)), /* one link */
145  /* One link is 1/8 as in fat7 + 2*3/8 for Lepage + 1/8 for Naik */
146  (-1.0/24.0), /* Naik */
147  (-1.0/8.0)*0.5, /* simple staple */
148  ( 1.0/8.0)*0.25*0.5, /* displace link in two directions */
149  (-1.0/8.0)*0.125*(1.0/6.0), /* displace link in three directions */
150  (-2.0/16.0) /* Lepage term, correct O(a^2) 2x ASQTAD */
151  };
152 
153  // Paths for epsilon corrections. Not used if n_naiks = 1.
154  double act_path_coeff_3[6] = {
155  ( 1.0/8.0), /* one link b/c of Naik */
156  (-1.0/24.0), /* Naik */
157  0.0, /* simple staple */
158  0.0, /* displace link in two directions */
159  0.0, /* displace link in three directions */
160  0.0 /* Lepage term */
161  };
162 
163 
165  // Set unitarization coefficients //
167 
173  svd_abs_error);
174 
176  // Input links //
178 
179  void* sitelink[4];
180  for(int i=0;i < 4;i++) sitelink[i] = pinned_malloc(V*gaugeSiteSize*gSize);
181 
182  void* milc_sitelink;
183  milc_sitelink = (void*)safe_malloc(4*V*gaugeSiteSize*gSize);
184 
185 
186  // Note: this could be replaced with loading a gauge field
187  createSiteLinkCPU(sitelink, qudaGaugeParam.cpu_prec, 0); // 0 -> no phases
188  for(int i=0; i<V; ++i){
189  for(int dir=0; dir<4; ++dir){
190  char* src = (char*)sitelink[dir];
191  memcpy((char*)milc_sitelink + (i*4 + dir)*gaugeSiteSize*gSize, src+i*gaugeSiteSize*gSize, gaugeSiteSize*gSize);
192  }
193  }
194 
196  // Perform GPU test //
198 
199  // Paths for step 1:
200  void* vlink = pinned_malloc(4*V*gaugeSiteSize*gSize); // V links
201  void* wlink = pinned_malloc(4*V*gaugeSiteSize*gSize); // W links
202 
203  // Paths for step 2:
204  void* fatlink = pinned_malloc(4*V*gaugeSiteSize*gSize); // final fat ("X") links
205  void* longlink = pinned_malloc(4*V*gaugeSiteSize*gSize); // final long links
206 
207  // Place to accumulate Naiks
208  void* fatlink_eps = nullptr;
209  void* longlink_eps = nullptr;
210  if (n_naiks > 1) {
211  fatlink_eps = pinned_malloc(4*V*gaugeSiteSize*gSize); // epsilon fat links
212  longlink_eps = pinned_malloc(4*V*gaugeSiteSize*gSize); // epsilon long naiks
213  }
214 
215  // Tuning run...
216  {
217  printfQuda("Tuning...\n");
218  computeKSLinkQuda(vlink , longlink, wlink, milc_sitelink, act_path_coeff_2, &qudaGaugeParam);
219  }
220 
221  struct timeval t0, t1;
222  printfQuda("Running %d iterations of computation\n", niter);
223  gettimeofday(&t0, NULL);
224  for (int n = 0; n < niter; n++) {
225 
226  // If we create cudaGaugeField objs, we can do this 100% on the GPU, no copying!
227 
228  // Create V links (fat7 links) and W links (unitarized V links), 1st path table set
229  computeKSLinkQuda(vlink, nullptr, wlink, milc_sitelink, act_path_coeff_1, &qudaGaugeParam);
230 
231  if (n_naiks > 1) {
232  // Create Naiks, 3rd path table set
233  computeKSLinkQuda(fatlink, longlink, nullptr, wlink, act_path_coeff_3, &qudaGaugeParam);
234 
235  // Rescale+copy Naiks into Naik field
236  cpu_axy(prec, eps_naik, fatlink, fatlink_eps, V*4*gaugeSiteSize);
237  cpu_axy(prec, eps_naik, longlink, longlink_eps, V*4*gaugeSiteSize);
238  } else {
239  memset(fatlink, 0, V*4*gaugeSiteSize*gSize);
240  memset(longlink, 0, V*4*gaugeSiteSize*gSize);
241  }
242 
243  // Create X and long links, 2nd path table set
244  computeKSLinkQuda(fatlink, longlink, nullptr, wlink, act_path_coeff_2, &qudaGaugeParam);
245 
246  if (n_naiks > 1) {
247  // Add into Naik field
248  cpu_xpy(prec, fatlink, fatlink_eps, V*4*gaugeSiteSize);
249  cpu_xpy(prec, longlink, longlink_eps, V*4*gaugeSiteSize);
250  }
251  }
252  gettimeofday(&t1, NULL);
253 
254  double secs = TDIFF(t0,t1);
255 
257  // Perform CPU Build //
259 
260  void* long_reflink[4]; // Long link for fermion with zero epsilon
261  void* fat_reflink[4]; // Fat link for fermion with zero epsilon
262  for(int i=0;i < 4;i++) {
263  long_reflink[i] = safe_malloc(V*gaugeSiteSize*gSize);
264  fat_reflink[i] = safe_malloc(V*gaugeSiteSize*gSize);
265  }
266 
267  void* long_reflink_eps[4]; // Long link for fermion with non-zero epsilon
268  void* fat_reflink_eps[4]; // Fat link for fermion with non-zero epsilon
269  if (n_naiks > 1) {
270  for(int i=0;i < 4;i++) {
271  long_reflink_eps[i] = safe_malloc(V*gaugeSiteSize*gSize);
272  fat_reflink_eps[i] = safe_malloc(V*gaugeSiteSize*gSize);
273  }
274  }
275 
276  if (verify_results) {
277 
278  double* act_paths[3] = { act_path_coeff_1, act_path_coeff_2, act_path_coeff_3 };
279 
280  computeHISQLinksCPU(fat_reflink, long_reflink,
281  fat_reflink_eps, long_reflink_eps,
282  sitelink, &qudaGaugeParam, act_paths, eps_naik);
283 
284  }
285 
287  // Layout change for fatlink, fatlink_eps, longlink, longlink_eps //
289 
290  void* myfatlink [4];
291  void* mylonglink [4];
292  void* myfatlink_eps [4];
293  void* mylonglink_eps [4];
294  for(int i=0; i < 4; i++) {
295 
296  myfatlink [i] = safe_malloc(V*gaugeSiteSize*gSize);
297  mylonglink[i] = safe_malloc(V*gaugeSiteSize*gSize);
298  memset(myfatlink [i], 0, V*gaugeSiteSize*gSize);
299  memset(mylonglink[i], 0, V*gaugeSiteSize*gSize);
300 
301  if (n_naiks > 1) {
302  myfatlink_eps [i] = safe_malloc(V*gaugeSiteSize*gSize);
303  mylonglink_eps[i] = safe_malloc(V*gaugeSiteSize*gSize);
304  memset(myfatlink_eps [i], 0, V*gaugeSiteSize*gSize);
305  memset(mylonglink_eps[i], 0, V*gaugeSiteSize*gSize);
306  }
307  }
308 
309  for(int i=0; i < V; i++){
310  for(int dir=0; dir< 4; dir++){
311  char* src = ((char*)fatlink )+ (4*i+dir)*gaugeSiteSize*gSize;
312  char* dst = ((char*)myfatlink [dir]) + i*gaugeSiteSize*gSize;
313  memcpy(dst, src, gaugeSiteSize*gSize);
314 
315  src = ((char*)longlink)+ (4*i+dir)*gaugeSiteSize*gSize;
316  dst = ((char*)mylonglink[dir]) + i*gaugeSiteSize*gSize;
317  memcpy(dst, src, gaugeSiteSize*gSize);
318 
319  if (n_naiks > 1) {
320  src = ((char*)fatlink_eps )+ (4*i+dir)*gaugeSiteSize*gSize;
321  dst = ((char*)myfatlink_eps [dir]) + i*gaugeSiteSize*gSize;
322  memcpy(dst, src, gaugeSiteSize*gSize);
323 
324  src = ((char*)longlink_eps)+ (4*i+dir)*gaugeSiteSize*gSize;
325  dst = ((char*)mylonglink_eps[dir]) + i*gaugeSiteSize*gSize;
326  memcpy(dst, src, gaugeSiteSize*gSize);
327  }
328  }
329  }
330 
332  // Perform the verification //
334 
335  if (verify_results) {
336  printfQuda("Checking fat links...\n");
337  int res=1;
338  for(int dir=0; dir<4; dir++){
339  res &= compare_floats(fat_reflink[dir], myfatlink [dir], V*gaugeSiteSize, 1e-3, qudaGaugeParam.cpu_prec);
340  }
341 
342  strong_check_link(myfatlink , "GPU results: ",
343  fat_reflink, "CPU reference results:",
344  V, qudaGaugeParam.cpu_prec);
345 
346  printfQuda("Fat-link test %s\n\n",(1 == res) ? "PASSED" : "FAILED");
347 
348 
349 
350  printfQuda("Checking long links...\n");
351  res = 1;
352  for(int dir=0; dir<4; ++dir){
353  res &= compare_floats(long_reflink[dir], mylonglink[dir], V*gaugeSiteSize, 1e-3, qudaGaugeParam.cpu_prec);
354  }
355 
356  strong_check_link(mylonglink, "GPU results: ",
357  long_reflink, "CPU reference results:",
358  V, qudaGaugeParam.cpu_prec);
359 
360  printfQuda("Long-link test %s\n\n",(1 == res) ? "PASSED" : "FAILED");
361 
362  if (n_naiks > 1) {
363 
364  printfQuda("Checking fat eps_naik links...\n");
365  res=1;
366  for(int dir=0; dir<4; dir++){
367  res &= compare_floats(fat_reflink_eps[dir], myfatlink_eps [dir], V*gaugeSiteSize, 1e-3, qudaGaugeParam.cpu_prec);
368  }
369 
370  strong_check_link(myfatlink_eps , "GPU results: ",
371  fat_reflink_eps, "CPU reference results:",
372  V, qudaGaugeParam.cpu_prec);
373 
374  printfQuda("Fat-link eps_naik test %s\n\n",(1 == res) ? "PASSED" : "FAILED");
375 
376 
377  printfQuda("Checking long eps_naik links...\n");
378  res = 1;
379  for(int dir=0; dir<4; ++dir){
380  res &= compare_floats(long_reflink_eps[dir], mylonglink_eps[dir], V*gaugeSiteSize, 1e-3, qudaGaugeParam.cpu_prec);
381  }
382 
383  strong_check_link(mylonglink_eps, "GPU results: ",
384  long_reflink_eps, "CPU reference results:",
385  V, qudaGaugeParam.cpu_prec);
386 
387  printfQuda("Long-link eps_naik test %s\n\n",(1 == res) ? "PASSED" : "FAILED");
388  }
389  }
390 
391  // FIXME: does not include unitarization, extra naiks
392  int volume = qudaGaugeParam.X[0]*qudaGaugeParam.X[1]*qudaGaugeParam.X[2]*qudaGaugeParam.X[3];
393  long long flops = 61632 * (long long)niter; // Constructing V field
394  // Constructing W field?
395  // Constructing separate Naiks
396  flops += 61632 * (long long)niter; // Constructing X field
397  flops += (252*4)*(long long)niter; // long-link contribution
398 
399  double perf = flops*volume/(secs*1024*1024*1024);
400  printfQuda("link computation time =%.2f ms, flops= %.2f Gflops\n", (secs*1000)/niter, perf);
401 
402  for (int i=0; i < 4; i++) {
403  host_free(myfatlink [i]);
404  host_free(mylonglink[i]);
405  if (n_naiks > 1) {
406  host_free(myfatlink_eps [i]);
407  host_free(mylonglink_eps[i]);
408  }
409  }
410 
411 
412  for(int i=0; i < 4; i++){
413  host_free(sitelink[i]);
414  host_free(fat_reflink[i]);
415  host_free(long_reflink[i]);
416  if (n_naiks > 1) {
417  host_free(fat_reflink_eps[i]);
418  host_free(long_reflink_eps[i]);
419  }
420  }
421 
422  // Clean up GPU compute links
423  host_free(vlink);
424  host_free(wlink);
425  host_free(fatlink);
426  host_free(longlink);
427 
428  if (n_naiks > 1) {
429  host_free(fatlink_eps);
430  host_free(longlink_eps);
431  }
432 
433  if(milc_sitelink) host_free(milc_sitelink);
434 #ifdef MULTI_GPU
436 #endif
437  endQuda();
438 }
439 
440 static void display_test_info()
441 {
442  printfQuda("running the following test:\n");
443 
444  printfQuda("link_precision link_reconstruct space_dimension T_dimension Ordering\n");
445  printfQuda("%s %s %d/%d/%d/ %d %s \n",
448  xdim, ydim, zdim, tdim,
450 
451  printfQuda("Grid partition info: X Y Z T\n");
452  printfQuda(" %d %d %d %d\n",
453  dimPartitioned(0),
454  dimPartitioned(1),
455  dimPartitioned(2),
456  dimPartitioned(3));
457 
458  printfQuda("Number of Naiks: %d\n", n_naiks);
459 
460  return ;
461 
462 }
463 
464 
465 int main(int argc, char **argv)
466 {
467  // for speed
468  xdim=ydim=zdim=tdim=8;
469 
470  //default to 18 reconstruct
473 
474  for (int i = 1; i < argc; i++){
475 
476  if(process_command_line_option(argc, argv, &i) == 0){
477  continue;
478  }
479  fprintf(stderr, "ERROR: Invalid option:%s\n", argv[i]);
480  usage(argv);
481  }
482 
483  if (eps_naik != 0.0) { n_naiks = 2; }
484 
485  initComms(argc, argv, gridsize_from_cmdline);
487  hisq_test();
488  finalizeComms();
489 }
490 
491 
static QudaGaugeParam qudaGaugeParam
static bool reunit_allow_svd
static size_t gSize
int dimPartitioned(int dim)
Definition: test_util.cpp:1776
QudaReconstructType link_recon
Definition: test_util.cpp:1605
QudaReconstructType reconstruct_sloppy
Definition: quda.h:53
double anisotropy
Definition: quda.h:38
QudaGhostExchange ghostExchange
Definition: lattice_field.h:76
void endQuda(void)
#define pinned_malloc(size)
Definition: malloc_quda.h:67
enum QudaPrecision_s QudaPrecision
int ga_pad
Definition: quda.h:63
QudaGaugeFixed gauge_fix
Definition: quda.h:61
static void hisq_test()
QudaLinkType type
Definition: quda.h:42
#define errorQuda(...)
Definition: util_quda.h:121
void setUnitarizeLinksConstants(double unitarize_eps, double max_error, bool allow_svd, bool svd_only, double svd_rel_error, double svd_abs_error)
#define host_free(ptr)
Definition: malloc_quda.h:71
void usage(char **argv)
Definition: test_util.cpp:1783
int process_command_line_option(int argc, char **argv, int *idx)
Definition: test_util.cpp:2019
double tadpole_factor
Definition: test_util.cpp:1651
QudaStaggeredPhase staggered_phase_type
Definition: quda.h:71
void finalizeComms()
Definition: test_util.cpp:128
void exchange_llfat_cleanup(void)
const char * get_gauge_order_str(QudaGaugeFieldOrder order)
Definition: misc.cpp:741
QudaGaugeFieldOrder gauge_order
Definition: quda.h:43
void computeKSLinkQuda(void *fatlink, void *longlink, void *ulink, void *inlink, double *path_coeff, QudaGaugeParam *param)
int compare_floats(void *a, void *b, int len, double epsilon, QudaPrecision precision)
Definition: test_util.cpp:434
const char * get_prec_str(QudaPrecision prec)
Definition: misc.cpp:701
static QudaGaugeFieldOrder gauge_order
int device
Definition: test_util.cpp:1602
QudaPrecision prec
Definition: test_util.cpp:1608
void createSiteLinkCPU(void **link, QudaPrecision precision, int phase)
Definition: test_util.cpp:1227
void setDims(int *)
Definition: test_util.cpp:151
int niter
Definition: test_util.cpp:1629
int gridsize_from_cmdline[]
Definition: test_util.cpp:49
static double svd_rel_error
double eps_naik
Definition: test_util.cpp:1652
void initQuda(int device)
static void display_test_info()
static bool reunit_svd_only
void cpu_xpy(QudaPrecision prec, void *x, void *y, int size)
const char * get_recon_str(QudaReconstructType recon)
Definition: misc.cpp:768
QudaGaugeFieldOrder order
Definition: gauge_field.h:17
int zdim
Definition: test_util.cpp:1617
QudaPrecision cuda_prec_sloppy
Definition: quda.h:52
enum QudaGaugeFieldOrder_s QudaGaugeFieldOrder
static double svd_abs_error
void computeHISQLinksCPU(void **fatlink, void **longlink, void **fatlink_eps, void **longlink_eps, void **sitelink, void *qudaGaugeParamPtr, double **act_path_coeffs, double eps_naik)
QudaReconstructType reconstruct
Definition: quda.h:50
QudaPrecision cuda_prec
Definition: quda.h:49
int ydim
Definition: test_util.cpp:1616
int X[4]
Definition: quda.h:36
#define TDIFF(a, b)
#define safe_malloc(size)
Definition: malloc_quda.h:66
int V
Definition: test_util.cpp:27
int strong_check_link(void **linkA, const char *msgA, void **linkB, const char *msgB, int len, QudaPrecision prec)
Definition: test_util.cpp:1429
static int n_naiks
void * memset(void *s, int c, size_t n)
int xdim
Definition: test_util.cpp:1615
double tadpole_coeff
Definition: quda.h:39
GaugeFieldParam gParam
enum QudaReconstructType_s QudaReconstructType
Main header file for the QUDA library.
static double unitarize_eps
QudaLinkType link_type
Definition: gauge_field.h:19
void cpu_axy(QudaPrecision prec, double a, void *x, void *y, int size)
#define printfQuda(...)
Definition: util_quda.h:115
QudaTboundary t_boundary
Definition: quda.h:45
int tdim
Definition: test_util.cpp:1618
static double max_allowed_error
unsigned long long flops
Definition: blas_quda.cu:22
int main(int argc, char **argv)
bool verify_results
Definition: test_util.cpp:1643
void * longlink
void * fatlink
void initComms(int argc, char **argv, int *const commDims)
Definition: test_util.cpp:88
static QudaPrecision cpu_prec
QudaPrecision cpu_prec
Definition: quda.h:47
#define gaugeSiteSize
Definition: face_gauge.cpp:34
QudaGaugeParam newQudaGaugeParam(void)