QUDA  v1.1.0
A library for QCD on GPUs
llfat_test.cpp
Go to the documentation of this file.
1 #include <stdio.h>
2 #include <stdlib.h>
3 #include <string.h>
4 #include <sys/time.h>
5 
6 #include "quda.h"
7 #include "host_utils.h"
8 #include "llfat_utils.h"
9 #include <command_line_params.h>
10 #include "misc.h"
11 #include "util_quda.h"
12 #include "malloc_quda.h"
13 
14 #ifdef MULTI_GPU
15 #include "comm_quda.h"
16 #endif
17 
18 #define TDIFF(a,b) (b.tv_sec - a.tv_sec + 0.000001*(b.tv_usec - a.tv_usec))
19 
21 
22 static void llfat_test()
23 {
24  QudaGaugeParam qudaGaugeParam;
25 #ifdef MULTI_GPU
26  void* ghost_sitelink[4];
27  void* ghost_sitelink_diag[16];
28 #endif
29 
31 
32  cpu_prec = prec;
34  qudaGaugeParam = newQudaGaugeParam();
35 
36  qudaGaugeParam.anisotropy = 1.0;
37 
38  qudaGaugeParam.X[0] = xdim;
39  qudaGaugeParam.X[1] = ydim;
40  qudaGaugeParam.X[2] = zdim;
41  qudaGaugeParam.X[3] = tdim;
42 
43  setDims(qudaGaugeParam.X);
44 
45  qudaGaugeParam.cpu_prec = cpu_prec;
46  qudaGaugeParam.cuda_prec = qudaGaugeParam.cuda_prec_sloppy = prec;
47  qudaGaugeParam.gauge_order = gauge_order;
48  qudaGaugeParam.type = QUDA_WILSON_LINKS;
49  qudaGaugeParam.reconstruct = qudaGaugeParam.reconstruct_sloppy = link_recon;
50  qudaGaugeParam.t_boundary = QUDA_ANTI_PERIODIC_T;
52  qudaGaugeParam.gauge_fix = QUDA_GAUGE_FIXED_NO;
53  qudaGaugeParam.ga_pad = 0;
54 
56  void *longlink = pinned_malloc(4 * V * gauge_site_size * host_gauge_data_type_size);
57 
58  void* sitelink[4];
59  for (int i = 0; i < 4; i++) sitelink[i] = pinned_malloc(V * gauge_site_size * host_gauge_data_type_size);
60 
61  void* sitelink_ex[4];
62  for (int i = 0; i < 4; i++) sitelink_ex[i] = pinned_malloc(V_ex * gauge_site_size * host_gauge_data_type_size);
63 
64  void* milc_sitelink;
65  milc_sitelink = (void *)safe_malloc(4 * V * gauge_site_size * host_gauge_data_type_size);
66 
67  void* milc_sitelink_ex;
68  milc_sitelink_ex = (void *)safe_malloc(4 * V_ex * gauge_site_size * host_gauge_data_type_size);
69 
70  createSiteLinkCPU(sitelink, qudaGaugeParam.cpu_prec, 1);
71 
73  for(int i=0; i<V; ++i){
74  for(int dir=0; dir<4; ++dir){
75  char* src = (char*)sitelink[dir];
76  memcpy((char *)milc_sitelink + (i * 4 + dir) * gauge_site_size * host_gauge_data_type_size,
78  }
79  }
80  }
81 
82  int X1=Z[0];
83  int X2=Z[1];
84  int X3=Z[2];
85  int X4=Z[3];
86 
87  for(int i=0; i < V_ex; i++){
88  int sid = i;
89  int oddBit=0;
90  if(i >= Vh_ex){
91  sid = i - Vh_ex;
92  oddBit = 1;
93  }
94 
95  int za = sid/E1h;
96  int x1h = sid - za*E1h;
97  int zb = za/E2;
98  int x2 = za - zb*E2;
99  int x4 = zb/E3;
100  int x3 = zb - x4*E3;
101  int x1odd = (x2 + x3 + x4 + oddBit) & 1;
102  int x1 = 2*x1h + x1odd;
103 
104 
105  if( x1< 2 || x1 >= X1 +2
106  || x2< 2 || x2 >= X2 +2
107  || x3< 2 || x3 >= X3 +2
108  || x4< 2 || x4 >= X4 +2){
109 #ifdef MULTI_GPU
110  continue;
111 #endif
112  }
113 
114  x1 = (x1 - 2 + X1) % X1;
115  x2 = (x2 - 2 + X2) % X2;
116  x3 = (x3 - 2 + X3) % X3;
117  x4 = (x4 - 2 + X4) % X4;
118 
119  int idx = (x4*X3*X2*X1+x3*X2*X1+x2*X1+x1)>>1;
120  if(oddBit){
121  idx += Vh;
122  }
123  for(int dir= 0; dir < 4; dir++){
124  char* src = (char*)sitelink[dir];
125  char* dst = (char*)sitelink_ex[dir];
126  memcpy(dst + i * gauge_site_size * host_gauge_data_type_size,
128 
129  // milc ordering
130  memcpy((char *)milc_sitelink_ex + (i * 4 + dir) * gauge_site_size * host_gauge_data_type_size,
132  }//dir
133  }//i
134 
135 
136  double act_path_coeff[6];
137  for(int i=0;i < 6;i++){
138  act_path_coeff[i]= 0.1*i;
139  }
140 
141 
142  //only record the last call's performance
143  //the first one is for creating the cpu/cuda data structures
144  struct timeval t0, t1;
145 
146  void* longlink_ptr = longlink;
147  {
148  printfQuda("Tuning...\n");
149  computeKSLinkQuda(fatlink, longlink_ptr, NULL, milc_sitelink, act_path_coeff, &qudaGaugeParam);
150  }
151 
152  printfQuda("Running %d iterations of computation\n", niter);
153  gettimeofday(&t0, NULL);
154  for (int i=0; i<niter; i++)
155  computeKSLinkQuda(fatlink, longlink_ptr, NULL, milc_sitelink, act_path_coeff, &qudaGaugeParam);
156  gettimeofday(&t1, NULL);
157 
158  double secs = TDIFF(t0,t1);
159 
160  void* fat_reflink[4];
161  void* long_reflink[4];
162  for(int i=0;i < 4;i++){
165  }
166 
167  if (verify_results){
168 
169  //FIXME: we have this complication because references takes coeff as float/double
170  // depending on the precision while the GPU code aways take coeff as double
171  void* coeff;
172  double coeff_dp[6];
173  float coeff_sp[6];
174  for (int i=0; i < 6;i++) coeff_sp[i] = coeff_dp[i] = act_path_coeff[i];
175  coeff = (prec == QUDA_DOUBLE_PRECISION) ? (void*)coeff_dp : (void*)coeff_sp;
176 
177 #ifdef MULTI_GPU
178  int optflag = 0;
179  //we need x,y,z site links in the back and forward T slice
180  // so it is 3*2*Vs_t
181  int Vs[4] = {Vs_x, Vs_y, Vs_z, Vs_t};
182  for (int i = 0; i < 4; i++)
183  ghost_sitelink[i] = safe_malloc(8 * Vs[i] * gauge_site_size * host_gauge_data_type_size);
184 
185  /*
186  nu | |
187  |_____|
188  mu
189  */
190 
191  for(int nu=0;nu < 4;nu++){
192  for(int mu=0; mu < 4;mu++){
193  if(nu == mu){
194  ghost_sitelink_diag[nu*4+mu] = NULL;
195  }else{
196  //the other directions
197  int dir1, dir2;
198  for(dir1= 0; dir1 < 4; dir1++){
199  if(dir1 !=nu && dir1 != mu){
200  break;
201  }
202  }
203  for(dir2=0; dir2 < 4; dir2++){
204  if(dir2 != nu && dir2 != mu && dir2 != dir1){
205  break;
206  }
207  }
208  ghost_sitelink_diag[nu * 4 + mu] = safe_malloc(Z[dir1] * Z[dir2] * gauge_site_size * host_gauge_data_type_size);
209  memset(ghost_sitelink_diag[nu * 4 + mu], 0, Z[dir1] * Z[dir2] * gauge_site_size * host_gauge_data_type_size);
210  }
211 
212  }
213  }
214 
215  exchange_cpu_sitelink(qudaGaugeParam.X, sitelink, ghost_sitelink, ghost_sitelink_diag, qudaGaugeParam.cpu_prec, &qudaGaugeParam, optflag);
216  llfat_reference_mg(fat_reflink, sitelink, ghost_sitelink, ghost_sitelink_diag, qudaGaugeParam.cpu_prec, coeff);
217 
218  {
219  int R[4] = {2,2,2,2};
220  exchange_cpu_sitelink_ex(qudaGaugeParam.X, R, sitelink_ex, QUDA_QDP_GAUGE_ORDER, qudaGaugeParam.cpu_prec, 0, 4);
221  computeLongLinkCPU(long_reflink, sitelink_ex, qudaGaugeParam.cpu_prec, coeff);
222  }
223 #else
224  llfat_reference(fat_reflink, sitelink, qudaGaugeParam.cpu_prec, coeff);
225  computeLongLinkCPU(long_reflink, sitelink, qudaGaugeParam.cpu_prec, coeff);
226 #endif
227 
228  }//verify_results
229 
230  //format change for fatlink and longlink
231  void* myfatlink[4];
232  void* mylonglink[4];
233  for(int i=0; i < 4; i++){
236  memset(myfatlink[i], 0, V * gauge_site_size * host_gauge_data_type_size);
237  memset(mylonglink[i], 0, V * gauge_site_size * host_gauge_data_type_size);
238  }
239 
240  for(int i=0; i < V; i++){
241  for(int dir=0; dir< 4; dir++){
242  char *src = ((char *)fatlink) + (4 * i + dir) * gauge_site_size * host_gauge_data_type_size;
243  char *dst = ((char *)myfatlink[dir]) + i * gauge_site_size * host_gauge_data_type_size;
244  memcpy(dst, src, gauge_site_size * host_gauge_data_type_size);
245 
246  src = ((char *)longlink) + (4 * i + dir) * gauge_site_size * host_gauge_data_type_size;
247  dst = ((char *)mylonglink[dir]) + i * gauge_site_size * host_gauge_data_type_size;
248  memcpy(dst, src, gauge_site_size * host_gauge_data_type_size);
249  }
250  }
251 
252  if (verify_results) {
253  printfQuda("Checking fat links...\n");
254  int res=1;
255  for(int dir=0; dir<4; dir++){
256  res &= compare_floats(fat_reflink[dir], myfatlink[dir], V * gauge_site_size, 1e-3, qudaGaugeParam.cpu_prec);
257  }
258 
259  strong_check_link(myfatlink, "GPU results: ",
260  fat_reflink, "CPU reference results:",
261  V, qudaGaugeParam.cpu_prec);
262 
263  printfQuda("Fat-link test %s\n\n",(1 == res) ? "PASSED" : "FAILED");
264 
265  printfQuda("Checking long links...\n");
266  res = 1;
267  for(int dir=0; dir<4; ++dir){
268  res &= compare_floats(long_reflink[dir], mylonglink[dir], V * gauge_site_size, 1e-3, qudaGaugeParam.cpu_prec);
269  }
270 
271  strong_check_link(mylonglink, "GPU results: ",
272  long_reflink, "CPU reference results:",
273  V, qudaGaugeParam.cpu_prec);
274 
275  printfQuda("Long-link test %s\n\n",(1 == res) ? "PASSED" : "FAILED");
276  }
277 
278  int volume = qudaGaugeParam.X[0]*qudaGaugeParam.X[1]*qudaGaugeParam.X[2]*qudaGaugeParam.X[3];
279  long long flops= 61632 * (long long)niter;
280  flops += (252*4)*(long long)niter; // long-link contribution
281 
282  double perf = flops*volume/(secs*1024*1024*1024);
283  printfQuda("link computation time =%.2f ms, flops= %.2f Gflops\n", (secs*1000)/niter, perf);
284 
285  for (int i=0; i < 4; i++) {
286  host_free(myfatlink[i]);
287  host_free(mylonglink[i]);
288  }
289 
290 #ifdef MULTI_GPU
291  if (verify_results){
292  for(int i=0; i<4; i++){
293  host_free(ghost_sitelink[i]);
294  for(int j=0;j <4; j++){
295  if (i==j) continue;
296  host_free(ghost_sitelink_diag[i*4+j]);
297  }
298  }
299  }
300 #endif
301 
302  for(int i=0; i < 4; i++){
303  host_free(sitelink[i]);
304  host_free(sitelink_ex[i]);
305  host_free(fat_reflink[i]);
306  host_free(long_reflink[i]);
307  }
308  host_free(fatlink);
309  host_free(longlink);
310  if(milc_sitelink) host_free(milc_sitelink);
311  if(milc_sitelink_ex) host_free(milc_sitelink_ex);
312 #ifdef MULTI_GPU
314 #endif
315  endQuda();
316 }
317 
318 static void display_test_info()
319 {
320  printfQuda("running the following test:\n");
321 
322  printfQuda("link_precision link_reconstruct space_dimension T_dimension Ordering\n");
323  printfQuda("%s %s %d/%d/%d/ %d %s \n",
326  xdim, ydim, zdim, tdim,
327  get_gauge_order_str(gauge_order));
328 
329  printfQuda("Grid partition info: X Y Z T\n");
330  printfQuda(" %d %d %d %d\n",
331  dimPartitioned(0),
332  dimPartitioned(1),
333  dimPartitioned(2),
334  dimPartitioned(3));
335 
336  return ;
337 
338 }
339 
340 
341 int main(int argc, char **argv)
342 {
343 
344  //default to 18 reconstruct, 8^3 x 8
346  xdim=ydim=zdim=tdim=8;
348 
349  // command line options
350  auto app = make_app();
351  // add_eigen_option_group(app);
352  // add_deflation_option_group(app);
353  // add_multigrid_option_group(app);
354  CLI::TransformPairs<QudaGaugeFieldOrder> gauge_order_map {{"milc", QUDA_MILC_GAUGE_ORDER},
355  {"qdp", QUDA_QDP_GAUGE_ORDER}};
356  app->add_option("--gauge-order", gauge_order, "")->transform(CLI::QUDACheckedTransformer(gauge_order_map));
357  try {
358  app->parse(argc, argv);
359  } catch (const CLI::ParseError &e) {
360  return app->exit(e);
361  }
362 
363  initComms(argc, argv, gridsize_from_cmdline);
365  llfat_test();
366  finalizeComms();
367 }
368 
369 
void display_test_info()
std::shared_ptr< QUDAApp > make_app(std::string app_description, std::string app_name)
QudaReconstructType link_recon
int niter
int device_ordinal
int & ydim
bool verify_results
double mu
int & zdim
QudaPrecision prec
int & tdim
int & xdim
std::array< int, 4 > gridsize_from_cmdline
int Vh
Definition: host_utils.cpp:38
int Z[4]
Definition: host_utils.cpp:36
int V
Definition: host_utils.cpp:37
void * memset(void *s, int c, size_t n)
void setDims(int *)
Definition: host_utils.cpp:315
@ QUDA_STAGGERED_PHASE_MILC
Definition: enum_quda.h:516
enum QudaGaugeFieldOrder_s QudaGaugeFieldOrder
@ QUDA_RECONSTRUCT_NO
Definition: enum_quda.h:70
@ QUDA_ANTI_PERIODIC_T
Definition: enum_quda.h:56
@ QUDA_GAUGE_FIXED_NO
Definition: enum_quda.h:80
@ QUDA_DOUBLE_PRECISION
Definition: enum_quda.h:65
@ QUDA_QDP_GAUGE_ORDER
Definition: enum_quda.h:44
@ QUDA_MILC_GAUGE_ORDER
Definition: enum_quda.h:47
@ QUDA_WILSON_LINKS
Definition: enum_quda.h:30
void exchange_cpu_sitelink_ex(int *X, int *R, void **sitelink, QudaGaugeFieldOrder cpu_order, QudaPrecision gPrecision, int optflag, int geometry)
Definition: face_gauge.cpp:644
void exchange_llfat_cleanup(void)
#define gauge_site_size
Definition: face_gauge.cpp:34
void exchange_cpu_sitelink(int *X, void **sitelink, void **ghost_sitelink, void **ghost_sitelink_diag, QudaPrecision gPrecision, QudaGaugeParam *param, int optflag)
Definition: face_gauge.cpp:512
int Vh_ex
Definition: host_utils.cpp:46
QudaGaugeFieldOrder gauge_order
int Vs_y
Definition: host_utils.cpp:39
int compare_floats(void *a, void *b, int len, double epsilon, QudaPrecision precision)
Definition: host_utils.cpp:889
size_t host_gauge_data_type_size
Definition: host_utils.cpp:65
int dimPartitioned(int dim)
Definition: host_utils.cpp:376
void initComms(int argc, char **argv, std::array< int, 4 > &commDims)
Definition: host_utils.cpp:255
int E1h
Definition: host_utils.cpp:44
int strong_check_link(void **linkA, const char *msgA, void **linkB, const char *msgB, int len, QudaPrecision prec)
void finalizeComms()
Definition: host_utils.cpp:292
int Vs_x
Definition: host_utils.cpp:39
void createSiteLinkCPU(void **link, QudaPrecision precision, int phase)
int Vs_t
Definition: host_utils.cpp:39
int Vs_z
Definition: host_utils.cpp:39
QudaPrecision & cpu_prec
Definition: host_utils.cpp:57
int E2
Definition: host_utils.cpp:44
int E3
Definition: host_utils.cpp:44
int V_ex
Definition: host_utils.cpp:46
void computeLongLinkCPU(void **longlink, void **sitelink, QudaPrecision prec, void *act_path_coeff)
int main(int argc, char **argv)
Definition: llfat_test.cpp:341
#define TDIFF(a, b)
Definition: llfat_test.cpp:18
void llfat_reference(void **fatlink, void **sitelink, QudaPrecision prec, void *act_path_coeff)
void llfat_reference_mg(void **fatlink, void **sitelink, void **ghost_sitelink, void **ghost_sitelink_diag, QudaPrecision prec, void *act_path_coeff)
#define safe_malloc(size)
Definition: malloc_quda.h:106
#define pinned_malloc(size)
Definition: malloc_quda.h:107
#define host_free(ptr)
Definition: malloc_quda.h:115
const char * get_prec_str(QudaPrecision prec)
Definition: misc.cpp:26
const char * get_gauge_order_str(QudaGaugeFieldOrder order)
Definition: misc.cpp:54
const char * get_recon_str(QudaReconstructType recon)
Definition: misc.cpp:68
unsigned long long flops
Main header file for the QUDA library.
QudaGaugeParam newQudaGaugeParam(void)
void initQuda(int device)
void computeKSLinkQuda(void *fatlink, void *longlink, void *ulink, void *inlink, double *path_coeff, QudaGaugeParam *param)
void endQuda(void)
double anisotropy
Definition: quda.h:37
QudaReconstructType reconstruct
Definition: quda.h:49
int ga_pad
Definition: quda.h:65
QudaLinkType type
Definition: quda.h:41
QudaPrecision cuda_prec_sloppy
Definition: quda.h:51
QudaReconstructType reconstruct_sloppy
Definition: quda.h:52
QudaGaugeFixed gauge_fix
Definition: quda.h:63
QudaGaugeFieldOrder gauge_order
Definition: quda.h:42
QudaPrecision cuda_prec
Definition: quda.h:48
QudaStaggeredPhase staggered_phase_type
Definition: quda.h:73
int X[4]
Definition: quda.h:35
QudaPrecision cpu_prec
Definition: quda.h:46
QudaTboundary t_boundary
Definition: quda.h:44
#define printfQuda(...)
Definition: util_quda.h:114