QUDA  v0.5.0
A library for QCD on GPUs
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
llfat_test.cpp
Go to the documentation of this file.
1 #include <stdio.h>
2 #include <stdlib.h>
3 #include <string.h>
4 #include <sys/time.h>
5 #include <cuda.h>
6 #include <cuda_runtime.h>
7 
8 #include "quda.h"
9 #include "test_util.h"
10 #include "llfat_reference.h"
11 #include "misc.h"
12 #include "util_quda.h"
13 
14 #ifdef MULTI_GPU
15 #include "face_quda.h"
16 #include "comm_quda.h"
17 #endif
18 
19 #define TDIFF(a,b) (b.tv_sec - a.tv_sec + 0.000001*(b.tv_usec - a.tv_usec))
20 
21 extern void usage(char** argv);
22 static int verify_results = 0;
23 
24 extern int device;
25 extern int test_type;
26 extern int xdim, ydim, zdim, tdim;
27 extern int gridsize_from_cmdline[];
28 
30 extern QudaPrecision prec;
33 
34 static size_t gSize;
35 
36 static int
37 llfat_test(int test)
38 {
39 
40  QudaGaugeParam qudaGaugeParam;
41 #ifdef MULTI_GPU
42  void* ghost_sitelink[4];
43  void* ghost_sitelink_diag[16];
44 #endif
45 
46 
48 
49  cpu_prec = prec;
50  gSize = cpu_prec;
51  qudaGaugeParam = newQudaGaugeParam();
52 
53  qudaGaugeParam.anisotropy = 1.0;
54 
55  qudaGaugeParam.X[0] = xdim;
56  qudaGaugeParam.X[1] = ydim;
57  qudaGaugeParam.X[2] = zdim;
58  qudaGaugeParam.X[3] = tdim;
59 
60  setDims(qudaGaugeParam.X);
61 
62  qudaGaugeParam.cpu_prec = cpu_prec;
63  qudaGaugeParam.cuda_prec = prec;
64  qudaGaugeParam.gauge_order = gauge_order;
65  qudaGaugeParam.type=QUDA_WILSON_LINKS;
66  qudaGaugeParam.reconstruct = link_recon;
67  /*
68  qudaGaugeParam.flag = QUDA_FAT_PRESERVE_CPU_GAUGE
69  | QUDA_FAT_PRESERVE_GPU_GAUGE
70  | QUDA_FAT_PRESERVE_COMM_MEM;
71  */
72  qudaGaugeParam.preserve_gauge =0;
73  void* fatlink;
74  if (cudaMallocHost((void**)&fatlink, 4*V*gaugeSiteSize*gSize) == cudaErrorMemoryAllocation) {
75  errorQuda("ERROR: cudaMallocHost failed for fatlink\n");
76  }
77 
78  void* sitelink[4];
79  for(int i=0;i < 4;i++){
80  if (cudaMallocHost((void**)&sitelink[i], V*gaugeSiteSize*gSize) == cudaErrorMemoryAllocation) {
81  errorQuda("ERROR: cudaMallocHost failed for sitelink\n");
82  }
83  }
84 
85  void* sitelink_ex[4];
86  for(int i=0;i < 4;i++){
87  if (cudaMallocHost((void**)&sitelink_ex[i], V_ex*gaugeSiteSize*gSize) == cudaErrorMemoryAllocation) {
88  errorQuda("ERROR: cudaMallocHost failed for sitelink_ex\n");
89  }
90  }
91 
92 
93  void* milc_sitelink;
94  milc_sitelink = (void*)malloc(4*V*gaugeSiteSize*gSize);
95  if(milc_sitelink == NULL){
96  errorQuda("ERROR: allocating milc_sitelink failed\n");
97  }
98 
99  void* milc_sitelink_ex;
100  milc_sitelink_ex = (void*)malloc(4*V_ex*gaugeSiteSize*gSize);
101  if(milc_sitelink_ex == NULL){
102  errorQuda("Error: allocating milc_sitelink failed\n");
103  }
104 
105 
106 
107  createSiteLinkCPU(sitelink, qudaGaugeParam.cpu_prec, 1);
108 
110  for(int i=0; i<V; ++i){
111  for(int dir=0; dir<4; ++dir){
112  char* src = (char*)sitelink[dir];
113  memcpy((char*)milc_sitelink + (i*4 + dir)*gaugeSiteSize*gSize, src+i*gaugeSiteSize*gSize, gaugeSiteSize*gSize);
114  }
115  }
116  }
117 
118  int X1=Z[0];
119  int X2=Z[1];
120  int X3=Z[2];
121  int X4=Z[3];
122 
123  for(int i=0; i < V_ex; i++){
124  int sid = i;
125  int oddBit=0;
126  if(i >= Vh_ex){
127  sid = i - Vh_ex;
128  oddBit = 1;
129  }
130 
131  int za = sid/E1h;
132  int x1h = sid - za*E1h;
133  int zb = za/E2;
134  int x2 = za - zb*E2;
135  int x4 = zb/E3;
136  int x3 = zb - x4*E3;
137  int x1odd = (x2 + x3 + x4 + oddBit) & 1;
138  int x1 = 2*x1h + x1odd;
139 
140 
141  if( x1< 2 || x1 >= X1 +2
142  || x2< 2 || x2 >= X2 +2
143  || x3< 2 || x3 >= X3 +2
144  || x4< 2 || x4 >= X4 +2){
145 #ifdef MULTI_GPU
146  continue;
147 #endif
148  }
149 
150 
151 
152  x1 = (x1 - 2 + X1) % X1;
153  x2 = (x2 - 2 + X2) % X2;
154  x3 = (x3 - 2 + X3) % X3;
155  x4 = (x4 - 2 + X4) % X4;
156 
157  int idx = (x4*X3*X2*X1+x3*X2*X1+x2*X1+x1)>>1;
158  if(oddBit){
159  idx += Vh;
160  }
161  for(int dir= 0; dir < 4; dir++){
162  char* src = (char*)sitelink[dir];
163  char* dst = (char*)sitelink_ex[dir];
164  memcpy(dst+i*gaugeSiteSize*gSize, src+idx*gaugeSiteSize*gSize, gaugeSiteSize*gSize);
165 
166  // milc ordering
167  memcpy((char*)milc_sitelink_ex + (i*4 + dir)*gaugeSiteSize*gSize, src+idx*gaugeSiteSize*gSize, gaugeSiteSize*gSize);
168  }//dir
169  }//i
170 
171 
172  double act_path_coeff[6];
173  for(int i=0;i < 6;i++){
174  act_path_coeff[i]= 0.1*i;
175  }
176 
177 
178  //only record the last call's performance
179  //the first one is for creating the cpu/cuda data structures
180  struct timeval t0, t1;
181 
182  for(int i=0;i < 2;i++){
183  gettimeofday(&t0, NULL);
185  if(test == 0){
186  computeFatLinkQuda(fatlink, sitelink, act_path_coeff, &qudaGaugeParam,
188  }else{
189  computeFatLinkQuda(fatlink, sitelink_ex, act_path_coeff, &qudaGaugeParam,
191  }
192  }else if(gauge_order == QUDA_MILC_GAUGE_ORDER){
193  if(test == 0){
194  computeFatLinkQuda(fatlink, (void**)milc_sitelink, act_path_coeff, &qudaGaugeParam,
196  }else{
197  computeFatLinkQuda(fatlink, (void**)milc_sitelink_ex, act_path_coeff, &qudaGaugeParam,
199  }
200  }
201  gettimeofday(&t1, NULL);
202  }
203 
204  double secs = TDIFF(t0,t1);
205 
206  void* reflink[4];
207  for(int i=0;i < 4;i++){
208  reflink[i] = malloc(V*gaugeSiteSize*gSize);
209  if(reflink[i] == NULL){
210  errorQuda("ERROR; allocate reflink[%d] failed\n", i);
211  }
212  }
213 
214  if (verify_results){
215 
216  //FIXME: we have this compplication because references takes coeff as float/double
217  // depending on the precision while the GPU code aways take coeff as double
218  void* coeff;
219  double coeff_dp[6];
220  float coeff_sp[6];
221  for(int i=0;i < 6;i++){
222  coeff_sp[i] = coeff_dp[i] = act_path_coeff[i];
223  }
225  coeff = coeff_dp;
226  }else{
227  coeff = coeff_sp;
228  }
229 #ifdef MULTI_GPU
230  int optflag = 0;
231  //we need x,y,z site links in the back and forward T slice
232  // so it is 3*2*Vs_t
233  int Vs[4] = {Vs_x, Vs_y, Vs_z, Vs_t};
234  for(int i=0;i < 4; i++){
235  ghost_sitelink[i] = malloc(8*Vs[i]*gaugeSiteSize*gSize);
236  if (ghost_sitelink[i] == NULL){
237  printf("ERROR: malloc failed for ghost_sitelink[%d] \n",i);
238  exit(1);
239  }
240  }
241 
242  /*
243  nu | |
244  |_____|
245  mu
246  */
247 
248  for(int nu=0;nu < 4;nu++){
249  for(int mu=0; mu < 4;mu++){
250  if(nu == mu){
251  ghost_sitelink_diag[nu*4+mu] = NULL;
252  }else{
253  //the other directions
254  int dir1, dir2;
255  for(dir1= 0; dir1 < 4; dir1++){
256  if(dir1 !=nu && dir1 != mu){
257  break;
258  }
259  }
260  for(dir2=0; dir2 < 4; dir2++){
261  if(dir2 != nu && dir2 != mu && dir2 != dir1){
262  break;
263  }
264  }
265  ghost_sitelink_diag[nu*4+mu] = malloc(Z[dir1]*Z[dir2]*gaugeSiteSize*gSize);
266  if(ghost_sitelink_diag[nu*4+mu] == NULL){
267  errorQuda("malloc failed for ghost_sitelink_diag\n");
268  }
269 
270  memset(ghost_sitelink_diag[nu*4+mu], 0, Z[dir1]*Z[dir2]*gaugeSiteSize*gSize);
271  }
272 
273  }
274  }
275 
276  exchange_cpu_sitelink(qudaGaugeParam.X, sitelink, ghost_sitelink, ghost_sitelink_diag, qudaGaugeParam.cpu_prec, &qudaGaugeParam, optflag);
277  llfat_reference_mg(reflink, sitelink, ghost_sitelink, ghost_sitelink_diag, qudaGaugeParam.cpu_prec, coeff);
278 #else
279  llfat_reference(reflink, sitelink, qudaGaugeParam.cpu_prec, coeff);
280 #endif
281 
282  }//verify_results
283 
284  //format change for fatlink
285  void* myfatlink[4];
286  for(int i=0;i < 4;i++){
287  myfatlink[i] = malloc(V*gaugeSiteSize*gSize);
288  if(myfatlink[i] == NULL){
289  printf("Error: malloc failed for myfatlink[%d]\n", i);
290  exit(1);
291  }
292  memset(myfatlink[i], 0, V*gaugeSiteSize*gSize);
293  }
294 
295  for(int i=0;i < V; i++){
296  for(int dir=0; dir< 4; dir++){
297  char* src = ((char*)fatlink)+ (4*i+dir)*gaugeSiteSize*gSize;
298  char* dst = ((char*)myfatlink[dir]) + i*gaugeSiteSize*gSize;
299  memcpy(dst, src, gaugeSiteSize*gSize);
300  }
301  }
302 
303  int res=1;
304  for(int i=0;i < 4;i++){
305  res &= compare_floats(reflink[i], myfatlink[i], V*gaugeSiteSize, 1e-3, qudaGaugeParam.cpu_prec);
306  }
307  int accuracy_level;
308 
309  accuracy_level = strong_check_link(myfatlink, "GPU results: ",
310  reflink, "CPU reference results:",
311  V, qudaGaugeParam.cpu_prec);
312 
313  printfQuda("Test %s\n",(1 == res) ? "PASSED" : "FAILED");
314  int volume = qudaGaugeParam.X[0]*qudaGaugeParam.X[1]*qudaGaugeParam.X[2]*qudaGaugeParam.X[3];
315  int flops= 61632;
316  double perf = 1.0* flops*volume/(secs*1024*1024*1024);
317  printfQuda("fatlink computation time =%.2f ms, flops= %.2f Gflops\n", secs*1000, perf);
318 
319 
320  for(int i=0;i < 4;i++){
321  free(myfatlink[i]);
322  }
323 
324  if (res == 0){//failed
325  printfQuda("\n");
326  printfQuda("Warning: your test failed. \n");
327  printfQuda(" Did you use --verify?\n");
328  printfQuda(" Did you check the GPU health by running cuda memtest?\n");
329  }
330 
331 #ifdef MULTI_GPU
332  if (verify_results){
333  int i;
334  for(i=0;i < 4;i++){
335  free(ghost_sitelink[i]);
336  }
337  for(i=0;i <4; i++){
338  for(int j=0;j <4; j++){
339  if (i==j){
340  continue;
341  }
342  free(ghost_sitelink_diag[i*4+j]);
343  }
344  }
345  }
346 #endif
347 
348  for(int i=0;i < 4; i++){
349  cudaFreeHost(sitelink[i]);
350  cudaFreeHost(sitelink_ex[i]);
351  free(reflink[i]);
352  }
353  cudaFreeHost(fatlink);
354  if(milc_sitelink) free(milc_sitelink);
355  if(milc_sitelink_ex) free(milc_sitelink_ex);
356 #ifdef MULTI_GPU
358 #endif
359  endQuda();
360 
361  return accuracy_level;
362 
363 }
364 
365 static void
366 display_test_info(int test)
367 {
368  printfQuda("running the following test:\n");
369 
370  printfQuda("link_precision link_reconstruct space_dimension T_dimension Test Ordering\n");
371  printfQuda("%s %s %d/%d/%d/ %d %d %s \n",
374  xdim, ydim, zdim, tdim, test,
376 
377 #ifdef MULTI_GPU
378  printfQuda("Grid partition info: X Y Z T\n");
379  printfQuda(" %d %d %d %d\n",
380  dimPartitioned(0),
381  dimPartitioned(1),
382  dimPartitioned(2),
383  dimPartitioned(3));
384 #endif
385 
386  return ;
387 
388 }
389 
390 void
391 usage_extra(char** argv )
392 {
393  printfQuda("Extra options:\n");
394  printfQuda(" --test <0/1> # Test method\n");
395  printfQuda(" 0: standard method\n");
396  printfQuda(" 1: extended volume method\n");
397  printfQuda(" --verify # Verify the GPU results using CPU results\n");
398  printfQuda(" --gauge-order <qdp/milc> # ordering of the input gauge-field\n");
399  return ;
400 }
401 
402 int
403 main(int argc, char **argv)
404 {
405 
406  int test = 0;
407 
408  //default to 18 reconstruct, 8^3 x 8
410  xdim=ydim=zdim=tdim=8;
412 
413  int i;
414  for (i =1;i < argc; i++){
415 
416  if(process_command_line_option(argc, argv, &i) == 0){
417  continue;
418  }
419 
420  if( strcmp(argv[i], "--gauge-order") == 0){
421  if(i+1 >= argc){
422  usage(argv);
423  }
424 
425  if(strcmp(argv[i+1], "milc") == 0){
427  }else if(strcmp(argv[i+1], "qdp") == 0){
429  }else{
430  fprintf(stderr, "Error: unsupported gauge-field order\n");
431  exit(1);
432  }
433  i++;
434  continue;
435  }
436 
437 
438  if( strcmp(argv[i], "--verify") == 0){
439  verify_results=1;
440  continue;
441  }
442 
443  fprintf(stderr, "ERROR: Invalid option:%s\n", argv[i]);
444  usage(argv);
445  }
446 
447  test = test_type;
448 
449 #ifdef MULTI_GPU
450  if(gauge_order == QUDA_MILC_GAUGE_ORDER && test == 0){
451  errorQuda("ERROR: milc format for multi-gpu with test0 is not supported yet!\n");
452  }
453 #endif
454 
455  initComms(argc, argv, gridsize_from_cmdline);
456 
457  display_test_info(test);
458 
459  int accuracy_level = llfat_test(test);
460 
461  printfQuda("accuracy_level=%d\n", accuracy_level);
462 
463  finalizeComms();
464 
465  int ret;
466  if(accuracy_level >=3 ){
467  ret = 0;
468  }else{
469  ret = 1; //we delclare the test failed
470  }
471 
472  return ret;
473 }
474 
475