QUDA  v0.7.0
A library for QCD on GPUs
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
hisq_paths_force_test.cpp
Go to the documentation of this file.
1 #include <cstdio>
2 #include <cstdlib>
3 #include <cstring>
4 
5 #include <quda.h>
6 #include "test_util.h"
7 #include "gauge_field.h"
8 #include "fat_force_quda.h"
9 #include "misc.h"
10 #include "hisq_force_reference.h"
11 #include "ks_improved_force.h"
12 #include "hw_quda.h"
13 #include <fat_force_quda.h>
14 #include <face_quda.h>
15 #include <dslash_quda.h>
16 #include <sys/time.h>
17 
18 #define TDIFF(a,b) (b.tv_sec - a.tv_sec + 0.000001*(b.tv_usec - a.tv_usec))
19 
21 using namespace quda;
22 
23 extern void usage(char** argv);
24 extern int device;
27 
30 
34 
35 static QudaGaugeParam qudaGaugeParam;
36 static QudaGaugeParam qudaGaugeParam_ex;
37 static void* hw; // the array of half_wilson_vector
38 
40 
45 
46 extern bool verify_results;
47 int ODD_BIT = 1;
48 extern int xdim, ydim, zdim, tdim;
49 extern int gridsize_from_cmdline[];
50 
51 extern bool tune;
52 extern QudaPrecision prec;
58 
60 void* siteLink_2d[4];
61 void* siteLink_ex_2d[4];
62 
71 #ifdef MULTI_GPU
72 GaugeFieldParam gParam_ex;
73 #endif
74 
76 
77 static void setPrecision(QudaPrecision precision)
78 {
79  link_prec = precision;
80  hw_prec = precision;
81  cpu_hw_prec = precision;
82  mom_prec = precision;
83  return;
84 }
85 
86 
87  void
88 total_staple_io_flops(QudaPrecision prec, QudaReconstructType recon, double* io, double* flops)
89 {
90  //total IO counting for the middle/side/all link kernels
91  //Explanation about these numbers can be founed in the corresnponding kernel functions in
92  //the hisq kernel core file
93  int linksize = prec*recon;
94  int cmsize = prec*18;
95 
96  int matrix_mul_flops = 198;
97  int matrix_add_flops = 18;
98 
99  int num_calls_middle_link[6] = {24, 24, 96, 96, 24, 24};
100  int middle_link_data_io[6][2] = {
101  {3,6},
102  {3,4},
103  {3,7},
104  {3,5},
105  {3,5},
106  {3,2}
107  };
108  int middle_link_data_flops[6][2] = {
109  {3,1},
110  {2,0},
111  {4,1},
112  {3,0},
113  {4,1},
114  {2,0}
115  };
116 
117 
118  int num_calls_side_link[2]= {192, 48};
119  int side_link_data_io[2][2] = {
120  {1, 6},
121  {0, 3}
122  };
123  int side_link_data_flops[2][2] = {
124  {2, 2},
125  {0, 1}
126  };
127 
128 
129 
130  int num_calls_all_link[2] ={192, 192};
131  int all_link_data_io[2][2] = {
132  {3, 8},
133  {3, 6}
134  };
135  int all_link_data_flops[2][2] = {
136  {6, 3},
137  {4, 2}
138  };
139 
140 
141  double total_io = 0;
142  for(int i = 0;i < 6; i++){
143  total_io += num_calls_middle_link[i]
144  *(middle_link_data_io[i][0]*linksize + middle_link_data_io[i][1]*cmsize);
145  }
146 
147  for(int i = 0;i < 2; i++){
148  total_io += num_calls_side_link[i]
149  *(side_link_data_io[i][0]*linksize + side_link_data_io[i][1]*cmsize);
150  }
151  for(int i = 0;i < 2; i++){
152  total_io += num_calls_all_link[i]
153  *(all_link_data_io[i][0]*linksize + all_link_data_io[i][1]*cmsize);
154  }
155  total_io *= V;
156 
157 
158  double total_flops = 0;
159  for(int i = 0;i < 6; i++){
160  total_flops += num_calls_middle_link[i]
161  *(middle_link_data_flops[i][0]*matrix_mul_flops + middle_link_data_flops[i][1]*matrix_add_flops);
162  }
163 
164  for(int i = 0;i < 2; i++){
165  total_flops += num_calls_side_link[i]
166  *(side_link_data_flops[i][0]*matrix_mul_flops + side_link_data_flops[i][1]*matrix_add_flops);
167  }
168  for(int i = 0;i < 2; i++){
169  total_flops += num_calls_all_link[i]
170  *(all_link_data_flops[i][0]*matrix_mul_flops + all_link_data_flops[i][1]*matrix_add_flops);
171  }
172  total_flops *= V;
173 
174  *io=total_io;
175  *flops = total_flops;
176 
177  printfQuda("flop/byte =%.1f\n", total_flops/total_io);
178  return ;
179 }
180 
181 // allocate memory
182 // set the layout, etc.
183  static void
184 hisq_force_init()
185 {
186  initQuda(device);
187 
188  qudaGaugeParam.X[0] = xdim;
189  qudaGaugeParam.X[1] = ydim;
190  qudaGaugeParam.X[2] = zdim;
191  qudaGaugeParam.X[3] = tdim;
192 
193  setDims(qudaGaugeParam.X);
194 
195 
196  qudaGaugeParam.cpu_prec = link_prec;
197  qudaGaugeParam.cuda_prec = link_prec;
198  qudaGaugeParam.reconstruct = link_recon;
199 
200  // qudaGaugeParam.gauge_order = QUDA_MILC_GAUGE_ORDER;
201  qudaGaugeParam.gauge_order = gauge_order;
202  qudaGaugeParam.anisotropy = 1.0;
203 
204 
205  memcpy(&qudaGaugeParam_ex, &qudaGaugeParam, sizeof(QudaGaugeParam));
206  qudaGaugeParam_ex.X[0] = qudaGaugeParam.X[0] + 4;
207  qudaGaugeParam_ex.X[1] = qudaGaugeParam.X[1] + 4;
208  qudaGaugeParam_ex.X[2] = qudaGaugeParam.X[2] + 4;
209  qudaGaugeParam_ex.X[3] = qudaGaugeParam.X[3] + 4;
210 
211 
212 
213  gParam = GaugeFieldParam(0, qudaGaugeParam);
217 
218 #ifdef MULTI_GPU
219  gParam_ex = GaugeFieldParam(0, qudaGaugeParam_ex);
220  gParam_ex.ghostExchange = QUDA_GHOST_EXCHANGE_NO;
221  gParam_ex.create = QUDA_NULL_FIELD_CREATE;
222  gParam_ex.link_type = QUDA_GENERAL_LINKS;
223  gParam_ex.order = gauge_order;
224  cpuGauge_ex = new cpuGaugeField(gParam_ex);
225 #endif
226 
227  int gSize = qudaGaugeParam.cpu_prec;
228  // this is a hack to get the gauge field to appear as a void** rather than void*
229  for(int i=0;i < 4;i++){
230 #ifdef GPU_DIRECT
231  if(cudaMallocHost(&siteLink_2d[i], V*gaugeSiteSize* qudaGaugeParam.cpu_prec) == cudaErrorMemoryAllocation) {
232  errorQuda("ERROR: cudaMallocHost failed for sitelink_2d\n");
233  }
234  if(cudaMallocHost((void**)&siteLink_ex_2d[i], V_ex*gaugeSiteSize*qudaGaugeParam.cpu_prec) == cudaErrorMemoryAllocation) {
235  errorQuda("ERROR: cudaMallocHost failed for sitelink_ex_2d\n");
236  }
237 #else
238  siteLink_2d[i] = malloc(V*gaugeSiteSize* qudaGaugeParam.cpu_prec);
239  siteLink_ex_2d[i] = malloc(V_ex*gaugeSiteSize*qudaGaugeParam.cpu_prec);
240 #endif
241  if(siteLink_2d[i] == NULL || siteLink_ex_2d[i] == NULL){
242  errorQuda("malloc failed for siteLink_2d/siteLink_ex_2d\n");
243  }
244  memset(siteLink_2d[i], 0, V*gaugeSiteSize* qudaGaugeParam.cpu_prec);
245  memset(siteLink_ex_2d[i], 0, V_ex*gaugeSiteSize*qudaGaugeParam.cpu_prec);
246  }
247  //siteLink_1d is only used in fermion reference computation
248  siteLink_1d = malloc(4*V*gaugeSiteSize* qudaGaugeParam.cpu_prec);
249 
250 
251  // fills the gauge field with random numbers
252  createSiteLinkCPU(siteLink_2d, qudaGaugeParam.cpu_prec, 1);
253 
254  int X1 = Z[0];
255  int X2 = Z[1];
256  int X3 = Z[2];
257  int X4 = Z[3];
258  for(int i=0; i < V_ex; i++){
259  int sid = i;
260  int oddBit=0;
261  if(i >= Vh_ex){
262  sid = i - Vh_ex;
263  oddBit = 1;
264  }
265 
266  int za = sid/E1h;
267  int x1h = sid - za*E1h;
268  int zb = za/E2;
269  int x2 = za - zb*E2;
270  int x4 = zb/E3;
271  int x3 = zb - x4*E3;
272  int x1odd = (x2 + x3 + x4 + oddBit) & 1;
273  int x1 = 2*x1h + x1odd;
274 
275 
276  if( x1< 2 || x1 >= X1 +2
277  || x2< 2 || x2 >= X2 +2
278  || x3< 2 || x3 >= X3 +2
279  || x4< 2 || x4 >= X4 +2){
280  continue;
281  }
282 
283 
284 
285  x1 = (x1 - 2 + X1) % X1;
286  x2 = (x2 - 2 + X2) % X2;
287  x3 = (x3 - 2 + X3) % X3;
288  x4 = (x4 - 2 + X4) % X4;
289 
290  int idx = (x4*X3*X2*X1+x3*X2*X1+x2*X1+x1)>>1;
291  if(oddBit){
292  idx += Vh;
293  }
294  for(int dir= 0; dir < 4; dir++){
295  char* src = (char*)siteLink_2d[dir];
296  char* dst = (char*)siteLink_ex_2d[dir];
297  memcpy(dst+i*gaugeSiteSize*gSize, src+idx*gaugeSiteSize*gSize, gaugeSiteSize*gSize);
298  }//dir
299 
300  }//i
301 
302 
303 
304 
305  for(int dir = 0; dir < 4; dir++){
306  for(int i = 0;i < V; i++){
307  char* src = (char*)siteLink_2d[dir];
308  char* dst = (char*)siteLink_1d;
309  memcpy(dst + (4*i+dir)*gaugeSiteSize*link_prec, src + i*gaugeSiteSize*link_prec, gaugeSiteSize \
310  *link_prec);
311  }
312  }
313 
315  for(int dir = 0; dir < 4; dir++){
316  for(int i = 0;i < V; i++){
317  char* src = (char*)siteLink_2d[dir];
318  char* dst = (char*)cpuGauge->Gauge_p();
319  memcpy(dst + (4*i+dir)*gaugeSiteSize*link_prec, src + i*gaugeSiteSize*link_prec, gaugeSiteSize*link_prec);
320  }
321  }
322  }else{
323  for(int dir=0;dir < 4; dir++){
324  char* src = (char*)siteLink_2d[dir];
325  char* dst = ((char**)cpuGauge->Gauge_p())[dir];
326  memcpy(dst, src, V*gaugeSiteSize*link_prec);
327  }
328  }
329 #ifdef MULTI_GPU
330  //for multi-gpu we have to use qdp format now
332  errorQuda("multi_gpu milc is not supported\n");
333  }
334  for(int dir=0;dir < 4; dir++){
335  char* src = (char*)siteLink_ex_2d[dir];
336  char* dst = ((char**)cpuGauge_ex->Gauge_p())[dir];
337  memcpy(dst, src, V_ex*gaugeSiteSize*link_prec);
338  }
339 
340 #endif
341 
342 
343 
344 #ifdef MULTI_GPU
345  gParam_ex.order = QUDA_FLOAT2_GAUGE_ORDER;
346  gParam_ex.precision = prec;
347  gParam_ex.reconstruct = link_recon;
348  //gParam_ex.pad = E1*E2*E3/2;
349  gParam_ex.pad = 0;
350  gParam_ex.order = QUDA_FLOAT2_GAUGE_ORDER;
351  cudaGauge_ex = new cudaGaugeField(gParam_ex);
352  qudaGaugeParam.site_ga_pad = gParam_ex.pad;
353  //record gauge pad size
354 
355 #else
357  gParam.precision = qudaGaugeParam.cuda_prec;
359  gParam.pad = X1*X2*X3/2;
362  //record gauge pad size
363  qudaGaugeParam.site_ga_pad = gParam.pad;
364 
365 #endif
366 
367 #ifdef MULTI_GPU
368  gParam_ex.pad = 0;
369  gParam_ex.reconstruct = QUDA_RECONSTRUCT_NO;
370  gParam_ex.create = QUDA_ZERO_FIELD_CREATE;
371  gParam_ex.order = gauge_order;
372  cpuForce_ex = new cpuGaugeField(gParam_ex);
373  gParam_ex.order = QUDA_FLOAT2_GAUGE_ORDER;
374  gParam_ex.reconstruct = QUDA_RECONSTRUCT_NO;
375  cudaForce_ex = new cudaGaugeField(gParam_ex);
376 #else
377  gParam.pad = 0;
381  cpuForce = new cpuGaugeField(gParam);
386 #endif
387 
388  // create the momentum matrix
389  gParam.pad = 0;
394  cpuMom = new cpuGaugeField(gParam);
395  refMom = new cpuGaugeField(gParam);
396 
397  //createMomCPU(cpuMom->Gauge_p(), mom_prec);
398 
399  hw = malloc(4*cpuGauge->Volume()*hwSiteSize*qudaGaugeParam.cpu_prec);
400  if (hw == NULL){
401  fprintf(stderr, "ERROR: malloc failed for hw\n");
402  exit(1);
403  }
404 
405  createHwCPU(hw, hw_prec);
406 
407 
411  gParam.pad = 0;
415  computeLinkOrderedOuterProduct(hw, cpuLongLinkOprod->Gauge_p(), hw_prec, 3, gauge_order);
416 
417 #ifdef MULTI_GPU
418  gParam_ex.link_type = QUDA_GENERAL_LINKS;
419  gParam_ex.reconstruct = QUDA_RECONSTRUCT_NO;
420  gParam_ex.order = gauge_order;
421  cpuOprod_ex = new cpuGaugeField(gParam_ex);
422  cpuLongLinkOprod_ex = new cpuGaugeField(gParam_ex);
423 
424  for(int i=0; i < V_ex; i++){
425  int sid = i;
426  int oddBit=0;
427  if(i >= Vh_ex){
428  sid = i - Vh_ex;
429  oddBit = 1;
430  }
431 
432  int za = sid/E1h;
433  int x1h = sid - za*E1h;
434  int zb = za/E2;
435  int x2 = za - zb*E2;
436  int x4 = zb/E3;
437  int x3 = zb - x4*E3;
438  int x1odd = (x2 + x3 + x4 + oddBit) & 1;
439  int x1 = 2*x1h + x1odd;
440 
441 
442  if( x1< 2 || x1 >= X1 +2
443  || x2< 2 || x2 >= X2 +2
444  || x3< 2 || x3 >= X3 +2
445  || x4< 2 || x4 >= X4 +2){
446  continue;
447  }
448 
449 
450 
451  x1 = (x1 - 2 + X1) % X1;
452  x2 = (x2 - 2 + X2) % X2;
453  x3 = (x3 - 2 + X3) % X3;
454  x4 = (x4 - 2 + X4) % X4;
455 
456  int idx = (x4*X3*X2*X1+x3*X2*X1+x2*X1+x1)>>1;
457  if(oddBit){
458  idx += Vh;
459  }
460  for(int dir= 0; dir < 4; dir++){
461  char* src = ((char**)cpuOprod->Gauge_p())[dir];
462  char* dst = ((char**)cpuOprod_ex->Gauge_p())[dir];
463  memcpy(dst+i*gaugeSiteSize*gSize, src+idx*gaugeSiteSize*gSize, gaugeSiteSize*gSize);
464 
465  src = ((char**)cpuLongLinkOprod->Gauge_p())[dir];
466  dst = ((char**)cpuLongLinkOprod_ex->Gauge_p())[dir];
467  memcpy(dst+i*gaugeSiteSize*gSize, src+idx*gaugeSiteSize*gSize, gaugeSiteSize*gSize);
468 
469  }//dir
470  }//i
471 
472 
473  gParam_ex.order = QUDA_FLOAT2_GAUGE_ORDER;
474  cudaOprod_ex = new cudaGaugeField(gParam_ex);
475  gParam_ex.order = gauge_order;
476 #else
477 
481 
482 #endif
483 
484  return;
485 }
486 
487 
488  static void
489 hisq_force_end()
490 {
491  for(int i = 0;i < 4; i++){
492 #ifdef GPU_DIRECT
493  cudaFreeHost(siteLink_2d[i]);
494  cudaFreeHost(siteLink_ex_2d[i]);
495 #else
496  free(siteLink_2d[i]);
497  free(siteLink_ex_2d[i]);
498 #endif
499  }
500  free(siteLink_1d);
501 
502  delete cudaMom;
503  delete cudaGauge;
504 #ifdef MULTI_GPU
505  delete cudaForce_ex;
506  delete cudaGauge_ex;
507  //delete cudaOprod_ex; // already deleted
508  delete cudaLongLinkOprod_ex;
509 #else
510  delete cudaForce;
511  delete cudaOprod;
512  delete cudaLongLinkOprod;
513 #endif
514 
515  delete cpuGauge;
516  delete cpuMom;
517  delete refMom;
518  delete cpuOprod;
519  delete cpuLongLinkOprod;
520 
521 #ifdef MULTI_GPU
522  delete cpuGauge_ex;
523  delete cpuForce_ex;
524  delete cpuOprod_ex;
525  delete cpuLongLinkOprod_ex;
526 #else
527  delete cpuForce;
528 #endif
529 
530  free(hw);
531 
532  endQuda();
533 
534  return;
535 }
536 
537  static int
538 hisq_force_test(void)
539 {
540  tune = false;
543 
544  hisq_force_init();
545 
546  //float weight = 1.0;
547  float act_path_coeff[6];
548 
549  act_path_coeff[0] = 0.625000;
550  act_path_coeff[1] = -0.058479;
551  act_path_coeff[2] = -0.087719;
552  act_path_coeff[3] = 0.030778;
553  act_path_coeff[4] = -0.007200;
554  act_path_coeff[5] = -0.123113;
555 
556 
557  //double d_weight = 1.0;
558  double d_act_path_coeff[6];
559  for(int i=0; i<6; ++i){
560  d_act_path_coeff[i] = act_path_coeff[i];
561  }
562 
563 
564 
565 
566 #ifdef MULTI_GPU
567  int R[4] = {2, 2, 2, 2};
570 #else
572 #endif
573 
574 
575 
576 
577 #ifdef MULTI_GPU
580 #else
582 #endif
583 
584 
585 
586 
587 #ifdef MULTI_GPU
589 #endif
590 
591 
592 
593  struct timeval ht0, ht1;
594  gettimeofday(&ht0, NULL);
595  if (verify_results){
596 #ifdef MULTI_GPU
597  hisqStaplesForceCPU(d_act_path_coeff, qudaGaugeParam, *cpuOprod_ex, *cpuGauge_ex, cpuForce_ex);
598  hisqLongLinkForceCPU(d_act_path_coeff[1], qudaGaugeParam, *cpuLongLinkOprod_ex, *cpuGauge_ex, cpuForce_ex);
599  hisqCompleteForceCPU(qudaGaugeParam, *cpuForce_ex, *cpuGauge_ex, refMom);
600 #else
601  hisqStaplesForceCPU(d_act_path_coeff, qudaGaugeParam, *cpuOprod, *cpuGauge, cpuForce);
602  hisqLongLinkForceCPU(d_act_path_coeff[1], qudaGaugeParam, *cpuLongLinkOprod, *cpuGauge, cpuForce);
603  hisqCompleteForceCPU(qudaGaugeParam, *cpuForce, *cpuGauge, refMom);
604 #endif
605 
606  }
607 
608 
609 
610  gettimeofday(&ht1, NULL);
611 
612  struct timeval t0, t1, t2, t3;
613 
614  gettimeofday(&t0, NULL);
615 
616 #ifdef MULTI_GPU
617  fermion_force::hisqStaplesForceCuda(d_act_path_coeff, qudaGaugeParam, *cudaOprod_ex, *cudaGauge_ex, cudaForce_ex);
618  cudaDeviceSynchronize();
619  gettimeofday(&t1, NULL);
620 
621  delete cudaOprod_ex; //doing this to lower the peak memory usage
622  gParam_ex.order = QUDA_FLOAT2_GAUGE_ORDER;
623  cudaLongLinkOprod_ex = new cudaGaugeField(gParam_ex);
625  fermion_force::hisqLongLinkForceCuda(d_act_path_coeff[1], qudaGaugeParam, *cudaLongLinkOprod_ex, *cudaGauge_ex, cudaForce_ex);
626  cudaDeviceSynchronize();
627 
628  gettimeofday(&t2, NULL);
629 
630 #else
631  fermion_force::hisqStaplesForceCuda(d_act_path_coeff, qudaGaugeParam, *cudaOprod, *cudaGauge, cudaForce);
632  cudaDeviceSynchronize();
633  gettimeofday(&t1, NULL);
634 
635  checkCudaError();
637  fermion_force::hisqLongLinkForceCuda(d_act_path_coeff[1], qudaGaugeParam, *cudaLongLinkOprod, *cudaGauge, cudaForce);
638  cudaDeviceSynchronize();
639  gettimeofday(&t2, NULL);
640 #endif
641 
645  gParam.pad = 0; //X1*X2*X3/2;
647  cudaMom = new cudaGaugeField(gParam); // Are the elements initialised to zero? - No!
648 
649  //record the mom pad
650  qudaGaugeParam.mom_ga_pad = gParam.pad;
651 
652 #ifdef MULTI_GPU
654 #else
656 #endif
657 
658 
659 
660  cudaDeviceSynchronize();
661 
662  gettimeofday(&t3, NULL);
663 
664  checkCudaError();
665 
667 
668  int accuracy_level = 3;
669  if(verify_results){
670  int res;
671  res = compare_floats(cpuMom->Gauge_p(), refMom->Gauge_p(), 4*cpuMom->Volume()*momSiteSize, 1e-5, qudaGaugeParam.cpu_prec);
672 
673  accuracy_level = strong_check_mom(cpuMom->Gauge_p(), refMom->Gauge_p(), 4*cpuMom->Volume(), qudaGaugeParam.cpu_prec);
674  printfQuda("Test %s\n",(1 == res) ? "PASSED" : "FAILED");
675  }
676  double total_io;
677  double total_flops;
678  total_staple_io_flops(link_prec, link_recon, &total_io, &total_flops);
679 
680  float perf_flops = total_flops / (TDIFF(t0, t1)) *1e-9;
681  float perf = total_io / (TDIFF(t0, t1)) *1e-9;
682  printfQuda("Staples time: %.2f ms, perf = %.2f GFLOPS, achieved bandwidth= %.2f GB/s\n", TDIFF(t0,t1)*1000, perf_flops, perf);
683  printfQuda("Staples time : %g ms\t LongLink time : %g ms\t Completion time : %g ms\n", TDIFF(t0,t1)*1000, TDIFF(t1,t2)*1000, TDIFF(t2,t3)*1000);
684  printfQuda("Host time (half-wilson fermion force) : %g ms\n", TDIFF(ht0, ht1)*1000);
685 
686  hisq_force_end();
687 
688  return accuracy_level;
689 }
690 
691 
692  static void
694 {
695  printfQuda("running the following fermion force computation test:\n");
696 
697  printfQuda("link_precision link_reconstruct space_dim(x/y/z) T_dimension Gauge_order\n");
698  printfQuda("%s %s %d/%d/%d %d %s\n",
701  xdim, ydim, zdim, tdim,
703  return ;
704 
705 }
706 
707  int
708 main(int argc, char **argv)
709 {
710  int i;
711  for (i =1;i < argc; i++){
712 
713  if(process_command_line_option(argc, argv, &i) == 0){
714  continue;
715  }
716 
717  if( strcmp(argv[i], "--gauge-order") == 0){
718  if(i+1 >= argc){
719  usage(argv);
720  }
721 
722  if(strcmp(argv[i+1], "milc") == 0){
724  }else if(strcmp(argv[i+1], "qdp") == 0){
726  }else{
727  fprintf(stderr, "Error: unsupported gauge-field order\n");
728  exit(1);
729  }
730  i++;
731  continue;
732  }
733 
734  fprintf(stderr, "ERROR: Invalid option:%s\n", argv[i]);
735  usage(argv);
736  }
737 
738 #ifdef MULTI_GPU
740  errorQuda("Multi-gpu for milc order is not supported\n");
741  }
742 
743  initComms(argc, argv, gridsize_from_cmdline);
744 #endif
745 
747 
749 
750  int accuracy_level = hisq_force_test();
751 
752 
753  finalizeComms();
754 
755  if(accuracy_level >=3 ){
756  return EXIT_SUCCESS;
757  }else{
758  return -1;
759  }
760 
761 }
762 
763 
int device
Definition: test_util.cpp:1546
QudaPrecision hw_prec
int main(int argc, char **argv)
double anisotropy
Definition: quda.h:31
__constant__ int Vh
cpuGaugeField * cpuOprod
void endQuda(void)
cpuGaugeField * cpuForce_ex
__constant__ int X2
enum QudaPrecision_s QudaPrecision
int V
Definition: test_util.cpp:29
__constant__ int Vh_ex
void display_test_info()
Definition: blas_test.cu:56
cudaGaugeField * cudaOprod_ex
void exchangeExtendedGhost(const int *R, bool no_comms_fill=false)
cudaGaugeField * cudaForce_ex
int xdim
Definition: test_util.cpp:1553
void saveCPUField(cpuGaugeField &, const QudaFieldLocation &) const
#define errorQuda(...)
Definition: util_quda.h:73
void hisqStaplesForceCPU(const double *path_coeff, const QudaGaugeParam &param, quda::cpuGaugeField &oprod, quda::cpuGaugeField &link, quda::cpuGaugeField *newOprod)
void createHwCPU(void *hw, QudaPrecision precision)
Definition: test_util.cpp:1429
void setDims(int *)
Definition: test_util.cpp:88
__constant__ int X1
cudaGaugeField * cudaMom
int V_ex
Definition: test_util.cpp:38
int process_command_line_option(int argc, char **argv, int *idx)
Definition: test_util.cpp:1635
int gridsize_from_cmdline[]
Definition: test_util.cpp:1559
void computeLinkOrderedOuterProduct(void *src, void *dst, QudaPrecision precision, int gauge_order)
QudaPrecision precision
Definition: lattice_field.h:41
#define gaugeSiteSize
void finalizeComms()
Definition: test_util.cpp:65
cpuGaugeField * cpuMom
void setPrecision(QudaPrecision precision)
const char * get_gauge_order_str(QudaGaugeFieldOrder order)
Definition: misc.cpp:697
void * siteLink_ex_2d[4]
QudaGaugeFieldOrder gauge_order
Definition: quda.h:36
int ydim
Definition: test_util.cpp:1554
int compare_floats(void *a, void *b, int len, double epsilon, QudaPrecision precision)
Definition: test_util.cpp:395
const char * get_prec_str(QudaPrecision prec)
Definition: misc.cpp:658
int tdim
Definition: test_util.cpp:1556
void createSiteLinkCPU(void **link, QudaPrecision precision, int phase)
Definition: test_util.cpp:1166
QudaPrecision cpu_hw_prec
void hisqCompleteForceCPU(const QudaGaugeParam &param, quda::cpuGaugeField &oprod, quda::cpuGaugeField &link, quda::cpuGaugeField *mom)
void * siteLink_2d[4]
void setTuning(QudaTune tune)
Definition: util_quda.cpp:33
cpuGaugeField * refMom
void initQuda(int device)
int Volume() const
int site_ga_pad
Definition: quda.h:55
#define TDIFF(a, b)
QudaPrecision link_prec
cudaGaugeField * cudaLongLinkOprod
QudaReconstructType link_recon
Definition: test_util.cpp:1549
#define momSiteSize
const char * get_recon_str(QudaReconstructType recon)
Definition: misc.cpp:724
void hisqCompleteForceCuda(const QudaGaugeParam &param, const cudaGaugeField &oprod, const cudaGaugeField &link, cudaGaugeField *force, long long *flops=NULL)
bool verify_results
Definition: test_util.cpp:1568
cpuGaugeField * cpuOprod_ex
QudaGaugeFieldOrder order
Definition: gauge_field.h:15
void loadCPUField(const cpuGaugeField &, const QudaFieldLocation &)
enum QudaGaugeFieldOrder_s QudaGaugeFieldOrder
void hisqLongLinkForceCPU(double coeff, const QudaGaugeParam &param, quda::cpuGaugeField &oprod, quda::cpuGaugeField &link, quda::cpuGaugeField *newOprod)
QudaReconstructType reconstruct
Definition: quda.h:43
QudaPrecision cuda_prec
Definition: quda.h:42
int X[4]
Definition: quda.h:29
int strong_check_mom(void *momA, void *momB, int len, QudaPrecision prec)
Definition: test_util.cpp:1502
int zdim
Definition: test_util.cpp:1555
short x1h
Definition: llfat_core.h:815
void hisqLongLinkForceCuda(double coeff, const QudaGaugeParam &param, const cudaGaugeField &oprod, const cudaGaugeField &link, cudaGaugeField *newOprod, long long *flops=NULL)
#define hwSiteSize
Definition: hw_quda.cpp:8
void hisqStaplesForceCuda(const double path_coeff[6], const QudaGaugeParam &param, const cudaGaugeField &oprod, const cudaGaugeField &link, cudaGaugeField *newOprod, long long *flops=NULL)
cpuGaugeField * cpuLongLinkOprod_ex
cudaGaugeField * cudaGauge_ex
QudaPrecision prec
Definition: test_util.cpp:1551
void * memset(void *s, int c, size_t n)
int Z[4]
Definition: test_util.cpp:28
GaugeFieldParam gParam
bool tune
Definition: test_util.cpp:1562
cpuGaugeField * cpuLongLinkOprod
enum QudaReconstructType_s QudaReconstructType
Main header file for the QUDA library.
__constant__ int X3
QudaLinkType link_type
Definition: gauge_field.h:17
int mom_ga_pad
Definition: quda.h:59
#define printfQuda(...)
Definition: util_quda.h:67
QudaPrecision mom_prec
cpuGaugeField * cpuGauge_ex
short x1odd
Definition: llfat_core.h:821
cudaGaugeField * cudaGauge
QudaGaugeFieldOrder gauge_order
QudaReconstructType reconstruct
Definition: gauge_field.h:14
QudaFieldCreate create
Definition: gauge_field.h:26
void * siteLink_1d
cpuGaugeField * cpuGauge
#define checkCudaError()
Definition: util_quda.h:110
QudaGaugeFieldOrder order
Definition: gauge_field.h:131
cudaGaugeField * cudaForce
cudaGaugeField * cudaOprod
cudaGaugeField * cudaLongLinkOprod_ex
void initComms(int argc, char **argv, const int *commDims)
Definition: test_util.cpp:48
__constant__ int E1h
void setVerbosity(const QudaVerbosity verbosity)
Definition: util_quda.cpp:24
int oddBit
cpuGaugeField * cpuForce
__constant__ int E3
QudaPrecision cpu_prec
Definition: quda.h:40
__constant__ int E2
__constant__ int X4
void usage(char **argv)
Definition: test_util.cpp:1584
void total_staple_io_flops(QudaPrecision prec, QudaReconstructType recon, double *io, double *flops)