QUDA  v0.5.0
A library for QCD on GPUs
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
hisq_paths_force_test.cpp
Go to the documentation of this file.
1 #include <cstdio>
2 #include <cstdlib>
3 #include <cstring>
4 
5 #include <quda.h>
6 #include "test_util.h"
7 #include "gauge_field.h"
8 #include "fat_force_quda.h"
9 #include "misc.h"
10 #include "hisq_force_reference.h"
11 #include "hisq_force_quda.h"
12 #include "hw_quda.h"
13 #include <fat_force_quda.h>
14 #include <face_quda.h>
15 #include <dslash_quda.h>
16 #include <sys/time.h>
17 
18 #define TDIFF(a,b) (b.tv_sec - a.tv_sec + 0.000001*(b.tv_usec - a.tv_usec))
19 
21 using namespace quda;
22 
23 extern void usage(char** argv);
24 extern int device;
27 
30 
34 
35 static QudaGaugeParam qudaGaugeParam;
36 static QudaGaugeParam qudaGaugeParam_ex;
37 static void* hw; // the array of half_wilson_vector
38 
40 
45 
47 int ODD_BIT = 1;
48 extern int xdim, ydim, zdim, tdim;
49 extern int gridsize_from_cmdline[];
50 
51 extern bool tune;
52 extern QudaPrecision prec;
58 
60 void* siteLink_2d[4];
61 void* siteLink_ex_2d[4];
62 
71 #ifdef MULTI_GPU
72 GaugeFieldParam gParam_ex;
73 #endif
74 
76 
77 static void setPrecision(QudaPrecision precision)
78 {
79  link_prec = precision;
80  hw_prec = precision;
81  cpu_hw_prec = precision;
82  mom_prec = precision;
83  return;
84 }
85 
86 
87 void
88 total_staple_io_flops(QudaPrecision prec, QudaReconstructType recon, double* io, double* flops)
89 {
90  //total IO counting for the middle/side/all link kernels
91  //Explanation about these numbers can be founed in the corresnponding kernel functions in
92  //the hisq kernel core file
93  int linksize = prec*recon;
94  int cmsize = prec*18;
95 
96  int matrix_mul_flops = 198;
97  int matrix_add_flops = 18;
98 
99  int num_calls_middle_link[6] = {24, 24, 96, 96, 24, 24};
100  int middle_link_data_io[6][2] = {
101  {3,6},
102  {3,4},
103  {3,7},
104  {3,5},
105  {3,5},
106  {3,2}
107  };
108  int middle_link_data_flops[6][2] = {
109  {3,1},
110  {2,0},
111  {4,1},
112  {3,0},
113  {4,1},
114  {2,0}
115  };
116 
117 
118  int num_calls_side_link[2]= {192, 48};
119  int side_link_data_io[2][2] = {
120  {1, 6},
121  {0, 3}
122  };
123  int side_link_data_flops[2][2] = {
124  {2, 2},
125  {0, 1}
126  };
127 
128 
129 
130  int num_calls_all_link[2] ={192, 192};
131  int all_link_data_io[2][2] = {
132  {3, 8},
133  {3, 6}
134  };
135  int all_link_data_flops[2][2] = {
136  {6, 3},
137  {4, 2}
138  };
139 
140 
141  double total_io = 0;
142  for(int i = 0;i < 6; i++){
143  total_io += num_calls_middle_link[i]
144  *(middle_link_data_io[i][0]*linksize + middle_link_data_io[i][1]*cmsize);
145  }
146 
147  for(int i = 0;i < 2; i++){
148  total_io += num_calls_side_link[i]
149  *(side_link_data_io[i][0]*linksize + side_link_data_io[i][1]*cmsize);
150  }
151  for(int i = 0;i < 2; i++){
152  total_io += num_calls_all_link[i]
153  *(all_link_data_io[i][0]*linksize + all_link_data_io[i][1]*cmsize);
154  }
155  total_io *= V;
156 
157 
158  double total_flops = 0;
159  for(int i = 0;i < 6; i++){
160  total_flops += num_calls_middle_link[i]
161  *(middle_link_data_flops[i][0]*matrix_mul_flops + middle_link_data_flops[i][1]*matrix_add_flops);
162  }
163 
164  for(int i = 0;i < 2; i++){
165  total_flops += num_calls_side_link[i]
166  *(side_link_data_flops[i][0]*matrix_mul_flops + side_link_data_flops[i][1]*matrix_add_flops);
167  }
168  for(int i = 0;i < 2; i++){
169  total_flops += num_calls_all_link[i]
170  *(all_link_data_flops[i][0]*matrix_mul_flops + all_link_data_flops[i][1]*matrix_add_flops);
171  }
172  total_flops *= V;
173 
174  *io=total_io;
175  *flops = total_flops;
176 
177  printfQuda("flop/byte =%.1f\n", total_flops/total_io);
178  return ;
179 }
180 
181 // allocate memory
182 // set the layout, etc.
183 static void
184 hisq_force_init()
185 {
186  initQuda(device);
187 
188  qudaGaugeParam.X[0] = xdim;
189  qudaGaugeParam.X[1] = ydim;
190  qudaGaugeParam.X[2] = zdim;
191  qudaGaugeParam.X[3] = tdim;
192 
193  setDims(qudaGaugeParam.X);
194 
195 
196  qudaGaugeParam.cpu_prec = link_prec;
197  qudaGaugeParam.cuda_prec = link_prec;
198  qudaGaugeParam.reconstruct = link_recon;
199 
200  // qudaGaugeParam.gauge_order = QUDA_MILC_GAUGE_ORDER;
201  qudaGaugeParam.gauge_order = gauge_order;
202  qudaGaugeParam.anisotropy = 1.0;
203 
204 
205  memcpy(&qudaGaugeParam_ex, &qudaGaugeParam, sizeof(QudaGaugeParam));
206  qudaGaugeParam_ex.X[0] = qudaGaugeParam.X[0] + 4;
207  qudaGaugeParam_ex.X[1] = qudaGaugeParam.X[1] + 4;
208  qudaGaugeParam_ex.X[2] = qudaGaugeParam.X[2] + 4;
209  qudaGaugeParam_ex.X[3] = qudaGaugeParam.X[3] + 4;
210 
211 
212 
213  gParam = GaugeFieldParam(0, qudaGaugeParam);
217 
218 #ifdef MULTI_GPU
219  gParam_ex = GaugeFieldParam(0, qudaGaugeParam_ex);
220  gParam_ex.create = QUDA_NULL_FIELD_CREATE;
221  gParam_ex.link_type = QUDA_GENERAL_LINKS;
222  cpuGauge_ex = new cpuGaugeField(gParam_ex);
223 #endif
224 
225  int gSize = qudaGaugeParam.cpu_prec;
226  // this is a hack to get the gauge field to appear as a void** rather than void*
227  for(int i=0;i < 4;i++){
228 #ifdef GPU_DIRECT
229  if(cudaMallocHost(&siteLink_2d[i], V*gaugeSiteSize* qudaGaugeParam.cpu_prec) == cudaErrorMemoryAllocation) {
230  errorQuda("ERROR: cudaMallocHost failed for sitelink_2d\n");
231  }
232  if(cudaMallocHost((void**)&siteLink_ex_2d[i], V_ex*gaugeSiteSize*qudaGaugeParam.cpu_prec) == cudaErrorMemoryAllocation) {
233  errorQuda("ERROR: cudaMallocHost failed for sitelink_ex_2d\n");
234  }
235 #else
236  siteLink_2d[i] = malloc(V*gaugeSiteSize* qudaGaugeParam.cpu_prec);
237  siteLink_ex_2d[i] = malloc(V_ex*gaugeSiteSize*qudaGaugeParam.cpu_prec);
238 #endif
239  if(siteLink_2d[i] == NULL || siteLink_ex_2d[i] == NULL){
240  errorQuda("malloc failed for siteLink_2d/siteLink_ex_2d\n");
241  }
242  memset(siteLink_2d[i], 0, V*gaugeSiteSize* qudaGaugeParam.cpu_prec);
243  memset(siteLink_ex_2d[i], 0, V_ex*gaugeSiteSize*qudaGaugeParam.cpu_prec);
244  }
245  //siteLink_1d is only used in fermion reference computation
246  siteLink_1d = malloc(4*V*gaugeSiteSize* qudaGaugeParam.cpu_prec);
247 
248 
249  // fills the gauge field with random numbers
250  createSiteLinkCPU(siteLink_2d, qudaGaugeParam.cpu_prec, 1);
251 
252  int X1 = Z[0];
253  int X2 = Z[1];
254  int X3 = Z[2];
255  int X4 = Z[3];
256  for(int i=0; i < V_ex; i++){
257  int sid = i;
258  int oddBit=0;
259  if(i >= Vh_ex){
260  sid = i - Vh_ex;
261  oddBit = 1;
262  }
263 
264  int za = sid/E1h;
265  int x1h = sid - za*E1h;
266  int zb = za/E2;
267  int x2 = za - zb*E2;
268  int x4 = zb/E3;
269  int x3 = zb - x4*E3;
270  int x1odd = (x2 + x3 + x4 + oddBit) & 1;
271  int x1 = 2*x1h + x1odd;
272 
273 
274  if( x1< 2 || x1 >= X1 +2
275  || x2< 2 || x2 >= X2 +2
276  || x3< 2 || x3 >= X3 +2
277  || x4< 2 || x4 >= X4 +2){
278  continue;
279  }
280 
281 
282 
283  x1 = (x1 - 2 + X1) % X1;
284  x2 = (x2 - 2 + X2) % X2;
285  x3 = (x3 - 2 + X3) % X3;
286  x4 = (x4 - 2 + X4) % X4;
287 
288  int idx = (x4*X3*X2*X1+x3*X2*X1+x2*X1+x1)>>1;
289  if(oddBit){
290  idx += Vh;
291  }
292  for(int dir= 0; dir < 4; dir++){
293  char* src = (char*)siteLink_2d[dir];
294  char* dst = (char*)siteLink_ex_2d[dir];
295  memcpy(dst+i*gaugeSiteSize*gSize, src+idx*gaugeSiteSize*gSize, gaugeSiteSize*gSize);
296  }//dir
297 
298  }//i
299 
300 
301 
302 
303  for(int dir = 0; dir < 4; dir++){
304  for(int i = 0;i < V; i++){
305  char* src = (char*)siteLink_2d[dir];
306  char* dst = (char*)siteLink_1d;
307  memcpy(dst + (4*i+dir)*gaugeSiteSize*link_prec, src + i*gaugeSiteSize*link_prec, gaugeSiteSize \
308  *link_prec);
309  }
310  }
311 
313  for(int dir = 0; dir < 4; dir++){
314  for(int i = 0;i < V; i++){
315  char* src = (char*)siteLink_2d[dir];
316  char* dst = (char*)cpuGauge->Gauge_p();
317  memcpy(dst + (4*i+dir)*gaugeSiteSize*link_prec, src + i*gaugeSiteSize*link_prec, gaugeSiteSize*link_prec);
318  }
319  }
320  }else{
321  for(int dir=0;dir < 4; dir++){
322  char* src = (char*)siteLink_2d[dir];
323  char* dst = ((char**)cpuGauge->Gauge_p())[dir];
324  memcpy(dst, src, V*gaugeSiteSize*link_prec);
325  }
326  }
327 #ifdef MULTI_GPU
328  //for multi-gpu we have to use qdp format now
330  errorQuda("multi_gpu milc is not supported\n");
331  }
332  for(int dir=0;dir < 4; dir++){
333  char* src = (char*)siteLink_ex_2d[dir];
334  char* dst = ((char**)cpuGauge_ex->Gauge_p())[dir];
335  memcpy(dst, src, V_ex*gaugeSiteSize*link_prec);
336  }
337 
338 #endif
339 
340 
341 
342 #ifdef MULTI_GPU
343  gParam_ex.precision = prec;
344  gParam_ex.reconstruct = link_recon;
345  //gParam_ex.pad = E1*E2*E3/2;
346  gParam_ex.pad = 0;
347  cudaGauge_ex = new cudaGaugeField(gParam_ex);
348  qudaGaugeParam.site_ga_pad = gParam_ex.pad;
349  //record gauge pad size
350 
351 #else
352  gParam.precision = qudaGaugeParam.cuda_prec;
354  gParam.pad = X1*X2*X3/2;
356  //record gauge pad size
357  qudaGaugeParam.site_ga_pad = gParam.pad;
358 
359 #endif
360 
361 #ifdef MULTI_GPU
362  gParam_ex.pad = 0;
363  gParam_ex.reconstruct = QUDA_RECONSTRUCT_NO;
364  gParam_ex.create = QUDA_ZERO_FIELD_CREATE;
365  cpuForce_ex = new cpuGaugeField(gParam_ex);
366 
367  gParam_ex.reconstruct = QUDA_RECONSTRUCT_NO;
368  cudaForce_ex = new cudaGaugeField(gParam_ex);
369 #else
370  gParam.pad = 0;
373  cpuForce = new cpuGaugeField(gParam);
374 
377 #endif
378 
379  // create the momentum matrix
380  gParam.pad = 0;
385  cpuMom = new cpuGaugeField(gParam);
386  refMom = new cpuGaugeField(gParam);
387 
388  //createMomCPU(cpuMom->Gauge_p(), mom_prec);
389 
390  hw = malloc(4*cpuGauge->Volume()*hwSiteSize*qudaGaugeParam.cpu_prec);
391  if (hw == NULL){
392  fprintf(stderr, "ERROR: malloc failed for hw\n");
393  exit(1);
394  }
395 
396  createHwCPU(hw, hw_prec);
397 
398 
402  gParam.pad = 0;
406  computeLinkOrderedOuterProduct(hw, cpuLongLinkOprod->Gauge_p(), hw_prec, 3, gauge_order);
407 
408 #ifdef MULTI_GPU
409  gParam_ex.link_type = QUDA_GENERAL_LINKS;
410  gParam_ex.reconstruct = QUDA_RECONSTRUCT_NO;
411  gParam_ex.order = gauge_order;
412  cpuOprod_ex = new cpuGaugeField(gParam_ex);
413  //computeLinkOrderedOuterProduct(hw, cpuOprod_ex->Gauge_p(), hw_prec, 1, gauge_order);
414 
415  cpuLongLinkOprod_ex = new cpuGaugeField(gParam_ex);
416  //computeLinkOrderedOuterProduct(hw, cpuLongLinkOprod_ex->Gauge_p(), hw_prec, 3, gauge_order);
417 
418  for(int i=0; i < V_ex; i++){
419  int sid = i;
420  int oddBit=0;
421  if(i >= Vh_ex){
422  sid = i - Vh_ex;
423  oddBit = 1;
424  }
425 
426  int za = sid/E1h;
427  int x1h = sid - za*E1h;
428  int zb = za/E2;
429  int x2 = za - zb*E2;
430  int x4 = zb/E3;
431  int x3 = zb - x4*E3;
432  int x1odd = (x2 + x3 + x4 + oddBit) & 1;
433  int x1 = 2*x1h + x1odd;
434 
435 
436  if( x1< 2 || x1 >= X1 +2
437  || x2< 2 || x2 >= X2 +2
438  || x3< 2 || x3 >= X3 +2
439  || x4< 2 || x4 >= X4 +2){
440  continue;
441  }
442 
443 
444 
445  x1 = (x1 - 2 + X1) % X1;
446  x2 = (x2 - 2 + X2) % X2;
447  x3 = (x3 - 2 + X3) % X3;
448  x4 = (x4 - 2 + X4) % X4;
449 
450  int idx = (x4*X3*X2*X1+x3*X2*X1+x2*X1+x1)>>1;
451  if(oddBit){
452  idx += Vh;
453  }
454  for(int dir= 0; dir < 4; dir++){
455  char* src = ((char**)cpuOprod->Gauge_p())[dir];
456  char* dst = ((char**)cpuOprod_ex->Gauge_p())[dir];
457  memcpy(dst+i*gaugeSiteSize*gSize, src+idx*gaugeSiteSize*gSize, gaugeSiteSize*gSize);
458 
459  src = ((char**)cpuLongLinkOprod->Gauge_p())[dir];
460  dst = ((char**)cpuLongLinkOprod_ex->Gauge_p())[dir];
461  memcpy(dst+i*gaugeSiteSize*gSize, src+idx*gaugeSiteSize*gSize, gaugeSiteSize*gSize);
462 
463  }//dir
464  }//i
465 
466 
467 
468  cudaOprod_ex = new cudaGaugeField(gParam_ex);
469 #else
470 
473 
474 #endif
475 
476  return;
477 }
478 
479 
480 static void
481 hisq_force_end()
482 {
483  for(int i = 0;i < 4; i++){
484 #ifdef GPU_DIRECT
485  cudaFreeHost(siteLink_2d[i]);
486  cudaFreeHost(siteLink_ex_2d[i]);
487 #else
488  free(siteLink_2d[i]);
489  free(siteLink_ex_2d[i]);
490 #endif
491  }
492  free(siteLink_1d);
493 
494  delete cudaMom;
495  delete cudaGauge;
496 #ifdef MULTI_GPU
497  delete cudaForce_ex;
498  delete cudaGauge_ex;
499  //delete cudaOprod_ex; // already deleted
500  delete cudaLongLinkOprod_ex;
501 #else
502  delete cudaForce;
503  delete cudaOprod;
504  delete cudaLongLinkOprod;
505 #endif
506 
507  delete cpuGauge;
508  delete cpuMom;
509  delete refMom;
510  delete cpuOprod;
511  delete cpuLongLinkOprod;
512 
513 #ifdef MULTI_GPU
514  delete cpuGauge_ex;
515  delete cpuForce_ex;
516  delete cpuOprod_ex;
517  delete cpuLongLinkOprod_ex;
518 #else
519  delete cpuForce;
520 #endif
521 
522  free(hw);
523 
524  endQuda();
525 
526  return;
527 }
528 
529 static int
530 hisq_force_test(void)
531 {
533 
534  hisq_force_init();
535 
537  fermion_force::hisqForceInitCuda(&qudaGaugeParam);
538 
539 
540 
541  //float weight = 1.0;
542  float act_path_coeff[6];
543 
544  act_path_coeff[0] = 0.625000;
545  act_path_coeff[1] = -0.058479;
546  act_path_coeff[2] = -0.087719;
547  act_path_coeff[3] = 0.030778;
548  act_path_coeff[4] = -0.007200;
549  act_path_coeff[5] = -0.123113;
550 
551 
552  //double d_weight = 1.0;
553  double d_act_path_coeff[6];
554  for(int i=0; i<6; ++i){
555  d_act_path_coeff[i] = act_path_coeff[i];
556  }
557 
558 
559 
560 
561 #ifdef MULTI_GPU
562  int optflag = 0;
563  int R[4] = {2, 2, 2, 2};
564  exchange_cpu_sitelink_ex(qudaGaugeParam.X, R, (void**)cpuGauge_ex->Gauge_p(), cpuGauge->Order(), qudaGaugeParam.cpu_prec, optflag);
566 #else
567  loadLinkToGPU(cudaGauge, cpuGauge, &qudaGaugeParam);
568 #endif
569 
570 
571 
572 
573 #ifdef MULTI_GPU
574  exchange_cpu_sitelink_ex(qudaGaugeParam.X, R, (void**)cpuOprod_ex->Gauge_p(), cpuOprod_ex->Order(), qudaGaugeParam.cpu_prec, optflag);
576 #else
577  loadLinkToGPU(cudaOprod, cpuOprod, &qudaGaugeParam);
578 #endif
579 
580 
581 
582 
583 #ifdef MULTI_GPU
584  exchange_cpu_sitelink_ex(qudaGaugeParam.X, R, (void**)cpuLongLinkOprod_ex->Gauge_p(), cpuLongLinkOprod_ex->Order(), qudaGaugeParam.cpu_prec, optflag);
585 #endif
586 
587 
588  struct timeval ht0, ht1;
589  gettimeofday(&ht0, NULL);
590  if (verify_results){
591  /*
592  if(cpu_hw_prec == QUDA_SINGLE_PRECISION){
593  const float eps = 0.5;
594  fermion_force_reference(eps, weight, 0, act_path_coeff, hw, siteLink_1d, refMom->Gauge_p());
595  }else if(cpu_hw_prec == QUDA_DOUBLE_PRECISION){
596  const double eps = 0.5;
597  fermion_force_reference(eps, d_weight, 0, d_act_path_coeff, hw, siteLink_1d, refMom->Gauge_p());
598  }
599  */
600 
601 #ifdef MULTI_GPU
602  hisqStaplesForceCPU(d_act_path_coeff, qudaGaugeParam, *cpuOprod_ex, *cpuGauge_ex, cpuForce_ex);
603  hisqLongLinkForceCPU(d_act_path_coeff[1], qudaGaugeParam, *cpuLongLinkOprod_ex, *cpuGauge_ex, cpuForce_ex);
604  hisqCompleteForceCPU(qudaGaugeParam, *cpuForce_ex, *cpuGauge_ex, refMom);
605 #else
606  hisqStaplesForceCPU(d_act_path_coeff, qudaGaugeParam, *cpuOprod, *cpuGauge, cpuForce);
607  hisqLongLinkForceCPU(d_act_path_coeff[1], qudaGaugeParam, *cpuLongLinkOprod, *cpuGauge, cpuForce);
608  hisqCompleteForceCPU(qudaGaugeParam, *cpuForce, *cpuGauge, refMom);
609 #endif
610 
611  }
612 
613 
614 
615  gettimeofday(&ht1, NULL);
616 
617  struct timeval t0, t1, t2, t3;
618 
619  gettimeofday(&t0, NULL);
620 
621 #ifdef MULTI_GPU
622  fermion_force::hisqStaplesForceCuda(d_act_path_coeff, qudaGaugeParam, *cudaOprod_ex, *cudaGauge_ex, cudaForce_ex);
623  cudaDeviceSynchronize();
624  gettimeofday(&t1, NULL);
625 
626  delete cudaOprod_ex; //doing this to lower the peak memory usage
627  cudaLongLinkOprod_ex = new cudaGaugeField(gParam_ex);
629  fermion_force::hisqLongLinkForceCuda(d_act_path_coeff[1], qudaGaugeParam, *cudaLongLinkOprod_ex, *cudaGauge_ex, cudaForce_ex);
630  cudaDeviceSynchronize();
631 
632  gettimeofday(&t2, NULL);
633 
634 #else
635  fermion_force::hisqStaplesForceCuda(d_act_path_coeff, qudaGaugeParam, *cudaOprod, *cudaGauge, cudaForce);
636  cudaDeviceSynchronize();
637  gettimeofday(&t1, NULL);
638 
639  checkCudaError();
640  loadLinkToGPU(cudaLongLinkOprod, cpuLongLinkOprod, &qudaGaugeParam);
641 
642  fermion_force::hisqLongLinkForceCuda(d_act_path_coeff[1], qudaGaugeParam, *cudaLongLinkOprod, *cudaGauge, cudaForce);
643  cudaDeviceSynchronize();
644  gettimeofday(&t2, NULL);
645 
646 #endif
647 
651  gParam.pad = 0; //X1*X2*X3/2;
652  cudaMom = new cudaGaugeField(gParam); // Are the elements initialised to zero? - No!
653 
654  //record the mom pad
655  qudaGaugeParam.mom_ga_pad = gParam.pad;
656 
657 #ifdef MULTI_GPU
659 #else
661 #endif
662 
663 
664 
665  cudaDeviceSynchronize();
666 
667  gettimeofday(&t3, NULL);
668 
669  checkCudaError();
670 
672 
673  int res;
674  res = compare_floats(cpuMom->Gauge_p(), refMom->Gauge_p(), 4*cpuMom->Volume()*momSiteSize, 1e-5, qudaGaugeParam.cpu_prec);
675 
676  int accuracy_level = strong_check_mom(cpuMom->Gauge_p(), refMom->Gauge_p(), 4*cpuMom->Volume(), qudaGaugeParam.cpu_prec);
677  printfQuda("Test %s\n",(1 == res) ? "PASSED" : "FAILED");
678 
679  double total_io;
680  double total_flops;
681  total_staple_io_flops(link_prec, link_recon, &total_io, &total_flops);
682 
683  float perf_flops = total_flops / (TDIFF(t0, t1)) *1e-9;
684  float perf = total_io / (TDIFF(t0, t1)) *1e-9;
685  printfQuda("Staples time: %.2f ms, perf = %.2f GFLOPS, achieved bandwidth= %.2f GB/s\n", TDIFF(t0,t1)*1000, perf_flops, perf);
686  printfQuda("Staples time : %g ms\t LongLink time : %g ms\t Completion time : %g ms\n", TDIFF(t0,t1)*1000, TDIFF(t1,t2)*1000, TDIFF(t2,t3)*1000);
687  printfQuda("Host time (half-wilson fermion force) : %g ms\n", TDIFF(ht0, ht1)*1000);
688 
689  hisq_force_end();
690 
691  return accuracy_level;
692 }
693 
694 
695 static void
697 {
698  printfQuda("running the following fermion force computation test:\n");
699 
700  printfQuda("link_precision link_reconstruct space_dim(x/y/z) T_dimension Gauge_order\n");
701  printfQuda("%s %s %d/%d/%d %d %s\n",
704  xdim, ydim, zdim, tdim,
706  return ;
707 
708 }
709 
710 void
711 usage_extra(char** argv )
712 {
713  printfQuda("Extra options: \n");
714  printfQuda(" --no_verify # Do not verify the GPU results using CPU results\n");
715  return ;
716 }
717 int
718 main(int argc, char **argv)
719 {
720  int i;
721  for (i =1;i < argc; i++){
722 
723  if(process_command_line_option(argc, argv, &i) == 0){
724  continue;
725  }
726 
727  if( strcmp(argv[i], "--gauge-order") == 0){
728  if(i+1 >= argc){
729  usage(argv);
730  }
731 
732  if(strcmp(argv[i+1], "milc") == 0){
734  }else if(strcmp(argv[i+1], "qdp") == 0){
736  }else{
737  fprintf(stderr, "Error: unsupported gauge-field order\n");
738  exit(1);
739  }
740  i++;
741  continue;
742  }
743 
744  if( strcmp(argv[i], "--no_verify") == 0){
745  verify_results=0;
746  continue;
747  }
748  fprintf(stderr, "ERROR: Invalid option:%s\n", argv[i]);
749  usage(argv);
750  }
751 
752 #ifdef MULTI_GPU
754  errorQuda("Multi-gpu for milc order is not supported\n");
755  }
756 
757  initComms(argc, argv, gridsize_from_cmdline);
758 #endif
759 
761 
763 
764  int accuracy_level = hisq_force_test();
765 
766  finalizeComms();
767 
768  if(accuracy_level >=3 ){
769  return EXIT_SUCCESS;
770  }else{
771  return -1;
772  }
773 
774 }
775 
776