QUDA  v1.1.0
A library for QCD on GPUs
hisq_paths_force_test.cpp
Go to the documentation of this file.
1 #include <cstdio>
2 #include <cstdlib>
3 #include <cstring>
4 
5 #include <quda.h>
6 #include "host_utils.h"
7 #include <command_line_params.h>
8 #include "gauge_field.h"
9 #include "misc.h"
10 #include "hisq_force_reference.h"
11 #include "ks_improved_force.h"
12 #include "momentum.h"
13 #include <dslash_quda.h>
14 #include <sys/time.h>
15 
16 #define TDIFF(a,b) (b.tv_sec - a.tv_sec + 0.000001*(b.tv_usec - a.tv_usec))
17 
18 using namespace quda;
19 
22 
25 
29 
30 static QudaGaugeParam qudaGaugeParam;
31 static QudaGaugeParam qudaGaugeParam_ex;
32 static void* hw; // the array of half_wilson_vector
33 
35 
40 
41 int ODD_BIT = 1;
42 
47 
56 
59 
60 static void setPrecision(QudaPrecision precision)
61 {
62  link_prec = precision;
63  hw_prec = precision;
64  cpu_hw_prec = precision;
65  mom_prec = precision;
66  return;
67 }
68 
70 {
71  //total IO counting for the middle/side/all link kernels
72  //Explanation about these numbers can be founed in the corresnponding kernel functions in
73  //the hisq kernel core file
74  int linksize = prec*recon;
75  int cmsize = prec*18;
76 
77  int matrix_mul_flops = 198;
78  int matrix_add_flops = 18;
79 
80  int num_calls_middle_link[6] = {24, 24, 96, 96, 24, 24};
81  int middle_link_data_io[6][2] = {
82  {3,6},
83  {3,4},
84  {3,7},
85  {3,5},
86  {3,5},
87  {3,2}
88  };
89  int middle_link_data_flops[6][2] = {
90  {3,1},
91  {2,0},
92  {4,1},
93  {3,0},
94  {4,1},
95  {2,0}
96  };
97 
98  int num_calls_side_link[2]= {192, 48};
99  int side_link_data_io[2][2] = {
100  {1, 6},
101  {0, 3}
102  };
103  int side_link_data_flops[2][2] = {
104  {2, 2},
105  {0, 1}
106  };
107 
108  int num_calls_all_link[2] ={192, 192};
109  int all_link_data_io[2][2] = {
110  {3, 8},
111  {3, 6}
112  };
113  int all_link_data_flops[2][2] = {
114  {6, 3},
115  {4, 2}
116  };
117 
118  double total_io = 0;
119  for(int i = 0;i < 6; i++){
120  total_io += num_calls_middle_link[i]
121  *(middle_link_data_io[i][0]*linksize + middle_link_data_io[i][1]*cmsize);
122  }
123 
124  for(int i = 0;i < 2; i++){
125  total_io += num_calls_side_link[i]
126  *(side_link_data_io[i][0]*linksize + side_link_data_io[i][1]*cmsize);
127  }
128  for(int i = 0;i < 2; i++){
129  total_io += num_calls_all_link[i]
130  *(all_link_data_io[i][0]*linksize + all_link_data_io[i][1]*cmsize);
131  }
132  total_io *= V;
133 
134 
135  double total_flops = 0;
136  for(int i = 0;i < 6; i++){
137  total_flops += num_calls_middle_link[i]
138  *(middle_link_data_flops[i][0]*matrix_mul_flops + middle_link_data_flops[i][1]*matrix_add_flops);
139  }
140 
141  for(int i = 0;i < 2; i++){
142  total_flops += num_calls_side_link[i]
143  *(side_link_data_flops[i][0]*matrix_mul_flops + side_link_data_flops[i][1]*matrix_add_flops);
144  }
145  for(int i = 0;i < 2; i++){
146  total_flops += num_calls_all_link[i]
147  *(all_link_data_flops[i][0]*matrix_mul_flops + all_link_data_flops[i][1]*matrix_add_flops);
148  }
149  total_flops *= V;
150 
151  *io=total_io;
152  *flops = total_flops;
153 
154  printfQuda("flop/byte =%.1f\n", total_flops/total_io);
155  return ;
156 }
157 
158 #ifdef MULTI_GPU
159 static int R[4] = {2, 2, 2, 2};
160 #else
161 static int R[4] = {0, 0, 0, 0};
162 #endif
163 
164 // allocate memory
165 // set the layout, etc.
166 static void hisq_force_init()
167 {
169 
170  qudaGaugeParam.X[0] = xdim;
171  qudaGaugeParam.X[1] = ydim;
172  qudaGaugeParam.X[2] = zdim;
173  qudaGaugeParam.X[3] = tdim;
174 
175  setDims(qudaGaugeParam.X);
176 
177  qudaGaugeParam.cpu_prec = link_prec;
178  qudaGaugeParam.cuda_prec = link_prec;
179  qudaGaugeParam.reconstruct = link_recon;
180  qudaGaugeParam.anisotropy = 1.0;
181 
182  memcpy(&qudaGaugeParam_ex, &qudaGaugeParam, sizeof(QudaGaugeParam));
183 
184  gParam = GaugeFieldParam(0, qudaGaugeParam);
187 
190 
191  gParam_ex = GaugeFieldParam(0, qudaGaugeParam_ex);
196  for (int d=0; d<4; d++) { gParam_ex.r[d] = R[d]; gParam_ex.x[d] = gParam.x[d] + 2*gParam_ex.r[d]; } // set halo region for CPU
198 
200  createSiteLinkCPU((void**)cpuGauge->Gauge_p(), qudaGaugeParam.cpu_prec, 1);
201  } else {
202  errorQuda("Unsupported gauge order %d", gauge_order);
203  }
204 
206 
210  gParam_ex.pad = 0;
212  for (int d=0; d<4; d++) { gParam_ex.r[d] = (comm_dim_partitioned(d)) ? 2 : 0; gParam_ex.x[d] = gParam.x[d] + 2*gParam_ex.r[d]; } // set halo region
214  qudaGaugeParam.site_ga_pad = gParam_ex.pad;
215  //record gauge pad size
216 
217  gParam_ex.pad = 0;
221  for (int d=0; d<4; d++) { gParam_ex.r[d] = R[d]; gParam_ex.x[d] = gParam.x[d] + 2*gParam_ex.r[d]; } // set halo region for CPU
223 
224 
227  for (int d=0; d<4; d++) { gParam_ex.r[d] = (comm_dim_partitioned(d)) ? 2 : 0; gParam_ex.x[d] = gParam.x[d] + 2*gParam_ex.r[d]; } // set halo region
229 
230  // create the momentum matrix
231  gParam.pad = 0;
237  cpuMom = new cpuGaugeField(gParam);
238  refMom = new cpuGaugeField(gParam);
239 
240  //createMomCPU(cpuMom->Gauge_p(), mom_prec);
241 
242  hw = malloc(4 * cpuGauge->Volume() * hw_site_size * qudaGaugeParam.cpu_prec);
243  if (hw == NULL){
244  fprintf(stderr, "ERROR: malloc failed for hw\n");
245  exit(1);
246  }
247 
248  createHwCPU(hw, hw_prec);
249 
253  gParam.pad = 0;
258 
262  for (int d=0; d<4; d++) { gParam_ex.r[d] = R[d]; gParam_ex.x[d] = gParam.x[d] + 2*gParam_ex.r[d]; } // set halo region for CPU
265 
267 
269 
271  for (int d=0; d<4; d++) { gParam_ex.r[d] = (comm_dim_partitioned(d)) ? 2 : 0; gParam_ex.x[d] = gParam.x[d] + 2*gParam_ex.r[d]; } // set halo region
273 
274 }
275 
276 static void hisq_force_end()
277 {
278  delete cudaMom;
279  delete cudaGauge;
280  delete cudaForce_ex;
281  delete cudaGauge_ex;
282  //delete cudaOprod_ex; // already deleted
283  delete cudaLongLinkOprod_ex;
284 
285  delete cpuGauge;
286  delete cpuMom;
287  delete refMom;
288  delete cpuOprod;
289  delete cpuLongLinkOprod;
290 
291  delete cpuGauge_ex;
292  delete cpuForce_ex;
293  delete cpuOprod_ex;
294  delete cpuLongLinkOprod_ex;
295 
296  free(hw);
297 
298  endQuda();
299 }
300 
301 static int hisq_force_test(void)
302 {
304 
305  hisq_force_init();
306 
307  //float weight = 1.0;
308  float act_path_coeff[6];
309 
310  act_path_coeff[0] = 0.625000;
311  act_path_coeff[1] = -0.058479;
312  act_path_coeff[2] = -0.087719;
313  act_path_coeff[3] = 0.030778;
314  act_path_coeff[4] = -0.007200;
315  act_path_coeff[5] = -0.123113;
316 
317  //double d_weight = 1.0;
318  double d_act_path_coeff[6];
319  for(int i=0; i<6; ++i){
320  d_act_path_coeff[i] = act_path_coeff[i];
321  }
322 
326 
330 
332 
333  struct timeval ht0, ht1;
334  gettimeofday(&ht0, NULL);
335  if (verify_results){
336  hisqStaplesForceCPU(d_act_path_coeff, qudaGaugeParam, *cpuOprod_ex, *cpuGauge_ex, cpuForce_ex);
337  hisqLongLinkForceCPU(d_act_path_coeff[1], qudaGaugeParam, *cpuLongLinkOprod_ex, *cpuGauge_ex, cpuForce_ex);
338  hisqCompleteForceCPU(qudaGaugeParam, *cpuForce_ex, *cpuGauge_ex, refMom);
339  }
340  gettimeofday(&ht1, NULL);
341 
342  struct timeval t0, t1, t2, t3;
343 
344  gettimeofday(&t0, NULL);
345 
348  gettimeofday(&t1, NULL);
349 
350  delete cudaOprod_ex; //doing this to lower the peak memory usage
352  for (int d=0; d<4; d++) { gParam_ex.r[d] = (comm_dim_partitioned(d)) ? 2 : 0; gParam_ex.x[d] = gParam.x[d] + 2*gParam_ex.r[d]; } // set halo region
358 
359  gettimeofday(&t2, NULL);
360 
361  gParam.create = QUDA_ZERO_FIELD_CREATE; // initialize to zero
364  gParam.pad = 0; //X1*X2*X3/2;
367 
368  //record the mom pad
369  qudaGaugeParam.mom_ga_pad = gParam.pad;
370 
372  updateMomentum(*cudaMom, 1.0, *cudaForce_ex, __func__);
373 
375  gettimeofday(&t3, NULL);
376 
378 
379  int accuracy_level = 3;
380  if(verify_results){
381  int res;
382  res = compare_floats(cpuMom->Gauge_p(), refMom->Gauge_p(), 4 * cpuMom->Volume() * mom_site_size, 1e-5,
383  qudaGaugeParam.cpu_prec);
384 
385  accuracy_level = strong_check_mom(cpuMom->Gauge_p(), refMom->Gauge_p(), 4*cpuMom->Volume(), qudaGaugeParam.cpu_prec);
386  printfQuda("Test %s\n",(1 == res) ? "PASSED" : "FAILED");
387  }
388  double total_io;
389  double total_flops;
390  total_staple_io_flops(link_prec, link_recon, &total_io, &total_flops);
391 
392  float perf_flops = total_flops / (TDIFF(t0, t1)) *1e-9;
393  float perf = total_io / (TDIFF(t0, t1)) *1e-9;
394  printfQuda("Staples time: %.2f ms, perf = %.2f GFLOPS, achieved bandwidth= %.2f GB/s\n", TDIFF(t0,t1)*1000, perf_flops, perf);
395  printfQuda("Staples time : %g ms\t LongLink time : %g ms\t Completion time : %g ms\n", TDIFF(t0,t1)*1000, TDIFF(t1,t2)*1000, TDIFF(t2,t3)*1000);
396  printfQuda("Host time (half-wilson fermion force) : %g ms\n", TDIFF(ht0, ht1)*1000);
397 
398  hisq_force_end();
399 
400  return accuracy_level;
401 }
402 
403 
404 static void display_test_info()
405 {
406  printfQuda("running the following fermion force computation test:\n");
407 
408  printfQuda("link_precision link_reconstruct space_dim(x/y/z) T_dimension Gauge_order\n");
409  printfQuda("%s %s %d/%d/%d %d %s\n",
412  xdim, ydim, zdim, tdim,
414  return ;
415 
416 }
417 
418 int main(int argc, char **argv)
419 {
420  auto app = make_app();
421  // app->get_formatter()->column_width(40);
422  // add_eigen_option_group(app);
423  // add_deflation_option_group(app);
424  // add_multigrid_option_group(app);
425 
426  CLI::TransformPairs<QudaGaugeFieldOrder> gauge_order_map {{"milc", QUDA_MILC_GAUGE_ORDER},
427  {"qdp", QUDA_QDP_GAUGE_ORDER}};
428  app->add_option("--gauge-order", gauge_order, "")->transform(CLI::QUDACheckedTransformer(gauge_order_map));
429 
430  try {
431  app->parse(argc, argv);
432  } catch (const CLI::ParseError &e) {
433  return app->exit(e);
434  }
435 
437  errorQuda("Multi-gpu for milc order is not supported\n");
438  }
439 
440  initComms(argc, argv, gridsize_from_cmdline);
441 
443 
445 
446  int accuracy_level = hisq_force_test();
447 
448  finalizeComms();
449 
450  if(accuracy_level >=3 ){
451  return EXIT_SUCCESS;
452  }else{
453  return -1;
454  }
455 }
void display_test_info()
size_t Volume() const
const int * R() const
void exchangeExtendedGhost(const int *R, bool no_comms_fill=false)
This does routine will populate the border / halo region of a gauge field that has been created using...
void loadCPUField(const cpuGaugeField &cpu)
Download into this field from a CPU field.
void saveCPUField(cpuGaugeField &cpu) const
Upload from this field into a CPU field.
void exchangeExtendedGhost(const int *R, bool no_comms_fill=false)
This does routine will populate the border / halo region of a gauge field that has been created using...
int comm_dim_partitioned(int dim)
std::shared_ptr< QUDAApp > make_app(std::string app_description, std::string app_name)
QudaReconstructType link_recon
int device_ordinal
int & ydim
bool verify_results
int & zdim
QudaPrecision prec
int & tdim
int & xdim
std::array< int, 4 > gridsize_from_cmdline
int V
Definition: host_utils.cpp:37
void setDims(int *)
Definition: host_utils.cpp:315
enum QudaPrecision_s QudaPrecision
@ QUDA_CPU_FIELD_LOCATION
Definition: enum_quda.h:325
@ QUDA_VERBOSE
Definition: enum_quda.h:267
enum QudaGaugeFieldOrder_s QudaGaugeFieldOrder
@ QUDA_RECONSTRUCT_NO
Definition: enum_quda.h:70
@ QUDA_RECONSTRUCT_10
Definition: enum_quda.h:75
@ QUDA_GHOST_EXCHANGE_EXTENDED
Definition: enum_quda.h:510
@ QUDA_GHOST_EXCHANGE_NO
Definition: enum_quda.h:508
enum QudaReconstructType_s QudaReconstructType
@ QUDA_DOUBLE_PRECISION
Definition: enum_quda.h:65
@ QUDA_FLOAT2_GAUGE_ORDER
Definition: enum_quda.h:40
@ QUDA_QDP_GAUGE_ORDER
Definition: enum_quda.h:44
@ QUDA_MILC_GAUGE_ORDER
Definition: enum_quda.h:47
@ QUDA_ZERO_FIELD_CREATE
Definition: enum_quda.h:361
@ QUDA_NULL_FIELD_CREATE
Definition: enum_quda.h:360
@ QUDA_ASQTAD_MOM_LINKS
Definition: enum_quda.h:33
@ QUDA_GENERAL_LINKS
Definition: enum_quda.h:25
void computeLinkOrderedOuterProduct(void *src, void *dst, QudaPrecision precision, int gauge_order)
void hisqStaplesForceCPU(const double *path_coeff, const QudaGaugeParam &param, quda::cpuGaugeField &oprod, quda::cpuGaugeField &link, quda::cpuGaugeField *newOprod)
void hisqCompleteForceCPU(const QudaGaugeParam &param, quda::cpuGaugeField &oprod, quda::cpuGaugeField &link, quda::cpuGaugeField *mom)
void hisqLongLinkForceCPU(double coeff, const QudaGaugeParam &param, quda::cpuGaugeField &oprod, quda::cpuGaugeField &link, quda::cpuGaugeField *newOprod)
cudaGaugeField * cudaLongLinkOprod
cpuGaugeField * cpuForce
QudaGaugeFieldOrder gauge_order
cpuGaugeField * cpuLongLinkOprod_ex
void total_staple_io_flops(QudaPrecision prec, QudaReconstructType recon, double *io, double *flops)
QudaPrecision link_prec
QudaPrecision hw_prec
int main(int argc, char **argv)
cpuGaugeField * cpuOprod
QudaPrecision mom_prec
cudaGaugeField * cudaMom
cpuGaugeField * cpuMom
cudaGaugeField * cudaLongLinkOprod_ex
cpuGaugeField * cpuForce_ex
cudaGaugeField * cudaOprod
GaugeFieldParam gParam
cpuGaugeField * refMom
cpuGaugeField * cpuOprod_ex
cpuGaugeField * cpuGauge
#define TDIFF(a, b)
cudaGaugeField * cudaOprod_ex
GaugeFieldParam gParam_ex
cudaGaugeField * cudaGauge_ex
cpuGaugeField * cpuLongLinkOprod
QudaPrecision cpu_hw_prec
cudaGaugeField * cudaForce
cudaGaugeField * cudaGauge
cpuGaugeField * cpuGauge_ex
cudaGaugeField * cudaForce_ex
void setPrecision(QudaPrecision precision)
int compare_floats(void *a, void *b, int len, double epsilon, QudaPrecision precision)
Definition: host_utils.cpp:889
void initComms(int argc, char **argv, std::array< int, 4 > &commDims)
Definition: host_utils.cpp:255
void finalizeComms()
Definition: host_utils.cpp:292
int strong_check_mom(void *momA, void *momB, int len, QudaPrecision prec)
void createSiteLinkCPU(void **link, QudaPrecision precision, int phase)
void createHwCPU(void *hw, QudaPrecision precision)
#define mom_site_size
Definition: host_utils.h:12
#define hw_site_size
Definition: host_utils.h:13
const char * get_prec_str(QudaPrecision prec)
Definition: misc.cpp:26
const char * get_gauge_order_str(QudaGaugeFieldOrder order)
Definition: misc.cpp:54
const char * get_recon_str(QudaReconstructType recon)
Definition: misc.cpp:68
unsigned long long flops
void hisqCompleteForce(GaugeField &oprod, const GaugeField &link)
Multiply the computed the force matrix by the gauge field and perform traceless anti-hermitian projec...
void hisqLongLinkForce(GaugeField &newOprod, const GaugeField &oprod, const GaugeField &link, double coeff)
Compute the long-link contribution to the fermion force.
void hisqStaplesForce(GaugeField &newOprod, const GaugeField &oprod, const GaugeField &link, const double path_coeff[6])
Compute the fat-link contribution to the fermion force.
void updateMomentum(GaugeField &mom, double coeff, GaugeField &force, const char *fname)
void copyExtendedGauge(GaugeField &out, const GaugeField &in, QudaFieldLocation location, void *Out=0, void *In=0)
Main header file for the QUDA library.
void initQuda(int device)
void endQuda(void)
#define qudaDeviceSynchronize()
Definition: quda_api.h:250
double anisotropy
Definition: quda.h:37
QudaReconstructType reconstruct
Definition: quda.h:49
int mom_ga_pad
Definition: quda.h:71
QudaPrecision cuda_prec
Definition: quda.h:48
int X[4]
Definition: quda.h:35
QudaPrecision cpu_prec
Definition: quda.h:46
int site_ga_pad
Definition: quda.h:67
QudaReconstructType reconstruct
Definition: gauge_field.h:50
QudaGaugeFieldOrder order
Definition: gauge_field.h:51
void setPrecision(QudaPrecision precision, bool force_native=false)
Helper function for setting the precision and corresponding field order for QUDA internal fields.
Definition: gauge_field.h:173
QudaLinkType link_type
Definition: gauge_field.h:53
QudaFieldCreate create
Definition: gauge_field.h:60
int r[QUDA_MAX_DIM]
Definition: lattice_field.h:80
QudaGhostExchange ghostExchange
Definition: lattice_field.h:77
int x[QUDA_MAX_DIM]
Definition: lattice_field.h:68
#define printfQuda(...)
Definition: util_quda.h:114
void setVerbosity(QudaVerbosity verbosity)
Definition: util_quda.cpp:25
#define errorQuda(...)
Definition: util_quda.h:120