QUDA  1.0.0
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros
hisq_paths_force_test.cpp
Go to the documentation of this file.
1 #include <cstdio>
2 #include <cstdlib>
3 #include <cstring>
4 
5 #include <quda.h>
6 #include "test_util.h"
7 #include "gauge_field.h"
8 #include "misc.h"
9 #include "hisq_force_reference.h"
10 #include "ks_improved_force.h"
11 #include "momentum.h"
12 #include <dslash_quda.h>
13 #include <sys/time.h>
14 
15 #define TDIFF(a,b) (b.tv_sec - a.tv_sec + 0.000001*(b.tv_usec - a.tv_usec))
16 
17 using namespace quda;
18 
19 extern void usage(char** argv);
20 extern int device;
23 
26 
30 
33 static void* hw; // the array of half_wilson_vector
34 
36 
41 
42 extern bool verify_results;
43 int ODD_BIT = 1;
44 extern int xdim, ydim, zdim, tdim;
45 extern int gridsize_from_cmdline[];
46 
47 extern QudaPrecision prec;
53 
62 
65 
66 static void setPrecision(QudaPrecision precision)
67 {
68  link_prec = precision;
69  hw_prec = precision;
70  cpu_hw_prec = precision;
71  mom_prec = precision;
72  return;
73 }
74 
76 {
77  //total IO counting for the middle/side/all link kernels
78  //Explanation about these numbers can be founed in the corresnponding kernel functions in
79  //the hisq kernel core file
80  int linksize = prec*recon;
81  int cmsize = prec*18;
82 
83  int matrix_mul_flops = 198;
84  int matrix_add_flops = 18;
85 
86  int num_calls_middle_link[6] = {24, 24, 96, 96, 24, 24};
87  int middle_link_data_io[6][2] = {
88  {3,6},
89  {3,4},
90  {3,7},
91  {3,5},
92  {3,5},
93  {3,2}
94  };
95  int middle_link_data_flops[6][2] = {
96  {3,1},
97  {2,0},
98  {4,1},
99  {3,0},
100  {4,1},
101  {2,0}
102  };
103 
104  int num_calls_side_link[2]= {192, 48};
105  int side_link_data_io[2][2] = {
106  {1, 6},
107  {0, 3}
108  };
109  int side_link_data_flops[2][2] = {
110  {2, 2},
111  {0, 1}
112  };
113 
114  int num_calls_all_link[2] ={192, 192};
115  int all_link_data_io[2][2] = {
116  {3, 8},
117  {3, 6}
118  };
119  int all_link_data_flops[2][2] = {
120  {6, 3},
121  {4, 2}
122  };
123 
124  double total_io = 0;
125  for(int i = 0;i < 6; i++){
126  total_io += num_calls_middle_link[i]
127  *(middle_link_data_io[i][0]*linksize + middle_link_data_io[i][1]*cmsize);
128  }
129 
130  for(int i = 0;i < 2; i++){
131  total_io += num_calls_side_link[i]
132  *(side_link_data_io[i][0]*linksize + side_link_data_io[i][1]*cmsize);
133  }
134  for(int i = 0;i < 2; i++){
135  total_io += num_calls_all_link[i]
136  *(all_link_data_io[i][0]*linksize + all_link_data_io[i][1]*cmsize);
137  }
138  total_io *= V;
139 
140 
141  double total_flops = 0;
142  for(int i = 0;i < 6; i++){
143  total_flops += num_calls_middle_link[i]
144  *(middle_link_data_flops[i][0]*matrix_mul_flops + middle_link_data_flops[i][1]*matrix_add_flops);
145  }
146 
147  for(int i = 0;i < 2; i++){
148  total_flops += num_calls_side_link[i]
149  *(side_link_data_flops[i][0]*matrix_mul_flops + side_link_data_flops[i][1]*matrix_add_flops);
150  }
151  for(int i = 0;i < 2; i++){
152  total_flops += num_calls_all_link[i]
153  *(all_link_data_flops[i][0]*matrix_mul_flops + all_link_data_flops[i][1]*matrix_add_flops);
154  }
155  total_flops *= V;
156 
157  *io=total_io;
158  *flops = total_flops;
159 
160  printfQuda("flop/byte =%.1f\n", total_flops/total_io);
161  return ;
162 }
163 
164 #ifdef MULTI_GPU
165 static int R[4] = {2, 2, 2, 2};
166 #else
167 static int R[4] = {0, 0, 0, 0};
168 #endif
169 
170 // allocate memory
171 // set the layout, etc.
172 static void hisq_force_init()
173 {
174  initQuda(device);
175 
176  qudaGaugeParam.X[0] = xdim;
177  qudaGaugeParam.X[1] = ydim;
178  qudaGaugeParam.X[2] = zdim;
179  qudaGaugeParam.X[3] = tdim;
180 
181  setDims(qudaGaugeParam.X);
182 
183  qudaGaugeParam.cpu_prec = link_prec;
184  qudaGaugeParam.cuda_prec = link_prec;
185  qudaGaugeParam.reconstruct = link_recon;
186  qudaGaugeParam.anisotropy = 1.0;
187 
188  memcpy(&qudaGaugeParam_ex, &qudaGaugeParam, sizeof(QudaGaugeParam));
189 
190  gParam = GaugeFieldParam(0, qudaGaugeParam);
192  gParam.link_type = QUDA_GENERAL_LINKS;
193 
194  gParam.order = gauge_order;
195  cpuGauge = new cpuGaugeField(gParam);
196 
197  gParam_ex = GaugeFieldParam(0, qudaGaugeParam_ex);
199  gParam_ex.create = QUDA_NULL_FIELD_CREATE;
200  gParam_ex.link_type = QUDA_GENERAL_LINKS;
201  gParam_ex.order = gauge_order;
202  for (int d=0; d<4; d++) { gParam_ex.r[d] = R[d]; gParam_ex.x[d] = gParam.x[d] + 2*gParam_ex.r[d]; } // set halo region for CPU
203  cpuGauge_ex = new cpuGaugeField(gParam_ex);
204 
206  createSiteLinkCPU((void**)cpuGauge->Gauge_p(), qudaGaugeParam.cpu_prec, 1);
207  } else {
208  errorQuda("Unsupported gauge order %d", gauge_order);
209  }
210 
211  copyExtendedGauge(*cpuGauge_ex, *cpuGauge, QUDA_CPU_FIELD_LOCATION);
212 
213  gParam_ex.order = QUDA_FLOAT2_GAUGE_ORDER;
214  gParam_ex.setPrecision(prec);
215  gParam_ex.reconstruct = link_recon;
216  gParam_ex.pad = 0;
217  gParam_ex.order = QUDA_FLOAT2_GAUGE_ORDER;
218  for (int d=0; d<4; d++) { gParam_ex.r[d] = (comm_dim_partitioned(d)) ? 2 : 0; gParam_ex.x[d] = gParam.x[d] + 2*gParam_ex.r[d]; } // set halo region
219  cudaGauge_ex = new cudaGaugeField(gParam_ex);
220  qudaGaugeParam.site_ga_pad = gParam_ex.pad;
221  //record gauge pad size
222 
223  gParam_ex.pad = 0;
224  gParam_ex.reconstruct = QUDA_RECONSTRUCT_NO;
225  gParam_ex.create = QUDA_ZERO_FIELD_CREATE;
226  gParam_ex.order = gauge_order;
227  for (int d=0; d<4; d++) { gParam_ex.r[d] = R[d]; gParam_ex.x[d] = gParam.x[d] + 2*gParam_ex.r[d]; } // set halo region for CPU
228  cpuForce_ex = new cpuGaugeField(gParam_ex);
229 
230 
231  gParam_ex.order = QUDA_FLOAT2_GAUGE_ORDER;
232  gParam_ex.reconstruct = QUDA_RECONSTRUCT_NO;
233  for (int d=0; d<4; d++) { gParam_ex.r[d] = (comm_dim_partitioned(d)) ? 2 : 0; gParam_ex.x[d] = gParam.x[d] + 2*gParam_ex.r[d]; } // set halo region
234  cudaForce_ex = new cudaGaugeField(gParam_ex);
235 
236  // create the momentum matrix
237  gParam.pad = 0;
241  gParam.order = QUDA_MILC_GAUGE_ORDER;
243  cpuMom = new cpuGaugeField(gParam);
244  refMom = new cpuGaugeField(gParam);
245 
246  //createMomCPU(cpuMom->Gauge_p(), mom_prec);
247 
248  hw = malloc(4*cpuGauge->Volume()*hwSiteSize*qudaGaugeParam.cpu_prec);
249  if (hw == NULL){
250  fprintf(stderr, "ERROR: malloc failed for hw\n");
251  exit(1);
252  }
253 
255 
256  gParam.link_type = QUDA_GENERAL_LINKS;
258  gParam.order = gauge_order;
259  gParam.pad = 0;
260  cpuOprod = new cpuGaugeField(gParam);
262  cpuLongLinkOprod = new cpuGaugeField(gParam);
263  computeLinkOrderedOuterProduct(hw, cpuLongLinkOprod->Gauge_p(), hw_prec, 3, gauge_order);
264 
265  gParam_ex.link_type = QUDA_GENERAL_LINKS;
266  gParam_ex.reconstruct = QUDA_RECONSTRUCT_NO;
267  gParam_ex.order = gauge_order;
268  for (int d=0; d<4; d++) { gParam_ex.r[d] = R[d]; gParam_ex.x[d] = gParam.x[d] + 2*gParam_ex.r[d]; } // set halo region for CPU
269  cpuOprod_ex = new cpuGaugeField(gParam_ex);
270  cpuLongLinkOprod_ex = new cpuGaugeField(gParam_ex);
271 
272  copyExtendedGauge(*cpuOprod_ex, *cpuOprod, QUDA_CPU_FIELD_LOCATION);
273 
274  copyExtendedGauge(*cpuLongLinkOprod_ex, *cpuLongLinkOprod, QUDA_CPU_FIELD_LOCATION);
275 
276  gParam_ex.order = QUDA_FLOAT2_GAUGE_ORDER;
277  for (int d=0; d<4; d++) { gParam_ex.r[d] = (comm_dim_partitioned(d)) ? 2 : 0; gParam_ex.x[d] = gParam.x[d] + 2*gParam_ex.r[d]; } // set halo region
278  cudaOprod_ex = new cudaGaugeField(gParam_ex);
279 
280 }
281 
282 static void hisq_force_end()
283 {
284  delete cudaMom;
285  delete cudaGauge;
286  delete cudaForce_ex;
287  delete cudaGauge_ex;
288  //delete cudaOprod_ex; // already deleted
289  delete cudaLongLinkOprod_ex;
290 
291  delete cpuGauge;
292  delete cpuMom;
293  delete refMom;
294  delete cpuOprod;
295  delete cpuLongLinkOprod;
296 
297  delete cpuGauge_ex;
298  delete cpuForce_ex;
299  delete cpuOprod_ex;
300  delete cpuLongLinkOprod_ex;
301 
302  free(hw);
303 
304  endQuda();
305 }
306 
307 static int hisq_force_test(void)
308 {
310 
311  hisq_force_init();
312 
313  //float weight = 1.0;
314  float act_path_coeff[6];
315 
316  act_path_coeff[0] = 0.625000;
317  act_path_coeff[1] = -0.058479;
318  act_path_coeff[2] = -0.087719;
319  act_path_coeff[3] = 0.030778;
320  act_path_coeff[4] = -0.007200;
321  act_path_coeff[5] = -0.123113;
322 
323  //double d_weight = 1.0;
324  double d_act_path_coeff[6];
325  for(int i=0; i<6; ++i){
326  d_act_path_coeff[i] = act_path_coeff[i];
327  }
328 
329  cpuGauge_ex->exchangeExtendedGhost(R,true);
330  cudaGauge_ex->loadCPUField(*cpuGauge);
331  cudaGauge_ex->exchangeExtendedGhost(cudaGauge_ex->R());
332 
333  cpuOprod_ex->exchangeExtendedGhost(R,true);
334  cudaOprod_ex->loadCPUField(*cpuOprod);
335  cudaOprod_ex->exchangeExtendedGhost(cudaOprod_ex->R());
336 
337  cpuLongLinkOprod_ex->exchangeExtendedGhost(R,true);
338 
339  struct timeval ht0, ht1;
340  gettimeofday(&ht0, NULL);
341  if (verify_results){
342  hisqStaplesForceCPU(d_act_path_coeff, qudaGaugeParam, *cpuOprod_ex, *cpuGauge_ex, cpuForce_ex);
343  hisqLongLinkForceCPU(d_act_path_coeff[1], qudaGaugeParam, *cpuLongLinkOprod_ex, *cpuGauge_ex, cpuForce_ex);
344  hisqCompleteForceCPU(qudaGaugeParam, *cpuForce_ex, *cpuGauge_ex, refMom);
345  }
346  gettimeofday(&ht1, NULL);
347 
348  struct timeval t0, t1, t2, t3;
349 
350  gettimeofday(&t0, NULL);
351 
352  fermion_force::hisqStaplesForce(*cudaForce_ex, *cudaOprod_ex, *cudaGauge_ex, d_act_path_coeff);
353  cudaDeviceSynchronize();
354  gettimeofday(&t1, NULL);
355 
356  delete cudaOprod_ex; //doing this to lower the peak memory usage
357  gParam_ex.order = QUDA_FLOAT2_GAUGE_ORDER;
358  for (int d=0; d<4; d++) { gParam_ex.r[d] = (comm_dim_partitioned(d)) ? 2 : 0; gParam_ex.x[d] = gParam.x[d] + 2*gParam_ex.r[d]; } // set halo region
359  cudaLongLinkOprod_ex = new cudaGaugeField(gParam_ex);
360  cudaLongLinkOprod_ex->loadCPUField(*cpuLongLinkOprod);
361  cudaLongLinkOprod_ex->exchangeExtendedGhost(cudaLongLinkOprod_ex->R());
362  fermion_force::hisqLongLinkForce(*cudaForce_ex, *cudaLongLinkOprod_ex, *cudaGauge_ex, d_act_path_coeff[1]);
363  cudaDeviceSynchronize();
364 
365  gettimeofday(&t2, NULL);
366 
367  gParam.create = QUDA_ZERO_FIELD_CREATE; // initialize to zero
370  gParam.pad = 0; //X1*X2*X3/2;
372  cudaMom = new cudaGaugeField(gParam);
373 
374  //record the mom pad
375  qudaGaugeParam.mom_ga_pad = gParam.pad;
376 
377  fermion_force::hisqCompleteForce(*cudaForce_ex, *cudaGauge_ex);
378  updateMomentum(*cudaMom, 1.0, *cudaForce_ex, __func__);
379 
380  cudaDeviceSynchronize();
381  gettimeofday(&t3, NULL);
382 
383  checkCudaError();
384 
385  cudaMom->saveCPUField(*cpuMom);
386 
387  int accuracy_level = 3;
388  if(verify_results){
389  int res;
390  res = compare_floats(cpuMom->Gauge_p(), refMom->Gauge_p(), 4*cpuMom->Volume()*momSiteSize, 1e-5, qudaGaugeParam.cpu_prec);
391 
392  accuracy_level = strong_check_mom(cpuMom->Gauge_p(), refMom->Gauge_p(), 4*cpuMom->Volume(), qudaGaugeParam.cpu_prec);
393  printfQuda("Test %s\n",(1 == res) ? "PASSED" : "FAILED");
394  }
395  double total_io;
396  double total_flops;
397  total_staple_io_flops(link_prec, link_recon, &total_io, &total_flops);
398 
399  float perf_flops = total_flops / (TDIFF(t0, t1)) *1e-9;
400  float perf = total_io / (TDIFF(t0, t1)) *1e-9;
401  printfQuda("Staples time: %.2f ms, perf = %.2f GFLOPS, achieved bandwidth= %.2f GB/s\n", TDIFF(t0,t1)*1000, perf_flops, perf);
402  printfQuda("Staples time : %g ms\t LongLink time : %g ms\t Completion time : %g ms\n", TDIFF(t0,t1)*1000, TDIFF(t1,t2)*1000, TDIFF(t2,t3)*1000);
403  printfQuda("Host time (half-wilson fermion force) : %g ms\n", TDIFF(ht0, ht1)*1000);
404 
405  hisq_force_end();
406 
407  return accuracy_level;
408 }
409 
410 
411 static void display_test_info()
412 {
413  printfQuda("running the following fermion force computation test:\n");
414 
415  printfQuda("link_precision link_reconstruct space_dim(x/y/z) T_dimension Gauge_order\n");
416  printfQuda("%s %s %d/%d/%d %d %s\n",
419  xdim, ydim, zdim, tdim,
421  return ;
422 
423 }
424 
425 int main(int argc, char **argv)
426 {
427  int i;
428  for (i =1;i < argc; i++){
429 
430  if(process_command_line_option(argc, argv, &i) == 0){
431  continue;
432  }
433 
434  if( strcmp(argv[i], "--gauge-order") == 0){
435  if(i+1 >= argc){
436  usage(argv);
437  }
438 
439  if(strcmp(argv[i+1], "milc") == 0){
441  }else if(strcmp(argv[i+1], "qdp") == 0){
443  }else{
444  fprintf(stderr, "Error: unsupported gauge-field order\n");
445  exit(1);
446  }
447  i++;
448  continue;
449  }
450 
451  fprintf(stderr, "ERROR: Invalid option:%s\n", argv[i]);
452  usage(argv);
453  }
454 
456  errorQuda("Multi-gpu for milc order is not supported\n");
457  }
458 
459  initComms(argc, argv, gridsize_from_cmdline);
460 
462 
464 
465  int accuracy_level = hisq_force_test();
466 
467  finalizeComms();
468 
469  if(accuracy_level >=3 ){
470  return EXIT_SUCCESS;
471  }else{
472  return -1;
473  }
474 
475 }
476 
477 
int device
Definition: test_util.cpp:1602
QudaPrecision hw_prec
int main(int argc, char **argv)
double anisotropy
Definition: quda.h:38
cpuGaugeField * cpuOprod
QudaGhostExchange ghostExchange
Definition: lattice_field.h:76
void endQuda(void)
cpuGaugeField * cpuForce_ex
enum QudaPrecision_s QudaPrecision
void saveCPUField(cpuGaugeField &cpu) const
Upload from this field into a CPU field.
void hisqLongLinkForce(GaugeField &newOprod, const GaugeField &oprod, const GaugeField &link, double coeff)
Compute the long-link contribution to the fermion force.
cudaGaugeField * cudaOprod_ex
void exchangeExtendedGhost(const int *R, bool no_comms_fill=false)
This does routine will populate the border / halo region of a gauge field that has been created using...
cudaGaugeField * cudaForce_ex
int xdim
Definition: test_util.cpp:1615
#define errorQuda(...)
Definition: util_quda.h:121
void hisqStaplesForceCPU(const double *path_coeff, const QudaGaugeParam &param, quda::cpuGaugeField &oprod, quda::cpuGaugeField &link, quda::cpuGaugeField *newOprod)
void createHwCPU(void *hw, QudaPrecision precision)
Definition: test_util.cpp:1487
void hisqCompleteForce(GaugeField &oprod, const GaugeField &link)
Multiply the computed the force matrix by the gauge field and perform traceless anti-hermitian projec...
cudaGaugeField * cudaMom
static int R[4]
void updateMomentum(GaugeField &mom, double coeff, GaugeField &force, const char *fname)
Definition: momentum.cu:328
int process_command_line_option(int argc, char **argv, int *idx)
Definition: test_util.cpp:2019
int gridsize_from_cmdline[]
Definition: test_util.cpp:49
static void setPrecision(QudaPrecision precision)
void finalizeComms()
Definition: test_util.cpp:128
cpuGaugeField * cpuMom
const char * get_gauge_order_str(QudaGaugeFieldOrder order)
Definition: misc.cpp:741
int ydim
Definition: test_util.cpp:1616
int compare_floats(void *a, void *b, int len, double epsilon, QudaPrecision precision)
Definition: test_util.cpp:434
const char * get_prec_str(QudaPrecision prec)
Definition: misc.cpp:701
int tdim
Definition: test_util.cpp:1618
void createSiteLinkCPU(void **link, QudaPrecision precision, int phase)
Definition: test_util.cpp:1227
QudaPrecision cpu_hw_prec
void setDims(int *)
Definition: test_util.cpp:151
static void hisq_force_init()
static void display_test_info()
void hisqCompleteForceCPU(const QudaGaugeParam &param, quda::cpuGaugeField &oprod, quda::cpuGaugeField &link, quda::cpuGaugeField *mom)
int x[QUDA_MAX_DIM]
Definition: lattice_field.h:67
const int * R() const
void loadCPUField(const cpuGaugeField &cpu)
Download into this field from a CPU field.
cpuGaugeField * refMom
void initQuda(int device)
void hisqStaplesForce(GaugeField &newOprod, const GaugeField &oprod, const GaugeField &link, const double path_coeff[6])
Compute the fat-link contribution to the fermion force.
int site_ga_pad
Definition: quda.h:65
#define TDIFF(a, b)
QudaPrecision link_prec
static void * hw
cudaGaugeField * cudaLongLinkOprod
void exchangeExtendedGhost(const int *R, bool no_comms_fill=false)
This does routine will populate the border / halo region of a gauge field that has been created using...
QudaReconstructType link_recon
Definition: test_util.cpp:1605
const char * get_recon_str(QudaReconstructType recon)
Definition: misc.cpp:768
bool verify_results
Definition: test_util.cpp:1643
cpuGaugeField * cpuOprod_ex
static QudaGaugeParam qudaGaugeParam
QudaGaugeFieldOrder order
Definition: gauge_field.h:17
enum QudaGaugeFieldOrder_s QudaGaugeFieldOrder
int Volume() const
static void computeLinkOrderedOuterProduct(half_wilson_vector *src, su3_matrix *dest, int gauge_order)
void hisqLongLinkForceCPU(double coeff, const QudaGaugeParam &param, quda::cpuGaugeField &oprod, quda::cpuGaugeField &link, quda::cpuGaugeField *newOprod)
QudaReconstructType reconstruct
Definition: quda.h:50
QudaPrecision cuda_prec
Definition: quda.h:49
int X[4]
Definition: quda.h:36
int strong_check_mom(void *momA, void *momB, int len, QudaPrecision prec)
Definition: test_util.cpp:1559
int zdim
Definition: test_util.cpp:1617
#define hwSiteSize
Definition: test_util.h:11
int V
Definition: test_util.cpp:27
static QudaGaugeParam qudaGaugeParam_ex
GaugeFieldParam gParam_ex
cpuGaugeField * cpuLongLinkOprod_ex
cudaGaugeField * cudaGauge_ex
void setPrecision(QudaPrecision precision, bool force_native=false)
Helper function for setting the precision and corresponding field order for QUDA internal fields...
Definition: gauge_field.h:131
QudaPrecision prec
Definition: test_util.cpp:1608
GaugeFieldParam gParam
cpuGaugeField * cpuLongLinkOprod
enum QudaReconstructType_s QudaReconstructType
Main header file for the QUDA library.
QudaLinkType link_type
Definition: gauge_field.h:19
int mom_ga_pad
Definition: quda.h:69
static int hisq_force_test(void)
#define printfQuda(...)
Definition: util_quda.h:115
static void hisq_force_end()
QudaPrecision mom_prec
unsigned long long flops
Definition: blas_quda.cu:22
cpuGaugeField * cpuGauge_ex
cudaGaugeField * cudaGauge
QudaGaugeFieldOrder gauge_order
QudaReconstructType reconstruct
Definition: gauge_field.h:16
QudaFieldCreate create
Definition: gauge_field.h:26
cpuGaugeField * cpuGauge
#define checkCudaError()
Definition: util_quda.h:161
cudaGaugeField * cudaForce
void initComms(int argc, char **argv, int *const commDims)
Definition: test_util.cpp:88
cudaGaugeField * cudaOprod
int r[QUDA_MAX_DIM]
Definition: lattice_field.h:79
cudaGaugeField * cudaLongLinkOprod_ex
void setVerbosity(QudaVerbosity verbosity)
Definition: util_quda.cpp:25
void copyExtendedGauge(GaugeField &out, const GaugeField &in, QudaFieldLocation location, void *Out=0, void *In=0)
cpuGaugeField * cpuForce
QudaPrecision cpu_prec
Definition: quda.h:47
int comm_dim_partitioned(int dim)
void usage(char **argv)
Definition: test_util.cpp:1783
#define momSiteSize
Definition: test_util.h:10
void total_staple_io_flops(QudaPrecision prec, QudaReconstructType recon, double *io, double *flops)