QUDA  1.0.0
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros
llfat_test.cpp
Go to the documentation of this file.
1 #include <stdio.h>
2 #include <stdlib.h>
3 #include <string.h>
4 #include <sys/time.h>
5 #include <cuda.h>
6 #include <cuda_runtime.h>
7 
8 #include "quda.h"
9 #include "test_util.h"
10 #include "llfat_reference.h"
11 #include "misc.h"
12 #include "util_quda.h"
13 #include "malloc_quda.h"
14 
15 #ifdef MULTI_GPU
16 #include "comm_quda.h"
17 #endif
18 
19 #define TDIFF(a,b) (b.tv_sec - a.tv_sec + 0.000001*(b.tv_usec - a.tv_usec))
20 
21 extern void usage(char** argv);
22 extern bool verify_results;
23 
24 extern int device;
25 extern int xdim, ydim, zdim, tdim;
26 extern int gridsize_from_cmdline[];
27 
29 extern QudaPrecision prec;
30 extern int niter;
31 
33 //static QudaGaugeFieldOrder gauge_order = QUDA_QDP_GAUGE_ORDER;
35 
36 static size_t gSize;
37 
38 static void llfat_test()
39 {
40 
42 #ifdef MULTI_GPU
43  void* ghost_sitelink[4];
44  void* ghost_sitelink_diag[16];
45 #endif
46 
47 
49 
50  cpu_prec = prec;
51  gSize = cpu_prec;
52  qudaGaugeParam = newQudaGaugeParam();
53 
54  qudaGaugeParam.anisotropy = 1.0;
55 
56  qudaGaugeParam.X[0] = xdim;
57  qudaGaugeParam.X[1] = ydim;
58  qudaGaugeParam.X[2] = zdim;
59  qudaGaugeParam.X[3] = tdim;
60 
61  setDims(qudaGaugeParam.X);
62 
63  qudaGaugeParam.cpu_prec = cpu_prec;
64  qudaGaugeParam.cuda_prec = qudaGaugeParam.cuda_prec_sloppy = prec;
65  qudaGaugeParam.gauge_order = gauge_order;
66  qudaGaugeParam.type = QUDA_WILSON_LINKS;
67  qudaGaugeParam.reconstruct = qudaGaugeParam.reconstruct_sloppy = link_recon;
68  qudaGaugeParam.t_boundary = QUDA_ANTI_PERIODIC_T;
70  qudaGaugeParam.gauge_fix = QUDA_GAUGE_FIXED_NO;
71  qudaGaugeParam.ga_pad = 0;
72 
74  void* longlink = pinned_malloc(4*V*gaugeSiteSize*gSize);
75 
76  void* sitelink[4];
77  for(int i=0;i < 4;i++) sitelink[i] = pinned_malloc(V*gaugeSiteSize*gSize);
78 
79  void* sitelink_ex[4];
80  for(int i=0;i < 4;i++) sitelink_ex[i] = pinned_malloc(V_ex*gaugeSiteSize*gSize);
81 
82  void* milc_sitelink;
83  milc_sitelink = (void*)safe_malloc(4*V*gaugeSiteSize*gSize);
84 
85  void* milc_sitelink_ex;
86  milc_sitelink_ex = (void*)safe_malloc(4*V_ex*gaugeSiteSize*gSize);
87 
88  createSiteLinkCPU(sitelink, qudaGaugeParam.cpu_prec, 1);
89 
91  for(int i=0; i<V; ++i){
92  for(int dir=0; dir<4; ++dir){
93  char* src = (char*)sitelink[dir];
94  memcpy((char*)milc_sitelink + (i*4 + dir)*gaugeSiteSize*gSize, src+i*gaugeSiteSize*gSize, gaugeSiteSize*gSize);
95  }
96  }
97  }
98 
99  int X1=Z[0];
100  int X2=Z[1];
101  int X3=Z[2];
102  int X4=Z[3];
103 
104  for(int i=0; i < V_ex; i++){
105  int sid = i;
106  int oddBit=0;
107  if(i >= Vh_ex){
108  sid = i - Vh_ex;
109  oddBit = 1;
110  }
111 
112  int za = sid/E1h;
113  int x1h = sid - za*E1h;
114  int zb = za/E2;
115  int x2 = za - zb*E2;
116  int x4 = zb/E3;
117  int x3 = zb - x4*E3;
118  int x1odd = (x2 + x3 + x4 + oddBit) & 1;
119  int x1 = 2*x1h + x1odd;
120 
121 
122  if( x1< 2 || x1 >= X1 +2
123  || x2< 2 || x2 >= X2 +2
124  || x3< 2 || x3 >= X3 +2
125  || x4< 2 || x4 >= X4 +2){
126 #ifdef MULTI_GPU
127  continue;
128 #endif
129  }
130 
131 
132 
133  x1 = (x1 - 2 + X1) % X1;
134  x2 = (x2 - 2 + X2) % X2;
135  x3 = (x3 - 2 + X3) % X3;
136  x4 = (x4 - 2 + X4) % X4;
137 
138  int idx = (x4*X3*X2*X1+x3*X2*X1+x2*X1+x1)>>1;
139  if(oddBit){
140  idx += Vh;
141  }
142  for(int dir= 0; dir < 4; dir++){
143  char* src = (char*)sitelink[dir];
144  char* dst = (char*)sitelink_ex[dir];
145  memcpy(dst+i*gaugeSiteSize*gSize, src+idx*gaugeSiteSize*gSize, gaugeSiteSize*gSize);
146 
147  // milc ordering
148  memcpy((char*)milc_sitelink_ex + (i*4 + dir)*gaugeSiteSize*gSize, src+idx*gaugeSiteSize*gSize, gaugeSiteSize*gSize);
149  }//dir
150  }//i
151 
152 
153  double act_path_coeff[6];
154  for(int i=0;i < 6;i++){
155  act_path_coeff[i]= 0.1*i;
156  }
157 
158 
159  //only record the last call's performance
160  //the first one is for creating the cpu/cuda data structures
161  struct timeval t0, t1;
162 
163  void* longlink_ptr = longlink;
164  {
165  printfQuda("Tuning...\n");
166  computeKSLinkQuda(fatlink, longlink_ptr, NULL, milc_sitelink, act_path_coeff, &qudaGaugeParam);
167  }
168 
169  printfQuda("Running %d iterations of computation\n", niter);
170  gettimeofday(&t0, NULL);
171  for (int i=0; i<niter; i++)
172  computeKSLinkQuda(fatlink, longlink_ptr, NULL, milc_sitelink, act_path_coeff, &qudaGaugeParam);
173  gettimeofday(&t1, NULL);
174 
175  double secs = TDIFF(t0,t1);
176 
177  void* fat_reflink[4];
178  void* long_reflink[4];
179  for(int i=0;i < 4;i++){
180  fat_reflink[i] = safe_malloc(V*gaugeSiteSize*gSize);
181  long_reflink[i] = safe_malloc(V*gaugeSiteSize*gSize);
182  }
183 
184  if (verify_results){
185 
186  //FIXME: we have this complication because references takes coeff as float/double
187  // depending on the precision while the GPU code aways take coeff as double
188  void* coeff;
189  double coeff_dp[6];
190  float coeff_sp[6];
191  for (int i=0; i < 6;i++) coeff_sp[i] = coeff_dp[i] = act_path_coeff[i];
192  coeff = (prec == QUDA_DOUBLE_PRECISION) ? (void*)coeff_dp : (void*)coeff_sp;
193 
194 #ifdef MULTI_GPU
195  int optflag = 0;
196  //we need x,y,z site links in the back and forward T slice
197  // so it is 3*2*Vs_t
198  int Vs[4] = {Vs_x, Vs_y, Vs_z, Vs_t};
199  for (int i=0; i < 4; i++) ghost_sitelink[i] = safe_malloc(8*Vs[i]*gaugeSiteSize*gSize);
200 
201  /*
202  nu | |
203  |_____|
204  mu
205  */
206 
207  for(int nu=0;nu < 4;nu++){
208  for(int mu=0; mu < 4;mu++){
209  if(nu == mu){
210  ghost_sitelink_diag[nu*4+mu] = NULL;
211  }else{
212  //the other directions
213  int dir1, dir2;
214  for(dir1= 0; dir1 < 4; dir1++){
215  if(dir1 !=nu && dir1 != mu){
216  break;
217  }
218  }
219  for(dir2=0; dir2 < 4; dir2++){
220  if(dir2 != nu && dir2 != mu && dir2 != dir1){
221  break;
222  }
223  }
224  ghost_sitelink_diag[nu*4+mu] = safe_malloc(Z[dir1]*Z[dir2]*gaugeSiteSize*gSize);
225  memset(ghost_sitelink_diag[nu*4+mu], 0, Z[dir1]*Z[dir2]*gaugeSiteSize*gSize);
226  }
227 
228  }
229  }
230 
231  exchange_cpu_sitelink(qudaGaugeParam.X, sitelink, ghost_sitelink, ghost_sitelink_diag, qudaGaugeParam.cpu_prec, &qudaGaugeParam, optflag);
232  llfat_reference_mg(fat_reflink, sitelink, ghost_sitelink, ghost_sitelink_diag, qudaGaugeParam.cpu_prec, coeff);
233 
234  {
235  int R[4] = {2,2,2,2};
236  exchange_cpu_sitelink_ex(qudaGaugeParam.X, R, sitelink_ex, QUDA_QDP_GAUGE_ORDER, qudaGaugeParam.cpu_prec, 0, 4);
237  computeLongLinkCPU(long_reflink, sitelink_ex, qudaGaugeParam.cpu_prec, coeff);
238  }
239 #else
240  llfat_reference(fat_reflink, sitelink, qudaGaugeParam.cpu_prec, coeff);
241  computeLongLinkCPU(long_reflink, sitelink, qudaGaugeParam.cpu_prec, coeff);
242 #endif
243 
244  }//verify_results
245 
246  //format change for fatlink and longlink
247  void* myfatlink[4];
248  void* mylonglink[4];
249  for(int i=0; i < 4; i++){
250  myfatlink[i] = safe_malloc(V*gaugeSiteSize*gSize);
251  mylonglink[i] = safe_malloc(V*gaugeSiteSize*gSize);
252  memset(myfatlink[i], 0, V*gaugeSiteSize*gSize);
253  memset(mylonglink[i], 0, V*gaugeSiteSize*gSize);
254  }
255 
256  for(int i=0; i < V; i++){
257  for(int dir=0; dir< 4; dir++){
258  char* src = ((char*)fatlink)+ (4*i+dir)*gaugeSiteSize*gSize;
259  char* dst = ((char*)myfatlink[dir]) + i*gaugeSiteSize*gSize;
260  memcpy(dst, src, gaugeSiteSize*gSize);
261 
262  src = ((char*)longlink)+ (4*i+dir)*gaugeSiteSize*gSize;
263  dst = ((char*)mylonglink[dir]) + i*gaugeSiteSize*gSize;
264  memcpy(dst, src, gaugeSiteSize*gSize);
265  }
266  }
267 
268  if (verify_results) {
269  printfQuda("Checking fat links...\n");
270  int res=1;
271  for(int dir=0; dir<4; dir++){
272  res &= compare_floats(fat_reflink[dir], myfatlink[dir], V*gaugeSiteSize, 1e-3, qudaGaugeParam.cpu_prec);
273  }
274 
275  strong_check_link(myfatlink, "GPU results: ",
276  fat_reflink, "CPU reference results:",
277  V, qudaGaugeParam.cpu_prec);
278 
279  printfQuda("Fat-link test %s\n\n",(1 == res) ? "PASSED" : "FAILED");
280 
281  printfQuda("Checking long links...\n");
282  res = 1;
283  for(int dir=0; dir<4; ++dir){
284  res &= compare_floats(long_reflink[dir], mylonglink[dir], V*gaugeSiteSize, 1e-3, qudaGaugeParam.cpu_prec);
285  }
286 
287  strong_check_link(mylonglink, "GPU results: ",
288  long_reflink, "CPU reference results:",
289  V, qudaGaugeParam.cpu_prec);
290 
291  printfQuda("Long-link test %s\n\n",(1 == res) ? "PASSED" : "FAILED");
292  }
293 
294  int volume = qudaGaugeParam.X[0]*qudaGaugeParam.X[1]*qudaGaugeParam.X[2]*qudaGaugeParam.X[3];
295  long long flops= 61632 * (long long)niter;
296  flops += (252*4)*(long long)niter; // long-link contribution
297 
298  double perf = flops*volume/(secs*1024*1024*1024);
299  printfQuda("link computation time =%.2f ms, flops= %.2f Gflops\n", (secs*1000)/niter, perf);
300 
301  for (int i=0; i < 4; i++) {
302  host_free(myfatlink[i]);
303  host_free(mylonglink[i]);
304  }
305 
306 #ifdef MULTI_GPU
307  if (verify_results){
308  for(int i=0; i<4; i++){
309  host_free(ghost_sitelink[i]);
310  for(int j=0;j <4; j++){
311  if (i==j) continue;
312  host_free(ghost_sitelink_diag[i*4+j]);
313  }
314  }
315  }
316 #endif
317 
318  for(int i=0; i < 4; i++){
319  host_free(sitelink[i]);
320  host_free(sitelink_ex[i]);
321  host_free(fat_reflink[i]);
322  host_free(long_reflink[i]);
323  }
324  host_free(fatlink);
325  host_free(longlink);
326  if(milc_sitelink) host_free(milc_sitelink);
327  if(milc_sitelink_ex) host_free(milc_sitelink_ex);
328 #ifdef MULTI_GPU
330 #endif
331  endQuda();
332 }
333 
334 static void display_test_info()
335 {
336  printfQuda("running the following test:\n");
337 
338  printfQuda("link_precision link_reconstruct space_dimension T_dimension Ordering\n");
339  printfQuda("%s %s %d/%d/%d/ %d %s \n",
342  xdim, ydim, zdim, tdim,
344 
345  printfQuda("Grid partition info: X Y Z T\n");
346  printfQuda(" %d %d %d %d\n",
347  dimPartitioned(0),
348  dimPartitioned(1),
349  dimPartitioned(2),
350  dimPartitioned(3));
351 
352  return ;
353 
354 }
355 
356 void usage_extra(char** argv )
357 {
358  printfQuda("Extra options:\n");
359  printfQuda(" --gauge-order <qdp/milc> # ordering of the input gauge-field\n");
360  return ;
361 }
362 
363 int main(int argc, char **argv)
364 {
365 
366  //default to 18 reconstruct, 8^3 x 8
368  xdim=ydim=zdim=tdim=8;
370 
371  for (int i = 1; i < argc; i++){
372 
373  if(process_command_line_option(argc, argv, &i) == 0){
374  continue;
375  }
376 
377  if( strcmp(argv[i], "--gauge-order") == 0){
378  if(i+1 >= argc){
379  usage(argv);
380  }
381 
382  if(strcmp(argv[i+1], "milc") == 0){
384  }else if(strcmp(argv[i+1], "qdp") == 0){
386  }else{
387  fprintf(stderr, "Error: unsupported gauge-field order\n");
388  exit(1);
389  }
390  i++;
391  continue;
392  }
393 
394  fprintf(stderr, "ERROR: Invalid option:%s\n", argv[i]);
395  usage(argv);
396  }
397 
398  initComms(argc, argv, gridsize_from_cmdline);
400  llfat_test();
401  finalizeComms();
402 }
403 
404 
static QudaGaugeParam qudaGaugeParam
int dimPartitioned(int dim)
Definition: test_util.cpp:1776
QudaReconstructType reconstruct_sloppy
Definition: quda.h:53
double anisotropy
Definition: quda.h:38
int xdim
Definition: test_util.cpp:1615
int Z[4]
Definition: test_util.cpp:26
void endQuda(void)
double mu
Definition: test_util.cpp:1648
#define pinned_malloc(size)
Definition: malloc_quda.h:67
enum QudaPrecision_s QudaPrecision
bool verify_results
Definition: test_util.cpp:1643
static QudaGaugeFieldOrder gauge_order
Definition: llfat_test.cpp:34
int ga_pad
Definition: quda.h:63
QudaGaugeFixed gauge_fix
Definition: quda.h:61
int Vs_z
Definition: test_util.cpp:29
QudaPrecision prec
Definition: test_util.cpp:1608
QudaLinkType type
Definition: quda.h:42
#define TDIFF(a, b)
Definition: llfat_test.cpp:19
static int X2
Definition: face_gauge.cpp:42
#define host_free(ptr)
Definition: malloc_quda.h:71
int gridsize_from_cmdline[]
Definition: test_util.cpp:49
int V_ex
Definition: test_util.cpp:36
int Vs_y
Definition: test_util.cpp:29
int niter
Definition: test_util.cpp:1629
int zdim
Definition: test_util.cpp:1617
int process_command_line_option(int argc, char **argv, int *idx)
Definition: test_util.cpp:2019
QudaStaggeredPhase staggered_phase_type
Definition: quda.h:71
static int R[4]
void finalizeComms()
Definition: test_util.cpp:128
void exchange_llfat_cleanup(void)
const char * get_gauge_order_str(QudaGaugeFieldOrder order)
Definition: misc.cpp:741
void exchange_cpu_sitelink(int *X, void **sitelink, void **ghost_sitelink, void **ghost_sitelink_diag, QudaPrecision gPrecision, QudaGaugeParam *param, int optflag)
Definition: face_gauge.cpp:512
QudaGaugeFieldOrder gauge_order
Definition: quda.h:43
void computeKSLinkQuda(void *fatlink, void *longlink, void *ulink, void *inlink, double *path_coeff, QudaGaugeParam *param)
int compare_floats(void *a, void *b, int len, double epsilon, QudaPrecision precision)
Definition: test_util.cpp:434
const char * get_prec_str(QudaPrecision prec)
Definition: misc.cpp:701
void createSiteLinkCPU(void **link, QudaPrecision precision, int phase)
Definition: test_util.cpp:1227
void setDims(int *)
Definition: test_util.cpp:151
static size_t gSize
Definition: llfat_test.cpp:36
int Vs_x
Definition: test_util.cpp:29
void initQuda(int device)
static QudaPrecision cpu_prec
Definition: llfat_test.cpp:32
static int Vs[4]
Definition: face_gauge.cpp:44
int Vh_ex
Definition: test_util.cpp:36
const char * get_recon_str(QudaReconstructType recon)
Definition: misc.cpp:768
QudaPrecision cuda_prec_sloppy
Definition: quda.h:52
enum QudaGaugeFieldOrder_s QudaGaugeFieldOrder
QudaReconstructType reconstruct
Definition: quda.h:50
QudaPrecision cuda_prec
Definition: quda.h:49
int X[4]
Definition: quda.h:36
static int X3
Definition: face_gauge.cpp:42
#define safe_malloc(size)
Definition: malloc_quda.h:66
int V
Definition: test_util.cpp:27
int strong_check_link(void **linkA, const char *msgA, void **linkB, const char *msgB, int len, QudaPrecision prec)
Definition: test_util.cpp:1429
static int X1
Definition: face_gauge.cpp:42
void * memset(void *s, int c, size_t n)
void usage_extra(char **argv)
Definition: llfat_test.cpp:356
QudaReconstructType link_recon
Definition: test_util.cpp:1605
void computeLongLinkCPU(void **longlink, su3_matrix **sitelink, Float *act_path_coeff)
int device
Definition: test_util.cpp:1602
static void display_test_info()
Definition: llfat_test.cpp:334
int tdim
Definition: test_util.cpp:1618
int main(int argc, char **argv)
Definition: llfat_test.cpp:363
enum QudaReconstructType_s QudaReconstructType
Main header file for the QUDA library.
void llfat_reference_mg(void **fatlink, void **sitelink, void **ghost_sitelink, void **ghost_sitelink_diag, QudaPrecision prec, void *act_path_coeff)
int E2
Definition: test_util.cpp:34
#define printfQuda(...)
Definition: util_quda.h:115
QudaTboundary t_boundary
Definition: quda.h:45
void usage(char **argv)
Definition: test_util.cpp:1783
unsigned long long flops
Definition: blas_quda.cu:22
void * longlink
int Vs_t
Definition: test_util.cpp:29
void * fatlink
void initComms(int argc, char **argv, int *const commDims)
Definition: test_util.cpp:88
int E3
Definition: test_util.cpp:34
static void llfat_test()
Definition: llfat_test.cpp:38
QudaPrecision cpu_prec
Definition: quda.h:47
void exchange_cpu_sitelink_ex(int *X, int *R, void **sitelink, QudaGaugeFieldOrder cpu_order, QudaPrecision gPrecision, int optflag, int geometry)
Definition: face_gauge.cpp:644
int E1h
Definition: test_util.cpp:34
int ydim
Definition: test_util.cpp:1616
void llfat_reference(void **fatlink, void **sitelink, QudaPrecision prec, void *act_path_coeff)
static int X4
Definition: face_gauge.cpp:42
#define gaugeSiteSize
Definition: face_gauge.cpp:34
QudaGaugeParam newQudaGaugeParam(void)
int Vh
Definition: test_util.cpp:28