QUDA  v0.7.0
A library for QCD on GPUs
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
llfat_test.cpp
Go to the documentation of this file.
1 #include <stdio.h>
2 #include <stdlib.h>
3 #include <string.h>
4 #include <sys/time.h>
5 #include <cuda.h>
6 #include <cuda_runtime.h>
7 
8 #include "quda.h"
9 #include "test_util.h"
10 #include "llfat_reference.h"
11 #include "misc.h"
12 #include "util_quda.h"
13 
14 #ifdef MULTI_GPU
15 #include "face_quda.h"
16 #include "comm_quda.h"
17 #endif
18 
19 #define TDIFF(a,b) (b.tv_sec - a.tv_sec + 0.000001*(b.tv_usec - a.tv_usec))
20 
21 extern void usage(char** argv);
22 extern bool verify_results;
23 
24 extern int device;
25 extern int xdim, ydim, zdim, tdim;
26 extern int gridsize_from_cmdline[];
27 
29 extern QudaPrecision prec;
31 //static QudaGaugeFieldOrder gauge_order = QUDA_QDP_GAUGE_ORDER;
33 
34 static size_t gSize;
35 
36  static void
37 llfat_test(int test)
38 {
39 
40  QudaGaugeParam qudaGaugeParam;
41 #ifdef MULTI_GPU
42  void* ghost_sitelink[4];
43  void* ghost_sitelink_diag[16];
44 #endif
45 
46 
48 
49  cpu_prec = prec;
50  gSize = cpu_prec;
51  qudaGaugeParam = newQudaGaugeParam();
52 
53  qudaGaugeParam.anisotropy = 1.0;
54 
55  qudaGaugeParam.X[0] = xdim;
56  qudaGaugeParam.X[1] = ydim;
57  qudaGaugeParam.X[2] = zdim;
58  qudaGaugeParam.X[3] = tdim;
59 
60  setDims(qudaGaugeParam.X);
61 
62  qudaGaugeParam.cpu_prec = cpu_prec;
63  qudaGaugeParam.cuda_prec = prec;
64  qudaGaugeParam.gauge_order = gauge_order;
65  qudaGaugeParam.type=QUDA_WILSON_LINKS;
66  qudaGaugeParam.reconstruct = link_recon;
67  /*
68  qudaGaugeParam.flag = QUDA_FAT_PRESERVE_CPU_GAUGE
69  | QUDA_FAT_PRESERVE_GPU_GAUGE
70  | QUDA_FAT_PRESERVE_COMM_MEM;
71  */
72  qudaGaugeParam.preserve_gauge =0;
73  void* fatlink;
74  if (cudaMallocHost((void**)&fatlink, 4*V*gaugeSiteSize*gSize) == cudaErrorMemoryAllocation) {
75  errorQuda("ERROR: cudaMallocHost failed for fatlink\n");
76  }
77 
78  void* longlink;
79  if (cudaMallocHost((void**)&longlink, 4*V*gaugeSiteSize*gSize) == cudaErrorMemoryAllocation) {
80  errorQuda("ERROR: cudaMallocHost failed for longlink\n");
81  } // page-locked memory
82 
83  void* sitelink[4];
84  for(int i=0;i < 4;i++){
85  if (cudaMallocHost((void**)&sitelink[i], V*gaugeSiteSize*gSize) == cudaErrorMemoryAllocation) {
86  errorQuda("ERROR: cudaMallocHost failed for sitelink\n");
87  }
88  }
89 
90  void* sitelink_ex[4];
91  for(int i=0;i < 4;i++){
92  if (cudaMallocHost((void**)&sitelink_ex[i], V_ex*gaugeSiteSize*gSize) == cudaErrorMemoryAllocation) {
93  errorQuda("ERROR: cudaMallocHost failed for sitelink_ex\n");
94  }
95  }
96 
97 
98  void* milc_sitelink;
99  milc_sitelink = (void*)malloc(4*V*gaugeSiteSize*gSize);
100  if(milc_sitelink == NULL){
101  errorQuda("ERROR: allocating milc_sitelink failed\n");
102  }
103 
104  void* milc_sitelink_ex;
105  milc_sitelink_ex = (void*)malloc(4*V_ex*gaugeSiteSize*gSize);
106  if(milc_sitelink_ex == NULL){
107  errorQuda("Error: allocating milc_sitelink failed\n");
108  }
109 
110 
111 
112  createSiteLinkCPU(sitelink, qudaGaugeParam.cpu_prec, 1);
113 
114  if(gauge_order == QUDA_MILC_GAUGE_ORDER){
115  for(int i=0; i<V; ++i){
116  for(int dir=0; dir<4; ++dir){
117  char* src = (char*)sitelink[dir];
118  memcpy((char*)milc_sitelink + (i*4 + dir)*gaugeSiteSize*gSize, src+i*gaugeSiteSize*gSize, gaugeSiteSize*gSize);
119  }
120  }
121  }
122 
123  int X1=Z[0];
124  int X2=Z[1];
125  int X3=Z[2];
126  int X4=Z[3];
127 
128  for(int i=0; i < V_ex; i++){
129  int sid = i;
130  int oddBit=0;
131  if(i >= Vh_ex){
132  sid = i - Vh_ex;
133  oddBit = 1;
134  }
135 
136  int za = sid/E1h;
137  int x1h = sid - za*E1h;
138  int zb = za/E2;
139  int x2 = za - zb*E2;
140  int x4 = zb/E3;
141  int x3 = zb - x4*E3;
142  int x1odd = (x2 + x3 + x4 + oddBit) & 1;
143  int x1 = 2*x1h + x1odd;
144 
145 
146  if( x1< 2 || x1 >= X1 +2
147  || x2< 2 || x2 >= X2 +2
148  || x3< 2 || x3 >= X3 +2
149  || x4< 2 || x4 >= X4 +2){
150 #ifdef MULTI_GPU
151  continue;
152 #endif
153  }
154 
155 
156 
157  x1 = (x1 - 2 + X1) % X1;
158  x2 = (x2 - 2 + X2) % X2;
159  x3 = (x3 - 2 + X3) % X3;
160  x4 = (x4 - 2 + X4) % X4;
161 
162  int idx = (x4*X3*X2*X1+x3*X2*X1+x2*X1+x1)>>1;
163  if(oddBit){
164  idx += Vh;
165  }
166  for(int dir= 0; dir < 4; dir++){
167  char* src = (char*)sitelink[dir];
168  char* dst = (char*)sitelink_ex[dir];
169  memcpy(dst+i*gaugeSiteSize*gSize, src+idx*gaugeSiteSize*gSize, gaugeSiteSize*gSize);
170 
171  // milc ordering
172  memcpy((char*)milc_sitelink_ex + (i*4 + dir)*gaugeSiteSize*gSize, src+idx*gaugeSiteSize*gSize, gaugeSiteSize*gSize);
173  }//dir
174  }//i
175 
176 
177  double act_path_coeff[6];
178  for(int i=0;i < 6;i++){
179  act_path_coeff[i]= 0.1*i;
180  }
181 
182 
183  //only record the last call's performance
184  //the first one is for creating the cpu/cuda data structures
185  struct timeval t0, t1;
186 
187 
188  void** sitelink_ptr;
190  if(gauge_order == QUDA_QDP_GAUGE_ORDER){
191  sitelink_ptr = (test) ? (void**)sitelink_ex : (void**)sitelink;
192  }else{
193  sitelink_ptr = (test) ? (void**)milc_sitelink_ex : (void**)milc_sitelink;
194  }
195  void* longlink_ptr = longlink;
196 #ifdef MULTI_GPU
197  if(!test) longlink_ptr = NULL; // Have to have an extended volume for the long-link calculation
198 #endif
199 
200  gettimeofday(&t0, NULL);
201  computeKSLinkQuda(fatlink, longlink_ptr, NULL, milc_sitelink, act_path_coeff, &qudaGaugeParam, method);
202  gettimeofday(&t1, NULL);
203 
204 
205  double secs = TDIFF(t0,t1);
206 
207  void* fat_reflink[4];
208  void* long_reflink[4];
209  for(int i=0;i < 4;i++){
210  fat_reflink[i] = malloc(V*gaugeSiteSize*gSize);
211  if(fat_reflink[i] == NULL){
212  errorQuda("ERROR; allocate fat_reflink[%d] failed\n", i);
213  }
214  long_reflink[i] = malloc(V*gaugeSiteSize*gSize);
215  if(long_reflink[i] == NULL) errorQuda("ERROR; allocate long_reflink[%d] failed\n", i);
216  }
217 
218  if (verify_results){
219 
220  //FIXME: we have this compplication because references takes coeff as float/double
221  // depending on the precision while the GPU code aways take coeff as double
222  void* coeff;
223  double coeff_dp[6];
224  float coeff_sp[6];
225  for(int i=0;i < 6;i++){
226  coeff_sp[i] = coeff_dp[i] = act_path_coeff[i];
227  }
229  coeff = coeff_dp;
230  }else{
231  coeff = coeff_sp;
232  }
233 #ifdef MULTI_GPU
234  int optflag = 0;
235  //we need x,y,z site links in the back and forward T slice
236  // so it is 3*2*Vs_t
237  int Vs[4] = {Vs_x, Vs_y, Vs_z, Vs_t};
238  for(int i=0;i < 4; i++){
239  ghost_sitelink[i] = malloc(8*Vs[i]*gaugeSiteSize*gSize);
240  if (ghost_sitelink[i] == NULL){
241  printf("ERROR: malloc failed for ghost_sitelink[%d] \n",i);
242  exit(1);
243  }
244  }
245 
246  /*
247  nu | |
248  |_____|
249  mu
250  */
251 
252  for(int nu=0;nu < 4;nu++){
253  for(int mu=0; mu < 4;mu++){
254  if(nu == mu){
255  ghost_sitelink_diag[nu*4+mu] = NULL;
256  }else{
257  //the other directions
258  int dir1, dir2;
259  for(dir1= 0; dir1 < 4; dir1++){
260  if(dir1 !=nu && dir1 != mu){
261  break;
262  }
263  }
264  for(dir2=0; dir2 < 4; dir2++){
265  if(dir2 != nu && dir2 != mu && dir2 != dir1){
266  break;
267  }
268  }
269  ghost_sitelink_diag[nu*4+mu] = malloc(Z[dir1]*Z[dir2]*gaugeSiteSize*gSize);
270  if(ghost_sitelink_diag[nu*4+mu] == NULL){
271  errorQuda("malloc failed for ghost_sitelink_diag\n");
272  }
273 
274  memset(ghost_sitelink_diag[nu*4+mu], 0, Z[dir1]*Z[dir2]*gaugeSiteSize*gSize);
275  }
276 
277  }
278  }
279 
280  exchange_cpu_sitelink(qudaGaugeParam.X, sitelink, ghost_sitelink, ghost_sitelink_diag, qudaGaugeParam.cpu_prec, &qudaGaugeParam, optflag);
281  llfat_reference_mg(fat_reflink, sitelink, ghost_sitelink, ghost_sitelink_diag, qudaGaugeParam.cpu_prec, coeff);
282 
283 
284  {
285  int R[4] = {2,2,2,2};
286  exchange_cpu_sitelink_ex(qudaGaugeParam.X, R, sitelink_ex, QUDA_QDP_GAUGE_ORDER, qudaGaugeParam.cpu_prec, 0, 4);
287  computeLongLinkCPU(long_reflink, sitelink_ex, qudaGaugeParam.cpu_prec, coeff);
288  }
289 #else
290  llfat_reference(fat_reflink, sitelink, qudaGaugeParam.cpu_prec, coeff);
291  computeLongLinkCPU(long_reflink, sitelink, qudaGaugeParam.cpu_prec, coeff);
292 #endif
293 
294  }//verify_results
295 
296  //format change for fatlink and longlink
297  void* myfatlink[4];
298  void* mylonglink[4];
299  for(int i=0;i < 4;i++){
300  myfatlink[i] = malloc(V*gaugeSiteSize*gSize);
301  if(myfatlink[i] == NULL){
302  printf("Error: malloc failed for myfatlink[%d]\n", i);
303  exit(1);
304  }
305  mylonglink[i] = malloc(V*gaugeSiteSize*gSize);
306  if(mylonglink[i] == NULL){
307  printf("Error: malloc failed for mylonglink[%d]\n", i);
308  exit(1);
309  }
310  memset(myfatlink[i], 0, V*gaugeSiteSize*gSize);
311  memset(mylonglink[i], 0, V*gaugeSiteSize*gSize);
312  }
313 
314  for(int i=0;i < V; i++){
315  for(int dir=0; dir< 4; dir++){
316  char* src = ((char*)fatlink)+ (4*i+dir)*gaugeSiteSize*gSize;
317  char* dst = ((char*)myfatlink[dir]) + i*gaugeSiteSize*gSize;
318  memcpy(dst, src, gaugeSiteSize*gSize);
319 
320  src = ((char*)longlink)+ (4*i+dir)*gaugeSiteSize*gSize;
321  dst = ((char*)mylonglink[dir]) + i*gaugeSiteSize*gSize;
322  memcpy(dst, src, gaugeSiteSize*gSize);
323  }
324  }
325 
326  if (verify_results) {
327  printfQuda("Checking fat links...\n");
328  int res=1;
329  for(int dir=0; dir<4; dir++){
330  res &= compare_floats(fat_reflink[dir], myfatlink[dir], V*gaugeSiteSize, 1e-3, qudaGaugeParam.cpu_prec);
331  }
332 
333  strong_check_link(myfatlink, "GPU results: ",
334  fat_reflink, "CPU reference results:",
335  V, qudaGaugeParam.cpu_prec);
336 
337  printfQuda("Fat-link test %s\n\n",(1 == res) ? "PASSED" : "FAILED");
338 #ifdef MULTI_GPU
339  if(test){
340 #endif
341  printfQuda("Checking long links...\n");
342  res = 1;
343  for(int dir=0; dir<4; ++dir){
344  res &= compare_floats(long_reflink[dir], mylonglink[dir], V*gaugeSiteSize, 1e-3, qudaGaugeParam.cpu_prec);
345  }
346 
347  strong_check_link(mylonglink, "GPU results: ",
348  long_reflink, "CPU reference results:",
349  V, qudaGaugeParam.cpu_prec);
350 
351  printfQuda("Long-link test %s\n\n",(1 == res) ? "PASSED" : "FAILED");
352 
353 #ifdef MULTI_GPU
354  }else{ // !test
355  printfQuda("Extended volume is required for multi-GPU long-link construction\n");
356  }
357 #endif
358  }
359 
360  int volume = qudaGaugeParam.X[0]*qudaGaugeParam.X[1]*qudaGaugeParam.X[2]*qudaGaugeParam.X[3];
361  int flops= 61632;
362 #ifdef MULTI_GPU
363  if(test) flops += (252*4); // long-link contribution
364 #else
365  flops += (252*4); // 2*117 + 18 (two matrix-matrix multiplications and a matrix rescale)
366 #endif
367 
368  double perf = 1.0* flops*volume/(secs*1024*1024*1024);
369  printfQuda("link computation time =%.2f ms, flops= %.2f Gflops\n", secs*1000, perf);
370 
371 
372  for(int i=0;i < 4;i++){
373  free(myfatlink[i]);
374  }
375 
376 #ifdef MULTI_GPU
377  if (verify_results){
378  int i;
379  for(i=0;i < 4;i++){
380  free(ghost_sitelink[i]);
381  }
382  for(i=0;i <4; i++){
383  for(int j=0;j <4; j++){
384  if (i==j){
385  continue;
386  }
387  free(ghost_sitelink_diag[i*4+j]);
388  }
389  }
390  }
391 #endif
392 
393  for(int i=0;i < 4; i++){
394  cudaFreeHost(sitelink[i]);
395  cudaFreeHost(sitelink_ex[i]);
396  free(fat_reflink[i]);
397  }
398  cudaFreeHost(fatlink);
399  cudaFreeHost(longlink);
400  if(milc_sitelink) free(milc_sitelink);
401  if(milc_sitelink_ex) free(milc_sitelink_ex);
402 #ifdef MULTI_GPU
404 #endif
405  endQuda();
406 }
407 
408  static void
409 display_test_info(int test)
410 {
411  printfQuda("running the following test:\n");
412 
413  printfQuda("link_precision link_reconstruct space_dimension T_dimension Test Ordering\n");
414  printfQuda("%s %s %d/%d/%d/ %d %d %s \n",
417  xdim, ydim, zdim, tdim, test,
418  get_gauge_order_str(gauge_order));
419 
420 #ifdef MULTI_GPU
421  printfQuda("Grid partition info: X Y Z T\n");
422  printfQuda(" %d %d %d %d\n",
423  dimPartitioned(0),
424  dimPartitioned(1),
425  dimPartitioned(2),
426  dimPartitioned(3));
427 #endif
428 
429  return ;
430 
431 }
432 
433  void
434 usage_extra(char** argv )
435 {
436  printfQuda("Extra options:\n");
437  printfQuda(" --test <0/1> # Test method\n");
438  printfQuda(" 0: standard method\n");
439  printfQuda(" 1: extended volume method\n");
440  printfQuda(" --gauge-order <qdp/milc> # ordering of the input gauge-field\n");
441  return ;
442 }
443 
444 int
445 main(int argc, char **argv)
446 {
447 
448 #ifdef MULTI_GPU
449  int test = 1;
450 #else
451  int test = 0;
452 #endif
453  //default to 18 reconstruct, 8^3 x 8
455  xdim=ydim=zdim=tdim=8;
457 
458  int i;
459  for (i =1;i < argc; i++){
460 
461  if(process_command_line_option(argc, argv, &i) == 0){
462  continue;
463  }
464 
465  if( strcmp(argv[i], "--gauge-order") == 0){
466  if(i+1 >= argc){
467  usage(argv);
468  }
469 
470  if(strcmp(argv[i+1], "milc") == 0){
472  }else if(strcmp(argv[i+1], "qdp") == 0){
474  }else{
475  fprintf(stderr, "Error: unsupported gauge-field order\n");
476  exit(1);
477  }
478  i++;
479  continue;
480  }
481 
482  fprintf(stderr, "ERROR: Invalid option:%s\n", argv[i]);
483  usage(argv);
484  }
485 
486 
487 #ifdef MULTI_GPU
488  if(gauge_order == QUDA_MILC_GAUGE_ORDER && test == 0){
489  errorQuda("ERROR: milc format for multi-gpu with test0 is not supported yet!\n");
490  }
491 #endif
492 
493  initComms(argc, argv, gridsize_from_cmdline);
494  display_test_info(test);
495  llfat_test(test);
496  finalizeComms();
497 }
498 
499 
int dimPartitioned(int dim)
Definition: test_util.cpp:1577
double anisotropy
Definition: quda.h:31
__constant__ int Vh
int xdim
Definition: test_util.cpp:1553
void exchange_llfat_cleanup(void)
void endQuda(void)
__constant__ int X2
enum QudaPrecision_s QudaPrecision
int V
Definition: test_util.cpp:29
bool verify_results
Definition: test_util.cpp:1568
double test(int kernel)
Definition: blas_test.cu:354
__constant__ int Vh_ex
int Vs_z
Definition: test_util.cpp:31
QudaPrecision prec
Definition: test_util.cpp:1551
void display_test_info()
Definition: blas_test.cu:56
QudaLinkType type
Definition: quda.h:35
#define TDIFF(a, b)
Definition: llfat_test.cpp:19
#define errorQuda(...)
Definition: util_quda.h:73
void setDims(int *)
Definition: test_util.cpp:88
void computeKSLinkQuda(void *fatlink, void *longlink, void *ulink, void *inlink, double *path_coeff, QudaGaugeParam *param, QudaComputeFatMethod method)
__constant__ int X1
int gridsize_from_cmdline[]
Definition: test_util.cpp:1559
__global__ void const RealA *const const RealA *const const RealA *const const RealB *const const RealB *const int int mu
int V_ex
Definition: test_util.cpp:38
int Vs_y
Definition: test_util.cpp:31
int zdim
Definition: test_util.cpp:1555
int process_command_line_option(int argc, char **argv, int *idx)
Definition: test_util.cpp:1635
#define gaugeSiteSize
void finalizeComms()
Definition: test_util.cpp:65
const char * get_gauge_order_str(QudaGaugeFieldOrder order)
Definition: misc.cpp:697
QudaGaugeFieldOrder gauge_order
Definition: quda.h:36
int compare_floats(void *a, void *b, int len, double epsilon, QudaPrecision precision)
Definition: test_util.cpp:395
const char * get_prec_str(QudaPrecision prec)
Definition: misc.cpp:658
QudaPrecision cpu_prec
Definition: dslash_test.cpp:34
void createSiteLinkCPU(void **link, QudaPrecision precision, int phase)
Definition: test_util.cpp:1166
void * longlink[4]
__constant__ int Vs
int Vs_x
Definition: test_util.cpp:31
void initQuda(int device)
void exchange_cpu_sitelink_ex(int *X, int *R, void **sitelink, QudaGaugeFieldOrder cpu_order, QudaPrecision gPrecision, int optflag, int geometry)
const char * get_recon_str(QudaReconstructType recon)
Definition: misc.cpp:724
int preserve_gauge
Definition: quda.h:62
__constant__ double coeff
enum QudaGaugeFieldOrder_s QudaGaugeFieldOrder
QudaReconstructType reconstruct
Definition: quda.h:43
QudaPrecision cuda_prec
Definition: quda.h:42
int X[4]
Definition: quda.h:29
int strong_check_link(void **linkA, const char *msgA, void **linkB, const char *msgB, int len, QudaPrecision prec)
Definition: test_util.cpp:1365
short x1h
Definition: llfat_core.h:815
void usage_extra(char **argv)
Definition: llfat_test.cpp:434
QudaReconstructType link_recon
Definition: test_util.cpp:1549
void computeLongLinkCPU(void **longlink, su3_matrix **sitelink, Float *act_path_coeff)
int device
Definition: test_util.cpp:1546
int tdim
Definition: test_util.cpp:1556
void * memset(void *s, int c, size_t n)
int main(int argc, char **argv)
Definition: llfat_test.cpp:445
int Z[4]
Definition: test_util.cpp:28
QudaGaugeFieldOrder gauge_order
enum QudaReconstructType_s QudaReconstructType
Main header file for the QUDA library.
void llfat_reference_mg(void **fatlink, void **sitelink, void **ghost_sitelink, void **ghost_sitelink_diag, QudaPrecision prec, void *act_path_coeff)
__constant__ int X3
#define printfQuda(...)
Definition: util_quda.h:67
void usage(char **argv)
Definition: test_util.cpp:1584
short x1odd
Definition: llfat_core.h:821
int Vs_t
Definition: test_util.cpp:31
enum QudaComputeFatMethod_s QudaComputeFatMethod
void exchange_cpu_sitelink(int *X, void **sitelink, void **ghost_sitelink, void **ghost_sitelink_diag, QudaPrecision gPrecision, QudaGaugeParam *param, int optflag)
void initComms(int argc, char **argv, const int *commDims)
Definition: test_util.cpp:48
__constant__ int E1h
int oddBit
__constant__ int E3
QudaPrecision cpu_prec
Definition: quda.h:40
__constant__ int E2
__constant__ int X4
int ydim
Definition: test_util.cpp:1554
void llfat_reference(void **fatlink, void **sitelink, QudaPrecision prec, void *act_path_coeff)
QudaGaugeParam newQudaGaugeParam(void)
void * fatlink[4]