QUDA  v0.5.0
A library for QCD on GPUs
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
staggered_dslash_test.cpp
Go to the documentation of this file.
1 #include <iostream>
2 #include <stdio.h>
3 #include <stdlib.h>
4 #include <string.h>
5 
6 #include <quda.h>
7 #include <quda_internal.h>
8 #include <dirac_quda.h>
9 #include <dslash_quda.h>
10 #include <invert_quda.h>
11 #include <util_quda.h>
12 #include <blas_quda.h>
13 
14 #include <misc.h>
15 #include <test_util.h>
16 #include <dslash_util.h>
18 #include <gauge_field.h>
19 
20 #include <face_quda.h>
21 
22 #include <assert.h>
23 
24 using namespace quda;
25 
26 #define MAX(a,b) ((a)>(b)?(a):(b))
27 #define staggeredSpinorSiteSize 6
28 // What test are we doing (0 = dslash, 1 = MatPC, 2 = Mat)
29 
30 extern void usage(char** argv );
31 
32 extern int test_type;
33 
34 extern bool tune;
35 
38 
41 
44 
46 
47 void *hostGauge[4];
48 void *fatlink[4], *longlink[4];
49 
50 #ifdef MULTI_GPU
51 const void **ghost_fatlink, **ghost_longlink;
52 #endif
53 
54 const int loops = 100;
55 
57 extern QudaDagType dagger;
58 int transfer = 0; // include transfer time in the benchmark?
59 extern int xdim;
60 extern int ydim;
61 extern int zdim;
62 extern int tdim;
63 extern int gridsize_from_cmdline[];
65 extern QudaPrecision prec;
66 
67 extern int device;
68 
69 int X[4];
70 
72 
73 void init()
74 {
75 
77 
80 
81  gaugeParam.X[0] = X[0] = xdim;
82  gaugeParam.X[1] = X[1] = ydim;
83  gaugeParam.X[2] = X[2] = zdim;
84  gaugeParam.X[3] = X[3] = tdim;
85 
88 
94 
95  gaugeParam.anisotropy = 1.0;
100  gaugeParam.gaugeGiB = 0;
101 
109 
112 
113  int tmpint = MAX(X[1]*X[2]*X[3], X[0]*X[2]*X[3]);
114  tmpint = MAX(tmpint, X[0]*X[1]*X[3]);
115  tmpint = MAX(tmpint, X[0]*X[1]*X[2]);
116 
117 
118  gaugeParam.ga_pad = tmpint;
119  inv_param.sp_pad = tmpint;
120 
122  csParam.nColor=3;
123  csParam.nSpin=1;
124  csParam.nDim=4;
125  for(int d = 0; d < 4; d++) {
126  csParam.x[d] = gaugeParam.X[d];
127  }
128  csParam.precision = inv_param.cpu_prec;
129  csParam.pad = 0;
130  if (test_type < 2) {
133  csParam.x[0] /= 2;
134  } else {
137  }
138 
141  csParam.gammaBasis = inv_param.gamma_basis; // this parameter is meaningless for staggered
142  csParam.create = QUDA_ZERO_FIELD_CREATE;
143 
144  spinor = new cpuColorSpinorField(csParam);
145  spinorOut = new cpuColorSpinorField(csParam);
146  spinorRef = new cpuColorSpinorField(csParam);
147 
149  csParam.x[0] = gaugeParam.X[0];
150 
151  printfQuda("Randomizing fields ...\n");
152 
154 
155  size_t gSize = (gaugeParam.cpu_prec == QUDA_DOUBLE_PRECISION) ? sizeof(double) : sizeof(float);
156 
157  for (int dir = 0; dir < 4; dir++) {
158  fatlink[dir] = malloc(V*gaugeSiteSize*gSize);
159  longlink[dir] = malloc(V*gaugeSiteSize*gSize);
160  }
161  if (fatlink == NULL || longlink == NULL){
162  errorQuda("ERROR: malloc failed for fatlink/longlink");
163  }
165 
166 #ifdef MULTI_GPU
169  GaugeFieldParam cpuFatParam(fatlink, gaugeParam);
170  cpuFat = new cpuGaugeField(cpuFatParam);
172  ghost_fatlink = cpuFat->Ghost();
173 
175  GaugeFieldParam cpuLongParam(longlink, gaugeParam);
176  cpuLong = new cpuGaugeField(cpuLongParam);
178  ghost_longlink = cpuLong->Ghost();
179 
180  int x_face_size = X[1]*X[2]*X[3]/2;
181  int y_face_size = X[0]*X[2]*X[3]/2;
182  int z_face_size = X[0]*X[1]*X[3]/2;
183  int t_face_size = X[0]*X[1]*X[2]/2;
184  int pad_size =MAX(x_face_size, y_face_size);
185  pad_size = MAX(pad_size, z_face_size);
186  pad_size = MAX(pad_size, t_face_size);
187  gaugeParam.ga_pad = pad_size;
188 #endif
189 
192 
193  printfQuda("Fat links sending...");
195  printfQuda("Fat links sent\n");
196 
198 
199 #ifdef MULTI_GPU
200  gaugeParam.ga_pad = 3*pad_size;
201 #endif
202 
204  printfQuda("Long links sending...");
206  printfQuda("Long links sent...\n");
207 
208  printfQuda("Sending fields to GPU...");
209 
210  if (!transfer) {
211 
212  //csParam.verbose = QUDA_DEBUG_VERBOSE;
213 
215  csParam.pad = inv_param.sp_pad;
216  csParam.precision = inv_param.cuda_prec;
217  if (test_type < 2){
219  csParam.x[0] /=2;
220  }
221 
222  printfQuda("Creating cudaSpinor\n");
223  cudaSpinor = new cudaColorSpinorField(csParam);
224 
225  printfQuda("Creating cudaSpinorOut\n");
226  cudaSpinorOut = new cudaColorSpinorField(csParam);
227 
228  printfQuda("Sending spinor field to GPU\n");
229  *cudaSpinor = *spinor;
230 
231  cudaDeviceSynchronize();
232  checkCudaError();
233 
234  double spinor_norm2 = norm2(*spinor);
235  double cuda_spinor_norm2= norm2(*cudaSpinor);
236  printfQuda("Source CPU = %f, CUDA=%f\n", spinor_norm2, cuda_spinor_norm2);
237 
238  if(test_type == 2){
239  csParam.x[0] /=2;
240  }
242  tmp = new cudaColorSpinorField(csParam);
243 
244  bool pc = (test_type != 2);
245  DiracParam diracParam;
246  setDiracParam(diracParam, &inv_param, pc);
247 
248  diracParam.verbose = QUDA_VERBOSE;
249  diracParam.tmp1=tmp;
250 
251  dirac = Dirac::create(diracParam);
252 
253  } else {
254  errorQuda("Error not suppported");
255  }
256 
257  return;
258 }
259 
260 void end(void)
261 {
262  for (int dir = 0; dir < 4; dir++) {
263  free(fatlink[dir]);
264  free(longlink[dir]);
265  }
266 
267  if (!transfer){
268  delete dirac;
269  delete cudaSpinor;
270  delete cudaSpinorOut;
271  delete tmp;
272  }
273 
274  delete spinor;
275  delete spinorOut;
276  delete spinorRef;
277 
278  if (cpuFat) delete cpuFat;
279  if (cpuLong) delete cpuLong;
280 
281  endQuda();
282 }
283 
284 double dslashCUDA(int niter) {
285 
286  cudaEvent_t start, end;
287  cudaEventCreate(&start);
288  cudaEventRecord(start, 0);
289  cudaEventSynchronize(start);
290 
291  for (int i = 0; i < niter; i++) {
292  switch (test_type) {
293  case 0:
295  if (transfer){
296  //dslashQuda(spinorOdd, spinorEven, &inv_param, parity);
297  } else {
299  }
300  break;
301  case 1:
303  if (transfer){
304  //MatPCQuda(spinorOdd, spinorEven, &inv_param);
305  } else {
307  }
308  break;
309  case 2:
310  if (transfer){
311  //MatQuda(spinorGPU, spinor, &inv_param);
312  } else {
314  }
315  }
316  }
317 
318  cudaEventCreate(&end);
319  cudaEventRecord(end, 0);
320  cudaEventSynchronize(end);
321  float runTime;
322  cudaEventElapsedTime(&runTime, start, end);
323  cudaEventDestroy(start);
324  cudaEventDestroy(end);
325 
326  double secs = runTime / 1000; //stopwatchReadSeconds();
327 
328  // check for errors
329  cudaError_t stat = cudaGetLastError();
330  if (stat != cudaSuccess)
331  errorQuda("with ERROR: %s\n", cudaGetErrorString(stat));
332 
333  return secs;
334 }
335 
337 {
338 #ifndef MULTI_GPU
339  int cpu_parity = 0;
340 #endif
341 
342  // compare to dslash reference implementation
343  printfQuda("Calculating reference implementation...");
344  fflush(stdout);
345  switch (test_type) {
346  case 0:
347 #ifdef MULTI_GPU
348 
349  staggered_dslash_mg4dir(spinorRef, fatlink, longlink, (void**)ghost_fatlink, (void**)ghost_longlink,
351 #else
352  cpu_parity = 0; //EVEN
353  staggered_dslash(spinorRef->V(), fatlink, longlink, spinor->V(), cpu_parity, dagger,
355 
356 #endif
357 
358 
359  break;
360  case 1:
361 #ifdef MULTI_GPU
362  staggered_dslash_mg4dir(spinorRef, fatlink, longlink, (void**)ghost_fatlink, (void**)ghost_longlink,
364 
365 #else
366  cpu_parity=1; //ODD
367  staggered_dslash(spinorRef->V(), fatlink, longlink, spinor->V(), cpu_parity, dagger,
369 #endif
370  break;
371  case 2:
372  //mat(spinorRef->V(), fatlink, longlink, spinor->V(), kappa, dagger,
373  //inv_param.cpu_prec, gaugeParam.cpu_prec);
374  break;
375  default:
376  errorQuda("Test type not defined");
377  }
378 
379  printfQuda("done.\n");
380 
381 }
382 
383 static int dslashTest()
384 {
385  int accuracy_level = 0;
386 
387  init();
388 
389  int attempts = 1;
390 
391  for (int i=0; i<attempts; i++) {
392 
393  if (tune) { // warm-up run
394  printfQuda("Tuning...\n");
396  dslashCUDA(1);
397  }
398  printfQuda("Executing %d kernel loops...", loops);
399  double secs = dslashCUDA(loops);
400 
401 #ifdef DSLASH_PROFILING
402  printDslashProfile();
403 #endif
404 
405  if (!transfer) *spinorOut = *cudaSpinorOut;
406 
407  printfQuda("\n%fms per loop\n", 1000*secs);
409 
410  unsigned long long flops = dirac->Flops();
411  int link_floats = 8*gaugeParam.reconstruct+8*18;
412  int spinor_floats = 8*6*2 + 6;
413  int link_float_size = prec;
414  int spinor_float_size = 0;
415 
416  link_floats = test_type ? (2*link_floats) : link_floats;
417  spinor_floats = test_type ? (2*spinor_floats) : spinor_floats;
418 
419  int bytes_for_one_site = link_floats * link_float_size + spinor_floats * spinor_float_size;
420  if (prec == QUDA_HALF_PRECISION) bytes_for_one_site += (8*2 + 1)*4;
421 
422  printfQuda("GFLOPS = %f\n", 1.0e-9*flops/secs);
423  printfQuda("GB/s = %f\n\n", 1.0*Vh*bytes_for_one_site/((secs/loops)*1e+9));
424 
425  if (!transfer) {
426  double spinor_ref_norm2 = norm2(*spinorRef);
427  double cuda_spinor_out_norm2 = norm2(*cudaSpinorOut);
428  double spinor_out_norm2 = norm2(*spinorOut);
429  printfQuda("Results: CPU=%f, CUDA=%f, CPU-CUDA=%f\n", spinor_ref_norm2, cuda_spinor_out_norm2,
430  spinor_out_norm2);
431  } else {
432  double spinor_ref_norm2 = norm2(*spinorRef);
433  double spinor_out_norm2 = norm2(*spinorOut);
434  printfQuda("Result: CPU=%f , CPU-CUDA=%f", spinor_ref_norm2, spinor_out_norm2);
435  }
436 
437  accuracy_level = cpuColorSpinorField::Compare(*spinorRef, *spinorOut);
438  }
439  end();
440 
441  return accuracy_level;
442 }
443 
444 
446 {
447  printfQuda("running the following test:\n");
448 
449  printfQuda("prec recon test_type dagger S_dim T_dimension\n");
450  printfQuda("%s %s %d %d %d/%d/%d %d \n",
453  printfQuda("Grid partition info: X Y Z T\n");
454  printfQuda(" %d %d %d %d\n",
455  dimPartitioned(0),
456  dimPartitioned(1),
457  dimPartitioned(2),
458  dimPartitioned(3));
459 
460  return ;
461 
462 }
463 
464 
465 void
466 usage_extra(char** argv )
467 {
468  printfQuda("Extra options:\n");
469  printfQuda(" --test <0/1> # Test method\n");
470  printfQuda(" 0: Even destination spinor\n");
471  printfQuda(" 1: Odd destination spinor\n");
472  return ;
473 }
474 
475 int main(int argc, char **argv)
476 {
477 
478  int i;
479  for (i =1;i < argc; i++){
480 
481  if(process_command_line_option(argc, argv, &i) == 0){
482  continue;
483  }
484 
485  fprintf(stderr, "ERROR: Invalid option:%s\n", argv[i]);
486  usage(argv);
487  }
488 
489  initComms(argc, argv, gridsize_from_cmdline);
490 
492 
493  int ret =1;
494  int accuracy_level = dslashTest();
495 
496  printfQuda("accuracy_level =%d\n", accuracy_level);
497 
498  if (accuracy_level >= 1) ret = 0; //probably no error, -1 means no matching
499 
500  finalizeComms();
501 
502  return ret;
503 }
504