QUDA  v0.5.0
A library for QCD on GPUs
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
interface_quda.cpp
Go to the documentation of this file.
1 #include <iostream>
2 #include <stdio.h>
3 #include <stdlib.h>
4 #include <math.h>
5 #include <string.h>
6 #include <sys/time.h>
7 
8 #include <quda.h>
9 #include <quda_internal.h>
10 #include <comm_quda.h>
11 #include <tune_quda.h>
12 #include <blas_quda.h>
13 #include <gauge_field.h>
14 #include <dirac_quda.h>
15 #include <dslash_quda.h>
16 #include <invert_quda.h>
17 #include <color_spinor_field.h>
18 #include <clover_field.h>
19 #include <llfat_quda.h>
20 #include <fat_force_quda.h>
21 #include <hisq_links_quda.h>
22 
23 #ifdef NUMA_AFFINITY
24 #include <numa_affinity.h>
25 #endif
26 
27 #include <cuda.h>
28 
29 #ifdef MULTI_GPU
30 extern void exchange_cpu_sitelink_ex(int* X, int *R, void** sitelink, QudaGaugeFieldOrder cpu_order,
31  QudaPrecision gPrecision, int optflag);
32 #endif // MULTI_GPU
33 
34 #ifdef GPU_GAUGE_FORCE
35 #include <gauge_force_quda.h>
36 #endif
37 
38 #define MAX(a,b) ((a)>(b)? (a):(b))
39 #define TDIFF(a,b) (b.tv_sec - a.tv_sec + 0.000001*(b.tv_usec - a.tv_usec))
40 
41 #define spinorSiteSize 24 // real numbers per spinor
42 
43 #define MAX_GPU_NUM_PER_NODE 16
44 
45 // define newQudaGaugeParam() and newQudaInvertParam()
46 #define INIT_PARAM
47 #include "check_params.h"
48 #undef INIT_PARAM
49 
50 // define (static) checkGaugeParam() and checkInvertParam()
51 #define CHECK_PARAM
52 #include "check_params.h"
53 #undef CHECK_PARAM
54 
55 // define printQudaGaugeParam() and printQudaInvertParam()
56 #define PRINT_PARAM
57 #include "check_params.h"
58 #undef PRINT_PARAM
59 
60 #include "face_quda.h"
61 
63 
64 using namespace quda;
65 
69 
70 // It's important that these alias the above so that constants are set correctly in Dirac::Dirac()
74 
78 
82 
83 
84 cudaDeviceProp deviceProp;
85 cudaStream_t *streams;
86 
87 static bool initialized = false;
88 
90 static TimeProfile profileInit("initQuda");
91 
93 static TimeProfile profileGauge("loadGaugeQuda");
94 
96 static TimeProfile profileClover("loadCloverQuda");
97 
99 static TimeProfile profileInvert("invertQuda");
100 
102 static TimeProfile profileMulti("invertMultiShiftQuda");
103 
105 static TimeProfile profileMultiMixed("invertMultiShiftMixedQuda");
106 
108 static TimeProfile profileEnd("endQuda");
109 
110 
111 void setVerbosityQuda(QudaVerbosity verbosity, const char prefix[], FILE *outfile)
112 {
113  setVerbosity(verbosity);
114  setOutputPrefix(prefix);
115  setOutputFile(outfile);
116 }
117 
118 
119 typedef struct {
120  int ndim;
121  int dims[QUDA_MAX_DIM];
122 } LexMapData;
123 
127 static int lex_rank_from_coords(const int *coords, void *fdata)
128 {
129  LexMapData *md = static_cast<LexMapData *>(fdata);
130 
131  int rank = coords[0];
132  for (int i = 1; i < md->ndim; i++) {
133  rank = md->dims[i] * rank + coords[i];
134  }
135  return rank;
136 }
137 
138 #ifdef QMP_COMMS
139 
142 static int qmp_rank_from_coords(const int *coords, void *fdata)
143 {
144  return QMP_get_node_number_from(coords);
145 }
146 #endif
147 
148 
149 static bool comms_initialized = false;
150 
151 void initCommsGridQuda(int nDim, const int *dims, QudaCommsMap func, void *fdata)
152 {
153  if (nDim != 4) {
154  errorQuda("Number of communication grid dimensions must be 4");
155  }
156 
157  if (!func) {
158 
159 #if QMP_COMMS
160  if (QMP_logical_topology_is_declared()) {
161  if (QMP_get_logical_number_of_dimensions() != 4) {
162  errorQuda("QMP logical topology must have 4 dimensions");
163  }
164  for (int i=0; i<nDim; i++) {
165  int qdim = QMP_get_logical_dimensions()[i];
166  if(qdim != dims[i]) {
167  errorQuda("QMP logical dims[%d]=%d does not match dims[%d]=%d argument", i, qdim, i, dims[i]);
168  }
169  }
170  fdata = NULL;
171  func = qmp_rank_from_coords;
172  } else {
173  warningQuda("QMP logical topology is undeclared; using default lexicographical ordering");
174 #endif
175 
176  LexMapData map_data;
177  map_data.ndim = nDim;
178  for (int i=0; i<nDim; i++) {
179  map_data.dims[i] = dims[i];
180  }
181  fdata = (void *) &map_data;
182  func = lex_rank_from_coords;
183 
184 #if QMP_COMMS
185  }
186 #endif
187 
188  }
189  comm_init(nDim, dims, func, fdata);
190  comms_initialized = true;
191 }
192 
193 
194 void initQuda(int dev)
195 {
196  profileInit[QUDA_PROFILE_TOTAL].Start();
197 
198  //static bool initialized = false;
199  if (initialized) return;
200  initialized = true;
201 
202 #if defined(GPU_DIRECT) && defined(MULTI_GPU) && (CUDA_VERSION == 4000)
203  //check if CUDA_NIC_INTEROP is set to 1 in the enviroment
204  // not needed for CUDA >= 4.1
205  char* cni_str = getenv("CUDA_NIC_INTEROP");
206  if(cni_str == NULL){
207  errorQuda("Environment variable CUDA_NIC_INTEROP is not set\n");
208  }
209  int cni_int = atoi(cni_str);
210  if (cni_int != 1){
211  errorQuda("Environment variable CUDA_NIC_INTEROP is not set to 1\n");
212  }
213 #endif
214 
215  int deviceCount;
216  cudaGetDeviceCount(&deviceCount);
217  if (deviceCount == 0) {
218  errorQuda("No CUDA devices found");
219  }
220 
221  for(int i=0; i<deviceCount; i++) {
222  cudaGetDeviceProperties(&deviceProp, i);
223  checkCudaErrorNoSync(); // "NoSync" for correctness in HOST_DEBUG mode
224  if (getVerbosity() >= QUDA_SUMMARIZE) {
225  printfQuda("Found device %d: %s\n", i, deviceProp.name);
226  }
227  }
228 
229  if (!comms_initialized) {
230 #if defined(QMP_COMMS)
231  if (QMP_logical_topology_is_declared()) {
232  int ndim = QMP_get_logical_number_of_dimensions();
233  const int *dims = QMP_get_logical_dimensions();
234  initCommsGridQuda(ndim, dims, NULL, NULL);
235  } else {
236  errorQuda("initQuda() called without prior call to initCommsGridQuda(),"
237  " and QMP logical topology has not been declared");
238  }
239 #elif defined(MPI_COMMS)
240  errorQuda("When using MPI for communications, initCommsGridQuda() must be called before initQuda()");
241 #else // single-GPU
242  const int dims[4] = {1, 1, 1, 1};
243  initCommsGridQuda(4, dims, NULL, NULL);
244 #endif
245  }
246 
247 #ifdef MULTI_GPU
248  if (dev < 0) dev = comm_gpuid();
249 #else
250  if (dev < 0 || dev >= 16) errorQuda("Invalid device number %d", dev);
251 #endif
252 
253  cudaGetDeviceProperties(&deviceProp, dev);
254  checkCudaErrorNoSync(); // "NoSync" for correctness in HOST_DEBUG mode
255  if (deviceProp.major < 1) {
256  errorQuda("Device %d does not support CUDA", dev);
257  }
258 
259  if (getVerbosity() >= QUDA_SUMMARIZE) {
260  printfQuda("Using device %d: %s\n", dev, deviceProp.name);
261  }
262  cudaSetDevice(dev);
263  checkCudaErrorNoSync(); // "NoSync" for correctness in HOST_DEBUG mode
264 
265 #ifdef NUMA_AFFINITY
267  setNumaAffinity(dev);
268  }
269 #endif
270  // if the device supports host-mapped memory, then enable this
271  if(deviceProp.canMapHostMemory) cudaSetDeviceFlags(cudaDeviceMapHost);
272  checkCudaError();
273 
274  cudaDeviceSetCacheConfig(cudaFuncCachePreferL1);
275  //cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeEightByte);
276  cudaGetDeviceProperties(&deviceProp, dev);
277 
278  streams = new cudaStream_t[Nstream];
279  for (int i=0; i<Nstream; i++) {
280  cudaStreamCreate(&streams[i]);
281  }
282  checkCudaError();
284 
285  initBlas();
286 
288 
289  profileInit[QUDA_PROFILE_TOTAL].Stop();
290 }
291 
292 
293 void loadGaugeQuda(void *h_gauge, QudaGaugeParam *param)
294 {
295  profileGauge[QUDA_PROFILE_TOTAL].Start();
296 
297  if (!initialized) errorQuda("QUDA not initialized");
299 
300  checkGaugeParam(param);
301 
302  // Set the specific cpu parameters and create the cpu gauge field
303  GaugeFieldParam gauge_param(h_gauge, *param);
304 
305  cpuGaugeField cpu(gauge_param);
306 
307  profileGauge[QUDA_PROFILE_INIT].Start();
308  // switch the parameters for creating the mirror precise cuda gauge field
309  gauge_param.create = QUDA_NULL_FIELD_CREATE;
310  gauge_param.precision = param->cuda_prec;
311  gauge_param.reconstruct = param->reconstruct;
312  gauge_param.pad = param->ga_pad;
313  gauge_param.order = (gauge_param.precision == QUDA_DOUBLE_PRECISION ||
314  gauge_param.reconstruct == QUDA_RECONSTRUCT_NO ) ?
316  cudaGaugeField *precise = new cudaGaugeField(gauge_param);
317  profileGauge[QUDA_PROFILE_INIT].Stop();
318 
319  profileGauge[QUDA_PROFILE_H2D].Start();
320  precise->loadCPUField(cpu, QUDA_CPU_FIELD_LOCATION);
321 
322  param->gaugeGiB += precise->GBytes();
323 
324  // switch the parameters for creating the mirror sloppy cuda gauge field
325  gauge_param.precision = param->cuda_prec_sloppy;
326  gauge_param.reconstruct = param->reconstruct_sloppy;
327  gauge_param.order = (gauge_param.precision == QUDA_DOUBLE_PRECISION ||
328  gauge_param.reconstruct == QUDA_RECONSTRUCT_NO ) ?
330  cudaGaugeField *sloppy = NULL;
331  if (param->cuda_prec != param->cuda_prec_sloppy) {
332  sloppy = new cudaGaugeField(gauge_param);
334  param->gaugeGiB += sloppy->GBytes();
335  } else {
336  sloppy = precise;
337  }
338 
339  // switch the parameters for creating the mirror preconditioner cuda gauge field
340  gauge_param.precision = param->cuda_prec_precondition;
341  gauge_param.reconstruct = param->reconstruct_precondition;
342  gauge_param.order = (gauge_param.precision == QUDA_DOUBLE_PRECISION ||
343  gauge_param.reconstruct == QUDA_RECONSTRUCT_NO ) ?
345  cudaGaugeField *precondition = NULL;
346  if (param->cuda_prec_sloppy != param->cuda_prec_precondition) {
347  precondition = new cudaGaugeField(gauge_param);
348  precondition->loadCPUField(cpu, QUDA_CPU_FIELD_LOCATION);
349  param->gaugeGiB += precondition->GBytes();
350  } else {
351  precondition = sloppy;
352  }
353  profileGauge[QUDA_PROFILE_H2D].Stop();
354 
355  switch (param->type) {
356  case QUDA_WILSON_LINKS:
357  //if (gaugePrecise) errorQuda("Precise gauge field already allocated");
358  gaugePrecise = precise;
359  //if (gaugeSloppy) errorQuda("Sloppy gauge field already allocated");
360  gaugeSloppy = sloppy;
361  //if (gaugePrecondition) errorQuda("Precondition gauge field already allocated");
362  gaugePrecondition = precondition;
363  break;
365  if (gaugeFatPrecise) errorQuda("Precise gauge fat field already allocated");
366  gaugeFatPrecise = precise;
367  if (gaugeFatSloppy) errorQuda("Sloppy gauge fat field already allocated");
368  gaugeFatSloppy = sloppy;
369  if (gaugeFatPrecondition) errorQuda("Precondition gauge fat field already allocated");
370  gaugeFatPrecondition = precondition;
371  break;
373  if (gaugeLongPrecise) errorQuda("Precise gauge long field already allocated");
374  gaugeLongPrecise = precise;
375  if (gaugeLongSloppy) errorQuda("Sloppy gauge long field already allocated");
376  gaugeLongSloppy = sloppy;
377  if (gaugeLongPrecondition) errorQuda("Precondition gauge long field already allocated");
378  gaugeLongPrecondition = precondition;
379  break;
380  default:
381  errorQuda("Invalid gauge type");
382  }
383 
384  profileGauge[QUDA_PROFILE_TOTAL].Stop();
385 }
386 
387 void saveGaugeQuda(void *h_gauge, QudaGaugeParam *param)
388 {
389  profileGauge[QUDA_PROFILE_TOTAL].Start();
390 
391  if (!initialized) errorQuda("QUDA not initialized");
392  checkGaugeParam(param);
393 
394  // Set the specific cpu parameters and create the cpu gauge field
395  GaugeFieldParam gauge_param(h_gauge, *param);
396  cpuGaugeField cpuGauge(gauge_param);
397  cudaGaugeField *cudaGauge = NULL;
398  switch (param->type) {
399  case QUDA_WILSON_LINKS:
400  cudaGauge = gaugePrecise;
401  break;
403  cudaGauge = gaugeFatPrecise;
404  break;
406  cudaGauge = gaugeLongPrecise;
407  break;
408  default:
409  errorQuda("Invalid gauge type");
410  }
411 
412  profileGauge[QUDA_PROFILE_D2H].Start();
413  cudaGauge->saveCPUField(cpuGauge, QUDA_CPU_FIELD_LOCATION);
414  profileGauge[QUDA_PROFILE_D2H].Stop();
415 
416  profileGauge[QUDA_PROFILE_TOTAL].Stop();
417 }
418 
419 
420 void loadCloverQuda(void *h_clover, void *h_clovinv, QudaInvertParam *inv_param)
421 {
422  profileClover[QUDA_PROFILE_TOTAL].Start();
423 
424  pushVerbosity(inv_param->verbosity);
426 
427  if (!initialized) errorQuda("QUDA not initialized");
428 
429  if (!h_clover && !h_clovinv) {
430  errorQuda("loadCloverQuda() called with neither clover term nor inverse");
431  }
432  if (inv_param->clover_cpu_prec == QUDA_HALF_PRECISION) {
433  errorQuda("Half precision not supported on CPU");
434  }
435  if (gaugePrecise == NULL) {
436  errorQuda("Gauge field must be loaded before clover");
437  }
438  if (inv_param->dslash_type != QUDA_CLOVER_WILSON_DSLASH) {
439  errorQuda("Wrong dslash_type in loadCloverQuda()");
440  }
441 
442  // determines whether operator is preconditioned when calling invertQuda()
443  bool pc_solve = (inv_param->solve_type == QUDA_DIRECT_PC_SOLVE ||
444  inv_param->solve_type == QUDA_NORMOP_PC_SOLVE);
445 
446  // determines whether operator is preconditioned when calling MatQuda() or MatDagMatQuda()
447  bool pc_solution = (inv_param->solution_type == QUDA_MATPC_SOLUTION ||
449 
450  bool asymmetric = (inv_param->matpc_type == QUDA_MATPC_EVEN_EVEN_ASYMMETRIC ||
452 
453  // We issue a warning only when it seems likely that the user is screwing up:
454 
455  // inverted clover term is required when applying preconditioned operator
456  if (!h_clovinv && pc_solve && pc_solution) {
457  warningQuda("Inverted clover term not loaded");
458  }
459 
460  // uninverted clover term is required when applying unpreconditioned operator,
461  // but note that dslashQuda() is always preconditioned
462  if (!h_clover && !pc_solve && !pc_solution) {
463  //warningQuda("Uninverted clover term not loaded");
464  }
465 
466  // uninverted clover term is also required for "asymmetric" preconditioning
467  if (!h_clover && pc_solve && pc_solution && asymmetric) {
468  warningQuda("Uninverted clover term not loaded");
469  }
470 
471  CloverFieldParam clover_param;
472  clover_param.nDim = 4;
473  for (int i=0; i<4; i++) clover_param.x[i] = gaugePrecise->X()[i];
474  clover_param.precision = inv_param->clover_cuda_prec;
475  clover_param.pad = inv_param->cl_pad;
476 
477  profileClover[QUDA_PROFILE_H2D].Start();
478 
479  cloverPrecise = new cudaCloverField(h_clover, h_clovinv, inv_param->clover_cpu_prec,
480  inv_param->clover_order, clover_param);
481  inv_param->cloverGiB = cloverPrecise->GBytes();
482 
483  // create the mirror sloppy clover field
484  if (inv_param->clover_cuda_prec != inv_param->clover_cuda_prec_sloppy) {
485  clover_param.precision = inv_param->clover_cuda_prec_sloppy;
486  cloverSloppy = new cudaCloverField(h_clover, h_clovinv, inv_param->clover_cpu_prec,
487  inv_param->clover_order, clover_param);
488  inv_param->cloverGiB += cloverSloppy->GBytes();
489  } else {
491  }
492 
493  // create the mirror preconditioner clover field
494  if (inv_param->clover_cuda_prec_sloppy != inv_param->clover_cuda_prec_precondition &&
496  clover_param.precision = inv_param->clover_cuda_prec_precondition;
497  cloverPrecondition = new cudaCloverField(h_clover, h_clovinv, inv_param->clover_cpu_prec,
498  inv_param->clover_order, clover_param);
499  inv_param->cloverGiB += cloverPrecondition->GBytes();
500  } else {
502  }
503  profileClover[QUDA_PROFILE_H2D].Stop();
504 
505  popVerbosity();
506 
507  profileClover[QUDA_PROFILE_TOTAL].Stop();
508 }
509 
510 void freeGaugeQuda(void)
511 {
512  if (!initialized) errorQuda("QUDA not initialized");
515  if (gaugePrecise) delete gaugePrecise;
516 
517  gaugePrecondition = NULL;
518  gaugeSloppy = NULL;
519  gaugePrecise = NULL;
520 
524 
525  gaugeLongPrecondition = NULL;
526  gaugeLongSloppy = NULL;
527  gaugeLongPrecise = NULL;
528 
531  if (gaugeFatPrecise) delete gaugeFatPrecise;
532 
533  gaugeFatPrecondition = NULL;
534  gaugeFatSloppy = NULL;
535  gaugeFatPrecise = NULL;
536 }
537 
538 
539 void freeCloverQuda(void)
540 {
541  if (!initialized) errorQuda("QUDA not initialized");
544  if (cloverPrecise) delete cloverPrecise;
545 
546  cloverPrecondition = NULL;
547  cloverSloppy = NULL;
548  cloverPrecise = NULL;
549 }
550 
551 
552 void endQuda(void)
553 {
554  profileEnd[QUDA_PROFILE_TOTAL].Start();
555 
556  if (!initialized) return;
557 
563  freeGaugeQuda();
564  freeCloverQuda();
565 
566  endBlas();
567 
568  if (streams) {
569  for (int i=0; i<Nstream; i++) cudaStreamDestroy(streams[i]);
570  delete []streams;
571  streams = NULL;
572  }
574 
576 
577  // end this CUDA context
578  cudaDeviceReset();
579 
580  initialized = false;
581 
582  comm_finalize();
583  comms_initialized = false;
584 
585  profileEnd[QUDA_PROFILE_TOTAL].Stop();
586 
587  // print out the profile information of the lifetime of the library
588  if (getVerbosity() >= QUDA_SUMMARIZE) {
589  profileInit.Print();
590  profileGauge.Print();
591  profileClover.Print();
592  profileInvert.Print();
593  profileMulti.Print();
594  profileMultiMixed.Print();
595  profileEnd.Print();
596 
597  printfQuda("\n");
599  printfQuda("\n");
600  }
601 
603 }
604 
605 
606 namespace quda {
607 
608  void setDiracParam(DiracParam &diracParam, QudaInvertParam *inv_param, const bool pc)
609  {
610  double kappa = inv_param->kappa;
611  if (inv_param->dirac_order == QUDA_CPS_WILSON_DIRAC_ORDER) {
612  kappa *= gaugePrecise->Anisotropy();
613  }
614 
615  switch (inv_param->dslash_type) {
616  case QUDA_WILSON_DSLASH:
617  diracParam.type = pc ? QUDA_WILSONPC_DIRAC : QUDA_WILSON_DIRAC;
618  break;
620  diracParam.type = pc ? QUDA_CLOVERPC_DIRAC : QUDA_CLOVER_DIRAC;
621  break;
624  diracParam.Ls = inv_param->Ls;
625  break;
626  case QUDA_ASQTAD_DSLASH:
627  diracParam.type = pc ? QUDA_ASQTADPC_DIRAC : QUDA_ASQTAD_DIRAC;
628  break;
631  if (inv_param->twist_flavor == QUDA_TWIST_MINUS || inv_param->twist_flavor == QUDA_TWIST_PLUS)
632  {
633  diracParam.Ls = 1;
634  diracParam.epsilon = 0.0;
635  }
636  else
637  {
638  diracParam.Ls = 2;
639  diracParam.epsilon = inv_param->twist_flavor == QUDA_TWIST_NONDEG_DOUBLET ? inv_param->epsilon : 0.0;
640  }
641  break;
642  default:
643  errorQuda("Unsupported dslash_type %d", inv_param->dslash_type);
644  }
645 
646  diracParam.matpcType = inv_param->matpc_type;
647  diracParam.dagger = inv_param->dagger;
648  diracParam.gauge = gaugePrecise;
649  diracParam.fatGauge = gaugeFatPrecise;
650  diracParam.longGauge = gaugeLongPrecise;
651  diracParam.clover = cloverPrecise;
652  diracParam.kappa = kappa;
653  diracParam.mass = inv_param->mass;
654  diracParam.m5 = inv_param->m5;
655  diracParam.mu = inv_param->mu;
656  diracParam.verbose = getVerbosity();
657 
658  for (int i=0; i<4; i++) {
659  diracParam.commDim[i] = 1; // comms are always on
660  }
661  }
662 
663 
664  void setDiracSloppyParam(DiracParam &diracParam, QudaInvertParam *inv_param, const bool pc)
665  {
666  setDiracParam(diracParam, inv_param, pc);
667 
668  diracParam.gauge = gaugeSloppy;
669  diracParam.fatGauge = gaugeFatSloppy;
670  diracParam.longGauge = gaugeLongSloppy;
671  diracParam.clover = cloverSloppy;
672 
673  for (int i=0; i<4; i++) {
674  diracParam.commDim[i] = 1; // comms are always on
675  }
676 
677  }
678 
679  // The preconditioner currently mimicks the sloppy operator with no comms
680  void setDiracPreParam(DiracParam &diracParam, QudaInvertParam *inv_param, const bool pc)
681  {
682  setDiracParam(diracParam, inv_param, pc);
683 
684  diracParam.gauge = gaugePrecondition;
685  diracParam.fatGauge = gaugeFatPrecondition;
686  diracParam.longGauge = gaugeLongPrecondition;
687  diracParam.clover = cloverPrecondition;
688 
689  for (int i=0; i<4; i++) {
690  diracParam.commDim[i] = 0; // comms are always off
691  }
692 
693  }
694 
695  void createDirac(Dirac *&d, Dirac *&dSloppy, Dirac *&dPre, QudaInvertParam &param, const bool pc_solve)
696  {
697  DiracParam diracParam;
698  DiracParam diracSloppyParam;
699  DiracParam diracPreParam;
700 
701  setDiracParam(diracParam, &param, pc_solve);
702  setDiracSloppyParam(diracSloppyParam, &param, pc_solve);
703  setDiracPreParam(diracPreParam, &param, pc_solve);
704 
705  d = Dirac::create(diracParam); // create the Dirac operator
706  dSloppy = Dirac::create(diracSloppyParam);
707  dPre = Dirac::create(diracPreParam);
708  }
709 
711  QudaMassNormalization mass_normalization, cudaColorSpinorField &b)
712  {
713  if (getVerbosity() >= QUDA_DEBUG_VERBOSE) {
714  printfQuda("Mass rescale: Kappa is: %g\n", kappa);
715  printfQuda("Mass rescale: mass normalization: %d\n", mass_normalization);
716  double nin = norm2(b);
717  printfQuda("Mass rescale: norm of source in = %g\n", nin);
718  }
719 
720  if (dslash_type == QUDA_ASQTAD_DSLASH) {
721  if (mass_normalization != QUDA_MASS_NORMALIZATION) {
722  errorQuda("Staggered code only supports QUDA_MASS_NORMALIZATION");
723  }
724  return;
725  }
726 
727  // multiply the source to compensate for normalization of the Dirac operator, if necessary
728  switch (solution_type) {
729  case QUDA_MAT_SOLUTION:
730  if (mass_normalization == QUDA_MASS_NORMALIZATION ||
731  mass_normalization == QUDA_ASYMMETRIC_MASS_NORMALIZATION) {
732  axCuda(2.0*kappa, b);
733  }
734  break;
736  if (mass_normalization == QUDA_MASS_NORMALIZATION ||
737  mass_normalization == QUDA_ASYMMETRIC_MASS_NORMALIZATION) {
738  axCuda(4.0*kappa*kappa, b);
739  }
740  break;
741  case QUDA_MATPC_SOLUTION:
742  if (mass_normalization == QUDA_MASS_NORMALIZATION) {
743  axCuda(4.0*kappa*kappa, b);
744  } else if (mass_normalization == QUDA_ASYMMETRIC_MASS_NORMALIZATION) {
745  axCuda(2.0*kappa, b);
746  }
747  break;
749  if (mass_normalization == QUDA_MASS_NORMALIZATION) {
750  axCuda(16.0*pow(kappa,4), b);
751  } else if (mass_normalization == QUDA_ASYMMETRIC_MASS_NORMALIZATION) {
752  axCuda(4.0*kappa*kappa, b);
753  }
754  break;
755  default:
756  errorQuda("Solution type %d not supported", solution_type);
757  }
758 
759  if (getVerbosity() >= QUDA_DEBUG_VERBOSE) printfQuda("Mass rescale done\n");
760  if (getVerbosity() >= QUDA_DEBUG_VERBOSE) {
761  printfQuda("Mass rescale: Kappa is: %g\n", kappa);
762  printfQuda("Mass rescale: mass normalization: %d\n", mass_normalization);
763  double nin = norm2(b);
764  printfQuda("Mass rescale: norm of source out = %g\n", nin);
765  }
766 
767  }
768 
770  QudaMassNormalization mass_normalization, double &coeff)
771  {
772  if (dslash_type == QUDA_ASQTAD_DSLASH) {
773  if (mass_normalization != QUDA_MASS_NORMALIZATION) {
774  errorQuda("Staggered code only supports QUDA_MASS_NORMALIZATION");
775  }
776  return;
777  }
778 
779  // multiply the source to compensate for normalization of the Dirac operator, if necessary
780  switch (solution_type) {
781  case QUDA_MAT_SOLUTION:
782  if (mass_normalization == QUDA_MASS_NORMALIZATION ||
783  mass_normalization == QUDA_ASYMMETRIC_MASS_NORMALIZATION) {
784  coeff *= 2.0*kappa;
785  }
786  break;
788  if (mass_normalization == QUDA_MASS_NORMALIZATION ||
789  mass_normalization == QUDA_ASYMMETRIC_MASS_NORMALIZATION) {
790  coeff *= 4.0*kappa*kappa;
791  }
792  break;
793  case QUDA_MATPC_SOLUTION:
794  if (mass_normalization == QUDA_MASS_NORMALIZATION) {
795  coeff *= 4.0*kappa*kappa;
796  } else if (mass_normalization == QUDA_ASYMMETRIC_MASS_NORMALIZATION) {
797  coeff *= 2.0*kappa;
798  }
799  break;
801  if (mass_normalization == QUDA_MASS_NORMALIZATION) {
802  coeff*=16.0*pow(kappa,4);
803  } else if (mass_normalization == QUDA_ASYMMETRIC_MASS_NORMALIZATION) {
804  coeff*=4.0*kappa*kappa;
805  }
806  break;
807  default:
808  errorQuda("Solution type %d not supported", solution_type);
809  }
810 
811  if (getVerbosity() >= QUDA_DEBUG_VERBOSE) printfQuda("Mass rescale done\n");
812  }
813 }
814 
815 /*void QUDA_DiracField(QUDA_DiracParam *param) {
816 
817  }*/
818 
819 void dslashQuda(void *h_out, void *h_in, QudaInvertParam *inv_param, QudaParity parity)
820 {
822 
823  if (gaugePrecise == NULL) errorQuda("Gauge field not allocated");
824  if (cloverPrecise == NULL && inv_param->dslash_type == QUDA_CLOVER_WILSON_DSLASH)
825  errorQuda("Clover field not allocated");
826 
827  pushVerbosity(inv_param->verbosity);
829 
830  ColorSpinorParam cpuParam(h_in, inv_param->input_location, *inv_param, gaugePrecise->X(), 1);
831 
833  static_cast<ColorSpinorField*>(new cpuColorSpinorField(cpuParam)) :
834  static_cast<ColorSpinorField*>(new cudaColorSpinorField(cpuParam));
835 
836  ColorSpinorParam cudaParam(cpuParam, *inv_param);
837  cudaColorSpinorField in(*in_h, cudaParam);
838 
839  if (getVerbosity() >= QUDA_VERBOSE) {
840  double cpu = norm2(*in_h);
841  double gpu = norm2(in);
842  printfQuda("In CPU %e CUDA %e\n", cpu, gpu);
843  }
844 
845  cudaParam.create = QUDA_NULL_FIELD_CREATE;
846  cudaColorSpinorField out(in, cudaParam);
847 
848  if (inv_param->dirac_order == QUDA_CPS_WILSON_DIRAC_ORDER) {
849  if (parity == QUDA_EVEN_PARITY) {
850  parity = QUDA_ODD_PARITY;
851  } else {
852  parity = QUDA_EVEN_PARITY;
853  }
855  }
856  bool pc = true;
857 
858  DiracParam diracParam;
859  setDiracParam(diracParam, inv_param, pc);
860 
861  Dirac *dirac = Dirac::create(diracParam); // create the Dirac operator
862  dirac->Dslash(out, in, parity); // apply the operator
863  delete dirac; // clean up
864 
865  cpuParam.v = h_out;
866 
867  ColorSpinorField *out_h = (inv_param->output_location == QUDA_CPU_FIELD_LOCATION) ?
868  static_cast<ColorSpinorField*>(new cpuColorSpinorField(cpuParam)) : static_cast<ColorSpinorField*>(new cudaColorSpinorField(cpuParam));
869  *out_h = out;
870 
871  if (getVerbosity() >= QUDA_VERBOSE) {
872  double cpu = norm2(*out_h);
873  double gpu = norm2(out);
874  printfQuda("Out CPU %e CUDA %e\n", cpu, gpu);
875  }
876 
877  delete out_h;
878  delete in_h;
879 
880  popVerbosity();
881 }
882 
883 
884 void MatQuda(void *h_out, void *h_in, QudaInvertParam *inv_param)
885 {
886  pushVerbosity(inv_param->verbosity);
887 
889  if (gaugePrecise == NULL) errorQuda("Gauge field not allocated");
890  if (cloverPrecise == NULL && inv_param->dslash_type == QUDA_CLOVER_WILSON_DSLASH)
891  errorQuda("Clover field not allocated");
893 
894  bool pc = (inv_param->solution_type == QUDA_MATPC_SOLUTION ||
896 
897  ColorSpinorParam cpuParam(h_in, inv_param->input_location, *inv_param, gaugePrecise->X(), pc);
899  static_cast<ColorSpinorField*>(new cpuColorSpinorField(cpuParam)) : static_cast<ColorSpinorField*>(new cudaColorSpinorField(cpuParam));
900 
901  ColorSpinorParam cudaParam(cpuParam, *inv_param);
902  cudaColorSpinorField in(*in_h, cudaParam);
903 
904  if (getVerbosity() >= QUDA_VERBOSE) {
905  double cpu = norm2(*in_h);
906  double gpu = norm2(in);
907  printfQuda("In CPU %e CUDA %e\n", cpu, gpu);
908  }
909 
910  cudaParam.create = QUDA_NULL_FIELD_CREATE;
911  cudaColorSpinorField out(in, cudaParam);
912 
913  DiracParam diracParam;
914  setDiracParam(diracParam, inv_param, pc);
915 
916  Dirac *dirac = Dirac::create(diracParam); // create the Dirac operator
917  dirac->M(out, in); // apply the operator
918  delete dirac; // clean up
919 
920  double kappa = inv_param->kappa;
921  if (pc) {
922  if (inv_param->mass_normalization == QUDA_MASS_NORMALIZATION) {
923  axCuda(0.25/(kappa*kappa), out);
924  } else if (inv_param->mass_normalization == QUDA_ASYMMETRIC_MASS_NORMALIZATION) {
925  axCuda(0.5/kappa, out);
926  }
927  } else {
928  if (inv_param->mass_normalization == QUDA_MASS_NORMALIZATION ||
930  axCuda(0.5/kappa, out);
931  }
932  }
933 
934  cpuParam.v = h_out;
935 
936  ColorSpinorField *out_h = (inv_param->output_location == QUDA_CPU_FIELD_LOCATION) ?
937  static_cast<ColorSpinorField*>(new cpuColorSpinorField(cpuParam)) : static_cast<ColorSpinorField*>(new cudaColorSpinorField(cpuParam));
938  *out_h = out;
939 
940  if (getVerbosity() >= QUDA_VERBOSE) {
941  double cpu = norm2(*out_h);
942  double gpu = norm2(out);
943  printfQuda("Out CPU %e CUDA %e\n", cpu, gpu);
944  }
945 
946  delete out_h;
947  delete in_h;
948 
949  popVerbosity();
950 }
951 
952 
953 void MatDagMatQuda(void *h_out, void *h_in, QudaInvertParam *inv_param)
954 {
955  pushVerbosity(inv_param->verbosity);
956 
957  if (inv_param->dslash_type == QUDA_DOMAIN_WALL_DSLASH) setKernelPackT(true);
958  if (inv_param->twist_flavor == QUDA_TWIST_NONDEG_DOUBLET) setKernelPackT(true);
959 
960  if (!initialized) errorQuda("QUDA not initialized");
961  if (gaugePrecise == NULL) errorQuda("Gauge field not allocated");
962  if (cloverPrecise == NULL && inv_param->dslash_type == QUDA_CLOVER_WILSON_DSLASH)
963  errorQuda("Clover field not allocated");
965 
966  bool pc = (inv_param->solution_type == QUDA_MATPC_SOLUTION ||
968 
969  ColorSpinorParam cpuParam(h_in, inv_param->input_location, *inv_param, gaugePrecise->X(), pc);
971  static_cast<ColorSpinorField*>(new cpuColorSpinorField(cpuParam)) : static_cast<ColorSpinorField*>(new cudaColorSpinorField(cpuParam));
972 
973  ColorSpinorParam cudaParam(cpuParam, *inv_param);
974  cudaColorSpinorField in(*in_h, cudaParam);
975 
976  if (getVerbosity() >= QUDA_VERBOSE) {
977  double cpu = norm2(*in_h);
978  double gpu = norm2(in);
979  printfQuda("In CPU %e CUDA %e\n", cpu, gpu);
980  }
981 
982  cudaParam.create = QUDA_NULL_FIELD_CREATE;
983  cudaColorSpinorField out(in, cudaParam);
984 
985  // double kappa = inv_param->kappa;
986  // if (inv_param->dirac_order == QUDA_CPS_WILSON_DIRAC_ORDER) kappa *= gaugePrecise->anisotropy;
987 
988  DiracParam diracParam;
989  setDiracParam(diracParam, inv_param, pc);
990 
991  Dirac *dirac = Dirac::create(diracParam); // create the Dirac operator
992  dirac->MdagM(out, in); // apply the operator
993  delete dirac; // clean up
994 
995  double kappa = inv_param->kappa;
996  if (pc) {
997  if (inv_param->mass_normalization == QUDA_MASS_NORMALIZATION) {
998  axCuda(1.0/pow(2.0*kappa,4), out);
999  } else if (inv_param->mass_normalization == QUDA_ASYMMETRIC_MASS_NORMALIZATION) {
1000  axCuda(0.25/(kappa*kappa), out);
1001  }
1002  } else {
1003  if (inv_param->mass_normalization == QUDA_MASS_NORMALIZATION ||
1005  axCuda(0.25/(kappa*kappa), out);
1006  }
1007  }
1008 
1009  cpuParam.v = h_out;
1010 
1011  ColorSpinorField *out_h = (inv_param->output_location == QUDA_CPU_FIELD_LOCATION) ?
1012  static_cast<ColorSpinorField*>(new cpuColorSpinorField(cpuParam)) : static_cast<ColorSpinorField*>(new cudaColorSpinorField(cpuParam));
1013  *out_h = out;
1014 
1015  if (getVerbosity() >= QUDA_VERBOSE) {
1016  double cpu = norm2(*out_h);
1017  double gpu = norm2(out);
1018  printfQuda("Out CPU %e CUDA %e\n", cpu, gpu);
1019  }
1020 
1021  delete out_h;
1022  delete in_h;
1023 
1024  popVerbosity();
1025 }
1026 
1029  if (param->dslash_type != QUDA_ASQTAD_DSLASH) {
1030  if (gaugePrecise == NULL) errorQuda("Precise gauge field doesn't exist");
1031  if (gaugeSloppy == NULL) errorQuda("Sloppy gauge field doesn't exist");
1032  if (gaugePrecondition == NULL) errorQuda("Precondition gauge field doesn't exist");
1033  cudaGauge = gaugePrecise;
1034  } else {
1035  if (gaugeFatPrecise == NULL) errorQuda("Precise gauge fat field doesn't exist");
1036  if (gaugeFatSloppy == NULL) errorQuda("Sloppy gauge fat field doesn't exist");
1037  if (gaugeFatPrecondition == NULL) errorQuda("Precondition gauge fat field doesn't exist");
1038 
1039  if (gaugeLongPrecise == NULL) errorQuda("Precise gauge long field doesn't exist");
1040  if (gaugeLongSloppy == NULL) errorQuda("Sloppy gauge long field doesn't exist");
1041  if (gaugeLongPrecondition == NULL) errorQuda("Precondition gauge long field doesn't exist");
1042  cudaGauge = gaugeFatPrecise;
1043  }
1044  return cudaGauge;
1045 }
1046 
1047 
1048 void cloverQuda(void *h_out, void *h_in, QudaInvertParam *inv_param, QudaParity parity, int inverse)
1049 {
1050  pushVerbosity(inv_param->verbosity);
1051 
1052  if (!initialized) errorQuda("QUDA not initialized");
1053  if (gaugePrecise == NULL) errorQuda("Gauge field not allocated");
1054  if (cloverPrecise == NULL) errorQuda("Clover field not allocated");
1055 
1057 
1058  if (inv_param->dslash_type != QUDA_CLOVER_WILSON_DSLASH)
1059  errorQuda("Cannot apply the clover term for a non Wilson-clover dslash");
1060 
1061  ColorSpinorParam cpuParam(h_in, inv_param->input_location, *inv_param, gaugePrecise->X(), 1);
1062 
1063  ColorSpinorField *in_h = (inv_param->input_location == QUDA_CPU_FIELD_LOCATION) ?
1064  static_cast<ColorSpinorField*>(new cpuColorSpinorField(cpuParam)) :
1065  static_cast<ColorSpinorField*>(new cudaColorSpinorField(cpuParam));
1066 
1067  ColorSpinorParam cudaParam(cpuParam, *inv_param);
1068  cudaColorSpinorField in(*in_h, cudaParam);
1069 
1070  if (getVerbosity() >= QUDA_VERBOSE) {
1071  double cpu = norm2(*in_h);
1072  double gpu = norm2(in);
1073  printfQuda("In CPU %e CUDA %e\n", cpu, gpu);
1074  }
1075 
1076  cudaParam.create = QUDA_NULL_FIELD_CREATE;
1077  cudaColorSpinorField out(in, cudaParam);
1078 
1079  if (inv_param->dirac_order == QUDA_CPS_WILSON_DIRAC_ORDER) {
1080  if (parity == QUDA_EVEN_PARITY) {
1081  parity = QUDA_ODD_PARITY;
1082  } else {
1083  parity = QUDA_EVEN_PARITY;
1084  }
1086  }
1087  bool pc = true;
1088 
1089  DiracParam diracParam;
1090  setDiracParam(diracParam, inv_param, pc);
1091 
1092  DiracCloverPC dirac(diracParam); // create the Dirac operator
1093  if (!inverse) dirac.Clover(out, in, parity); // apply the clover operator
1094  else dirac.CloverInv(out, in, parity);
1095 
1096  cpuParam.v = h_out;
1097 
1098  ColorSpinorField *out_h = (inv_param->output_location == QUDA_CPU_FIELD_LOCATION) ?
1099  static_cast<ColorSpinorField*>(new cpuColorSpinorField(cpuParam)) : static_cast<ColorSpinorField*>(new cudaColorSpinorField(cpuParam));
1100  *out_h = out;
1101 
1102  if (getVerbosity() >= QUDA_VERBOSE) {
1103  double cpu = norm2(*out_h);
1104  double gpu = norm2(out);
1105  printfQuda("Out CPU %e CUDA %e\n", cpu, gpu);
1106  }
1107 
1108  /*for (int i=0; i<in_h->Volume(); i++) {
1109  ((cpuColorSpinorField*)out_h)->PrintVector(i);
1110  }*/
1111 
1112  delete out_h;
1113  delete in_h;
1114 
1115  popVerbosity();
1116 }
1117 
1118 
1119 void invertQuda(void *hp_x, void *hp_b, QudaInvertParam *param)
1120 {
1121 
1123 
1124  profileInvert[QUDA_PROFILE_TOTAL].Start();
1125 
1126  if (!initialized) errorQuda("QUDA not initialized");
1127 
1128  pushVerbosity(param->verbosity);
1130 
1131  // check the gauge fields have been created
1133 
1134  checkInvertParam(param);
1135 
1136  // It was probably a bad design decision to encode whether the system is even/odd preconditioned (PC) in
1137  // solve_type and solution_type, rather than in separate members of QudaInvertParam. We're stuck with it
1138  // for now, though, so here we factorize everything for convenience.
1139 
1140  bool pc_solution = (param->solution_type == QUDA_MATPC_SOLUTION) || (param->solution_type == QUDA_MATPCDAG_MATPC_SOLUTION);
1141  bool pc_solve = (param->solve_type == QUDA_DIRECT_PC_SOLVE) || (param->solve_type == QUDA_NORMOP_PC_SOLVE);
1142  bool mat_solution = (param->solution_type == QUDA_MAT_SOLUTION) || (param->solution_type == QUDA_MATPC_SOLUTION);
1143  bool direct_solve = (param->solve_type == QUDA_DIRECT_SOLVE) || (param->solve_type == QUDA_DIRECT_PC_SOLVE);
1144 
1145  param->spinorGiB = cudaGauge->VolumeCB() * spinorSiteSize;
1146  if (!pc_solve) param->spinorGiB *= 2;
1147  param->spinorGiB *= (param->cuda_prec == QUDA_DOUBLE_PRECISION ? sizeof(double) : sizeof(float));
1148  if (param->preserve_source == QUDA_PRESERVE_SOURCE_NO) {
1149  param->spinorGiB *= (param->inv_type == QUDA_CG_INVERTER ? 5 : 7)/(double)(1<<30);
1150  } else {
1151  param->spinorGiB *= (param->inv_type == QUDA_CG_INVERTER ? 8 : 9)/(double)(1<<30);
1152  }
1153 
1154  param->secs = 0;
1155  param->gflops = 0;
1156  param->iter = 0;
1157 
1158  Dirac *d = NULL;
1159  Dirac *dSloppy = NULL;
1160  Dirac *dPre = NULL;
1161 
1162  // create the dirac operator
1163  createDirac(d, dSloppy, dPre, *param, pc_solve);
1164 
1165  Dirac &dirac = *d;
1166  Dirac &diracSloppy = *dSloppy;
1167  Dirac &diracPre = *dPre;
1168 
1169  profileInvert[QUDA_PROFILE_H2D].Start();
1170 
1171  cudaColorSpinorField *b = NULL;
1172  cudaColorSpinorField *x = NULL;
1173  cudaColorSpinorField *in = NULL;
1174  cudaColorSpinorField *out = NULL;
1175 
1176  const int *X = cudaGauge->X();
1177 
1178  // wrap CPU host side pointers
1179  ColorSpinorParam cpuParam(hp_b, param->input_location, *param, X, pc_solution);
1181  static_cast<ColorSpinorField*>(new cpuColorSpinorField(cpuParam)) : static_cast<ColorSpinorField*>(new cudaColorSpinorField(cpuParam));
1182 
1183  cpuParam.v = hp_x;
1185  static_cast<ColorSpinorField*>(new cpuColorSpinorField(cpuParam)) : static_cast<ColorSpinorField*>(new cudaColorSpinorField(cpuParam));
1186 
1187  // download source
1188  ColorSpinorParam cudaParam(cpuParam, *param);
1189  cudaParam.create = QUDA_COPY_FIELD_CREATE;
1190  b = new cudaColorSpinorField(*h_b, cudaParam);
1191 
1192  if (param->use_init_guess == QUDA_USE_INIT_GUESS_YES) { // download initial guess
1193  // initial guess only supported for single-pass solvers
1195  (param->solve_type == QUDA_DIRECT_SOLVE || param->solve_type == QUDA_DIRECT_PC_SOLVE)) {
1196  errorQuda("Initial guess not supported for two-pass solver");
1197  }
1198 
1199  x = new cudaColorSpinorField(*h_x, cudaParam); // solution
1200  } else { // zero initial guess
1201  cudaParam.create = QUDA_ZERO_FIELD_CREATE;
1202  x = new cudaColorSpinorField(cudaParam); // solution
1203  }
1204 
1205  profileInvert[QUDA_PROFILE_H2D].Stop();
1206 
1207  if (getVerbosity() >= QUDA_VERBOSE) {
1208  double nh_b = norm2(*h_b);
1209  double nb = norm2(*b);
1210  double nh_x = norm2(*h_x);
1211  double nx = norm2(*x);
1212  printfQuda("Source: CPU = %g, CUDA copy = %g\n", nh_b, nb);
1213  printfQuda("Solution: CPU = %g, CUDA copy = %g\n", nh_x, nx);
1214  }
1215 
1216  setDslashTuning(param->tune, getVerbosity());
1217  setBlasTuning(param->tune, getVerbosity());
1218 
1219  dirac.prepare(in, out, *x, *b, param->solution_type);
1220  if (getVerbosity() >= QUDA_VERBOSE) {
1221  double nin = norm2(*in);
1222  double nout = norm2(*out);
1223  printfQuda("Prepared source = %g\n", nin);
1224  printfQuda("Prepared solution = %g\n", nout);
1225  }
1226 
1227  massRescale(param->dslash_type, param->kappa, param->solution_type, param->mass_normalization, *in);
1228 
1229  if (getVerbosity() >= QUDA_VERBOSE) {
1230  double nin = norm2(*in);
1231  printfQuda("Prepared source post mass rescale = %g\n", nin);
1232  }
1233 
1234  // solution_type specifies *what* system is to be solved.
1235  // solve_type specifies *how* the system is to be solved.
1236  //
1237  // We have the following four cases (plus preconditioned variants):
1238  //
1239  // solution_type solve_type Effect
1240  // ------------- ---------- ------
1241  // MAT DIRECT Solve Ax=b
1242  // MATDAG_MAT DIRECT Solve A^dag y = b, followed by Ax=y
1243  // MAT NORMOP Solve (A^dag A) x = (A^dag b)
1244  // MATDAG_MAT NORMOP Solve (A^dag A) x = b
1245  //
1246  // We generally require that the solution_type and solve_type
1247  // preconditioning match. As an exception, the unpreconditioned MAT
1248  // solution_type may be used with any solve_type, including
1249  // DIRECT_PC and NORMOP_PC. In these cases, preparation of the
1250  // preconditioned source and reconstruction of the full solution are
1251  // taken care of by Dirac::prepare() and Dirac::reconstruct(),
1252  // respectively.
1253 
1254  if (pc_solution && !pc_solve) {
1255  errorQuda("Preconditioned (PC) solution_type requires a PC solve_type");
1256  }
1257 
1258  if (!mat_solution && !pc_solution && pc_solve) {
1259  errorQuda("Unpreconditioned MATDAG_MAT solution_type requires an unpreconditioned solve_type");
1260  }
1261 
1262  if (mat_solution && !direct_solve) { // prepare source: b' = A^dag b
1264  dirac.Mdag(*in, tmp);
1265  } else if (!mat_solution && direct_solve) { // perform the first of two solves: A^dag y = b
1266  DiracMdag m(dirac), mSloppy(diracSloppy), mPre(diracPre);
1267  Solver *solve = Solver::create(*param, m, mSloppy, mPre, profileInvert);
1268  (*solve)(*out, *in);
1269  copyCuda(*in, *out);
1270  delete solve;
1271  }
1272 
1273  if (direct_solve) {
1274  DiracM m(dirac), mSloppy(diracSloppy), mPre(diracPre);
1275  Solver *solve = Solver::create(*param, m, mSloppy, mPre, profileInvert);
1276  (*solve)(*out, *in);
1277  delete solve;
1278  } else {
1279  DiracMdagM m(dirac), mSloppy(diracSloppy), mPre(diracPre);
1280  Solver *solve = Solver::create(*param, m, mSloppy, mPre, profileInvert);
1281  (*solve)(*out, *in);
1282  delete solve;
1283  }
1284 
1285  if (getVerbosity() >= QUDA_VERBOSE){
1286  double nx = norm2(*x);
1287  printfQuda("Solution = %g\n",nx);
1288  }
1289  dirac.reconstruct(*x, *b, param->solution_type);
1290 
1291  profileInvert[QUDA_PROFILE_D2H].Start();
1292  *h_x = *x;
1293  profileInvert[QUDA_PROFILE_D2H].Stop();
1294 
1295  if (getVerbosity() >= QUDA_VERBOSE){
1296  double nx = norm2(*x);
1297  double nh_x = norm2(*h_x);
1298  printfQuda("Reconstructed: CUDA solution = %g, CPU copy = %g\n", nx, nh_x);
1299  }
1300 
1301  delete h_b;
1302  delete h_x;
1303  delete b;
1304  delete x;
1305 
1306  delete d;
1307  delete dSloppy;
1308  delete dPre;
1309 
1310  popVerbosity();
1311 
1312  // FIXME: added temporarily so that the cache is written out even if a long benchmarking job gets interrupted
1314 
1315  profileInvert[QUDA_PROFILE_TOTAL].Stop();
1316 }
1317 
1318 
1327 void invertMultiShiftQuda(void **_hp_x, void *_hp_b, QudaInvertParam *param)
1328 {
1329  profileMulti[QUDA_PROFILE_TOTAL].Start();
1330 
1332 
1333  if (!initialized) errorQuda("QUDA not initialized");
1334  // check the gauge fields have been created
1336  checkInvertParam(param);
1337 
1338  if (param->num_offset > QUDA_MAX_MULTI_SHIFT)
1339  errorQuda("Number of shifts %d requested greater than QUDA_MAX_MULTI_SHIFT %d",
1341 
1342  pushVerbosity(param->verbosity);
1343 
1344  bool pc_solution = (param->solution_type == QUDA_MATPC_SOLUTION) || (param->solution_type == QUDA_MATPCDAG_MATPC_SOLUTION);
1345  bool pc_solve = (param->solve_type == QUDA_DIRECT_PC_SOLVE) || (param->solve_type == QUDA_NORMOP_PC_SOLVE);
1346  bool mat_solution = (param->solution_type == QUDA_MAT_SOLUTION) || (param->solution_type == QUDA_MATPC_SOLUTION);
1347  bool direct_solve = (param->solve_type == QUDA_DIRECT_SOLVE) || (param->solve_type == QUDA_DIRECT_PC_SOLVE);
1348 
1349  if (mat_solution) {
1350  errorQuda("Multi-shift solver does not support MAT or MATPC solution types");
1351  }
1352  if (direct_solve) {
1353  errorQuda("Multi-shift solver does not support DIRECT or DIRECT_PC solve types");
1354  }
1355  if (pc_solution & !pc_solve) {
1356  errorQuda("Preconditioned (PC) solution_type requires a PC solve_type");
1357  }
1358  if (!pc_solution & pc_solve) {
1359  errorQuda("In multi-shift solver, a preconditioned (PC) solve_type requires a PC solution_type");
1360  }
1361 
1362  // No of GiB in a checkerboard of a spinor
1363  param->spinorGiB = cudaGauge->VolumeCB() * spinorSiteSize;
1364  if( !pc_solve) param->spinorGiB *= 2; // Double volume for non PC solve
1365 
1366  // **** WARNING *** this may not match implementation...
1367  if( param->inv_type == QUDA_CG_INVERTER ) {
1368  // CG-M needs 5 vectors for the smallest shift + 2 for each additional shift
1369  param->spinorGiB *= (5 + 2*(param->num_offset-1))/(double)(1<<30);
1370  } else {
1371  errorQuda("QUDA only currently supports multi-shift CG");
1372  // BiCGStab-M needs 7 for the original shift + 2 for each additional shift + 1 auxiliary
1373  // (Jegerlehner hep-lat/9612014 eq (3.13)
1374  param->spinorGiB *= (7 + 2*(param->num_offset-1))/(double)(1<<30);
1375  }
1376 
1377  // Timing and FLOP counters
1378  param->secs = 0;
1379  param->gflops = 0;
1380  param->iter = 0;
1381 
1382  for (int i=0; i<param->num_offset-1; i++) {
1383  for (int j=i+1; j<param->num_offset; j++) {
1384  if (param->offset[i] > param->offset[j])
1385  errorQuda("Offsets must be ordered from smallest to largest");
1386  }
1387  }
1388 
1389  // Host pointers for x, take a copy of the input host pointers
1390  void** hp_x;
1391  hp_x = new void* [ param->num_offset ];
1392 
1393  void* hp_b = _hp_b;
1394  for(int i=0;i < param->num_offset;i++){
1395  hp_x[i] = _hp_x[i];
1396  }
1397 
1398  // Create the matrix.
1399  // The way this works is that createDirac will create 'd' and 'dSloppy'
1400  // which are global. We then grab these with references...
1401  //
1402  // Balint: Isn't there a nice construction pattern we could use here? This is
1403  // expedient but yucky.
1404  // DiracParam diracParam;
1405  if (param->dslash_type == QUDA_ASQTAD_DSLASH){
1406  param->mass = sqrt(param->offset[0]/4);
1407  }
1408 
1409  Dirac *d = NULL;
1410  Dirac *dSloppy = NULL;
1411  Dirac *dPre = NULL;
1412 
1413  // create the dirac operator
1414  createDirac(d, dSloppy, dPre, *param, pc_solve);
1415  Dirac &dirac = *d;
1416  Dirac &diracSloppy = *dSloppy;
1417 
1418  cpuColorSpinorField *h_b = NULL; // Host RHS
1419  cpuColorSpinorField **h_x = NULL;
1420  cudaColorSpinorField *b = NULL; // Cuda RHS
1421  cudaColorSpinorField **x = NULL; // Cuda Solutions
1422 
1423  // Grab the dimension array of the input gauge field.
1424  const int *X = ( param->dslash_type == QUDA_ASQTAD_DSLASH ) ?
1425  gaugeFatPrecise->X() : gaugePrecise->X();
1426 
1427  // Wrap CPU host side pointers
1428  //
1429  // Balint: This creates a ColorSpinorParam struct, from the host data pointer,
1430  // the definitions in param, the dimensions X, and whether the solution is on
1431  // a checkerboard instruction or not. These can then be used as 'instructions'
1432  // to create the actual colorSpinorField
1433  ColorSpinorParam cpuParam(hp_b, QUDA_CPU_FIELD_LOCATION, *param, X, pc_solution);
1434  h_b = new cpuColorSpinorField(cpuParam);
1435 
1436  h_x = new cpuColorSpinorField* [ param->num_offset ]; // DYNAMIC ALLOCATION
1437  for(int i=0; i < param->num_offset; i++) {
1438  cpuParam.v = hp_x[i];
1439  h_x[i] = new cpuColorSpinorField(cpuParam);
1440  }
1441 
1442 
1443  profileMulti[QUDA_PROFILE_H2D].Start();
1444  // Now I need a colorSpinorParam for the device
1445  ColorSpinorParam cudaParam(cpuParam, *param);
1446  // This setting will download a host vector
1447  cudaParam.create = QUDA_COPY_FIELD_CREATE;
1448  b = new cudaColorSpinorField(*h_b, cudaParam); // Creates b and downloads h_b to it
1449  profileMulti[QUDA_PROFILE_H2D].Stop();
1450 
1451  // Create the solution fields filled with zero
1452  x = new cudaColorSpinorField* [ param->num_offset ];
1453  cudaParam.create = QUDA_ZERO_FIELD_CREATE;
1454  for(int i=0; i < param->num_offset; i++) {
1455  x[i] = new cudaColorSpinorField(cudaParam);
1456  }
1457 
1458  // Check source norms
1459  if(getVerbosity() >= QUDA_VERBOSE ) {
1460  double nh_b = norm2(*h_b);
1461  double nb = norm2(*b);
1462  printfQuda("Source: CPU = %g, CUDA copy = %g\n", nh_b, nb);
1463  }
1464 
1465  setDslashTuning(param->tune, getVerbosity());
1466  setBlasTuning(param->tune, getVerbosity());
1467 
1468  massRescale(param->dslash_type, param->kappa, param->solution_type, param->mass_normalization, *b);
1469  double *unscaled_shifts = new double [param->num_offset];
1470  for(int i=0; i < param->num_offset; i++){
1471  unscaled_shifts[i] = param->offset[i];
1472  massRescaleCoeff(param->dslash_type, param->kappa, param->solution_type, param->mass_normalization, param->offset[i]);
1473  }
1474 
1475  // use multi-shift CG
1476  {
1477  DiracMdagM m(dirac), mSloppy(diracSloppy);
1478  MultiShiftCG cg_m(m, mSloppy, *param, profileMulti);
1479  cg_m(x, *b);
1480  }
1481 
1482  // experimenting with Minimum residual extrapolation
1483  /*
1484  cudaColorSpinorField **q = new cudaColorSpinorField* [ param->num_offset ];
1485  cudaColorSpinorField **z = new cudaColorSpinorField* [ param->num_offset ];
1486  cudaColorSpinorField tmp(cudaParam);
1487 
1488  for(int i=0; i < param->num_offset; i++) {
1489  cudaParam.create = QUDA_ZERO_FIELD_CREATE;
1490  q[i] = new cudaColorSpinorField(cudaParam);
1491  cudaParam.create = QUDA_COPY_FIELD_CREATE;
1492  z[i] = new cudaColorSpinorField(*x[i], cudaParam);
1493  }
1494 
1495  for(int i=0; i < param->num_offset; i++) {
1496  dirac.setMass(sqrt(param->offset[i]/4));
1497  DiracMdagM m(dirac);
1498  MinResExt mre(m, profileMulti);
1499  copyCuda(tmp, *b);
1500  mre(*x[i], tmp, z, q, param -> num_offset);
1501  dirac.setMass(sqrt(param->offset[0]/4));
1502  }
1503 
1504  for(int i=0; i < param->num_offset; i++) {
1505  delete q[i];
1506  delete z[i];
1507  }
1508  delete []q;
1509  delete []z;
1510  */
1511 
1512  // check each shift has the desired tolerance and use sequential CG to refine
1513 
1514  cudaParam.create = QUDA_ZERO_FIELD_CREATE;
1515  cudaColorSpinorField r(*b, cudaParam);
1516  for(int i=0; i < param->num_offset; i++) {
1517  double rsd_hq = param->residual_type & QUDA_HEAVY_QUARK_RESIDUAL ?
1518  param->true_res_hq_offset[i] : 0;
1519 
1520  double tol_hq = param->residual_type & QUDA_HEAVY_QUARK_RESIDUAL ?
1521  param->tol_hq_offset[i] : 0;
1522 
1523  // refine if either L2 or heavy quark residual tolerances have not been met
1524  if (param->true_res_offset[i] > param->tol_offset[i] || rsd_hq > tol_hq) {
1525  if (getVerbosity() >= QUDA_VERBOSE)
1526  printfQuda("Refining shift %d: L2 residual %e / %e, heavy quark %e / %e (actual / requested)\n",
1527  i, param->true_res_offset[i], param->tol_offset[i], rsd_hq, tol_hq);
1528 
1529  // for staggered the shift is just a change in mass term (FIXME: for twisted mass also)
1530  if (param->dslash_type == QUDA_ASQTAD_DSLASH ) {
1531  dirac.setMass(sqrt(param->offset[i]/4));
1532  diracSloppy.setMass(sqrt(param->offset[i]/4));
1533  }
1534 
1535  DiracMdagM m(dirac), mSloppy(diracSloppy);
1536 
1537  // need to curry in the shift if we are not doing staggered
1538  if (param->dslash_type != QUDA_ASQTAD_DSLASH) {
1539  m.shift = param->offset[i];
1540  mSloppy.shift = param->offset[i];
1541  }
1542 
1544  param->tol = param->tol_offset[i]; // set L2 tolerance
1545  param->tol_hq = param->tol_hq_offset[i]; // set heavy quark tolerance
1546  CG cg(m, mSloppy, *param, profileMulti);
1547  cg(*x[i], *b);
1548  param->true_res_offset[i] = param->true_res;
1549  param->true_res_hq_offset[i] = param->true_res_hq;
1550 
1551  if (param->dslash_type == QUDA_ASQTAD_DSLASH ) {
1552  dirac.setMass(sqrt(param->offset[0]/4)); // restore just in case
1553  diracSloppy.setMass(sqrt(param->offset[0]/4)); // restore just in case
1554  }
1555  }
1556  }
1557 
1558  // restore shifts -- avoid side effects
1559  for(int i=0; i < param->num_offset; i++) {
1560  param->offset[i] = unscaled_shifts[i];
1561  }
1562 
1563  delete [] unscaled_shifts;
1564 
1565  profileMulti[QUDA_PROFILE_D2H].Start();
1566  for(int i=0; i < param->num_offset; i++) {
1567  if (getVerbosity() >= QUDA_VERBOSE){
1568  double nx = norm2(*x[i]);
1569  printfQuda("Solution %d = %g\n", i, nx);
1570  }
1571 
1572  *h_x[i] = *x[i];
1573  }
1574  profileMulti[QUDA_PROFILE_D2H].Stop();
1575 
1576  for(int i=0; i < param->num_offset; i++){
1577  delete h_x[i];
1578  delete x[i];
1579  }
1580 
1581  delete h_b;
1582  delete b;
1583 
1584  delete [] h_x;
1585  delete [] x;
1586 
1587  delete [] hp_x;
1588 
1589  delete d;
1590  delete dSloppy;
1591  delete dPre;
1592 
1593  popVerbosity();
1594 
1595  // FIXME: added temporarily so that the cache is written out even if a long benchmarking job gets interrupted
1597 
1598  profileMulti[QUDA_PROFILE_TOTAL].Stop();
1599 }
1600 
1601 
1602 #ifdef GPU_FATLINK
1603 /* @method
1604  * QUDA_COMPUTE_FAT_STANDARD: standard method (default)
1605  * QUDA_COMPUTE_FAT_EXTENDED_VOLUME, extended volume method
1606  *
1607  */
1608 #include <sys/time.h>
1609 
1611 {
1612  int* X = param->X;
1613 #ifdef MULTI_GPU
1614  int Vsh_x = X[1]*X[2]*X[3]/2;
1615  int Vsh_y = X[0]*X[2]*X[3]/2;
1616  int Vsh_z = X[0]*X[1]*X[3]/2;
1617 #endif
1618  int Vsh_t = X[0]*X[1]*X[2]/2;
1619 
1620  int E[4];
1621  for (int i=0; i<4; i++) E[i] = X[i] + 4;
1622 
1623  // fat-link padding
1624  param->llfat_ga_pad = Vsh_t;
1625 
1626  // site-link padding
1627  if(method == QUDA_COMPUTE_FAT_STANDARD) {
1628 #ifdef MULTI_GPU
1629  int Vh_2d_max = MAX(X[0]*X[1]/2, X[0]*X[2]/2);
1630  Vh_2d_max = MAX(Vh_2d_max, X[0]*X[3]/2);
1631  Vh_2d_max = MAX(Vh_2d_max, X[1]*X[2]/2);
1632  Vh_2d_max = MAX(Vh_2d_max, X[1]*X[3]/2);
1633  Vh_2d_max = MAX(Vh_2d_max, X[2]*X[3]/2);
1634  param->site_ga_pad = 3*(Vsh_x+Vsh_y+Vsh_z+Vsh_t) + 4*Vh_2d_max;
1635 #else
1636  param->site_ga_pad = Vsh_t;
1637 #endif
1638  } else {
1639  param->site_ga_pad = (E[0]*E[1]*E[2]/2)*3;
1640  }
1641  param->ga_pad = param->site_ga_pad;
1642 
1643  // staple padding
1644  if(method == QUDA_COMPUTE_FAT_STANDARD) {
1645 #ifdef MULTI_GPU
1646  param->staple_pad = 3*(Vsh_x + Vsh_y + Vsh_z+ Vsh_t);
1647 #else
1648  param->staple_pad = 3*Vsh_t;
1649 #endif
1650  } else {
1651  param->staple_pad = (E[0]*E[1]*E[2]/2)*3;
1652  }
1653 
1654  return;
1655 }
1656 
1657 
1658 namespace quda {
1659  void computeFatLinkCore(cudaGaugeField* cudaSiteLink, double* act_path_coeff,
1660  QudaGaugeParam* qudaGaugeParam, QudaComputeFatMethod method,
1661  cudaGaugeField* cudaFatLink, struct timeval time_array[])
1662  {
1663  gettimeofday(&time_array[0], NULL);
1664 
1665  const int flag = qudaGaugeParam->preserve_gauge;
1666  GaugeFieldParam gParam(0,*qudaGaugeParam);
1667 
1668  if(method == QUDA_COMPUTE_FAT_STANDARD){
1669  for(int dir=0; dir<4; ++dir) gParam.x[dir] = qudaGaugeParam->X[dir];
1670  }else{
1671  for(int dir=0; dir<4; ++dir) gParam.x[dir] = qudaGaugeParam->X[dir] + 4;
1672  }
1673 
1674 
1675  static cudaGaugeField* cudaStapleField=NULL, *cudaStapleField1=NULL;
1676  if(cudaStapleField == NULL || cudaStapleField1 == NULL){
1677  gParam.pad = qudaGaugeParam->staple_pad;
1680  gParam.geometry = QUDA_SCALAR_GEOMETRY; // only require a scalar matrix field for the staple
1681  cudaStapleField = new cudaGaugeField(gParam);
1682  cudaStapleField1 = new cudaGaugeField(gParam);
1683  }
1684 
1685  gettimeofday(&time_array[1], NULL);
1686 
1687  if(method == QUDA_COMPUTE_FAT_STANDARD){
1688  llfat_cuda(*cudaFatLink, *cudaSiteLink, *cudaStapleField, *cudaStapleField1, qudaGaugeParam, act_path_coeff);
1689  }else{ //method == QUDA_COMPUTE_FAT_EXTENDED_VOLUME
1690  llfat_cuda_ex(*cudaFatLink, *cudaSiteLink, *cudaStapleField, *cudaStapleField1, qudaGaugeParam, act_path_coeff);
1691  }
1692  gettimeofday(&time_array[2], NULL);
1693 
1694  if (!(flag & QUDA_FAT_PRESERVE_GPU_GAUGE) ){
1695  delete cudaStapleField; cudaStapleField = NULL;
1696  delete cudaStapleField1; cudaStapleField1 = NULL;
1697  }
1698  gettimeofday(&time_array[3], NULL);
1699 
1700  return;
1701 
1702  }
1703 } // namespace quda
1704 
1705 
1706 
1707 int
1708 computeFatLinkQuda(void* fatlink, void** sitelink, double* act_path_coeff,
1709  QudaGaugeParam* qudaGaugeParam,
1710  QudaComputeFatMethod method)
1711 {
1712 #define TDIFF_MS(a,b) (b.tv_sec - a.tv_sec + 0.000001*(b.tv_usec - a.tv_usec))*1000
1713 
1714  struct timeval t0;
1715  struct timeval t7, t8, t9, t10, t11;
1716 
1717  gettimeofday(&t0, NULL);
1718 
1719  static cpuGaugeField* cpuFatLink=NULL, *cpuSiteLink=NULL;
1720  static cudaGaugeField* cudaFatLink=NULL, *cudaSiteLink=NULL;
1721  int flag = qudaGaugeParam->preserve_gauge;
1722 
1723  QudaGaugeParam qudaGaugeParam_ex_buf;
1724  QudaGaugeParam* qudaGaugeParam_ex = &qudaGaugeParam_ex_buf;
1725  memcpy(qudaGaugeParam_ex, qudaGaugeParam, sizeof(QudaGaugeParam));
1726 
1727  qudaGaugeParam_ex->X[0] = qudaGaugeParam->X[0]+4;
1728  qudaGaugeParam_ex->X[1] = qudaGaugeParam->X[1]+4;
1729  qudaGaugeParam_ex->X[2] = qudaGaugeParam->X[2]+4;
1730  qudaGaugeParam_ex->X[3] = qudaGaugeParam->X[3]+4;
1731 
1732  GaugeFieldParam gParam_ex(0, *qudaGaugeParam_ex);
1733 
1734  // fat-link padding
1735  setFatLinkPadding(method, qudaGaugeParam);
1736  qudaGaugeParam_ex->llfat_ga_pad = qudaGaugeParam->llfat_ga_pad;
1737  qudaGaugeParam_ex->staple_pad = qudaGaugeParam->staple_pad;
1738  qudaGaugeParam_ex->site_ga_pad = qudaGaugeParam->site_ga_pad;
1739 
1740  GaugeFieldParam gParam(0, *qudaGaugeParam);
1741 
1742  // create the host fatlink
1743  if(cpuFatLink == NULL){
1747  gParam.gauge= fatlink;
1748  cpuFatLink = new cpuGaugeField(gParam);
1749  if(cpuFatLink == NULL){
1750  errorQuda("ERROR: Creating cpuFatLink failed\n");
1751  }
1752  }else{
1753  cpuFatLink->setGauge((void**)fatlink);
1754  }
1755 
1756  // create the device fatlink
1757  if(cudaFatLink == NULL){
1758  gParam.pad = qudaGaugeParam->llfat_ga_pad;
1763  cudaFatLink = new cudaGaugeField(gParam);
1764  }
1765 
1766 
1767 
1768  // create the host sitelink
1769  if(cpuSiteLink == NULL){
1770  gParam.pad = 0;
1772  gParam.link_type = qudaGaugeParam->type;
1773  gParam.order = qudaGaugeParam->gauge_order;
1774  gParam.gauge = sitelink;
1775  if(method != QUDA_COMPUTE_FAT_STANDARD){
1776  for(int dir=0; dir<4; ++dir) gParam.x[dir] = qudaGaugeParam_ex->X[dir];
1777  }
1778  cpuSiteLink = new cpuGaugeField(gParam);
1779  if(cpuSiteLink == NULL){
1780  errorQuda("ERROR: Creating cpuSiteLink failed\n");
1781  }
1782  }else{
1783  cpuSiteLink->setGauge(sitelink);
1784  }
1785 
1786  if(cudaSiteLink == NULL){
1787  gParam.pad = qudaGaugeParam->site_ga_pad;
1789  gParam.link_type = qudaGaugeParam->type;
1790  gParam.reconstruct = qudaGaugeParam->reconstruct;
1791  cudaSiteLink = new cudaGaugeField(gParam);
1792  }
1793 
1794  initLatticeConstants(*cudaFatLink);
1795 
1796  if(method == QUDA_COMPUTE_FAT_STANDARD){
1797  llfat_init_cuda(qudaGaugeParam);
1798  gettimeofday(&t7, NULL);
1799 
1800 #ifdef MULTI_GPU
1801  if(qudaGaugeParam->gauge_order == QUDA_MILC_GAUGE_ORDER){
1802  errorQuda("Only QDP-ordered site links are supported in the multi-gpu standard fattening code\n");
1803  }
1804 #endif
1805  loadLinkToGPU(cudaSiteLink, cpuSiteLink, qudaGaugeParam);
1806 
1807  gettimeofday(&t8, NULL);
1808  }else{
1809  llfat_init_cuda_ex(qudaGaugeParam_ex);
1810 #ifdef MULTI_GPU
1811  int R[4] = {2, 2, 2, 2}; // radius of the extended region in each dimension / direction
1812  exchange_cpu_sitelink_ex(qudaGaugeParam->X, R, (void**)cpuSiteLink->Gauge_p(),
1813  cpuSiteLink->Order(),qudaGaugeParam->cpu_prec, 0);
1814 #endif
1815  gettimeofday(&t7, NULL);
1816  loadLinkToGPU_ex(cudaSiteLink, cpuSiteLink);
1817  gettimeofday(&t8, NULL);
1818  }
1819 
1820  // time the subroutines in computeFatLinkCore
1821  struct timeval time_array[4];
1822 
1823  // Actually do the fattening
1824  computeFatLinkCore(cudaSiteLink, act_path_coeff, qudaGaugeParam, method, cudaFatLink, time_array);
1825 
1826 
1827  gettimeofday(&t9, NULL);
1828 
1829  storeLinkToCPU(cpuFatLink, cudaFatLink, qudaGaugeParam);
1830 
1831  gettimeofday(&t10, NULL);
1832 
1833  if (!(flag & QUDA_FAT_PRESERVE_CPU_GAUGE) ){
1834  delete cpuFatLink; cpuFatLink = NULL;
1835  delete cpuSiteLink; cpuSiteLink = NULL;
1836  }
1837  if (!(flag & QUDA_FAT_PRESERVE_GPU_GAUGE) ){
1838  delete cudaFatLink; cudaFatLink = NULL;
1839  delete cudaSiteLink; cudaSiteLink = NULL;
1840  }
1841 
1842  gettimeofday(&t11, NULL);
1843 #ifdef DSLASH_PROFILING
1844  printfQuda("total time: %f ms, init(cuda/cpu gauge field creation,etc)=%f ms,"
1845  " sitelink cpu->gpu=%f ms, computation in gpu =%f ms, fatlink gpu->cpu=%f ms\n",
1846  TDIFF_MS(t0, t11), TDIFF_MS(t0, t7) + TDIFF_MS(time_array[0],time_array[1]), TDIFF_MS(t7, t8), TDIFF_MS(time_array[1], time_array[2]), TDIFF_MS(t9,t10));
1847  printfQuda("finally cleanup =%f ms\n", TDIFF_MS(t10, t11) + TDIFF_MS(time_array[2],time_array[3]));
1848 #endif
1849 
1850  return 0;
1851 }
1852 #endif
1853 
1854 #ifdef GPU_GAUGE_FORCE
1855 int
1856 computeGaugeForceQuda(void* mom, void* sitelink, int*** input_path_buf, int* path_length,
1857  void* loop_coeff, int num_paths, int max_length, double eb3,
1858  QudaGaugeParam* qudaGaugeParam, double* timeinfo)
1859 {
1860  struct timeval t0, t1, t2, t3;
1861 
1862  gettimeofday(&t0,NULL);
1863 
1864 #ifdef MULTI_GPU
1865  int E[4];
1866  QudaGaugeParam qudaGaugeParam_ex_buf;
1867  QudaGaugeParam* qudaGaugeParam_ex=&qudaGaugeParam_ex_buf;
1868  memcpy(qudaGaugeParam_ex, qudaGaugeParam, sizeof(QudaGaugeParam));
1869  E[0] = qudaGaugeParam_ex->X[0] = qudaGaugeParam->X[0] + 4;
1870  E[1] = qudaGaugeParam_ex->X[1] = qudaGaugeParam->X[1] + 4;
1871  E[2] = qudaGaugeParam_ex->X[2] = qudaGaugeParam->X[2] + 4;
1872  E[3] = qudaGaugeParam_ex->X[3] = qudaGaugeParam->X[3] + 4;
1873 #endif
1874 
1875  int* X = qudaGaugeParam->X;
1876  GaugeFieldParam gParam(0, *qudaGaugeParam);
1877 #ifdef MULTI_GPU
1878  GaugeFieldParam gParam_ex(0, *qudaGaugeParam_ex);
1879  GaugeFieldParam& gParamSL = gParam_ex;
1880  int pad = E[2]*E[1]*E[0]/2;
1881 #else
1882  GaugeFieldParam& gParamSL = gParam;
1883  int pad = X[2]*X[1]*X[0]/2;
1884 #endif
1885 
1886  GaugeFieldParam& gParamMom = gParam;
1887 
1889  gParamSL.gauge = sitelink;
1890  gParamSL.pad = 0;
1891  cpuGaugeField* cpuSiteLink = new cpuGaugeField(gParamSL);
1892 
1893  gParamSL.create =QUDA_NULL_FIELD_CREATE;
1894  gParamSL.pad = pad;
1895  gParamSL.precision = qudaGaugeParam->cuda_prec;
1896  gParamSL.reconstruct = qudaGaugeParam->reconstruct;
1897  cudaGaugeField* cudaSiteLink = new cudaGaugeField(gParamSL);
1898  qudaGaugeParam->site_ga_pad = gParamSL.pad;//need to record this value
1899 
1900  gParamMom.pad = 0;
1901  gParamMom.order =QUDA_MILC_GAUGE_ORDER;
1902  gParamMom.precision = qudaGaugeParam->cpu_prec;
1904  gParamMom.reconstruct =QUDA_RECONSTRUCT_10;
1905  gParamMom.gauge=mom;
1906  gParamMom.link_type = QUDA_ASQTAD_MOM_LINKS;
1907  cpuGaugeField* cpuMom = new cpuGaugeField(gParamMom);
1908 
1909 
1910  gParamMom.pad = pad;
1911  gParamMom.create =QUDA_NULL_FIELD_CREATE;
1912  gParamMom.reconstruct = QUDA_RECONSTRUCT_10;
1913  gParamMom.precision = qudaGaugeParam->cuda_prec;
1914  gParamMom.link_type = QUDA_ASQTAD_MOM_LINKS;
1915  cudaGaugeField* cudaMom = new cudaGaugeField(gParamMom);
1916  qudaGaugeParam->mom_ga_pad = gParamMom.pad; //need to record this value
1917 
1918  initLatticeConstants(*cudaMom);
1919  checkCudaError();
1920  gauge_force_init_cuda(qudaGaugeParam, max_length);
1921 
1922 #ifdef MULTI_GPU
1923  int R[4] = {2, 2, 2, 2}; // radius of the extended region in each dimension / direction
1924  exchange_cpu_sitelink_ex(qudaGaugeParam->X, R, (void**)cpuSiteLink->Gauge_p(),
1925  cpuSiteLink->Order(), qudaGaugeParam->cpu_prec, 1);
1926  loadLinkToGPU_ex(cudaSiteLink, cpuSiteLink);
1927 #else
1928  loadLinkToGPU(cudaSiteLink, cpuSiteLink, qudaGaugeParam);
1929 #endif
1930 
1931  cudaMom->loadCPUField(*cpuMom, QUDA_CPU_FIELD_LOCATION);
1932 
1933  gettimeofday(&t1,NULL);
1934 
1935  gauge_force_cuda(*cudaMom, eb3, *cudaSiteLink, qudaGaugeParam, input_path_buf,
1936  path_length, loop_coeff, num_paths, max_length);
1937 
1938  gettimeofday(&t2,NULL);
1939 
1940  cudaMom->saveCPUField(*cpuMom, QUDA_CPU_FIELD_LOCATION);
1941 
1942  delete cpuSiteLink;
1943  delete cpuMom;
1944 
1945  delete cudaSiteLink;
1946  delete cudaMom;
1947 
1948  gettimeofday(&t3,NULL);
1949 
1950  if(timeinfo){
1951  timeinfo[0] = TDIFF(t0, t1);
1952  timeinfo[1] = TDIFF(t1, t2);
1953  timeinfo[2] = TDIFF(t2, t3);
1954  }
1955 
1956  checkCudaError();
1957  return 0;
1958 }
1959 
1960 #endif
1961 
1962 
1963 /*
1964  The following functions are for the Fortran interface.
1965 */
1966 
1967 void init_quda_(int *dev) { initQuda(*dev); }
1968 void end_quda_() { endQuda(); }
1969 void load_gauge_quda_(void *h_gauge, QudaGaugeParam *param) { loadGaugeQuda(h_gauge, param); }
1971 void load_clover_quda_(void *h_clover, void *h_clovinv, QudaInvertParam *inv_param)
1972 { loadCloverQuda(h_clover, h_clovinv, inv_param); }
1974 void dslash_quda_(void *h_out, void *h_in, QudaInvertParam *inv_param,
1975  QudaParity *parity) { dslashQuda(h_out, h_in, inv_param, *parity); }
1976 void clover_quda_(void *h_out, void *h_in, QudaInvertParam *inv_param,
1977  QudaParity *parity, int *inverse) { cloverQuda(h_out, h_in, inv_param, *parity, *inverse); }
1978 void mat_quda_(void *h_out, void *h_in, QudaInvertParam *inv_param)
1979 { MatQuda(h_out, h_in, inv_param); }
1980 void mat_dag_mat_quda_(void *h_out, void *h_in, QudaInvertParam *inv_param)
1981 { MatDagMatQuda(h_out, h_in, inv_param); }
1982 void invert_quda_(void *hp_x, void *hp_b, QudaInvertParam *param)
1983 { invertQuda(hp_x, hp_b, param); }
1985  *param = newQudaGaugeParam();
1986 }
1988  *param = newQudaInvertParam();
1989 }
1990 
1991 
1995 static int bqcd_rank_from_coords(const int *coords, void *fdata)
1996 {
1997  int *dims = static_cast<int *>(fdata);
1998 
1999  int rank = coords[3];
2000  for (int i = 2; i >= 0; i--) {
2001  rank = dims[i] * rank + coords[i];
2002  }
2003  return rank;
2004 }
2005 
2006 void comm_set_gridsize_(int *grid)
2007 {
2008 #ifdef MULTI_GPU
2009  initCommsGridQuda(4, grid, bqcd_rank_from_coords, static_cast<void *>(grid));
2010 #endif
2011 }