QUDA  v0.7.0
A library for QCD on GPUs
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
interface_quda.cpp
Go to the documentation of this file.
1 #include <iostream>
2 #include <stdio.h>
3 #include <stdlib.h>
4 #include <math.h>
5 #include <string.h>
6 #include <sys/time.h>
7 
8 #include <quda.h>
9 #include <quda_fortran.h>
10 #include <quda_internal.h>
11 #include <comm_quda.h>
12 #include <tune_quda.h>
13 #include <blas_quda.h>
14 #include <gauge_field.h>
15 #include <dirac_quda.h>
16 #include <ritz_quda.h>
17 #include <dslash_quda.h>
18 #include <invert_quda.h>
19 #include <lanczos_quda.h>
20 #include <color_spinor_field.h>
21 #include <eig_variables.h>
22 #include <clover_field.h>
23 #include <llfat_quda.h>
24 #include <fat_force_quda.h>
25 #include <hisq_links_quda.h>
26 #include <algorithm>
27 #include <staggered_oprod.h>
28 #include <ks_improved_force.h>
29 #include <ks_force_quda.h>
30 
31 
32 #ifdef NUMA_AFFINITY
33 #include <numa_affinity.h>
34 #endif
35 
36 #include <cuda.h>
37 #include "face_quda.h"
38 
39 #ifdef MULTI_GPU
40 extern void exchange_cpu_sitelink_ex(int* X, int *R, void** sitelink, QudaGaugeFieldOrder cpu_order,
41  QudaPrecision gPrecision, int optflag, int geom);
42 #endif // MULTI_GPU
43 
44 #include <ks_force_quda.h>
45 
46 #ifdef GPU_GAUGE_FORCE
47 #include <gauge_force_quda.h>
48 #endif
49 #include <gauge_update_quda.h>
50 
51 #define MAX(a,b) ((a)>(b)? (a):(b))
52 #define TDIFF(a,b) (b.tv_sec - a.tv_sec + 0.000001*(b.tv_usec - a.tv_usec))
53 
54 #define spinorSiteSize 24 // real numbers per spinor
55 
56 #define MAX_GPU_NUM_PER_NODE 16
57 
58 // define newQudaGaugeParam() and newQudaInvertParam()
59 #define INIT_PARAM
60 #include "check_params.h"
61 #undef INIT_PARAM
62 
63 // define (static) checkGaugeParam() and checkInvertParam()
64 #define CHECK_PARAM
65 #include "check_params.h"
66 #undef CHECK_PARAM
67 
68 // define printQudaGaugeParam() and printQudaInvertParam()
69 #define PRINT_PARAM
70 #include "check_params.h"
71 #undef PRINT_PARAM
72 
73 #include <gauge_tools.h>
74 
76 
77 using namespace quda;
78 
79 static cudaGaugeField* cudaStapleField = NULL;
80 static cudaGaugeField* cudaStapleField1 = NULL;
81 
82 //for MAGMA lib:
83 #include <blas_magma.h>
84 
85 static bool InitMagma = false;
86 
87 void openMagma(){
88 
89  if(!InitMagma){
91  InitMagma = true;
92  }
93  else printfQuda("\nMAGMA library was already initialized..\n");
94 
95  return;
96 }
97 
98 void closeMagma(){
99 
100  if(InitMagma) BlasMagmaArgs::CloseMagma();
101  else printfQuda("\nMAGMA library was not initialized..\n");
102 
103  return;
104 }
105 
110 
111 // It's important that these alias the above so that constants are set correctly in Dirac::Dirac()
116 
117 
122 
124 
128 
132 
135 
137 
138 cudaDeviceProp deviceProp;
139 cudaStream_t *streams;
140 #ifdef PTHREADS
141 pthread_mutex_t pthread_mutex;
142 #endif
143 
144 static bool initialized = false;
145 
147 static TimeProfile profileInit("initQuda");
148 
150 static TimeProfile profileGauge("loadGaugeQuda");
151 
153 static TimeProfile profileClover("loadCloverQuda");
154 
156 static TimeProfile profileInvert("invertQuda");
157 
159 static TimeProfile profileMulti("invertMultiShiftQuda");
160 
162 static TimeProfile profileMultiMixed("invertMultiShiftMixedQuda");
163 
165 static TimeProfile profileFatLink("computeKSLinkQuda");
166 
168 static TimeProfile profileGaugeForce("computeGaugeForceQuda");
169 
171 static TimeProfile profileGaugeUpdate("updateGaugeFieldQuda");
172 
174 static TimeProfile profileExtendedGauge("createExtendedGaugeField");
175 
176 
178 static TimeProfile profileCloverCreate("createCloverQuda");
179 
181 static TimeProfile profileCloverDerivative("computeCloverDerivativeQuda");
182 
184 static TimeProfile profileCloverTrace("computeCloverTraceQuda");
185 
187 static TimeProfile profileStaggeredOprod("computeStaggeredOprodQuda");
188 
190 static TimeProfile profileAsqtadForce("computeAsqtadForceQuda");
191 
193 static TimeProfile profileHISQForce("computeHISQForceQuda");
194 
196 static TimeProfile profileHISQForceComplete("computeHISQForceCompleteQuda");
197 
199 static TimeProfile profileAPE("APEQuda");
200 
202 static TimeProfile profileContract("contractQuda");
203 
205 static TimeProfile profileCovDev("covDevCuda");
206 
208 static TimeProfile profileEnd("endQuda");
209 
210 namespace quda {
211  void printLaunchTimer();
212 }
213 
214 void setVerbosityQuda(QudaVerbosity verbosity, const char prefix[], FILE *outfile)
215 {
216  setVerbosity(verbosity);
217  setOutputPrefix(prefix);
218  setOutputFile(outfile);
219 }
220 
221 
222 typedef struct {
223  int ndim;
225 } LexMapData;
226 
230 static int lex_rank_from_coords(const int *coords, void *fdata)
231 {
232  LexMapData *md = static_cast<LexMapData *>(fdata);
233 
234  int rank = coords[0];
235  for (int i = 1; i < md->ndim; i++) {
236  rank = md->dims[i] * rank + coords[i];
237  }
238  return rank;
239 }
240 
241 #ifdef QMP_COMMS
242 
245 static int qmp_rank_from_coords(const int *coords, void *fdata)
246 {
247  return QMP_get_node_number_from(coords);
248 }
249 #endif
250 
251 
252 static bool comms_initialized = false;
253 
254 void initCommsGridQuda(int nDim, const int *dims, QudaCommsMap func, void *fdata)
255 {
256  if (nDim != 4) {
257  errorQuda("Number of communication grid dimensions must be 4");
258  }
259 
260  LexMapData map_data;
261  if (!func) {
262 
263 #if QMP_COMMS
264  if (QMP_logical_topology_is_declared()) {
265  if (QMP_get_logical_number_of_dimensions() != 4) {
266  errorQuda("QMP logical topology must have 4 dimensions");
267  }
268  for (int i=0; i<nDim; i++) {
269  int qdim = QMP_get_logical_dimensions()[i];
270  if(qdim != dims[i]) {
271  errorQuda("QMP logical dims[%d]=%d does not match dims[%d]=%d argument", i, qdim, i, dims[i]);
272  }
273  }
274  fdata = NULL;
275  func = qmp_rank_from_coords;
276  } else {
277  warningQuda("QMP logical topology is undeclared; using default lexicographical ordering");
278 #endif
279 
280  map_data.ndim = nDim;
281  for (int i=0; i<nDim; i++) {
282  map_data.dims[i] = dims[i];
283  }
284  fdata = (void *) &map_data;
285  func = lex_rank_from_coords;
286 
287 #if QMP_COMMS
288  }
289 #endif
290 
291  }
292  comm_init(nDim, dims, func, fdata);
293  comms_initialized = true;
294 }
295 
296 
297 static void init_default_comms()
298 {
299 #if defined(QMP_COMMS)
300  if (QMP_logical_topology_is_declared()) {
301  int ndim = QMP_get_logical_number_of_dimensions();
302  const int *dims = QMP_get_logical_dimensions();
303  initCommsGridQuda(ndim, dims, NULL, NULL);
304  } else {
305  errorQuda("initQuda() called without prior call to initCommsGridQuda(),"
306  " and QMP logical topology has not been declared");
307  }
308 #elif defined(MPI_COMMS)
309  errorQuda("When using MPI for communications, initCommsGridQuda() must be called before initQuda()");
310 #else // single-GPU
311  const int dims[4] = {1, 1, 1, 1};
312  initCommsGridQuda(4, dims, NULL, NULL);
313 #endif
314 }
315 
316 
317 /*
318  * Set the device that QUDA uses.
319  */
320 void initQudaDevice(int dev) {
321 
322  //static bool initialized = false;
323  if (initialized) return;
324  initialized = true;
325 
326 #if defined(GPU_DIRECT) && defined(MULTI_GPU) && (CUDA_VERSION == 4000)
327  //check if CUDA_NIC_INTEROP is set to 1 in the enviroment
328  // not needed for CUDA >= 4.1
329  char* cni_str = getenv("CUDA_NIC_INTEROP");
330  if(cni_str == NULL){
331  errorQuda("Environment variable CUDA_NIC_INTEROP is not set");
332  }
333  int cni_int = atoi(cni_str);
334  if (cni_int != 1){
335  errorQuda("Environment variable CUDA_NIC_INTEROP is not set to 1");
336  }
337 #endif
338 
339  int deviceCount;
340  cudaGetDeviceCount(&deviceCount);
341  if (deviceCount == 0) {
342  errorQuda("No CUDA devices found");
343  }
344 
345  for(int i=0; i<deviceCount; i++) {
346  cudaGetDeviceProperties(&deviceProp, i);
347  checkCudaErrorNoSync(); // "NoSync" for correctness in HOST_DEBUG mode
348  if (getVerbosity() >= QUDA_SUMMARIZE) {
349  printfQuda("Found device %d: %s\n", i, deviceProp.name);
350  }
351  }
352 
353 #ifdef MULTI_GPU
354  if (dev < 0) {
355  if (!comms_initialized) {
356  errorQuda("initDeviceQuda() called with a negative device ordinal, but comms have not been initialized");
357  }
358  dev = comm_gpuid();
359  }
360 #else
361  if (dev < 0 || dev >= 16) errorQuda("Invalid device number %d", dev);
362 #endif
363 
364  cudaGetDeviceProperties(&deviceProp, dev);
365  checkCudaErrorNoSync(); // "NoSync" for correctness in HOST_DEBUG mode
366  if (deviceProp.major < 1) {
367  errorQuda("Device %d does not support CUDA", dev);
368  }
369 
370  if (getVerbosity() >= QUDA_SUMMARIZE) {
371  printfQuda("Using device %d: %s\n", dev, deviceProp.name);
372  }
373 #ifndef USE_QDPJIT
374  cudaSetDevice(dev);
375  checkCudaErrorNoSync(); // "NoSync" for correctness in HOST_DEBUG mode
376 #endif
377 
378 #ifdef NUMA_AFFINITY
380  setNumaAffinity(dev);
381  }
382 #endif
383 
384  // if the device supports host-mapped memory, then enable this
385 #ifndef USE_QDPJIT
386  if(deviceProp.canMapHostMemory) cudaSetDeviceFlags(cudaDeviceMapHost);
387  checkCudaError();
388 #endif
389 
390  cudaDeviceSetCacheConfig(cudaFuncCachePreferL1);
391  //cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeEightByte);
392  cudaGetDeviceProperties(&deviceProp, dev);
393 }
394 
395 /*
396  * Any persistent memory allocations that QUDA uses are done here.
397  */
399 {
400  if (!comms_initialized) init_default_comms();
401 
402  streams = new cudaStream_t[Nstream];
403 
404 #if (CUDA_VERSION >= 5050)
405  int greatestPriority;
406  int leastPriority;
407  cudaDeviceGetStreamPriorityRange(&leastPriority, &greatestPriority);
408  for (int i=0; i<Nstream-1; i++) {
409  cudaStreamCreateWithPriority(&streams[i], cudaStreamDefault, greatestPriority);
410  }
411  cudaStreamCreateWithPriority(&streams[Nstream-1], cudaStreamDefault, leastPriority);
412 #else
413  for (int i=0; i<Nstream; i++) {
414  cudaStreamCreate(&streams[i]);
415  }
416 #endif
417 
418  checkCudaError();
420 #ifdef GPU_STAGGERED_OPROD
422 #endif
423  initBlas();
424 
426 }
427 
428 void initQuda(int dev)
429 {
430  profileInit.Start(QUDA_PROFILE_TOTAL);
431 
432  // initialize communications topology, if not already done explicitly via initCommsGridQuda()
433  if (!comms_initialized) init_default_comms();
434 
435  // set the device that QUDA uses
436  initQudaDevice(dev);
437 
438  // set the persistant memory allocations that QUDA uses (Blas, streams, etc.)
439  initQudaMemory();
440 
441 #ifdef PTHREADS
442  pthread_mutexattr_t mutex_attr;
443  pthread_mutexattr_init(&mutex_attr);
444  pthread_mutexattr_settype(&mutex_attr, PTHREAD_MUTEX_RECURSIVE);
445  pthread_mutex_init(&pthread_mutex, &mutex_attr);
446 #endif
447 
448  profileInit.Stop(QUDA_PROFILE_TOTAL);
449 }
450 
451 
452 void loadGaugeQuda(void *h_gauge, QudaGaugeParam *param)
453 {
454  //printfQuda("loadGaugeQuda use_resident_gauge = %d phase=%d\n",
455  //param->use_resident_gauge, param->staggered_phase_applied);
456 
457  profileGauge.Start(QUDA_PROFILE_TOTAL);
458 
459  if (!initialized) errorQuda("QUDA not initialized");
461 
462  checkGaugeParam(param);
463 
464  profileGauge.Start(QUDA_PROFILE_INIT);
465  // Set the specific input parameters and create the cpu gauge field
466  GaugeFieldParam gauge_param(h_gauge, *param);
467 
468  // if we are using half precision then we need to compute the fat
469  // link maximum while still on the cpu
470  // FIXME get a kernel for this
471  if ((param->cuda_prec == QUDA_HALF_PRECISION ||
474  param->type == QUDA_ASQTAD_FAT_LINKS)
475  gauge_param.compute_fat_link_max = true;
476 
478  static_cast<GaugeField*>(new cpuGaugeField(gauge_param)) :
479  static_cast<GaugeField*>(new cudaGaugeField(gauge_param));
480 
481  // if not preserving then copy the gauge field passed in
482  cudaGaugeField *precise = NULL;
483 
484  // switch the parameters for creating the mirror precise cuda gauge field
485  gauge_param.create = QUDA_NULL_FIELD_CREATE;
486  gauge_param.precision = param->cuda_prec;
487  gauge_param.reconstruct = param->reconstruct;
488  gauge_param.pad = param->ga_pad;
489  gauge_param.order = (gauge_param.precision == QUDA_DOUBLE_PRECISION ||
490  gauge_param.reconstruct == QUDA_RECONSTRUCT_NO ) ?
492 
493  precise = new cudaGaugeField(gauge_param);
494 
495  if (param->use_resident_gauge) {
496  if(gaugePrecise == NULL) errorQuda("No resident gauge field");
497  // copy rather than point at to ensure that the padded region is filled in
498  precise->copy(*gaugePrecise);
499  precise->exchangeGhost();
500  delete gaugePrecise;
501  gaugePrecise = NULL;
502  profileGauge.Stop(QUDA_PROFILE_INIT);
503  } else {
504  profileGauge.Stop(QUDA_PROFILE_INIT);
505  profileGauge.Start(QUDA_PROFILE_H2D);
506  precise->copy(*in);
507  profileGauge.Stop(QUDA_PROFILE_H2D);
508  }
509 
510  param->gaugeGiB += precise->GBytes();
511 
512  // creating sloppy fields isn't really compute, but it is work done on the gpu
513  profileGauge.Start(QUDA_PROFILE_COMPUTE);
514 
515  // switch the parameters for creating the mirror sloppy cuda gauge field
516  gauge_param.precision = param->cuda_prec_sloppy;
517  gauge_param.reconstruct = param->reconstruct_sloppy;
518  gauge_param.order = (gauge_param.precision == QUDA_DOUBLE_PRECISION ||
519  gauge_param.reconstruct == QUDA_RECONSTRUCT_NO ) ?
521  cudaGaugeField *sloppy = NULL;
522  if (param->cuda_prec != param->cuda_prec_sloppy ||
523  param->reconstruct != param->reconstruct_sloppy) {
524  sloppy = new cudaGaugeField(gauge_param);
525 #if (__COMPUTE_CAPABILITY__ >= 200)
526  sloppy->copy(*precise);
527 #else
528  sloppy->copy(*in);
529 #endif
530  param->gaugeGiB += sloppy->GBytes();
531  } else {
532  sloppy = precise;
533  }
534 
535  // switch the parameters for creating the mirror preconditioner cuda gauge field
536  gauge_param.precision = param->cuda_prec_precondition;
537  gauge_param.reconstruct = param->reconstruct_precondition;
538  gauge_param.order = (gauge_param.precision == QUDA_DOUBLE_PRECISION ||
539  gauge_param.reconstruct == QUDA_RECONSTRUCT_NO ) ?
541  cudaGaugeField *precondition = NULL;
542  if (param->cuda_prec_sloppy != param->cuda_prec_precondition ||
543  param->reconstruct_sloppy != param->reconstruct_precondition) {
544  precondition = new cudaGaugeField(gauge_param);
545 #if (__COMPUTE_CAPABILITY__ >= 200)
546  precondition->copy(*sloppy);
547 #else
548  precondition->copy(*in);
549 #endif
550  param->gaugeGiB += precondition->GBytes();
551  } else {
552  precondition = sloppy;
553  }
554 
555  // create an extended preconditioning field
556  cudaGaugeField* extended = NULL;
557  if(param->overlap){
558  int R[4]; // domain-overlap widths in different directions
559  for(int i=0; i<4; ++i){
560  R[i] = param->overlap*commDimPartitioned(i);
561  gauge_param.x[i] += 2*R[i];
562  }
563  // the extended field does not require any ghost padding
565  extended = new cudaGaugeField(gauge_param);
566 
567  // copy the unextended preconditioning field into the interior of the extended field
568  copyExtendedGauge(*extended, *precondition, QUDA_CUDA_FIELD_LOCATION);
569  // now perform communication and fill the overlap regions
570  extended->exchangeExtendedGhost(R);
571  }
572 
573  profileGauge.Stop(QUDA_PROFILE_COMPUTE);
574 
575  switch (param->type) {
576  case QUDA_WILSON_LINKS:
577  //if (gaugePrecise) errorQuda("Precise gauge field already allocated");
578  gaugePrecise = precise;
579  //if (gaugeSloppy) errorQuda("Sloppy gauge field already allocated");
580  gaugeSloppy = sloppy;
581  //if (gaugePrecondition) errorQuda("Precondition gauge field already allocated");
582  gaugePrecondition = precondition;
583 
584  if(param->overlap) gaugeExtended = extended;
585  break;
587  if (gaugeFatPrecise) errorQuda("Precise gauge fat field already allocated");
588  gaugeFatPrecise = precise;
589  if (gaugeFatSloppy) errorQuda("Sloppy gauge fat field already allocated");
590  gaugeFatSloppy = sloppy;
591  if (gaugeFatPrecondition) errorQuda("Precondition gauge fat field already allocated");
592  gaugeFatPrecondition = precondition;
593 
594  if(param->overlap){
595  if(gaugeFatExtended) errorQuda("Extended gauge fat field already allocated");
596  gaugeFatExtended = extended;
597  }
598  break;
600  if (gaugeLongPrecise) errorQuda("Precise gauge long field already allocated");
601  gaugeLongPrecise = precise;
602  if (gaugeLongSloppy) errorQuda("Sloppy gauge long field already allocated");
603  gaugeLongSloppy = sloppy;
604  if (gaugeLongPrecondition) errorQuda("Precondition gauge long field already allocated");
605  gaugeLongPrecondition = precondition;
606  if(param->overlap){
607  if(gaugeLongExtended) errorQuda("Extended gauge long field already allocated");
608  gaugeLongExtended = extended;
609  }
610  break;
611  default:
612  errorQuda("Invalid gauge type");
613  }
614 
615 
616  profileGauge.Start(QUDA_PROFILE_FREE);
617  delete in;
618  profileGauge.Stop(QUDA_PROFILE_FREE);
619 
620  profileGauge.Stop(QUDA_PROFILE_TOTAL);
621 }
622 
623 void saveGaugeQuda(void *h_gauge, QudaGaugeParam *param)
624 {
625  profileGauge.Start(QUDA_PROFILE_TOTAL);
626 
627  if (param->location != QUDA_CPU_FIELD_LOCATION)
628  errorQuda("Non-cpu output location not yet supported");
629 
630  if (!initialized) errorQuda("QUDA not initialized");
631  checkGaugeParam(param);
632 
633  // Set the specific cpu parameters and create the cpu gauge field
634  GaugeFieldParam gauge_param(h_gauge, *param);
635  cpuGaugeField cpuGauge(gauge_param);
636  cudaGaugeField *cudaGauge = NULL;
637  switch (param->type) {
638  case QUDA_WILSON_LINKS:
639  cudaGauge = gaugePrecise;
640  break;
642  cudaGauge = gaugeFatPrecise;
643  break;
645  cudaGauge = gaugeLongPrecise;
646  break;
647  default:
648  errorQuda("Invalid gauge type");
649  }
650 
651  profileGauge.Start(QUDA_PROFILE_D2H);
652  cudaGauge->saveCPUField(cpuGauge, QUDA_CPU_FIELD_LOCATION);
653  profileGauge.Stop(QUDA_PROFILE_D2H);
654 
655  profileGauge.Stop(QUDA_PROFILE_TOTAL);
656 }
657 
658 
659 void loadCloverQuda(void *h_clover, void *h_clovinv, QudaInvertParam *inv_param)
660 {
661  profileClover.Start(QUDA_PROFILE_TOTAL);
662  bool device_calc = false; // calculate clover and inverse on the device?
663 
664  pushVerbosity(inv_param->verbosity);
666 
667  if (!initialized) errorQuda("QUDA not initialized");
668 
669  if (!h_clover && !h_clovinv) {
670  printfQuda("clover_coeff: %lf\n", inv_param->clover_coeff);
671  if(inv_param->clover_coeff != 0){
672  device_calc = true;
673  }else{
674  errorQuda("loadCloverQuda() called with neither clover term nor inverse");
675  }
676  }
677 
678 
679  if (inv_param->clover_cpu_prec == QUDA_HALF_PRECISION) {
680  errorQuda("Half precision not supported on CPU");
681  }
682  if (gaugePrecise == NULL) {
683  errorQuda("Gauge field must be loaded before clover");
684  }
685  if ((inv_param->dslash_type != QUDA_CLOVER_WILSON_DSLASH) && (inv_param->dslash_type != QUDA_TWISTED_CLOVER_DSLASH)) {
686  errorQuda("Wrong dslash_type in loadCloverQuda()");
687  }
688 
689  // determines whether operator is preconditioned when calling invertQuda()
690  bool pc_solve = (inv_param->solve_type == QUDA_DIRECT_PC_SOLVE ||
691  inv_param->solve_type == QUDA_NORMOP_PC_SOLVE);
692 
693  // determines whether operator is preconditioned when calling MatQuda() or MatDagMatQuda()
694  bool pc_solution = (inv_param->solution_type == QUDA_MATPC_SOLUTION ||
696 
697  bool asymmetric = (inv_param->matpc_type == QUDA_MATPC_EVEN_EVEN_ASYMMETRIC ||
699 
700  // We issue a warning only when it seems likely that the user is screwing up:
701 
702  // uninverted clover term is required when applying unpreconditioned operator,
703  // but note that dslashQuda() is always preconditioned
704  if (!h_clover && !pc_solve && !pc_solution) {
705  //warningQuda("Uninverted clover term not loaded");
706  }
707 
708  // uninverted clover term is also required for "asymmetric" preconditioning
709  if (!h_clover && pc_solve && pc_solution && asymmetric && !device_calc) {
710  warningQuda("Uninverted clover term not loaded");
711  }
712 
713  CloverFieldParam clover_param;
714  CloverField *in=NULL, *inInv=NULL;
715 
716  if(!device_calc){
717  // create a param for the cpu clover field
718  profileClover.Start(QUDA_PROFILE_INIT);
719  CloverFieldParam cpuParam;
720  cpuParam.nDim = 4;
721  for (int i=0; i<4; i++) cpuParam.x[i] = gaugePrecise->X()[i];
722  cpuParam.precision = inv_param->clover_cpu_prec;
723  cpuParam.order = inv_param->clover_order;
724  cpuParam.direct = h_clover ? true : false;
725  cpuParam.inverse = h_clovinv ? true : false;
726  cpuParam.clover = h_clover;
727  cpuParam.norm = 0;
728  cpuParam.cloverInv = h_clovinv;
729  cpuParam.invNorm = 0;
732  cpuParam.twisted = false;
733  cpuParam.mu2 = 0.;
734 
735  if (inv_param->dslash_type == QUDA_TWISTED_CLOVER_DSLASH) {
736  cpuParam.direct = true;
737  cpuParam.inverse = false;
738  cpuParam.cloverInv = NULL;
739  cpuParam.clover = h_clover;
740  in = (inv_param->clover_location == QUDA_CPU_FIELD_LOCATION) ?
741  static_cast<CloverField*>(new cpuCloverField(cpuParam)) :
742  static_cast<CloverField*>(new cudaCloverField(cpuParam));
743 
744  cpuParam.cloverInv = h_clovinv;
745  cpuParam.clover = NULL;
746  cpuParam.twisted = true;
747  cpuParam.direct = true;
748  cpuParam.inverse = false;
749  cpuParam.mu2 = 4.*inv_param->kappa*inv_param->kappa*inv_param->mu*inv_param->mu;
750 
751  inInv = (inv_param->clover_location == QUDA_CPU_FIELD_LOCATION) ?
752  static_cast<CloverField*>(new cpuCloverField(cpuParam)) :
753  static_cast<CloverField*>(new cudaCloverField(cpuParam));
754  } else {
755  in = (inv_param->clover_location == QUDA_CPU_FIELD_LOCATION) ?
756  static_cast<CloverField*>(new cpuCloverField(cpuParam)) :
757  static_cast<CloverField*>(new cudaCloverField(cpuParam));
758  }
759 
760  clover_param.nDim = 4;
761  for (int i=0; i<4; i++) clover_param.x[i] = gaugePrecise->X()[i];
762  clover_param.setPrecision(inv_param->clover_cuda_prec);
763  clover_param.pad = inv_param->cl_pad;
764  clover_param.direct = h_clover ? true : false;
765  clover_param.inverse = (h_clovinv || pc_solve) ? true : false;
766  clover_param.create = QUDA_NULL_FIELD_CREATE;
767  clover_param.siteSubset = QUDA_FULL_SITE_SUBSET;
768 
769  if (inv_param->dslash_type == QUDA_TWISTED_CLOVER_DSLASH) {
770  clover_param.direct = true;
771  clover_param.inverse = false;
772  cloverPrecise = new cudaCloverField(clover_param);
773  clover_param.direct = false;
774  clover_param.inverse = true;
775  clover_param.twisted = true;
776  cloverInvPrecise = new cudaCloverField(clover_param);
777  clover_param.twisted = false;
778  } else {
779  cloverPrecise = new cudaCloverField(clover_param);
780  }
781 
782  profileClover.Stop(QUDA_PROFILE_INIT);
783 
784  profileClover.Start(QUDA_PROFILE_H2D);
785  if (inv_param->dslash_type == QUDA_TWISTED_CLOVER_DSLASH) {
786  cloverPrecise->copy(*in, false);
787  cloverInvPrecise->copy(*in, true);
789  } else {
790  cloverPrecise->copy(*in, h_clovinv ? true : false);
791  }
792 
793  profileClover.Stop(QUDA_PROFILE_H2D);
794  } else {
795  profileClover.Start(QUDA_PROFILE_COMPUTE);
796 
797  createCloverQuda(inv_param);
798 
799  if (inv_param->dslash_type == QUDA_TWISTED_CLOVER_DSLASH) {
801  if (inv_param->compute_clover_trlog) {
802  inv_param->trlogA[0] = cloverInvPrecise->TrLog()[0];
803  inv_param->trlogA[1] = cloverInvPrecise->TrLog()[1];
804  }
805  }
806  profileClover.Stop(QUDA_PROFILE_COMPUTE);
807  }
808 
809  // inverted clover term is required when applying preconditioned operator
810  if ((!h_clovinv && pc_solve) && inv_param->dslash_type != QUDA_TWISTED_CLOVER_DSLASH) {
811  profileClover.Start(QUDA_PROFILE_COMPUTE);
813  profileClover.Stop(QUDA_PROFILE_COMPUTE);
814  if (inv_param->compute_clover_trlog) {
815  inv_param->trlogA[0] = cloverPrecise->TrLog()[0];
816  inv_param->trlogA[1] = cloverPrecise->TrLog()[1];
817  }
818  }
819 
820  if (inv_param->dslash_type != QUDA_TWISTED_CLOVER_DSLASH)
821  inv_param->cloverGiB = cloverPrecise->GBytes();
822  else
824 
825  clover_param.norm = 0;
826  clover_param.invNorm = 0;
827  clover_param.mu2 = 0.;
828  clover_param.nDim = 4;
829  for(int dir=0; dir<4; ++dir) clover_param.x[dir] = gaugePrecise->X()[dir];
830  clover_param.pad = inv_param->cl_pad;
831  clover_param.siteSubset = QUDA_FULL_SITE_SUBSET;
832  clover_param.create = QUDA_NULL_FIELD_CREATE;
833  clover_param.direct = true;
834  clover_param.inverse = true;
835 
836  // create the mirror sloppy clover field
837  if (inv_param->clover_cuda_prec != inv_param->clover_cuda_prec_sloppy) {
838  profileClover.Start(QUDA_PROFILE_INIT);
839  clover_param.setPrecision(inv_param->clover_cuda_prec_sloppy);
840 
841  if (inv_param->dslash_type == QUDA_TWISTED_CLOVER_DSLASH) {
842  clover_param.direct = false;
843  clover_param.inverse = true;
844  clover_param.twisted = true;
845  clover_param.mu2 = 4.*inv_param->kappa*inv_param->kappa*inv_param->mu*inv_param->mu;
846  cloverInvSloppy = new cudaCloverField(clover_param);
848  clover_param.direct = true;
849  clover_param.inverse = false;
850  clover_param.twisted = false;
851  cloverSloppy = new cudaCloverField(clover_param);
853  inv_param->cloverGiB += cloverSloppy->GBytes() + cloverInvSloppy->GBytes();
854  } else {
855  cloverSloppy = new cudaCloverField(clover_param);
857  inv_param->cloverGiB += cloverSloppy->GBytes();
858  }
859  profileClover.Stop(QUDA_PROFILE_INIT);
860  } else {
862  if (inv_param->dslash_type == QUDA_TWISTED_CLOVER_DSLASH)
864  }
865 
866  // create the mirror preconditioner clover field
867  if (inv_param->clover_cuda_prec_sloppy != inv_param->clover_cuda_prec_precondition &&
869  profileClover.Start(QUDA_PROFILE_INIT);
870  clover_param.setPrecision(inv_param->clover_cuda_prec_precondition);
871  if (inv_param->dslash_type == QUDA_TWISTED_CLOVER_DSLASH) {
872  clover_param.direct = true;
873  clover_param.inverse = false;
874  clover_param.twisted = false;
875  cloverPrecondition = new cudaCloverField(clover_param);
877  clover_param.direct = false;
878  clover_param.inverse = true;
879  clover_param.twisted = true;
880  cloverInvPrecondition = new cudaCloverField(clover_param);
883  } else {
884  cloverPrecondition = new cudaCloverField(clover_param);
886  inv_param->cloverGiB += cloverPrecondition->GBytes();
887  }
888  profileClover.Stop(QUDA_PROFILE_INIT);
889  } else {
891  if (inv_param->dslash_type == QUDA_TWISTED_CLOVER_DSLASH)
893  }
894 
895  // need to copy back the odd inverse field into the application clover field
896  if (!h_clovinv && pc_solve && !device_calc) {
897  // copy the inverted clover term into host application order on the device
898  clover_param.setPrecision(inv_param->clover_cpu_prec);
899  clover_param.direct = false;
900  clover_param.inverse = true;
901  clover_param.order = inv_param->clover_order;
902 
903  // this isn't really "epilogue" but this label suffices
904  profileClover.Start(QUDA_PROFILE_EPILOGUE);
905  cudaCloverField hack(clover_param);
906  hack.copy(*cloverPrecise);
907  profileClover.Stop(QUDA_PROFILE_EPILOGUE);
908 
909  // copy the odd components into the host application's clover field
910  profileClover.Start(QUDA_PROFILE_D2H);
911  cudaMemcpy((char*)(in->V(false))+in->Bytes()/2, (char*)(hack.V(true))+hack.Bytes()/2,
912  in->Bytes()/2, cudaMemcpyDeviceToHost);
913  profileClover.Stop(QUDA_PROFILE_D2H);
914 
915  checkCudaError();
916  }
917 
918  if(!device_calc)
919  {
920  if (in) delete in; // delete object referencing input field
921  if (inv_param->dslash_type == QUDA_TWISTED_CLOVER_DSLASH && inInv) delete inInv;
922  }
923 
924  popVerbosity();
925 
926  profileClover.Stop(QUDA_PROFILE_TOTAL);
927 }
928 
929 void freeGaugeQuda(void)
930 {
931  if (!initialized) errorQuda("QUDA not initialized");
934  if (gaugePrecise) delete gaugePrecise;
935  if (gaugeExtended) delete gaugeExtended;
936 
937  gaugePrecondition = NULL;
938  gaugeSloppy = NULL;
939  gaugePrecise = NULL;
940  gaugeExtended = NULL;
941 
946 
947  gaugeLongPrecondition = NULL;
948  gaugeLongSloppy = NULL;
949  gaugeLongPrecise = NULL;
950  gaugeLongExtended = NULL;
951 
954  if (gaugeFatPrecise) delete gaugeFatPrecise;
955 
956 
957  gaugeFatPrecondition = NULL;
958  gaugeFatSloppy = NULL;
959  gaugeFatPrecise = NULL;
960  gaugeFatExtended = NULL;
961 
962  if (gaugeSmeared) delete gaugeSmeared;
963 
964  gaugeSmeared = NULL;
965  // Need to merge extendedGaugeResident and gaugeFatPrecise/gaugePrecise
966  if (extendedGaugeResident) {
967  delete extendedGaugeResident;
968  extendedGaugeResident = NULL;
969  }
970 }
971 
972 // just free the sloppy fields used in mixed-precision solvers
974 {
975  if (!initialized) errorQuda("QUDA not initialized");
978 
979  gaugePrecondition = NULL;
980  gaugeSloppy = NULL;
981 
984 
985  gaugeLongPrecondition = NULL;
986  gaugeLongSloppy = NULL;
987 
990 
991  gaugeFatPrecondition = NULL;
992  gaugeFatSloppy = NULL;
993 }
994 
995 
996 void freeCloverQuda(void)
997 {
998  if (!initialized) errorQuda("QUDA not initialized");
1001  if (cloverPrecise) delete cloverPrecise;
1002 
1003  cloverPrecondition = NULL;
1004  cloverSloppy = NULL;
1005  cloverPrecise = NULL;
1006 
1007  if (cloverInvPrecise != NULL) {
1010  if (cloverInvPrecise) delete cloverInvPrecise;
1011 
1012  cloverInvPrecondition = NULL;
1013  cloverInvSloppy = NULL;
1014  cloverInvPrecise = NULL;
1015  }
1016 }
1017 
1018 void endQuda(void)
1019 {
1020  profileEnd.Start(QUDA_PROFILE_TOTAL);
1021 
1022  if (!initialized) return;
1023 
1031  freeGaugeQuda();
1032  freeCloverQuda();
1033 
1034  if(cudaStapleField) delete cudaStapleField; cudaStapleField=NULL;
1035  if(cudaStapleField1) delete cudaStapleField1; cudaStapleField1=NULL;
1036 
1038  if(momResident) delete momResident;
1039 
1040  endBlas();
1041 
1042  if (streams) {
1043  for (int i=0; i<Nstream; i++) cudaStreamDestroy(streams[i]);
1044  delete []streams;
1045  streams = NULL;
1046  }
1048 
1049 #ifdef GPU_STAGGERED_OPROD
1051 #endif
1052 
1054 
1055 #if (!defined(USE_QDPJIT) && !defined(GPU_COMMS))
1056  // end this CUDA context
1057  cudaDeviceReset();
1058 #endif
1059 
1060  initialized = false;
1061 
1062  comm_finalize();
1063  comms_initialized = false;
1064 
1065  profileEnd.Stop(QUDA_PROFILE_TOTAL);
1066 
1067  // print out the profile information of the lifetime of the library
1068  if (getVerbosity() >= QUDA_SUMMARIZE) {
1069  profileInit.Print();
1070  profileGauge.Print();
1071  profileCloverCreate.Print();
1072  profileClover.Print();
1073  profileInvert.Print();
1074  profileMulti.Print();
1075  profileMultiMixed.Print();
1076  profileFatLink.Print();
1077  profileGaugeForce.Print();
1078  profileGaugeUpdate.Print();
1079  profileExtendedGauge.Print();
1080  profileCloverDerivative.Print();
1081  profileCloverTrace.Print();
1082  profileStaggeredOprod.Print();
1083  profileAsqtadForce.Print();
1084  profileHISQForce.Print();
1085  profileContract.Print();
1086  profileCovDev.Print();
1087  profileEnd.Print();
1088 
1089  printLaunchTimer();
1090 
1091  printfQuda("\n");
1093  printfQuda("\n");
1094  }
1095 
1096  assertAllMemFree();
1097 }
1098 
1099 
1100 namespace quda {
1101 
1102  void setDiracParam(DiracParam &diracParam, QudaInvertParam *inv_param, const bool pc)
1103  {
1104  double kappa = inv_param->kappa;
1105  if (inv_param->dirac_order == QUDA_CPS_WILSON_DIRAC_ORDER) {
1106  kappa *= gaugePrecise->Anisotropy();
1107  }
1108 
1109  switch (inv_param->dslash_type) {
1110  case QUDA_WILSON_DSLASH:
1111  diracParam.type = pc ? QUDA_WILSONPC_DIRAC : QUDA_WILSON_DIRAC;
1112  break;
1114  diracParam.type = pc ? QUDA_CLOVERPC_DIRAC : QUDA_CLOVER_DIRAC;
1115  break;
1118  diracParam.Ls = inv_param->Ls;
1119  break;
1121  if(pc) {
1122  diracParam.type = QUDA_DOMAIN_WALL_4DPC_DIRAC;
1123  diracParam.Ls = inv_param->Ls;
1124  } else errorQuda("For 4D type of DWF dslash, pc must be turned on, %d", inv_param->dslash_type);
1125  break;
1127  if (inv_param->Ls > QUDA_MAX_DWF_LS)
1128  errorQuda("Length of Ls dimension %d greater than QUDA_MAX_DWF_LS %d", inv_param->Ls, QUDA_MAX_DWF_LS);
1129  if(pc) {
1131  diracParam.Ls = inv_param->Ls;
1132  memcpy(diracParam.b_5, inv_param->b_5, sizeof(double)*inv_param->Ls);
1133  memcpy(diracParam.c_5, inv_param->c_5, sizeof(double)*inv_param->Ls);
1134  } else errorQuda("At currently, only preconditioned Mobius DWF is supported, %d", inv_param->dslash_type);
1135  break;
1136  case QUDA_STAGGERED_DSLASH:
1137  diracParam.type = pc ? QUDA_STAGGEREDPC_DIRAC : QUDA_STAGGERED_DIRAC;
1138  break;
1139  case QUDA_ASQTAD_DSLASH:
1140  diracParam.type = pc ? QUDA_ASQTADPC_DIRAC : QUDA_ASQTAD_DIRAC;
1141  break;
1144  if (inv_param->twist_flavor == QUDA_TWIST_MINUS || inv_param->twist_flavor == QUDA_TWIST_PLUS) {
1145  diracParam.Ls = 1;
1146  diracParam.epsilon = 0.0;
1147  } else {
1148  diracParam.Ls = 2;
1149  diracParam.epsilon = inv_param->twist_flavor == QUDA_TWIST_NONDEG_DOUBLET ? inv_param->epsilon : 0.0;
1150  }
1151  break;
1154  if (inv_param->twist_flavor == QUDA_TWIST_MINUS || inv_param->twist_flavor == QUDA_TWIST_PLUS) {
1155  diracParam.Ls = 1;
1156  diracParam.epsilon = 0.0;
1157  } else {
1158  diracParam.Ls = 2;
1159  diracParam.epsilon = inv_param->twist_flavor == QUDA_TWIST_NONDEG_DOUBLET ? inv_param->epsilon : 0.0;
1160  }
1161  break;
1162  default:
1163  errorQuda("Unsupported dslash_type %d", inv_param->dslash_type);
1164  }
1165 
1166  diracParam.matpcType = inv_param->matpc_type;
1167  diracParam.dagger = inv_param->dagger;
1168  diracParam.gauge = gaugePrecise;
1169  diracParam.fatGauge = gaugeFatPrecise;
1170  diracParam.longGauge = gaugeLongPrecise;
1171  diracParam.clover = cloverPrecise;
1172  diracParam.cloverInv = cloverInvPrecise;
1173  diracParam.kappa = kappa;
1174  diracParam.mass = inv_param->mass;
1175  diracParam.m5 = inv_param->m5;
1176  diracParam.mu = inv_param->mu;
1177 
1178  for (int i=0; i<4; i++) diracParam.commDim[i] = 1; // comms are always on
1179  }
1180 
1181 
1182  void setDiracSloppyParam(DiracParam &diracParam, QudaInvertParam *inv_param, const bool pc)
1183  {
1184  setDiracParam(diracParam, inv_param, pc);
1185 
1186  diracParam.gauge = gaugeSloppy;
1187  diracParam.fatGauge = gaugeFatSloppy;
1188  diracParam.longGauge = gaugeLongSloppy;
1189  diracParam.clover = cloverSloppy;
1190  diracParam.cloverInv = cloverInvSloppy;
1191 
1192  for (int i=0; i<4; i++) {
1193  diracParam.commDim[i] = 1; // comms are always on
1194  }
1195 
1196  }
1197 
1198  // The preconditioner currently mimicks the sloppy operator with no comms
1199  void setDiracPreParam(DiracParam &diracParam, QudaInvertParam *inv_param, const bool pc)
1200  {
1201  setDiracParam(diracParam, inv_param, pc);
1202 
1203  if(inv_param->overlap){
1204  diracParam.gauge = gaugeExtended;
1205  diracParam.fatGauge = gaugeFatExtended;
1206  diracParam.longGauge = gaugeLongExtended;
1207  }else{
1208  diracParam.gauge = gaugePrecondition;
1209  diracParam.fatGauge = gaugeFatPrecondition;
1210  diracParam.longGauge = gaugeLongPrecondition;
1211  }
1212  diracParam.clover = cloverPrecondition;
1213  diracParam.cloverInv = cloverInvPrecondition;
1214 
1215  for (int i=0; i<4; i++) {
1216  diracParam.commDim[i] = 0; // comms are always off
1217  }
1218 
1219  // In the preconditioned staggered CG allow a different dlsash type in the preconditioning
1220  if(inv_param->inv_type == QUDA_PCG_INVERTER && inv_param->dslash_type == QUDA_ASQTAD_DSLASH
1222  diracParam.type = pc ? QUDA_STAGGEREDPC_DIRAC : QUDA_STAGGERED_DIRAC;
1223  diracParam.gauge = gaugeFatPrecondition;
1224  }
1225  }
1226 
1227 
1228  void createDirac(Dirac *&d, Dirac *&dSloppy, Dirac *&dPre, QudaInvertParam &param, const bool pc_solve)
1229  {
1230  DiracParam diracParam;
1231  DiracParam diracSloppyParam;
1232  DiracParam diracPreParam;
1233 
1234  setDiracParam(diracParam, &param, pc_solve);
1235  setDiracSloppyParam(diracSloppyParam, &param, pc_solve);
1236  setDiracPreParam(diracPreParam, &param, pc_solve);
1237 
1238  d = Dirac::create(diracParam); // create the Dirac operator
1239  dSloppy = Dirac::create(diracSloppyParam);
1240  dPre = Dirac::create(diracPreParam);
1241  }
1242 
1243  static double unscaled_shifts[QUDA_MAX_MULTI_SHIFT];
1244 
1246 
1247  double kappa5 = (0.5/(5.0 + param.m5));
1248  double kappa = (param.dslash_type == QUDA_DOMAIN_WALL_DSLASH ||
1250  param.dslash_type == QUDA_MOBIUS_DWF_DSLASH) ? kappa5 : param.kappa;
1251 
1253  printfQuda("Mass rescale: Kappa is: %g\n", kappa);
1254  printfQuda("Mass rescale: mass normalization: %d\n", param.mass_normalization);
1255  double nin = norm2(b);
1256  printfQuda("Mass rescale: norm of source in = %g\n", nin);
1257  }
1258 
1259  // staggered dslash uses mass normalization internally
1261  switch (param.solution_type) {
1262  case QUDA_MAT_SOLUTION:
1263  case QUDA_MATPC_SOLUTION:
1264  if (param.mass_normalization == QUDA_KAPPA_NORMALIZATION) axCuda(2.0*param.mass, b);
1265  break;
1268  if (param.mass_normalization == QUDA_KAPPA_NORMALIZATION) axCuda(4.0*param.mass*param.mass, b);
1269  break;
1270  default:
1271  errorQuda("Not implemented");
1272  }
1273  return;
1274  }
1275 
1276  for(int i=0; i<param.num_offset; i++) {
1277  unscaled_shifts[i] = param.offset[i];
1278  }
1279 
1280  // multiply the source to compensate for normalization of the Dirac operator, if necessary
1281  switch (param.solution_type) {
1282  case QUDA_MAT_SOLUTION:
1285  axCuda(2.0*kappa, b);
1286  for(int i=0; i<param.num_offset; i++) param.offset[i] *= 2.0*kappa;
1287  }
1288  break;
1292  axCuda(4.0*kappa*kappa, b);
1293  for(int i=0; i<param.num_offset; i++) param.offset[i] *= 4.0*kappa*kappa;
1294  }
1295  break;
1296  case QUDA_MATPC_SOLUTION:
1298  axCuda(4.0*kappa*kappa, b);
1299  for(int i=0; i<param.num_offset; i++) param.offset[i] *= 4.0*kappa*kappa;
1301  axCuda(2.0*kappa, b);
1302  for(int i=0; i<param.num_offset; i++) param.offset[i] *= 2.0*kappa;
1303  }
1304  break;
1307  axCuda(16.0*pow(kappa,4), b);
1308  for(int i=0; i<param.num_offset; i++) param.offset[i] *= 16.0*pow(kappa,4);
1310  axCuda(4.0*kappa*kappa, b);
1311  for(int i=0; i<param.num_offset; i++) param.offset[i] *= 4.0*kappa*kappa;
1312  }
1313  break;
1314  default:
1315  errorQuda("Solution type %d not supported", param.solution_type);
1316  }
1317 
1318  if (getVerbosity() >= QUDA_DEBUG_VERBOSE) printfQuda("Mass rescale done\n");
1319  if (getVerbosity() >= QUDA_DEBUG_VERBOSE) {
1320  printfQuda("Mass rescale: Kappa is: %g\n", kappa);
1321  printfQuda("Mass rescale: mass normalization: %d\n", param.mass_normalization);
1322  double nin = norm2(b);
1323  printfQuda("Mass rescale: norm of source out = %g\n", nin);
1324  }
1325 
1326  }
1327 }
1328 
1329 void dslashQuda(void *h_out, void *h_in, QudaInvertParam *inv_param, QudaParity parity)
1330 {
1331  if (inv_param->dslash_type == QUDA_DOMAIN_WALL_DSLASH ||
1332  inv_param->dslash_type == QUDA_DOMAIN_WALL_4D_DSLASH ||
1333  inv_param->dslash_type == QUDA_MOBIUS_DWF_DSLASH) setKernelPackT(true);
1334 
1335  if (gaugePrecise == NULL) errorQuda("Gauge field not allocated");
1336  if (cloverPrecise == NULL && ((inv_param->dslash_type == QUDA_CLOVER_WILSON_DSLASH) || (inv_param->dslash_type == QUDA_TWISTED_CLOVER_DSLASH)))
1337  errorQuda("Clover field not allocated");
1338  if (cloverInvPrecise == NULL && inv_param->dslash_type == QUDA_TWISTED_CLOVER_DSLASH)
1339  errorQuda("Clover field not allocated");
1340 
1341  pushVerbosity(inv_param->verbosity);
1343 
1344  ColorSpinorParam cpuParam(h_in, *inv_param, gaugePrecise->X(), 1);
1345 
1346  ColorSpinorField *in_h = (inv_param->input_location == QUDA_CPU_FIELD_LOCATION) ?
1347  static_cast<ColorSpinorField*>(new cpuColorSpinorField(cpuParam)) :
1348  static_cast<ColorSpinorField*>(new cudaColorSpinorField(cpuParam));
1349 
1350  ColorSpinorParam cudaParam(cpuParam, *inv_param);
1351  cudaColorSpinorField in(*in_h, cudaParam);
1352 
1353  if (getVerbosity() >= QUDA_VERBOSE) {
1354  double cpu = norm2(*in_h);
1355  double gpu = norm2(in);
1356  printfQuda("In CPU %e CUDA %e\n", cpu, gpu);
1357  }
1358 
1359  if (inv_param->mass_normalization == QUDA_KAPPA_NORMALIZATION &&
1360  (inv_param->dslash_type == QUDA_STAGGERED_DSLASH ||
1361  inv_param->dslash_type == QUDA_ASQTAD_DSLASH) )
1362  axCuda(1.0/(2.0*inv_param->mass), in);
1363 
1364  cudaParam.create = QUDA_NULL_FIELD_CREATE;
1365  cudaColorSpinorField out(in, cudaParam);
1366 
1367  if (inv_param->dirac_order == QUDA_CPS_WILSON_DIRAC_ORDER) {
1368  if (parity == QUDA_EVEN_PARITY) {
1369  parity = QUDA_ODD_PARITY;
1370  } else {
1371  parity = QUDA_EVEN_PARITY;
1372  }
1374  }
1375  bool pc = true;
1376 
1377  DiracParam diracParam;
1378  setDiracParam(diracParam, inv_param, pc);
1379 
1380  Dirac *dirac = Dirac::create(diracParam); // create the Dirac operator
1381  if (inv_param->dslash_type == QUDA_TWISTED_CLOVER_DSLASH && inv_param->dagger) {
1382  cudaParam.create = QUDA_NULL_FIELD_CREATE;
1383  cudaColorSpinorField tmp1(in, cudaParam);
1384  ((DiracTwistedCloverPC*) dirac)->TwistCloverInv(tmp1, in, (parity+1)%2); // apply the clover-twist
1385  dirac->Dslash(out, tmp1, parity); // apply the operator
1386  } else {
1387  dirac->Dslash(out, in, parity); // apply the operator
1388  }
1389 
1390  delete dirac; // clean up
1391 
1392  cpuParam.v = h_out;
1393 
1394  ColorSpinorField *out_h = (inv_param->output_location == QUDA_CPU_FIELD_LOCATION) ?
1395  static_cast<ColorSpinorField*>(new cpuColorSpinorField(cpuParam)) :
1396  static_cast<ColorSpinorField*>(new cudaColorSpinorField(cpuParam));
1397  *out_h = out;
1398 
1399  if (getVerbosity() >= QUDA_VERBOSE) {
1400  double cpu = norm2(*out_h);
1401  double gpu = norm2(out);
1402  printfQuda("Out CPU %e CUDA %e\n", cpu, gpu);
1403  }
1404 
1405  delete out_h;
1406  delete in_h;
1407 
1408  popVerbosity();
1409 }
1410 
1412 {
1413  if (inv_param->dslash_type == QUDA_DOMAIN_WALL_4D_DSLASH )
1414  setKernelPackT(true);
1415  else
1416  errorQuda("This type of dslashQuda operator is defined for QUDA_DOMAIN_WALL_$D_DSLASH and QUDA_MOBIUS_DWF_DSLASH only");
1417 
1418  if (gaugePrecise == NULL) errorQuda("Gauge field not allocated");
1419 
1420  pushVerbosity(inv_param->verbosity);
1422 
1423  ColorSpinorParam cpuParam(h_in, *inv_param, gaugePrecise->X(), 1);
1424 
1425  ColorSpinorField *in_h = (inv_param->input_location == QUDA_CPU_FIELD_LOCATION) ?
1426  static_cast<ColorSpinorField*>(new cpuColorSpinorField(cpuParam)) :
1427  static_cast<ColorSpinorField*>(new cudaColorSpinorField(cpuParam));
1428 
1429  ColorSpinorParam cudaParam(cpuParam, *inv_param);
1430  cudaColorSpinorField in(*in_h, cudaParam);
1431 
1432  if (getVerbosity() >= QUDA_VERBOSE) {
1433  double cpu = norm2(*in_h);
1434  double gpu = norm2(in);
1435  printfQuda("In CPU %e CUDA %e\n", cpu, gpu);
1436  }
1437 
1438  cudaParam.create = QUDA_NULL_FIELD_CREATE;
1439  cudaColorSpinorField out(in, cudaParam);
1440 
1441  if (inv_param->dirac_order == QUDA_CPS_WILSON_DIRAC_ORDER) {
1442  if (parity == QUDA_EVEN_PARITY) {
1443  parity = QUDA_ODD_PARITY;
1444  } else {
1445  parity = QUDA_EVEN_PARITY;
1446  }
1448  }
1449  bool pc = true;
1450 
1451  DiracParam diracParam;
1452  setDiracParam(diracParam, inv_param, pc);
1453 
1454  DiracDomainWall4DPC dirac(diracParam); // create the Dirac operator
1455  printfQuda("kappa for QUDA input : %e\n",inv_param->kappa);
1456  switch (test_type) {
1457  case 0:
1458  dirac.Dslash4(out, in, parity);
1459  break;
1460  case 1:
1461  dirac.Dslash5(out, in, parity);
1462  break;
1463  case 2:
1464  dirac.Dslash5inv(out, in, parity, inv_param->kappa);
1465  break;
1466  }
1467 
1468  cpuParam.v = h_out;
1469 
1470  ColorSpinorField *out_h = (inv_param->output_location == QUDA_CPU_FIELD_LOCATION) ?
1471  static_cast<ColorSpinorField*>(new cpuColorSpinorField(cpuParam)) :
1472  static_cast<ColorSpinorField*>(new cudaColorSpinorField(cpuParam));
1473  *out_h = out;
1474 
1475  if (getVerbosity() >= QUDA_VERBOSE) {
1476  double cpu = norm2(*out_h);
1477  double gpu = norm2(out);
1478  printfQuda("Out CPU %e CUDA %e\n", cpu, gpu);
1479  }
1480 
1481  delete out_h;
1482  delete in_h;
1483 
1484  popVerbosity();
1485 }
1486 
1488 {
1489  if ( inv_param->dslash_type == QUDA_MOBIUS_DWF_DSLASH)
1490  setKernelPackT(true);
1491  else
1492  errorQuda("This type of dslashQuda operator is defined for QUDA_DOMAIN_WALL_$D_DSLASH and QUDA_MOBIUS_DWF_DSLASH only");
1493 
1494  if (gaugePrecise == NULL) errorQuda("Gauge field not allocated");
1495 
1496  pushVerbosity(inv_param->verbosity);
1498 
1499  ColorSpinorParam cpuParam(h_in, *inv_param, gaugePrecise->X(), 1);
1500 
1501  ColorSpinorField *in_h = (inv_param->input_location == QUDA_CPU_FIELD_LOCATION) ?
1502  static_cast<ColorSpinorField*>(new cpuColorSpinorField(cpuParam)) :
1503  static_cast<ColorSpinorField*>(new cudaColorSpinorField(cpuParam));
1504 
1505  ColorSpinorParam cudaParam(cpuParam, *inv_param);
1506  cudaColorSpinorField in(*in_h, cudaParam);
1507 
1508  if (getVerbosity() >= QUDA_VERBOSE) {
1509  double cpu = norm2(*in_h);
1510  double gpu = norm2(in);
1511  printfQuda("In CPU %e CUDA %e\n", cpu, gpu);
1512  }
1513 
1514  cudaParam.create = QUDA_NULL_FIELD_CREATE;
1515  cudaColorSpinorField out(in, cudaParam);
1516 
1517  if (inv_param->dirac_order == QUDA_CPS_WILSON_DIRAC_ORDER) {
1518  if (parity == QUDA_EVEN_PARITY) {
1519  parity = QUDA_ODD_PARITY;
1520  } else {
1521  parity = QUDA_EVEN_PARITY;
1522  }
1524  }
1525  bool pc = true;
1526 
1527  DiracParam diracParam;
1528  setDiracParam(diracParam, inv_param, pc);
1529 
1530  DiracMobiusDomainWallPC dirac(diracParam); // create the Dirac operator
1531  double kappa5 = 0.0; // Kappa5 is dummy argument
1532  switch (test_type) {
1533  case 0:
1534  dirac.Dslash4(out, in, parity);
1535  break;
1536  case 1:
1537  dirac.Dslash5(out, in, parity);
1538  break;
1539  case 2:
1540  dirac.Dslash4pre(out, in, parity);
1541  break;
1542  case 3:
1543  dirac.Dslash5inv(out, in, parity, kappa5);
1544  break;
1545  }
1546 
1547  cpuParam.v = h_out;
1548 
1549  ColorSpinorField *out_h = (inv_param->output_location == QUDA_CPU_FIELD_LOCATION) ?
1550  static_cast<ColorSpinorField*>(new cpuColorSpinorField(cpuParam)) :
1551  static_cast<ColorSpinorField*>(new cudaColorSpinorField(cpuParam));
1552  *out_h = out;
1553 
1554  if (getVerbosity() >= QUDA_VERBOSE) {
1555  double cpu = norm2(*out_h);
1556  double gpu = norm2(out);
1557  printfQuda("Out CPU %e CUDA %e\n", cpu, gpu);
1558  }
1559 
1560  delete out_h;
1561  delete in_h;
1562 
1563  popVerbosity();
1564 }
1565 
1566 
1567 void MatQuda(void *h_out, void *h_in, QudaInvertParam *inv_param)
1568 {
1569  pushVerbosity(inv_param->verbosity);
1570 
1571  if (inv_param->dslash_type == QUDA_DOMAIN_WALL_DSLASH ||
1572  inv_param->dslash_type == QUDA_DOMAIN_WALL_4D_DSLASH ||
1573  inv_param->dslash_type == QUDA_MOBIUS_DWF_DSLASH) setKernelPackT(true);
1574 
1575  if (gaugePrecise == NULL) errorQuda("Gauge field not allocated");
1576  if (cloverPrecise == NULL && ((inv_param->dslash_type == QUDA_CLOVER_WILSON_DSLASH) || (inv_param->dslash_type == QUDA_TWISTED_CLOVER_DSLASH)))
1577  errorQuda("Clover field not allocated");
1578  if (cloverInvPrecise == NULL && inv_param->dslash_type == QUDA_TWISTED_CLOVER_DSLASH)
1579  errorQuda("Clover field not allocated");
1581 
1582  bool pc = (inv_param->solution_type == QUDA_MATPC_SOLUTION ||
1584 
1585  ColorSpinorParam cpuParam(h_in, *inv_param, gaugePrecise->X(), pc);
1586  ColorSpinorField *in_h = (inv_param->input_location == QUDA_CPU_FIELD_LOCATION) ?
1587  static_cast<ColorSpinorField*>(new cpuColorSpinorField(cpuParam)) :
1588  static_cast<ColorSpinorField*>(new cudaColorSpinorField(cpuParam));
1589 
1590  ColorSpinorParam cudaParam(cpuParam, *inv_param);
1591  cudaColorSpinorField in(*in_h, cudaParam);
1592 
1593  if (getVerbosity() >= QUDA_VERBOSE) {
1594  double cpu = norm2(*in_h);
1595  double gpu = norm2(in);
1596  printfQuda("In CPU %e CUDA %e\n", cpu, gpu);
1597  }
1598 
1599  cudaParam.create = QUDA_NULL_FIELD_CREATE;
1600  cudaColorSpinorField out(in, cudaParam);
1601 
1602  DiracParam diracParam;
1603  setDiracParam(diracParam, inv_param, pc);
1604 
1605  Dirac *dirac = Dirac::create(diracParam); // create the Dirac operator
1606  dirac->M(out, in); // apply the operator
1607  delete dirac; // clean up
1608 
1609  double kappa = inv_param->kappa;
1610  if (pc) {
1611  if (inv_param->mass_normalization == QUDA_MASS_NORMALIZATION) {
1612  axCuda(0.25/(kappa*kappa), out);
1613  } else if (inv_param->mass_normalization == QUDA_ASYMMETRIC_MASS_NORMALIZATION) {
1614  axCuda(0.5/kappa, out);
1615  }
1616  } else {
1617  if (inv_param->mass_normalization == QUDA_MASS_NORMALIZATION ||
1619  axCuda(0.5/kappa, out);
1620  }
1621  }
1622 
1623  cpuParam.v = h_out;
1624 
1625  ColorSpinorField *out_h = (inv_param->output_location == QUDA_CPU_FIELD_LOCATION) ?
1626  static_cast<ColorSpinorField*>(new cpuColorSpinorField(cpuParam)) :
1627  static_cast<ColorSpinorField*>(new cudaColorSpinorField(cpuParam));
1628  *out_h = out;
1629 
1630  if (getVerbosity() >= QUDA_VERBOSE) {
1631  double cpu = norm2(*out_h);
1632  double gpu = norm2(out);
1633  printfQuda("Out CPU %e CUDA %e\n", cpu, gpu);
1634  }
1635 
1636  delete out_h;
1637  delete in_h;
1638 
1639  popVerbosity();
1640 }
1641 
1642 
1643 void MatDagMatQuda(void *h_out, void *h_in, QudaInvertParam *inv_param)
1644 {
1645  pushVerbosity(inv_param->verbosity);
1646 
1647  if (inv_param->dslash_type == QUDA_DOMAIN_WALL_DSLASH ||
1648  inv_param->dslash_type == QUDA_DOMAIN_WALL_4D_DSLASH ||
1649  inv_param->dslash_type == QUDA_MOBIUS_DWF_DSLASH) setKernelPackT(true);
1650 
1651  if (!initialized) errorQuda("QUDA not initialized");
1652  if (gaugePrecise == NULL) errorQuda("Gauge field not allocated");
1653  if (cloverPrecise == NULL && ((inv_param->dslash_type == QUDA_CLOVER_WILSON_DSLASH) || (inv_param->dslash_type == QUDA_TWISTED_CLOVER_DSLASH)))
1654  errorQuda("Clover field not allocated");
1655  if (cloverInvPrecise == NULL && inv_param->dslash_type == QUDA_TWISTED_CLOVER_DSLASH)
1656  errorQuda("Clover field not allocated");
1658 
1659  bool pc = (inv_param->solution_type == QUDA_MATPC_SOLUTION ||
1661 
1662  ColorSpinorParam cpuParam(h_in, *inv_param, gaugePrecise->X(), pc);
1663  ColorSpinorField *in_h = (inv_param->input_location == QUDA_CPU_FIELD_LOCATION) ?
1664  static_cast<ColorSpinorField*>(new cpuColorSpinorField(cpuParam)) :
1665  static_cast<ColorSpinorField*>(new cudaColorSpinorField(cpuParam));
1666 
1667  ColorSpinorParam cudaParam(cpuParam, *inv_param);
1668  cudaColorSpinorField in(*in_h, cudaParam);
1669 
1670  if (getVerbosity() >= QUDA_VERBOSE){
1671  double cpu = norm2(*in_h);
1672  double gpu = norm2(in);
1673  printfQuda("In CPU %e CUDA %e\n", cpu, gpu);
1674  }
1675 
1676  cudaParam.create = QUDA_NULL_FIELD_CREATE;
1677  cudaColorSpinorField out(in, cudaParam);
1678 
1679  // double kappa = inv_param->kappa;
1680  // if (inv_param->dirac_order == QUDA_CPS_WILSON_DIRAC_ORDER) kappa *= gaugePrecise->anisotropy;
1681 
1682  DiracParam diracParam;
1683  setDiracParam(diracParam, inv_param, pc);
1684 
1685  Dirac *dirac = Dirac::create(diracParam); // create the Dirac operator
1686  dirac->MdagM(out, in); // apply the operator
1687  delete dirac; // clean up
1688 
1689  double kappa = inv_param->kappa;
1690  if (pc) {
1691  if (inv_param->mass_normalization == QUDA_MASS_NORMALIZATION) {
1692  axCuda(1.0/pow(2.0*kappa,4), out);
1693  } else if (inv_param->mass_normalization == QUDA_ASYMMETRIC_MASS_NORMALIZATION) {
1694  axCuda(0.25/(kappa*kappa), out);
1695  }
1696  } else {
1697  if (inv_param->mass_normalization == QUDA_MASS_NORMALIZATION ||
1699  axCuda(0.25/(kappa*kappa), out);
1700  }
1701  }
1702 
1703  cpuParam.v = h_out;
1704 
1705  ColorSpinorField *out_h = (inv_param->output_location == QUDA_CPU_FIELD_LOCATION) ?
1706  static_cast<ColorSpinorField*>(new cpuColorSpinorField(cpuParam)) :
1707  static_cast<ColorSpinorField*>(new cudaColorSpinorField(cpuParam));
1708  *out_h = out;
1709 
1710  if (getVerbosity() >= QUDA_VERBOSE){
1711  double cpu = norm2(*out_h);
1712  double gpu = norm2(out);
1713  printfQuda("Out CPU %e CUDA %e\n", cpu, gpu);
1714  }
1715 
1716  delete out_h;
1717  delete in_h;
1718 
1719  popVerbosity();
1720 }
1721 
1724  if (param->dslash_type != QUDA_ASQTAD_DSLASH) {
1725  if (gaugePrecise == NULL) errorQuda("Precise gauge field doesn't exist");
1726  if (gaugeSloppy == NULL) errorQuda("Sloppy gauge field doesn't exist");
1727  if (gaugePrecondition == NULL) errorQuda("Precondition gauge field doesn't exist");
1728  if(param->overlap){
1729  if(gaugeExtended == NULL) errorQuda("Extended gauge field doesn't exist");
1730  }
1731  cudaGauge = gaugePrecise;
1732  } else {
1733  if (gaugeFatPrecise == NULL) errorQuda("Precise gauge fat field doesn't exist");
1734  if (gaugeFatSloppy == NULL) errorQuda("Sloppy gauge fat field doesn't exist");
1735  if (gaugeFatPrecondition == NULL) errorQuda("Precondition gauge fat field doesn't exist");
1736  if(param->overlap){
1737  if(gaugeFatExtended == NULL) errorQuda("Extended gauge fat field doesn't exist");
1738  }
1739 
1740  if (gaugeLongPrecise == NULL) errorQuda("Precise gauge long field doesn't exist");
1741  if (gaugeLongSloppy == NULL) errorQuda("Sloppy gauge long field doesn't exist");
1742  if (gaugeLongPrecondition == NULL) errorQuda("Precondition gauge long field doesn't exist");
1743  if(param->overlap){
1744  if(gaugeLongExtended == NULL) errorQuda("Extended gauge long field doesn't exist");
1745  }
1746  cudaGauge = gaugeFatPrecise;
1747  }
1748  return cudaGauge;
1749 }
1750 
1751 
1752 void cloverQuda(void *h_out, void *h_in, QudaInvertParam *inv_param, QudaParity parity, int inverse)
1753 {
1754  pushVerbosity(inv_param->verbosity);
1755 
1756  if (!initialized) errorQuda("QUDA not initialized");
1757  if (gaugePrecise == NULL) errorQuda("Gauge field not allocated");
1758  if (cloverPrecise == NULL) errorQuda("Clover field not allocated");
1759 
1761 
1762  if ((inv_param->dslash_type != QUDA_CLOVER_WILSON_DSLASH) && (inv_param->dslash_type != QUDA_TWISTED_CLOVER_DSLASH))
1763  errorQuda("Cannot apply the clover term for a non Wilson-clover or Twisted-mass-clover dslash");
1764 
1765  ColorSpinorParam cpuParam(h_in, *inv_param, gaugePrecise->X(), 1);
1766 
1767  ColorSpinorField *in_h = (inv_param->input_location == QUDA_CPU_FIELD_LOCATION) ?
1768  static_cast<ColorSpinorField*>(new cpuColorSpinorField(cpuParam)) :
1769  static_cast<ColorSpinorField*>(new cudaColorSpinorField(cpuParam));
1770 
1771  ColorSpinorParam cudaParam(cpuParam, *inv_param);
1772  cudaColorSpinorField in(*in_h, cudaParam);
1773 
1774  if (getVerbosity() >= QUDA_VERBOSE) {
1775  double cpu = norm2(*in_h);
1776  double gpu = norm2(in);
1777  printfQuda("In CPU %e CUDA %e\n", cpu, gpu);
1778  }
1779 
1780  cudaParam.create = QUDA_NULL_FIELD_CREATE;
1781  cudaColorSpinorField out(in, cudaParam);
1782 
1783  if (inv_param->dirac_order == QUDA_CPS_WILSON_DIRAC_ORDER) {
1784  if (parity == QUDA_EVEN_PARITY) {
1785  parity = QUDA_ODD_PARITY;
1786  } else {
1787  parity = QUDA_EVEN_PARITY;
1788  }
1790  }
1791  bool pc = true;
1792 
1793  DiracParam diracParam;
1794  setDiracParam(diracParam, inv_param, pc);
1795  //FIXME: Do we need this for twisted clover???
1796  DiracCloverPC dirac(diracParam); // create the Dirac operator
1797  if (!inverse) dirac.Clover(out, in, parity); // apply the clover operator
1798  else dirac.CloverInv(out, in, parity);
1799 
1800  cpuParam.v = h_out;
1801 
1802  ColorSpinorField *out_h = (inv_param->output_location == QUDA_CPU_FIELD_LOCATION) ?
1803  static_cast<ColorSpinorField*>(new cpuColorSpinorField(cpuParam)) :
1804  static_cast<ColorSpinorField*>(new cudaColorSpinorField(cpuParam));
1805  *out_h = out;
1806 
1807  if (getVerbosity() >= QUDA_VERBOSE) {
1808  double cpu = norm2(*out_h);
1809  double gpu = norm2(out);
1810  printfQuda("Out CPU %e CUDA %e\n", cpu, gpu);
1811  }
1812 
1813  /*for (int i=0; i<in_h->Volume(); i++) {
1814  ((cpuColorSpinorField*)out_h)->PrintVector(i);
1815  }*/
1816 
1817  delete out_h;
1818  delete in_h;
1819 
1820  popVerbosity();
1821 }
1822 
1823 
1824 void lanczosQuda(int k0, int m, void *hp_Apsi, void *hp_r, void *hp_V,
1825  void *hp_alpha, void *hp_beta, QudaEigParam *eig_param)
1826 {
1828  param = eig_param->invert_param;
1829  setTuning(param->tune);
1830 
1831  if (param->dslash_type == QUDA_DOMAIN_WALL_DSLASH ||
1834  if (gaugePrecise == NULL) errorQuda("Gauge field not allocated");
1835 
1836  profileInvert.Start(QUDA_PROFILE_TOTAL);
1837 
1838  if (!initialized) errorQuda("QUDA not initialized");
1839 
1840  pushVerbosity(param->verbosity);
1842 
1843  // check the gauge fields have been created
1845 
1846  checkInvertParam(param);
1847  checkEigParam(eig_param);
1848 
1849  bool pc_solution = (param->solution_type == QUDA_MATPC_DAG_SOLUTION) ||
1851 
1852  // create the dirac operator
1853  DiracParam diracParam;
1854  setDiracParam(diracParam, param, pc_solution);
1855  Dirac *d = Dirac::create(diracParam); // create the Dirac operator
1856 
1857  Dirac &dirac = *d;
1858 
1859  profileInvert.Start(QUDA_PROFILE_H2D);
1860 
1861  cudaColorSpinorField *r = NULL;
1862  cudaColorSpinorField *Apsi = NULL;
1863  const int *X = cudaGauge->X();
1864 
1865  // wrap CPU host side pointers
1866  ColorSpinorParam cpuParam(hp_r, *param, X, pc_solution);
1868  static_cast<ColorSpinorField*>(new cpuColorSpinorField(cpuParam)) :
1869  static_cast<ColorSpinorField*>(new cudaColorSpinorField(cpuParam));
1870 
1871  cpuParam.v = hp_Apsi;
1873  static_cast<ColorSpinorField*>(new cpuColorSpinorField(cpuParam)) :
1874  static_cast<ColorSpinorField*>(new cudaColorSpinorField(cpuParam));
1875 
1876  //Make Eigen vector data set
1877  cpuColorSpinorField **h_Eig_Vec;
1878  h_Eig_Vec =(cpuColorSpinorField **)safe_malloc( m*sizeof(cpuColorSpinorField*));
1879  for( int k = 0 ; k < m ; k++)
1880  {
1881  cpuParam.v = ((double**)hp_V)[k];
1882  h_Eig_Vec[k] = new cpuColorSpinorField(cpuParam);
1883  }
1884 
1885  // download source
1886  ColorSpinorParam cudaParam(cpuParam, *param);
1887  cudaParam.create = QUDA_COPY_FIELD_CREATE;
1888  r = new cudaColorSpinorField(*h_r, cudaParam);
1889  Apsi = new cudaColorSpinorField(*h_Apsi, cudaParam);
1890 
1891  double cpu;
1892  double gpu;
1893 
1894  if (getVerbosity() >= QUDA_VERBOSE) {
1895  cpu = norm2(*h_r);
1896  gpu = norm2(*r);
1897  printfQuda("r vector CPU %1.14e CUDA %1.14e\n", cpu, gpu);
1898  cpu = norm2(*h_Apsi);
1899  gpu = norm2(*Apsi);
1900  printfQuda("Apsi vector CPU %1.14e CUDA %1.14e\n", cpu, gpu);
1901  }
1902 
1903  // download Eigen vector set
1904  cudaColorSpinorField **Eig_Vec;
1905  Eig_Vec = (cudaColorSpinorField **)safe_malloc( m*sizeof(cudaColorSpinorField*));
1906 
1907  for( int k = 0 ; k < m ; k++)
1908  {
1909  Eig_Vec[k] = new cudaColorSpinorField(*h_Eig_Vec[k], cudaParam);
1910  if (getVerbosity() >= QUDA_VERBOSE) {
1911  cpu = norm2(*h_Eig_Vec[k]);
1912  gpu = norm2(*Eig_Vec[k]);
1913  printfQuda("Eig_Vec[%d] CPU %1.14e CUDA %1.14e\n", k, cpu, gpu);
1914  }
1915  }
1916  profileInvert.Stop(QUDA_PROFILE_H2D);
1917 
1918  if(eig_param->RitzMat_lanczos == QUDA_MATPC_DAG_SOLUTION)
1919  {
1920  DiracMdag mat(dirac);
1921  RitzMat ritz_mat(mat,*eig_param);
1922  Eig_Solver *eig_solve = Eig_Solver::create(*eig_param, ritz_mat, profileInvert);
1923  (*eig_solve)((double*)hp_alpha, (double*)hp_beta, Eig_Vec, *r, *Apsi, k0, m);
1924  delete eig_solve;
1925  }
1926  else if(eig_param->RitzMat_lanczos == QUDA_MATPCDAG_MATPC_SOLUTION)
1927  {
1928  DiracMdagM mat(dirac);
1929  RitzMat ritz_mat(mat,*eig_param);
1930  Eig_Solver *eig_solve = Eig_Solver::create(*eig_param, ritz_mat, profileInvert);
1931  (*eig_solve)((double*)hp_alpha, (double*)hp_beta, Eig_Vec, *r, *Apsi, k0, m);
1932  delete eig_solve;
1933  }
1935  {
1936  DiracMdagM mat(dirac);
1937  RitzMat ritz_mat(mat,*eig_param);
1938  Eig_Solver *eig_solve = Eig_Solver::create(*eig_param, ritz_mat, profileInvert);
1939  (*eig_solve)((double*)hp_alpha, (double*)hp_beta, Eig_Vec, *r, *Apsi, k0, m);
1940  delete eig_solve;
1941  }
1942  else
1943  {
1944  errorQuda("invalid ritz matrix type\n");
1945  exit(0);
1946  }
1947 
1948  //Write back calculated eigen vector
1949  profileInvert.Start(QUDA_PROFILE_D2H);
1950  for( int k = 0 ; k < m ; k++)
1951  {
1952  *h_Eig_Vec[k] = *Eig_Vec[k];
1953  }
1954  *h_r = *r;
1955  *h_Apsi = *Apsi;
1956  profileInvert.Stop(QUDA_PROFILE_D2H);
1957 
1958 
1959  delete h_r;
1960  delete h_Apsi;
1961  for( int k = 0 ; k < m ; k++)
1962  {
1963  delete Eig_Vec[k];
1964  delete h_Eig_Vec[k];
1965  }
1966  host_free(Eig_Vec);
1967  host_free(h_Eig_Vec);
1968 
1969  delete d;
1970 
1971  popVerbosity();
1972 
1974  profileInvert.Stop(QUDA_PROFILE_TOTAL);
1975 }
1976 
1977 void invertQuda(void *hp_x, void *hp_b, QudaInvertParam *param)
1978 {
1979  setTuning(param->tune);
1980 
1981  if (param->dslash_type == QUDA_DOMAIN_WALL_DSLASH ||
1984 
1985  profileInvert.Start(QUDA_PROFILE_TOTAL);
1986 
1987  if (!initialized) errorQuda("QUDA not initialized");
1988 
1989  pushVerbosity(param->verbosity);
1991 
1992  // check the gauge fields have been created
1994 
1995  checkInvertParam(param);
1996 
1997  // It was probably a bad design decision to encode whether the system is even/odd preconditioned (PC) in
1998  // solve_type and solution_type, rather than in separate members of QudaInvertParam. We're stuck with it
1999  // for now, though, so here we factorize everything for convenience.
2000 
2001  bool pc_solution = (param->solution_type == QUDA_MATPC_SOLUTION) ||
2003  bool pc_solve = (param->solve_type == QUDA_DIRECT_PC_SOLVE) ||
2005  bool mat_solution = (param->solution_type == QUDA_MAT_SOLUTION) ||
2006  (param->solution_type == QUDA_MATPC_SOLUTION);
2007  bool direct_solve = (param->solve_type == QUDA_DIRECT_SOLVE) ||
2008  (param->solve_type == QUDA_DIRECT_PC_SOLVE);
2009  bool norm_error_solve = (param->solve_type == QUDA_NORMERR_SOLVE) ||
2010  (param->solve_type == QUDA_NORMERR_PC_SOLVE);
2011 
2012  param->spinorGiB = cudaGauge->VolumeCB() * spinorSiteSize;
2013  if (!pc_solve) param->spinorGiB *= 2;
2014  param->spinorGiB *= (param->cuda_prec == QUDA_DOUBLE_PRECISION ? sizeof(double) : sizeof(float));
2015  if (param->preserve_source == QUDA_PRESERVE_SOURCE_NO) {
2016  param->spinorGiB *= (param->inv_type == QUDA_CG_INVERTER ? 5 : 7)/(double)(1<<30);
2017  } else {
2018  param->spinorGiB *= (param->inv_type == QUDA_CG_INVERTER ? 8 : 9)/(double)(1<<30);
2019  }
2020 
2021  param->secs = 0;
2022  param->gflops = 0;
2023  param->iter = 0;
2024 
2025  Dirac *d = NULL;
2026  Dirac *dSloppy = NULL;
2027  Dirac *dPre = NULL;
2028 
2029  // create the dirac operator
2030  createDirac(d, dSloppy, dPre, *param, pc_solve);
2031 
2032  Dirac &dirac = *d;
2033  Dirac &diracSloppy = *dSloppy;
2034  Dirac &diracPre = *dPre;
2035 
2036  profileInvert.Start(QUDA_PROFILE_H2D);
2037 
2038  cudaColorSpinorField *b = NULL;
2039  cudaColorSpinorField *x = NULL;
2040  cudaColorSpinorField *in = NULL;
2041  cudaColorSpinorField *out = NULL;
2042 
2043  const int *X = cudaGauge->X();
2044 
2045  // wrap CPU host side pointers
2046  ColorSpinorParam cpuParam(hp_b, *param, X, pc_solution);
2048  static_cast<ColorSpinorField*>(new cpuColorSpinorField(cpuParam)) :
2049  static_cast<ColorSpinorField*>(new cudaColorSpinorField(cpuParam));
2050 
2051  cpuParam.v = hp_x;
2053  static_cast<ColorSpinorField*>(new cpuColorSpinorField(cpuParam)) :
2054  static_cast<ColorSpinorField*>(new cudaColorSpinorField(cpuParam));
2055 
2056  // download source
2057  ColorSpinorParam cudaParam(cpuParam, *param);
2058  cudaParam.create = QUDA_COPY_FIELD_CREATE;
2059  b = new cudaColorSpinorField(*h_b, cudaParam);
2060 
2061  if (param->use_init_guess == QUDA_USE_INIT_GUESS_YES) { // download initial guess
2062  // initial guess only supported for single-pass solvers
2064  (param->solve_type == QUDA_DIRECT_SOLVE || param->solve_type == QUDA_DIRECT_PC_SOLVE)) {
2065  errorQuda("Initial guess not supported for two-pass solver");
2066  }
2067 
2068  x = new cudaColorSpinorField(*h_x, cudaParam); // solution
2069  } else { // zero initial guess
2070  cudaParam.create = QUDA_ZERO_FIELD_CREATE;
2071  x = new cudaColorSpinorField(cudaParam); // solution
2072  }
2073 
2074  profileInvert.Stop(QUDA_PROFILE_H2D);
2075 
2076  double nb = norm2(*b);
2077  if (nb==0.0) errorQuda("Source has zero norm");
2078 
2079  if (getVerbosity() >= QUDA_VERBOSE) {
2080  double nh_b = norm2(*h_b);
2081  double nh_x = norm2(*h_x);
2082  double nx = norm2(*x);
2083  printfQuda("Source: CPU = %g, CUDA copy = %g\n", nh_b, nb);
2084  printfQuda("Solution: CPU = %g, CUDA copy = %g\n", nh_x, nx);
2085  }
2086 
2087  // rescale the source and solution vectors to help prevent the onset of underflow
2089  axCuda(1.0/sqrt(nb), *b);
2090  axCuda(1.0/sqrt(nb), *x);
2091  }
2092 
2093  massRescale(*b, *param);
2094 
2095  dirac.prepare(in, out, *x, *b, param->solution_type);
2096  if (getVerbosity() >= QUDA_VERBOSE) {
2097  double nin = norm2(*in);
2098  double nout = norm2(*out);
2099  printfQuda("Prepared source = %g\n", nin);
2100  printfQuda("Prepared solution = %g\n", nout);
2101  }
2102 
2103  if (getVerbosity() >= QUDA_VERBOSE) {
2104  double nin = norm2(*in);
2105  printfQuda("Prepared source post mass rescale = %g\n", nin);
2106  }
2107 
2108  // solution_type specifies *what* system is to be solved.
2109  // solve_type specifies *how* the system is to be solved.
2110  //
2111  // We have the following four cases (plus preconditioned variants):
2112  //
2113  // solution_type solve_type Effect
2114  // ------------- ---------- ------
2115  // MAT DIRECT Solve Ax=b
2116  // MATDAG_MAT DIRECT Solve A^dag y = b, followed by Ax=y
2117  // MAT NORMOP Solve (A^dag A) x = (A^dag b)
2118  // MATDAG_MAT NORMOP Solve (A^dag A) x = b
2119  // MAT NORMERR Solve (A A^dag) y = b, then x = A^dag y
2120  //
2121  // We generally require that the solution_type and solve_type
2122  // preconditioning match. As an exception, the unpreconditioned MAT
2123  // solution_type may be used with any solve_type, including
2124  // DIRECT_PC and NORMOP_PC. In these cases, preparation of the
2125  // preconditioned source and reconstruction of the full solution are
2126  // taken care of by Dirac::prepare() and Dirac::reconstruct(),
2127  // respectively.
2128 
2129  if (pc_solution && !pc_solve) {
2130  errorQuda("Preconditioned (PC) solution_type requires a PC solve_type");
2131  }
2132 
2133  if (!mat_solution && !pc_solution && pc_solve) {
2134  errorQuda("Unpreconditioned MATDAG_MAT solution_type requires an unpreconditioned solve_type");
2135  }
2136 
2137  if (!mat_solution && norm_error_solve) {
2138  errorQuda("Normal-error solve requires Mat solution");
2139  }
2140 
2141  if (mat_solution && !direct_solve && !norm_error_solve) { // prepare source: b' = A^dag b
2143  dirac.Mdag(*in, tmp);
2144  } else if (!mat_solution && direct_solve) { // perform the first of two solves: A^dag y = b
2145  DiracMdag m(dirac), mSloppy(diracSloppy), mPre(diracPre);
2146  SolverParam solverParam(*param);
2147  Solver *solve = Solver::create(solverParam, m, mSloppy, mPre, profileInvert);
2148  (*solve)(*out, *in);
2149  copyCuda(*in, *out);
2150  solverParam.updateInvertParam(*param);
2151  delete solve;
2152  }
2153 
2154  if (direct_solve) {
2155  DiracM m(dirac), mSloppy(diracSloppy), mPre(diracPre);
2156  SolverParam solverParam(*param);
2157  Solver *solve = Solver::create(solverParam, m, mSloppy, mPre, profileInvert);
2158  (*solve)(*out, *in);
2159  solverParam.updateInvertParam(*param);
2160  delete solve;
2161  } else if (!norm_error_solve) {
2162  DiracMdagM m(dirac), mSloppy(diracSloppy), mPre(diracPre);
2163  SolverParam solverParam(*param);
2164  Solver *solve = Solver::create(solverParam, m, mSloppy, mPre, profileInvert);
2165  (*solve)(*out, *in);
2166  solverParam.updateInvertParam(*param);
2167  delete solve;
2168  } else { // norm_error_solve
2169  DiracMMdag m(dirac), mSloppy(diracSloppy), mPre(diracPre);
2170  cudaColorSpinorField tmp(*out);
2171  SolverParam solverParam(*param);
2172  Solver *solve = Solver::create(solverParam, m, mSloppy, mPre, profileInvert);
2173  (*solve)(tmp, *in); // y = (M M^\dag) b
2174  dirac.Mdag(*out, tmp); // x = M^dag y
2175  solverParam.updateInvertParam(*param);
2176  delete solve;
2177  }
2178 
2179  if (getVerbosity() >= QUDA_VERBOSE){
2180  double nx = norm2(*x);
2181  printfQuda("Solution = %g\n",nx);
2182  }
2183  dirac.reconstruct(*x, *b, param->solution_type);
2184 
2186  // rescale the solution
2187  axCuda(sqrt(nb), *x);
2188  }
2189 
2190  profileInvert.Start(QUDA_PROFILE_D2H);
2191  *h_x = *x;
2192  profileInvert.Stop(QUDA_PROFILE_D2H);
2193 
2194  if (getVerbosity() >= QUDA_VERBOSE){
2195  double nx = norm2(*x);
2196  double nh_x = norm2(*h_x);
2197  printfQuda("Reconstructed: CUDA solution = %g, CPU copy = %g\n", nx, nh_x);
2198  }
2199 
2200  delete h_b;
2201  delete h_x;
2202  delete b;
2203  delete x;
2204 
2205  delete d;
2206  delete dSloppy;
2207  delete dPre;
2208 
2209  popVerbosity();
2210 
2211  // FIXME: added temporarily so that the cache is written out even if a long benchmarking job gets interrupted
2213 
2214  profileInvert.Stop(QUDA_PROFILE_TOTAL);
2215 }
2216 
2217 void invertMDQuda(void *hp_x, void *hp_b, QudaInvertParam *param)
2218 {
2219  setTuning(param->tune);
2220 
2221  if (param->dslash_type == QUDA_DOMAIN_WALL_DSLASH) setKernelPackT(true);
2222 
2223  profileInvert.Start(QUDA_PROFILE_TOTAL);
2224 
2225  if (!initialized) errorQuda("QUDA not initialized");
2226 
2227  pushVerbosity(param->verbosity);
2229 
2230  // check the gauge fields have been created
2232 
2233  checkInvertParam(param);
2234 
2235  // It was probably a bad design decision to encode whether the system is even/odd preconditioned (PC) in
2236  // solve_type and solution_type, rather than in separate members of QudaInvertParam. We're stuck with it
2237  // for now, though, so here we factorize everything for convenience.
2238 
2239  bool pc_solution = (param->solution_type == QUDA_MATPC_SOLUTION) ||
2241  bool pc_solve = (param->solve_type == QUDA_DIRECT_PC_SOLVE) ||
2243  bool mat_solution = (param->solution_type == QUDA_MAT_SOLUTION) ||
2244  (param->solution_type == QUDA_MATPC_SOLUTION);
2245  bool direct_solve = (param->solve_type == QUDA_DIRECT_SOLVE) ||
2246  (param->solve_type == QUDA_DIRECT_PC_SOLVE);
2247  bool norm_error_solve = (param->solve_type == QUDA_NORMERR_SOLVE) ||
2248  (param->solve_type == QUDA_NORMERR_PC_SOLVE);
2249 
2250  param->spinorGiB = cudaGauge->VolumeCB() * spinorSiteSize;
2251  if (!pc_solve) param->spinorGiB *= 2;
2252  param->spinorGiB *= (param->cuda_prec == QUDA_DOUBLE_PRECISION ? sizeof(double) : sizeof(float));
2253  if (param->preserve_source == QUDA_PRESERVE_SOURCE_NO) {
2254  param->spinorGiB *= (param->inv_type == QUDA_CG_INVERTER ? 5 : 7)/(double)(1<<30);
2255  } else {
2256  param->spinorGiB *= (param->inv_type == QUDA_CG_INVERTER ? 8 : 9)/(double)(1<<30);
2257  }
2258 
2259  param->secs = 0;
2260  param->gflops = 0;
2261  param->iter = 0;
2262 
2263  Dirac *d = NULL;
2264  Dirac *dSloppy = NULL;
2265  Dirac *dPre = NULL;
2266 
2267  // create the dirac operator
2268  createDirac(d, dSloppy, dPre, *param, pc_solve);
2269 
2270  Dirac &dirac = *d;
2271  Dirac &diracSloppy = *dSloppy;
2272  Dirac &diracPre = *dPre;
2273 
2274  profileInvert.Start(QUDA_PROFILE_H2D);
2275 
2276  cudaColorSpinorField *b = NULL;
2277  cudaColorSpinorField *x = NULL;
2278  cudaColorSpinorField *in = NULL;
2279  cudaColorSpinorField *out = NULL;
2280 
2281  const int *X = cudaGauge->X();
2282 
2283  // wrap CPU host side pointers
2284  ColorSpinorParam cpuParam(hp_b, *param, X, pc_solution);
2286  static_cast<ColorSpinorField*>(new cpuColorSpinorField(cpuParam)) :
2287  static_cast<ColorSpinorField*>(new cudaColorSpinorField(cpuParam));
2288 
2289  cpuParam.v = hp_x;
2291  static_cast<ColorSpinorField*>(new cpuColorSpinorField(cpuParam)) :
2292  static_cast<ColorSpinorField*>(new cudaColorSpinorField(cpuParam));
2293 
2294  // download source
2295  ColorSpinorParam cudaParam(cpuParam, *param);
2296  cudaParam.create = QUDA_COPY_FIELD_CREATE;
2297  b = new cudaColorSpinorField(*h_b, cudaParam);
2298 
2299  if (param->use_init_guess == QUDA_USE_INIT_GUESS_YES) { // download initial guess
2300  // initial guess only supported for single-pass solvers
2302  (param->solve_type == QUDA_DIRECT_SOLVE || param->solve_type == QUDA_DIRECT_PC_SOLVE)) {
2303  errorQuda("Initial guess not supported for two-pass solver");
2304  }
2305 
2306  x = new cudaColorSpinorField(*h_x, cudaParam); // solution
2307  } else { // zero initial guess
2308  cudaParam.create = QUDA_ZERO_FIELD_CREATE;
2309  x = new cudaColorSpinorField(cudaParam); // solution
2310  }
2311 
2312  profileInvert.Stop(QUDA_PROFILE_H2D);
2313 
2314  double nb = norm2(*b);
2315  if (nb==0.0) errorQuda("Source has zero norm");
2316 
2317  if (getVerbosity() >= QUDA_VERBOSE) {
2318  double nh_b = norm2(*h_b);
2319  double nh_x = norm2(*h_x);
2320  double nx = norm2(*x);
2321  printfQuda("Source: CPU = %g, CUDA copy = %g\n", nh_b, nb);
2322  printfQuda("Solution: CPU = %g, CUDA copy = %g\n", nh_x, nx);
2323  }
2324 
2325  // rescale the source and solution vectors to help prevent the onset of underflow
2327  axCuda(1.0/sqrt(nb), *b);
2328  axCuda(1.0/sqrt(nb), *x);
2329  }
2330 
2331  massRescale(*b, *param);
2332 
2333  dirac.prepare(in, out, *x, *b, param->solution_type);
2334  if (getVerbosity() >= QUDA_VERBOSE) {
2335  double nin = norm2(*in);
2336  double nout = norm2(*out);
2337  printfQuda("Prepared source = %g\n", nin);
2338  printfQuda("Prepared solution = %g\n", nout);
2339  }
2340 
2341  if (getVerbosity() >= QUDA_VERBOSE) {
2342  double nin = norm2(*in);
2343  printfQuda("Prepared source post mass rescale = %g\n", nin);
2344  }
2345 
2346  // solution_type specifies *what* system is to be solved.
2347  // solve_type specifies *how* the system is to be solved.
2348  //
2349  // We have the following four cases (plus preconditioned variants):
2350  //
2351  // solution_type solve_type Effect
2352  // ------------- ---------- ------
2353  // MAT DIRECT Solve Ax=b
2354  // MATDAG_MAT DIRECT Solve A^dag y = b, followed by Ax=y
2355  // MAT NORMOP Solve (A^dag A) x = (A^dag b)
2356  // MATDAG_MAT NORMOP Solve (A^dag A) x = b
2357  // MAT NORMERR Solve (A A^dag) y = b, then x = A^dag y
2358  //
2359  // We generally require that the solution_type and solve_type
2360  // preconditioning match. As an exception, the unpreconditioned MAT
2361  // solution_type may be used with any solve_type, including
2362  // DIRECT_PC and NORMOP_PC. In these cases, preparation of the
2363  // preconditioned source and reconstruction of the full solution are
2364  // taken care of by Dirac::prepare() and Dirac::reconstruct(),
2365  // respectively.
2366 
2367  if (pc_solution && !pc_solve) {
2368  errorQuda("Preconditioned (PC) solution_type requires a PC solve_type");
2369  }
2370 
2371  if (!mat_solution && !pc_solution && pc_solve) {
2372  errorQuda("Unpreconditioned MATDAG_MAT solution_type requires an unpreconditioned solve_type");
2373  }
2374 
2375  if (!mat_solution && norm_error_solve) {
2376  errorQuda("Normal-error solve requires Mat solution");
2377  }
2378 
2379  if (mat_solution && !direct_solve && !norm_error_solve) { // prepare source: b' = A^dag b
2381  dirac.Mdag(*in, tmp);
2382  } else if (!mat_solution && direct_solve) { // perform the first of two solves: A^dag y = b
2383  DiracMdag m(dirac), mSloppy(diracSloppy), mPre(diracPre);
2384  SolverParam solverParam(*param);
2385  Solver *solve = Solver::create(solverParam, m, mSloppy, mPre, profileInvert);
2386  (*solve)(*out, *in);
2387  copyCuda(*in, *out);
2388  solverParam.updateInvertParam(*param);
2389  delete solve;
2390  }
2391 
2392  if (direct_solve) {
2393  DiracM m(dirac), mSloppy(diracSloppy), mPre(diracPre);
2394  SolverParam solverParam(*param);
2395  Solver *solve = Solver::create(solverParam, m, mSloppy, mPre, profileInvert);
2396  (*solve)(*out, *in);
2397  solverParam.updateInvertParam(*param);
2398  delete solve;
2399  } else if (!norm_error_solve){
2400  DiracMdagM m(dirac), mSloppy(diracSloppy), mPre(diracPre);
2401  SolverParam solverParam(*param);
2402  Solver *solve = Solver::create(solverParam, m, mSloppy, mPre, profileInvert);
2403  (*solve)(*out, *in);
2404  solverParam.updateInvertParam(*param);
2405  delete solve;
2406  } else { // norm_error_solve
2407  DiracMMdag m(dirac), mSloppy(diracSloppy), mPre(diracPre);
2408  cudaColorSpinorField tmp(*out);
2409  SolverParam solverParam(*param);
2410  Solver *solve = Solver::create(solverParam, m, mSloppy, mPre, profileInvert);
2411  (*solve)(tmp, *in); // y = (M M^\dag) b
2412  dirac.Mdag(*out, tmp); // x = M^dag y
2413  solverParam.updateInvertParam(*param);
2414  delete solve;
2415  }
2416 
2417  if (getVerbosity() >= QUDA_VERBOSE){
2418  double nx = norm2(*x);
2419  printfQuda("Solution = %g\n",nx);
2420  }
2421  dirac.reconstruct(*x, *b, param->solution_type);
2422 
2424  // rescale the solution
2425  axCuda(sqrt(nb), *x);
2426  }
2427 
2428  if (solutionResident) delete solutionResident;
2429  //errorQuda("solutionResident already allocated");
2430  cudaParam.siteSubset = QUDA_FULL_SITE_SUBSET;
2431  cudaParam.x[0] *= 2;
2432  cudaParam.create = QUDA_NULL_FIELD_CREATE;
2433  solutionResident = new cudaColorSpinorField(cudaParam);
2434 
2436 
2437  profileInvert.Start(QUDA_PROFILE_D2H);
2438  *h_x = *x;
2439  profileInvert.Stop(QUDA_PROFILE_D2H);
2440 
2441  if (getVerbosity() >= QUDA_VERBOSE){
2442  double nx = norm2(*x);
2443  double nh_x = norm2(*h_x);
2444  printfQuda("Reconstructed: CUDA solution = %g, CPU copy = %g\n", nx, nh_x);
2445  }
2446 
2447  delete h_b;
2448  delete h_x;
2449  delete b;
2450  delete x;
2451 
2452  delete d;
2453  delete dSloppy;
2454  delete dPre;
2455 
2456  popVerbosity();
2457 
2458  // FIXME: added temporarily so that the cache is written out even if a long benchmarking job gets interrupted
2460 
2461  profileInvert.Stop(QUDA_PROFILE_TOTAL);
2462 }
2463 
2464 
2473 void invertMultiShiftQuda(void **_hp_x, void *_hp_b, QudaInvertParam *param)
2474 {
2475  setTuning(param->tune);
2476 
2477  profileMulti.Start(QUDA_PROFILE_TOTAL);
2478 
2479  if (param->dslash_type == QUDA_DOMAIN_WALL_DSLASH ||
2482 
2483  if (!initialized) errorQuda("QUDA not initialized");
2484  // check the gauge fields have been created
2486  checkInvertParam(param);
2487 
2488  if (param->num_offset > QUDA_MAX_MULTI_SHIFT)
2489  errorQuda("Number of shifts %d requested greater than QUDA_MAX_MULTI_SHIFT %d",
2491 
2492  pushVerbosity(param->verbosity);
2493 
2494  bool pc_solution = (param->solution_type == QUDA_MATPC_SOLUTION) || (param->solution_type == QUDA_MATPCDAG_MATPC_SOLUTION);
2495  bool pc_solve = (param->solve_type == QUDA_DIRECT_PC_SOLVE) || (param->solve_type == QUDA_NORMOP_PC_SOLVE);
2496  bool mat_solution = (param->solution_type == QUDA_MAT_SOLUTION) || (param->solution_type == QUDA_MATPC_SOLUTION);
2497  bool direct_solve = (param->solve_type == QUDA_DIRECT_SOLVE) || (param->solve_type == QUDA_DIRECT_PC_SOLVE);
2498 
2499  if (mat_solution) {
2500  errorQuda("Multi-shift solver does not support MAT or MATPC solution types");
2501  }
2502  if (direct_solve) {
2503  errorQuda("Multi-shift solver does not support DIRECT or DIRECT_PC solve types");
2504  }
2505  if (pc_solution & !pc_solve) {
2506  errorQuda("Preconditioned (PC) solution_type requires a PC solve_type");
2507  }
2508  if (!pc_solution & pc_solve) {
2509  errorQuda("In multi-shift solver, a preconditioned (PC) solve_type requires a PC solution_type");
2510  }
2511 
2512  // No of GiB in a checkerboard of a spinor
2513  param->spinorGiB = cudaGauge->VolumeCB() * spinorSiteSize;
2514  if( !pc_solve) param->spinorGiB *= 2; // Double volume for non PC solve
2515 
2516  // **** WARNING *** this may not match implementation...
2517  if( param->inv_type == QUDA_CG_INVERTER ) {
2518  // CG-M needs 5 vectors for the smallest shift + 2 for each additional shift
2519  param->spinorGiB *= (5 + 2*(param->num_offset-1))/(double)(1<<30);
2520  } else {
2521  errorQuda("QUDA only currently supports multi-shift CG");
2522  // BiCGStab-M needs 7 for the original shift + 2 for each additional shift + 1 auxiliary
2523  // (Jegerlehner hep-lat/9612014 eq (3.13)
2524  param->spinorGiB *= (7 + 2*(param->num_offset-1))/(double)(1<<30);
2525  }
2526 
2527  // Timing and FLOP counters
2528  param->secs = 0;
2529  param->gflops = 0;
2530  param->iter = 0;
2531 
2532  for (int i=0; i<param->num_offset-1; i++) {
2533  for (int j=i+1; j<param->num_offset; j++) {
2534  if (param->offset[i] > param->offset[j])
2535  errorQuda("Offsets must be ordered from smallest to largest");
2536  }
2537  }
2538 
2539  // Host pointers for x, take a copy of the input host pointers
2540  void** hp_x;
2541  hp_x = new void* [ param->num_offset ];
2542 
2543  void* hp_b = _hp_b;
2544  for(int i=0;i < param->num_offset;i++){
2545  hp_x[i] = _hp_x[i];
2546  }
2547 
2548  // Create the matrix.
2549  // The way this works is that createDirac will create 'd' and 'dSloppy'
2550  // which are global. We then grab these with references...
2551  //
2552  // Balint: Isn't there a nice construction pattern we could use here? This is
2553  // expedient but yucky.
2554  // DiracParam diracParam;
2555  if (param->dslash_type == QUDA_ASQTAD_DSLASH ||
2557  param->mass = sqrt(param->offset[0]/4);
2558  }
2559 
2560  Dirac *d = NULL;
2561  Dirac *dSloppy = NULL;
2562  Dirac *dPre = NULL;
2563 
2564  // create the dirac operator
2565  createDirac(d, dSloppy, dPre, *param, pc_solve);
2566  Dirac &dirac = *d;
2567  Dirac &diracSloppy = *dSloppy;
2568 
2569  cudaColorSpinorField *b = NULL; // Cuda RHS
2570  cudaColorSpinorField **x = NULL; // Cuda Solutions
2571 
2572  // Grab the dimension array of the input gauge field.
2573  const int *X = ( param->dslash_type == QUDA_ASQTAD_DSLASH ) ?
2574  gaugeFatPrecise->X() : gaugePrecise->X();
2575 
2576  // This creates a ColorSpinorParam struct, from the host data
2577  // pointer, the definitions in param, the dimensions X, and whether
2578  // the solution is on a checkerboard instruction or not. These can
2579  // then be used as 'instructions' to create the actual
2580  // ColorSpinorField
2581  ColorSpinorParam cpuParam(hp_b, *param, X, pc_solution);
2583  static_cast<ColorSpinorField*>(new cpuColorSpinorField(cpuParam)) :
2584  static_cast<ColorSpinorField*>(new cudaColorSpinorField(cpuParam));
2585 
2586  ColorSpinorField **h_x = new ColorSpinorField* [ param->num_offset ]; // DYNAMIC ALLOCATION
2587  for(int i=0; i < param->num_offset; i++) {
2588  cpuParam.v = hp_x[i];
2589  h_x[i] = (param->output_location == QUDA_CPU_FIELD_LOCATION) ?
2590  static_cast<ColorSpinorField*>(new cpuColorSpinorField(cpuParam)) :
2591  static_cast<ColorSpinorField*>(new cudaColorSpinorField(cpuParam));
2592  }
2593 
2594  profileMulti.Start(QUDA_PROFILE_H2D);
2595  // Now I need a colorSpinorParam for the device
2596  ColorSpinorParam cudaParam(cpuParam, *param);
2597  // This setting will download a host vector
2598  cudaParam.create = QUDA_COPY_FIELD_CREATE;
2599  b = new cudaColorSpinorField(*h_b, cudaParam); // Creates b and downloads h_b to it
2600  profileMulti.Stop(QUDA_PROFILE_H2D);
2601 
2602  // Create the solution fields filled with zero
2603  x = new cudaColorSpinorField* [ param->num_offset ];
2604  cudaParam.create = QUDA_ZERO_FIELD_CREATE;
2605  for(int i=0; i < param->num_offset; i++) {
2606  x[i] = new cudaColorSpinorField(cudaParam);
2607  }
2608 
2609  // Check source norms
2610  double nb = norm2(*b);
2611  if (nb==0.0) errorQuda("Solution has zero norm");
2612 
2613  if(getVerbosity() >= QUDA_VERBOSE ) {
2614  double nh_b = norm2(*h_b);
2615  printfQuda("Source: CPU = %g, CUDA copy = %g\n", nh_b, nb);
2616  }
2617 
2618  // rescale the source vector to help prevent the onset of underflow
2620  axCuda(1.0/sqrt(nb), *b);
2621  }
2622 
2623  massRescale(*b, *param);
2624 
2625  // use multi-shift CG
2626  {
2627  DiracMdagM m(dirac), mSloppy(diracSloppy);
2628  SolverParam solverParam(*param);
2629  MultiShiftCG cg_m(m, mSloppy, solverParam, profileMulti);
2630  cg_m(x, *b);
2631  solverParam.updateInvertParam(*param);
2632  }
2633 
2634  // experimenting with Minimum residual extrapolation
2635  /*
2636  cudaColorSpinorField **q = new cudaColorSpinorField* [ param->num_offset ];
2637  cudaColorSpinorField **z = new cudaColorSpinorField* [ param->num_offset ];
2638  cudaColorSpinorField tmp(cudaParam);
2639 
2640  for(int i=0; i < param->num_offset; i++) {
2641  cudaParam.create = QUDA_ZERO_FIELD_CREATE;
2642  q[i] = new cudaColorSpinorField(cudaParam);
2643  cudaParam.create = QUDA_COPY_FIELD_CREATE;
2644  z[i] = new cudaColorSpinorField(*x[i], cudaParam);
2645  }
2646 
2647  for(int i=0; i < param->num_offset; i++) {
2648  dirac.setMass(sqrt(param->offset[i]/4));
2649  DiracMdagM m(dirac);
2650  MinResExt mre(m, profileMulti);
2651  copyCuda(tmp, *b);
2652  mre(*x[i], tmp, z, q, param -> num_offset);
2653  dirac.setMass(sqrt(param->offset[0]/4));
2654  }
2655 
2656  for(int i=0; i < param->num_offset; i++) {
2657  delete q[i];
2658  delete z[i];
2659  }
2660  delete []q;
2661  delete []z;
2662  */
2663 
2664  // check each shift has the desired tolerance and use sequential CG to refine
2665 
2666  cudaParam.create = QUDA_ZERO_FIELD_CREATE;
2667  cudaColorSpinorField r(*b, cudaParam);
2668  for(int i=0; i < param->num_offset; i++) {
2669  double rsd_hq = param->residual_type & QUDA_HEAVY_QUARK_RESIDUAL ?
2670  param->true_res_hq_offset[i] : 0;
2671  double tol_hq = param->residual_type & QUDA_HEAVY_QUARK_RESIDUAL ?
2672  param->tol_hq_offset[i] : 0;
2673 
2674  // refine if either L2 or heavy quark residual tolerances have not been met, only if desired residual is > 0
2675  if (param->tol_offset[i] > 0 && (param->true_res_offset[i] > param->tol_offset[i] || rsd_hq > tol_hq)) {
2676  if (getVerbosity() >= QUDA_VERBOSE)
2677  printfQuda("Refining shift %d: L2 residual %e / %e, heavy quark %e / %e (actual / requested)\n",
2678  i, param->true_res_offset[i], param->tol_offset[i], rsd_hq, tol_hq);
2679 
2680  // for staggered the shift is just a change in mass term (FIXME: for twisted mass also)
2681  if (param->dslash_type == QUDA_ASQTAD_DSLASH ||
2682  param->dslash_type == QUDA_STAGGERED_DSLASH) {
2683  dirac.setMass(sqrt(param->offset[i]/4));
2684  diracSloppy.setMass(sqrt(param->offset[i]/4));
2685  }
2686 
2687  DiracMdagM m(dirac), mSloppy(diracSloppy);
2688 
2689  // need to curry in the shift if we are not doing staggered
2690  if (param->dslash_type != QUDA_ASQTAD_DSLASH &&
2691  param->dslash_type != QUDA_STAGGERED_DSLASH) {
2692  m.shift = param->offset[i];
2693  mSloppy.shift = param->offset[i];
2694  }
2695 
2696  SolverParam solverParam(*param);
2697  solverParam.iter = 0;
2699  solverParam.tol = param->tol_offset[i]; // set L2 tolerance
2700  solverParam.tol_hq = param->tol_hq_offset[i]; // set heavy quark tolerance
2701 
2702  CG cg(m, mSloppy, solverParam, profileMulti);
2703  cg(*x[i], *b);
2704 
2705  solverParam.true_res_offset[i] = solverParam.true_res;
2706  solverParam.true_res_hq_offset[i] = solverParam.true_res_hq;
2707  solverParam.updateInvertParam(*param,i);
2708 
2709  if (param->dslash_type == QUDA_ASQTAD_DSLASH ||
2710  param->dslash_type == QUDA_STAGGERED_DSLASH) {
2711  dirac.setMass(sqrt(param->offset[0]/4)); // restore just in case
2712  diracSloppy.setMass(sqrt(param->offset[0]/4)); // restore just in case
2713  }
2714  }
2715  }
2716 
2717  // restore shifts -- avoid side effects
2718  for(int i=0; i < param->num_offset; i++) {
2719  param->offset[i] = unscaled_shifts[i];
2720  }
2721 
2722  profileMulti.Start(QUDA_PROFILE_D2H);
2723  for(int i=0; i < param->num_offset; i++) {
2724  if (param->solver_normalization == QUDA_SOURCE_NORMALIZATION) { // rescale the solution
2725  axCuda(sqrt(nb), *x[i]);
2726  }
2727 
2728  if (getVerbosity() >= QUDA_VERBOSE){
2729  double nx = norm2(*x[i]);
2730  printfQuda("Solution %d = %g\n", i, nx);
2731  }
2732 
2733  *h_x[i] = *x[i];
2734  }
2735  profileMulti.Stop(QUDA_PROFILE_D2H);
2736 
2737  for(int i=0; i < param->num_offset; i++){
2738  delete h_x[i];
2739  delete x[i];
2740  }
2741 
2742  delete h_b;
2743  delete b;
2744 
2745  delete [] h_x;
2746  delete [] x;
2747 
2748  delete [] hp_x;
2749 
2750  delete d;
2751  delete dSloppy;
2752  delete dPre;
2753 
2754  popVerbosity();
2755 
2756  // FIXME: added temporarily so that the cache is written out even if a long benchmarking job gets interrupted
2758 
2759  profileMulti.Stop(QUDA_PROFILE_TOTAL);
2760 }
2761 
2762 
2763 /*
2764  * Hacked multi-shift solver for Wilson RHMC molecular dynamics
2765  * FIXME!!
2766  */
2767 void invertMultiShiftMDQuda(void **_hp_xe, void **_hp_xo, void **_hp_ye, void **_hp_yo,
2768  void *_hp_b, QudaInvertParam *param)
2769 {
2770  setTuning(param->tune);
2771 
2772  profileMulti.Start(QUDA_PROFILE_TOTAL);
2773 
2774  if (param->dslash_type == QUDA_DOMAIN_WALL_DSLASH) setKernelPackT(true);
2775 
2776  if (!initialized) errorQuda("QUDA not initialized");
2777  // check the gauge fields have been created
2779  checkInvertParam(param);
2780 
2781  if (param->num_offset > QUDA_MAX_MULTI_SHIFT)
2782  errorQuda("Number of shifts %d requested greater than QUDA_MAX_MULTI_SHIFT %d",
2784 
2785  pushVerbosity(param->verbosity);
2786 
2787  bool pc_solution = (param->solution_type == QUDA_MATPC_SOLUTION) || (param->solution_type == QUDA_MATPCDAG_MATPC_SOLUTION);
2788  bool pc_solve = (param->solve_type == QUDA_DIRECT_PC_SOLVE) || (param->solve_type == QUDA_NORMOP_PC_SOLVE);
2789  bool mat_solution = (param->solution_type == QUDA_MAT_SOLUTION) || (param->solution_type == QUDA_MATPC_SOLUTION);
2790  bool direct_solve = (param->solve_type == QUDA_DIRECT_SOLVE) || (param->solve_type == QUDA_DIRECT_PC_SOLVE);
2791 
2792  if (mat_solution) {
2793  errorQuda("Multi-shift solver does not support MAT or MATPC solution types");
2794  }
2795  if (direct_solve) {
2796  errorQuda("Multi-shift solver does not support DIRECT or DIRECT_PC solve types");
2797  }
2798  if (pc_solution & !pc_solve) {
2799  errorQuda("Preconditioned (PC) solution_type requires a PC solve_type");
2800  }
2801  if (!pc_solution & pc_solve) {
2802  errorQuda("In multi-shift solver, a preconditioned (PC) solve_type requires a PC solution_type");
2803  }
2804 
2805  // No of GiB in a checkerboard of a spinor
2806  param->spinorGiB = cudaGauge->VolumeCB() * spinorSiteSize;
2807  if( !pc_solve) param->spinorGiB *= 2; // Double volume for non PC solve
2808 
2809  // **** WARNING *** this may not match implementation...
2810  if( param->inv_type == QUDA_CG_INVERTER ) {
2811  // CG-M needs 5 vectors for the smallest shift + 2 for each additional shift
2812  param->spinorGiB *= (5 + 2*(param->num_offset-1))/(double)(1<<30);
2813  } else {
2814  errorQuda("QUDA only currently supports multi-shift CG");
2815  // BiCGStab-M needs 7 for the original shift + 2 for each additional shift + 1 auxiliary
2816  // (Jegerlehner hep-lat/9612014 eq (3.13)
2817  param->spinorGiB *= (7 + 2*(param->num_offset-1))/(double)(1<<30);
2818  }
2819 
2820  // Timing and FLOP counters
2821  param->secs = 0;
2822  param->gflops = 0;
2823  param->iter = 0;
2824 
2825  for (int i=0; i<param->num_offset-1; i++) {
2826  for (int j=i+1; j<param->num_offset; j++) {
2827  if (param->offset[i] > param->offset[j])
2828  errorQuda("Offsets must be ordered from smallest to largest");
2829  }
2830  }
2831 
2832  // Host pointers for x, take a copy of the input host pointers
2833  void **hp_xe = new void* [ param->num_offset ];
2834  void **hp_xo = new void* [ param->num_offset ];
2835  void **hp_ye = new void* [ param->num_offset ];
2836  void **hp_yo = new void* [ param->num_offset ];
2837 
2838  void* hp_b = _hp_b;
2839  for(int i=0;i < param->num_offset;i++){
2840  hp_xe[i] = _hp_xe[i];
2841  hp_xo[i] = _hp_xo[i];
2842  hp_ye[i] = _hp_ye[i];
2843  hp_yo[i] = _hp_yo[i];
2844  }
2845 
2846  // Create the matrix.
2847  // The way this works is that createDirac will create 'd' and 'dSloppy'
2848  // which are global. We then grab these with references...
2849  if (param->dslash_type == QUDA_ASQTAD_DSLASH ||
2851  param->mass = sqrt(param->offset[0]/4);
2852  }
2853 
2854  Dirac *d = NULL;
2855  Dirac *dSloppy = NULL;
2856  Dirac *dPre = NULL;
2857 
2858  // create the dirac operator
2859  createDirac(d, dSloppy, dPre, *param, pc_solve);
2860  Dirac &dirac = *d;
2861  Dirac &diracSloppy = *dSloppy;
2862 
2863  cudaColorSpinorField *b = NULL; // Cuda RHS
2864  cudaColorSpinorField **xe = NULL; // Cuda Solutions
2865  cudaColorSpinorField *xo, *ye, *yo = NULL; // Cuda Solutions
2866 
2867  // Grab the dimension array of the input gauge field.
2868  const int *X = ( param->dslash_type == QUDA_ASQTAD_DSLASH ) ?
2869  gaugeFatPrecise->X() : gaugePrecise->X();
2870 
2871  // This creates a ColorSpinorParam struct, from the host data
2872  // pointer, the definitions in param, the dimensions X, and whether
2873  // the solution is on a checkerboard instruction or not. These can
2874  // then be used as 'instructions' to create the actual
2875  // ColorSpinorField
2876  ColorSpinorParam cpuParam(hp_b, *param, X, pc_solution);
2878  static_cast<ColorSpinorField*>(new cpuColorSpinorField(cpuParam)) :
2879  static_cast<ColorSpinorField*>(new cudaColorSpinorField(cpuParam));
2880 
2881  ColorSpinorField **h_xe = new ColorSpinorField* [ param->num_offset ]; // DYNAMIC ALLOCATION
2882  ColorSpinorField **h_xo = new ColorSpinorField* [ param->num_offset ]; // DYNAMIC ALLOCATION
2883  ColorSpinorField **h_ye = new ColorSpinorField* [ param->num_offset ]; // DYNAMIC ALLOCATION
2884  ColorSpinorField **h_yo = new ColorSpinorField* [ param->num_offset ]; // DYNAMIC ALLOCATION
2885  for(int i=0; i < param->num_offset; i++) {
2886  cpuParam.v = hp_xe[i];
2887  h_xe[i] = (param->output_location == QUDA_CPU_FIELD_LOCATION) ?
2888  static_cast<ColorSpinorField*>(new cpuColorSpinorField(cpuParam)) :
2889  static_cast<ColorSpinorField*>(new cudaColorSpinorField(cpuParam));
2890 
2891  cpuParam.v = hp_xo[i];
2892  h_xo[i] = (param->output_location == QUDA_CPU_FIELD_LOCATION) ?
2893  static_cast<ColorSpinorField*>(new cpuColorSpinorField(cpuParam)) :
2894  static_cast<ColorSpinorField*>(new cudaColorSpinorField(cpuParam));
2895 
2896  cpuParam.v = hp_ye[i];
2897  h_ye[i] = (param->output_location == QUDA_CPU_FIELD_LOCATION) ?
2898  static_cast<ColorSpinorField*>(new cpuColorSpinorField(cpuParam)) :
2899  static_cast<ColorSpinorField*>(new cudaColorSpinorField(cpuParam));
2900 
2901  cpuParam.v = hp_yo[i];
2902  h_yo[i] = (param->output_location == QUDA_CPU_FIELD_LOCATION) ?
2903  static_cast<ColorSpinorField*>(new cpuColorSpinorField(cpuParam)) :
2904  static_cast<ColorSpinorField*>(new cudaColorSpinorField(cpuParam));
2905  }
2906 
2907  profileMulti.Start(QUDA_PROFILE_H2D);
2908  // Now I need a colorSpinorParam for the device
2909  ColorSpinorParam cudaParam(cpuParam, *param);
2910  // This setting will download a host vector
2911  cudaParam.create = QUDA_COPY_FIELD_CREATE;
2912  b = new cudaColorSpinorField(*h_b, cudaParam); // Creates b and downloads h_b to it
2913  profileMulti.Stop(QUDA_PROFILE_H2D);
2914 
2915  // Create the solution fields filled with zero
2916  xe = new cudaColorSpinorField* [ param->num_offset ];
2917  cudaParam.create = QUDA_ZERO_FIELD_CREATE;
2918  for(int i=0; i < param->num_offset; i++) {
2919  xe[i] = new cudaColorSpinorField(cudaParam);
2920  }
2921 
2922  xo = new cudaColorSpinorField(cudaParam);
2923  ye = new cudaColorSpinorField(cudaParam);
2924  yo = new cudaColorSpinorField(cudaParam);
2925 
2926  // Check source norms
2927  double nb = norm2(*b);
2928  if (nb==0.0) errorQuda("Solution has zero norm");
2929 
2930  if(getVerbosity() >= QUDA_VERBOSE ) {
2931  double nh_b = norm2(*h_b);
2932  printfQuda("Source: CPU = %g, CUDA copy = %g\n", nh_b, nb);
2933  }
2934 
2935  // rescale the source vector to help prevent the onset of underflow
2937  axCuda(1.0/sqrt(nb), *b);
2938  }
2939 
2940  massRescale(*b, *param);
2941 
2942  // use multi-shift CG
2943  {
2944  DiracMdagM m(dirac), mSloppy(diracSloppy);
2945  SolverParam solverParam(*param);
2946  MultiShiftCG cg_m(m, mSloppy, solverParam, profileMulti);
2947  cg_m(xe, *b);
2948  solverParam.updateInvertParam(*param);
2949  }
2950 
2951  // check each shift has the desired tolerance and use sequential CG to refine
2952 
2953  cudaParam.create = QUDA_ZERO_FIELD_CREATE;
2954  cudaColorSpinorField r(*b, cudaParam);
2955  for(int i=0; i < param->num_offset; i++) {
2956  double rsd_hq = param->residual_type & QUDA_HEAVY_QUARK_RESIDUAL ?
2957  param->true_res_hq_offset[i] : 0;
2958 
2959  double tol_hq = param->residual_type & QUDA_HEAVY_QUARK_RESIDUAL ?
2960  param->tol_hq_offset[i] : 0;
2961 
2962  // refine if either L2 or heavy quark residual tolerances have not been met
2963  if (param->true_res_offset[i] > param->tol_offset[i] || rsd_hq > tol_hq) {
2964  if (getVerbosity() >= QUDA_VERBOSE)
2965  printfQuda("Refining shift %d: L2 residual %e / %e, heavy quark %e / %e (actual / requested)\n",
2966  i, param->true_res_offset[i], param->tol_offset[i], rsd_hq, tol_hq);
2967 
2968  // for staggered the shift is just a change in mass term (FIXME: for twisted mass also)
2969  if (param->dslash_type == QUDA_ASQTAD_DSLASH ||
2970  param->dslash_type == QUDA_STAGGERED_DSLASH) {
2971  dirac.setMass(sqrt(param->offset[i]/4));
2972  diracSloppy.setMass(sqrt(param->offset[i]/4));
2973  }
2974 
2975  DiracMdagM m(dirac), mSloppy(diracSloppy);
2976 
2977  // need to curry in the shift if we are not doing staggered
2978  if (param->dslash_type != QUDA_ASQTAD_DSLASH &&
2979  param->dslash_type != QUDA_STAGGERED_DSLASH) {
2980  m.shift = param->offset[i];
2981  mSloppy.shift = param->offset[i];
2982  }
2983 
2984  SolverParam solverParam(*param);
2986  solverParam.tol = param->tol_offset[i]; // set L2 tolerance
2987  solverParam.tol_hq = param->tol_hq_offset[i]; // set heavy quark tolerance
2988 
2989  CG cg(m, mSloppy, solverParam, profileMulti);
2990  cg(*xe[i], *b);
2991 
2992  solverParam.updateInvertParam(*param);
2993  param->true_res_offset[i] = param->true_res;
2994  param->true_res_hq_offset[i] = param->true_res_hq;
2995 
2996  if (param->dslash_type == QUDA_ASQTAD_DSLASH ||
2997  param->dslash_type == QUDA_STAGGERED_DSLASH) {
2998  dirac.setMass(sqrt(param->offset[0]/4)); // restore just in case
2999  diracSloppy.setMass(sqrt(param->offset[0]/4)); // restore just in case
3000  }
3001  }
3002  }
3003 
3004  // restore shifts -- avoid side effects
3005  for(int i=0; i < param->num_offset; i++) {
3006  param->offset[i] = unscaled_shifts[i];
3007  }
3008 
3009  profileMulti.Start(QUDA_PROFILE_D2H);
3010  for(int i=0; i < param->num_offset; i++) {
3011  if (param->solver_normalization == QUDA_SOURCE_NORMALIZATION) { // rescale the solution
3012  axCuda(sqrt(nb), *xe[i]);
3013  }
3014 
3015  if (getVerbosity() >= QUDA_VERBOSE){
3016  double nx = norm2(*xe[i]);
3017  printfQuda("Solution %d = %g\n", i, nx);
3018  }
3019 
3020  dirac.Dslash(*xo, *xe[i], QUDA_ODD_PARITY);
3021  dirac.M(*ye, *xe[i]);
3022  dirac.Dagger(QUDA_DAG_YES);
3023  dirac.Dslash(*yo, *ye, QUDA_ODD_PARITY);
3024  dirac.Dagger(QUDA_DAG_NO);
3025 
3026  *h_xe[i] = *xe[i];
3027  *h_xo[i] = *xo;
3028  *h_ye[i] = *ye;
3029  *h_yo[i] = *yo;
3030  }
3031  profileMulti.Stop(QUDA_PROFILE_D2H);
3032 
3033  for(int i=0; i < param->num_offset; i++){
3034  delete h_xe[i];
3035  delete h_xo[i];
3036  delete h_ye[i];
3037  delete h_yo[i];
3038  delete xe[i];
3039  }
3040 
3041  delete h_b;
3042  delete b;
3043 
3044  delete [] h_xe;
3045  delete [] h_xo;
3046  delete [] h_ye;
3047  delete [] h_yo;
3048 
3049  delete [] xe;
3050  delete xo;
3051  delete ye;
3052  delete yo;
3053 
3054  delete [] hp_xe;
3055  delete [] hp_xo;
3056  delete [] hp_ye;
3057  delete [] hp_yo;
3058 
3059  delete d;
3060  delete dSloppy;
3061  delete dPre;
3062 
3063  popVerbosity();
3064 
3065  // FIXME: added temporarily so that the cache is written out even if a long benchmarking job gets interrupted
3067 
3068  profileMulti.Stop(QUDA_PROFILE_TOTAL);
3069 }
3070 
3071 void incrementalEigQuda(void *_h_x, void *_h_b, QudaInvertParam *param, void *_h_u, double *inv_eigenvals, int last_rhs)
3072 {
3073  setTuning(param->tune);
3074 
3075  if(!InitMagma) openMagma();
3076 
3077  if (param->dslash_type == QUDA_DOMAIN_WALL_DSLASH) setKernelPackT(true);
3078 
3079  profileInvert.Start(QUDA_PROFILE_TOTAL);
3080 
3081  if (!initialized) errorQuda("QUDA not initialized");
3082 
3083  pushVerbosity(param->verbosity);
3085 
3086  // check the gauge fields have been created
3088 
3089  checkInvertParam(param);
3090 
3091  // It was probably a bad design decision to encode whether the system is even/odd preconditioned (PC) in
3092  // solve_type and solution_type, rather than in separate members of QudaInvertParam. We're stuck with it
3093  // for now, though, so here we factorize everything for convenience.
3094 
3095  bool pc_solution = (param->solution_type == QUDA_MATPC_SOLUTION) ||
3097  bool pc_solve = (param->solve_type == QUDA_DIRECT_PC_SOLVE) ||
3098  (param->solve_type == QUDA_NORMOP_PC_SOLVE);
3099  bool mat_solution = (param->solution_type == QUDA_MAT_SOLUTION) ||
3100  (param->solution_type == QUDA_MATPC_SOLUTION);
3101  bool direct_solve = (param->solve_type == QUDA_DIRECT_SOLVE) ||
3102  (param->solve_type == QUDA_DIRECT_PC_SOLVE);
3103 
3104  param->spinorGiB = cudaGauge->VolumeCB() * spinorSiteSize;
3105  if (!pc_solve) param->spinorGiB *= 2;
3106  param->spinorGiB *= (param->cuda_prec == QUDA_DOUBLE_PRECISION ? sizeof(double) : sizeof(float));
3107  if (param->preserve_source == QUDA_PRESERVE_SOURCE_NO) {
3108  param->spinorGiB *= ((param->inv_type == QUDA_EIGCG_INVERTER || param->inv_type == QUDA_INC_EIGCG_INVERTER) ? 5 : 7)/(double)(1<<30);
3109  } else {
3110  param->spinorGiB *= ((param->inv_type == QUDA_EIGCG_INVERTER || param->inv_type == QUDA_INC_EIGCG_INVERTER) ? 8 : 9)/(double)(1<<30);
3111  }
3112 
3113  param->secs = 0;
3114  param->gflops = 0;
3115  param->iter = 0;
3116 
3117  DiracParam diracParam;
3118  DiracParam diracSloppyParam;
3119  //DiracParam diracDeflateParam;
3121  DiracParam diracHalfPrecParam;//sloppy precision for initCG
3123  setDiracParam(diracParam, param, pc_solve);
3124  setDiracSloppyParam(diracSloppyParam, param, pc_solve);
3125 
3127  {
3128  errorQuda("\nInitCG requires sloppy gauge field in half precision. It seems that the half precision field is not loaded,\n please check you cuda_prec_precondition parameter.\n");
3129  }
3130 
3132  setDiracParam(diracHalfPrecParam, param, pc_solve);
3133 
3134  diracHalfPrecParam.gauge = gaugePrecondition;
3135  diracHalfPrecParam.fatGauge = gaugeFatPrecondition;
3136  diracHalfPrecParam.longGauge = gaugeLongPrecondition;
3137 
3138  diracHalfPrecParam.clover = cloverPrecondition;
3139  diracHalfPrecParam.cloverInv = cloverInvPrecondition;
3140 
3141  for (int i=0; i<4; i++) {
3142  diracHalfPrecParam.commDim[i] = 1; // comms are on.
3143  }
3145 
3146  Dirac *d = Dirac::create(diracParam); // create the Dirac operator
3147  Dirac *dSloppy = Dirac::create(diracSloppyParam);
3148  //Dirac *dDeflate = Dirac::create(diracPreParam);
3149  Dirac *dHalfPrec = Dirac::create(diracHalfPrecParam);
3150 
3151  Dirac &dirac = *d;
3152  //Dirac &diracSloppy = param->rhs_idx < param->deflation_grid ? *d : *dSloppy; //hack!!!
3153  //Dirac &diracSloppy = param->rhs_idx < param->deflation_grid ? *dSloppy : *dHalfPrec;
3154  Dirac &diracSloppy = *dSloppy;
3155  Dirac &diracHalf = *dHalfPrec;
3156  Dirac &diracDeflate = *d;//full precision deflation
3157  //Dirac &diracHalfPrec = *dHalfPrec;
3158 
3159  profileInvert.Start(QUDA_PROFILE_H2D);
3160 
3161  cudaColorSpinorField *b = NULL;
3162  cudaColorSpinorField *x = NULL;
3163  cudaColorSpinorField *in = NULL;
3164  cudaColorSpinorField *out = NULL;
3165 
3166  const int *X = cudaGauge->X();
3167 
3168  // wrap CPU host side pointers
3169  ColorSpinorParam cpuParam(_h_b, *param, X, pc_solution);
3171  static_cast<ColorSpinorField*>(new cpuColorSpinorField(cpuParam)) :
3172  static_cast<ColorSpinorField*>(new cudaColorSpinorField(cpuParam));
3173 
3174  cpuParam.v = _h_x;
3176  static_cast<ColorSpinorField*>(new cpuColorSpinorField(cpuParam)) :
3177  static_cast<ColorSpinorField*>(new cudaColorSpinorField(cpuParam));
3178 
3179  // download source
3180  ColorSpinorParam cudaParam(cpuParam, *param);
3181  cudaParam.create = QUDA_COPY_FIELD_CREATE;
3182  b = new cudaColorSpinorField(*h_b, cudaParam);
3183 
3184  if (param->use_init_guess == QUDA_USE_INIT_GUESS_YES) { // download initial guess
3185  // initial guess only supported for single-pass solvers
3187  (param->solve_type == QUDA_DIRECT_SOLVE || param->solve_type == QUDA_DIRECT_PC_SOLVE)) {
3188  errorQuda("Initial guess not supported for two-pass solver");
3189  }
3190 
3191  x = new cudaColorSpinorField(*h_x, cudaParam); // solution
3192  } else { // zero initial guess
3193  cudaParam.create = QUDA_ZERO_FIELD_CREATE;
3194  x = new cudaColorSpinorField(cudaParam); // solution
3195  }
3196 
3197  profileInvert.Stop(QUDA_PROFILE_H2D);
3198 
3199  double nb = norm2(*b);
3200  if (nb==0.0) errorQuda("Source has zero norm");
3201 
3202  if (getVerbosity() >= QUDA_VERBOSE) {
3203  double nh_b = norm2(*h_b);
3204  double nh_x = norm2(*h_x);
3205  double nx = norm2(*x);
3206  printfQuda("Source: CPU = %g, CUDA copy = %g\n", nh_b, nb);
3207  printfQuda("Solution: CPU = %g, CUDA copy = %g\n", nh_x, nx);
3208  }
3209 
3210  // rescale the source and solution vectors to help prevent the onset of underflow
3212  axCuda(1.0/sqrt(nb), *b);
3213  axCuda(1.0/sqrt(nb), *x);
3214  }
3215 
3216  massRescale(*b, *param);
3217 
3218  dirac.prepare(in, out, *x, *b, param->solution_type);
3219 //here...
3220  if (getVerbosity() >= QUDA_VERBOSE) {
3221  double nin = norm2(*in);
3222  double nout = norm2(*out);
3223  printfQuda("Prepared source = %g\n", nin);
3224  printfQuda("Prepared solution = %g\n", nout);
3225  }
3226 
3227  if (getVerbosity() >= QUDA_VERBOSE) {
3228  double nin = norm2(*in);
3229  printfQuda("Prepared source post mass rescale = %g\n", nin);
3230  }
3231 
3232  if (param->max_search_dim == 0 || param->nev == 0 || (param->max_search_dim < param->nev))
3233  errorQuda("\nIncorrect eigenvector space setup...\n");
3234 
3235  if (pc_solution && !pc_solve) {
3236  errorQuda("Preconditioned (PC) solution_type requires a PC solve_type");
3237  }
3238 
3239  if (!mat_solution && !pc_solution && pc_solve) {
3240  errorQuda("Unpreconditioned MATDAG_MAT solution_type requires an unpreconditioned solve_type");
3241  }
3242 
3243  if (mat_solution && !direct_solve) { // prepare source: b' = A^dag b
3245  dirac.Mdag(*in, tmp);
3246  }
3247 
3249  {
3250  DiracMdagM m(dirac), mSloppy(diracSloppy), mHalf(diracHalf), mDeflate(diracDeflate);
3251  SolverParam solverParam(*param);
3252 
3253  DeflatedSolver *solve = DeflatedSolver::create(solverParam, m, mSloppy, mHalf, mDeflate, profileInvert);
3254 
3255  (*solve)(out, in);//run solver
3256 
3257  solverParam.updateInvertParam(*param);//will update rhs_idx as well...
3258 
3259  if(last_rhs)
3260  {
3261  if(_h_u) solve->StoreRitzVecs(_h_u, inv_eigenvals, X, param, param->nev);
3262  printfQuda("\nDelete incremental EigCG solver resources...\n");
3263  //clean resources:
3264  solve->CleanResources();
3265  //
3266  printfQuda("\n...done.\n");
3267  }
3268 
3269  delete solve;
3270  }
3271  else
3272  {
3273  errorQuda("\nUnknown deflated solver...\n");
3274  }
3275 
3276  if (getVerbosity() >= QUDA_VERBOSE){
3277  double nx = norm2(*x);
3278  printfQuda("Solution = %g\n",nx);
3279  }
3280  dirac.reconstruct(*x, *b, param->solution_type);
3281 
3283  // rescale the solution
3284  axCuda(sqrt(nb), *x);
3285  }
3286 
3287  profileInvert.Start(QUDA_PROFILE_D2H);
3288  *h_x = *x;
3289  profileInvert.Stop(QUDA_PROFILE_D2H);
3290 
3291  if (getVerbosity() >= QUDA_VERBOSE){
3292  double nx = norm2(*x);
3293  double nh_x = norm2(*h_x);
3294  printfQuda("Reconstructed: CUDA solution = %g, CPU copy = %g\n", nx, nh_x);
3295  }
3296 
3297  delete h_b;
3298  delete h_x;
3299  delete b;
3300  delete x;
3301 
3302  delete d;
3303  delete dSloppy;
3304 // delete dDeflate;
3305  delete dHalfPrec;
3306 
3307  popVerbosity();
3308 
3309  // FIXME: added temporarily so that the cache is written out even if a long benchmarking job gets interrupted
3311 
3312  profileInvert.Stop(QUDA_PROFILE_TOTAL);
3313 }
3314 
3315 
3316 #ifdef GPU_FATLINK
3317 /* @method
3318  * QUDA_COMPUTE_FAT_STANDARD: standard method (default)
3319  * QUDA_COMPUTE_FAT_EXTENDED_VOLUME, extended volume method
3320  *
3321  */
3322 #include <sys/time.h>
3323 
3325 {
3326  int* X = param->X;
3327 #ifdef MULTI_GPU
3328  int Vsh_x = X[1]*X[2]*X[3]/2;
3329  int Vsh_y = X[0]*X[2]*X[3]/2;
3330  int Vsh_z = X[0]*X[1]*X[3]/2;
3331 #endif
3332  int Vsh_t = X[0]*X[1]*X[2]/2;
3333 
3334  int E[4];
3335  for (int i=0; i<4; i++) E[i] = X[i] + 4;
3336 
3337  // fat-link padding
3338  param->llfat_ga_pad = Vsh_t;
3339 
3340  // site-link padding
3341  if(method == QUDA_COMPUTE_FAT_STANDARD) {
3342 #ifdef MULTI_GPU
3343  int Vh_2d_max = MAX(X[0]*X[1]/2, X[0]*X[2]/2);
3344  Vh_2d_max = MAX(Vh_2d_max, X[0]*X[3]/2);
3345  Vh_2d_max = MAX(Vh_2d_max, X[1]*X[2]/2);
3346  Vh_2d_max = MAX(Vh_2d_max, X[1]*X[3]/2);
3347  Vh_2d_max = MAX(Vh_2d_max, X[2]*X[3]/2);
3348  param->site_ga_pad = 3*(Vsh_x+Vsh_y+Vsh_z+Vsh_t) + 4*Vh_2d_max;
3349 #else
3350  param->site_ga_pad = Vsh_t;
3351 #endif
3352  } else {
3353  param->site_ga_pad = (E[0]*E[1]*E[2]/2)*3;
3354  }
3355  param->ga_pad = param->site_ga_pad;
3356 
3357  // staple padding
3358  if(method == QUDA_COMPUTE_FAT_STANDARD) {
3359 #ifdef MULTI_GPU
3360  param->staple_pad = 3*(Vsh_x + Vsh_y + Vsh_z+ Vsh_t);
3361 #else
3362  param->staple_pad = 3*Vsh_t;
3363 #endif
3364  } else {
3365  param->staple_pad = (E[0]*E[1]*E[2]/2)*3;
3366  }
3367 
3368  return;
3369 }
3370 
3371 
3372 namespace quda {
3373  void computeFatLinkCore(cudaGaugeField* cudaSiteLink, double* act_path_coeff,
3374  QudaGaugeParam* qudaGaugeParam, QudaComputeFatMethod method,
3375  cudaGaugeField* cudaFatLink, cudaGaugeField* cudaLongLink,
3376  TimeProfile &profile)
3377  {
3378 
3379  profile.Start(QUDA_PROFILE_INIT);
3380  const int flag = qudaGaugeParam->preserve_gauge;
3381  GaugeFieldParam gParam(0,*qudaGaugeParam);
3382 
3383  if (method == QUDA_COMPUTE_FAT_STANDARD) {
3384  for(int dir=0; dir<4; ++dir) gParam.x[dir] = qudaGaugeParam->X[dir];
3385  } else {
3386  for(int dir=0; dir<4; ++dir) gParam.x[dir] = qudaGaugeParam->X[dir] + 4;
3387  }
3388 
3389  if (cudaStapleField == NULL || cudaStapleField1 == NULL) {
3390  gParam.pad = qudaGaugeParam->staple_pad;
3393  gParam.geometry = QUDA_SCALAR_GEOMETRY; // only require a scalar matrix field for the staple
3395 #ifdef MULTI_GPU
3397 #else
3399 #endif
3400  cudaStapleField = new cudaGaugeField(gParam);
3401  cudaStapleField1 = new cudaGaugeField(gParam);
3402  }
3403  profile.Stop(QUDA_PROFILE_INIT);
3404 
3405  profile.Start(QUDA_PROFILE_COMPUTE);
3406  if (method == QUDA_COMPUTE_FAT_STANDARD) {
3407  llfat_cuda(cudaFatLink, cudaLongLink, *cudaSiteLink, *cudaStapleField, *cudaStapleField1, qudaGaugeParam, act_path_coeff);
3408  } else { //method == QUDA_COMPUTE_FAT_EXTENDED_VOLUME
3409  llfat_cuda_ex(cudaFatLink, cudaLongLink, *cudaSiteLink, *cudaStapleField, *cudaStapleField1, qudaGaugeParam, act_path_coeff);
3410  }
3411  profile.Stop(QUDA_PROFILE_COMPUTE);
3412 
3413  profile.Start(QUDA_PROFILE_FREE);
3414  if (!(flag & QUDA_FAT_PRESERVE_GPU_GAUGE) ){
3415  delete cudaStapleField; cudaStapleField = NULL;
3416  delete cudaStapleField1; cudaStapleField1 = NULL;
3417  }
3418  profile.Stop(QUDA_PROFILE_FREE);
3419 
3420  return;
3421  }
3422 } // namespace quda
3423 
3424 
3425 namespace quda {
3426  namespace fatlink {
3427 #include <dslash_init.cuh>
3428  }
3429 }
3430 
3431 void computeKSLinkQuda(void* fatlink, void* longlink, void* ulink, void* inlink, double *path_coeff, QudaGaugeParam *param, QudaComputeFatMethod method)
3432 {
3433  profileFatLink.Start(QUDA_PROFILE_TOTAL);
3434  profileFatLink.Start(QUDA_PROFILE_INIT);
3435  // Initialize unitarization parameters
3436  if(ulink){
3437  const double unitarize_eps = 1e-14;
3438  const double max_error = 1e-10;
3439  const int reunit_allow_svd = 1;
3440  const int reunit_svd_only = 0;
3441  const double svd_rel_error = 1e-6;
3442  const double svd_abs_error = 1e-6;
3443  quda::setUnitarizeLinksConstants(unitarize_eps, max_error,
3444  reunit_allow_svd, reunit_svd_only,
3445  svd_rel_error, svd_abs_error);
3446  }
3447 
3448  cudaGaugeField* cudaFatLink = NULL;
3449  cudaGaugeField* cudaLongLink = NULL;
3450  cudaGaugeField* cudaUnitarizedLink = NULL;
3451  cudaGaugeField* cudaInLinkEx = NULL;
3452 
3453  QudaGaugeParam qudaGaugeParam_ex_buf;
3454  QudaGaugeParam* qudaGaugeParam_ex = &qudaGaugeParam_ex_buf;
3455  memcpy(qudaGaugeParam_ex, param, sizeof(QudaGaugeParam));
3456  for(int dir=0; dir<4; ++dir){ qudaGaugeParam_ex->X[dir] = param->X[dir]+4; }
3457 
3458  // fat-link padding
3459  setFatLinkPadding(method, param);
3460  qudaGaugeParam_ex->llfat_ga_pad = param->llfat_ga_pad;
3461  qudaGaugeParam_ex->staple_pad = param->staple_pad;
3462  qudaGaugeParam_ex->site_ga_pad = param->site_ga_pad;
3463 
3464  GaugeFieldParam gParam(0, *param);
3466  // create the host fatlink
3470  gParam.gauge = fatlink;
3472  gParam.gauge = longlink;
3473  cpuGaugeField cpuLongLink(gParam);
3474  gParam.gauge = ulink;
3475  cpuGaugeField cpuUnitarizedLink(gParam);
3476 
3477  // create the device fatlink
3478  gParam.pad = param->llfat_ga_pad;
3483  cudaFatLink = new cudaGaugeField(gParam);
3484  if(longlink) cudaLongLink = new cudaGaugeField(gParam);
3485  if(ulink){
3486  cudaUnitarizedLink = new cudaGaugeField(gParam);
3488  }
3489  // create the host sitelink
3490  gParam.pad = 0;
3492  gParam.link_type = param->type;
3494  gParam.gauge = inlink;
3495  cpuGaugeField cpuInLink(gParam);
3496 
3497 
3498  gParam.pad = param->site_ga_pad;
3500  gParam.link_type = param->type;
3501  gParam.reconstruct = param->reconstruct;
3503  cudaGaugeField* cudaInLink = new cudaGaugeField(gParam);
3504 
3505  if(method == QUDA_COMPUTE_FAT_EXTENDED_VOLUME){
3506  for(int dir=0; dir<4; ++dir) gParam.x[dir] = qudaGaugeParam_ex->X[dir];
3508  cudaInLinkEx = new cudaGaugeField(gParam);
3509  }
3510 
3511  profileFatLink.Stop(QUDA_PROFILE_INIT);
3512  fatlink::initLatticeConstants(*cudaFatLink, profileFatLink);
3513  profileFatLink.Start(QUDA_PROFILE_INIT);
3514 
3515  cudaGaugeField* inlinkPtr;
3516  if(method == QUDA_COMPUTE_FAT_STANDARD){
3517  llfat_init_cuda(param);
3518  param->ga_pad = param->site_ga_pad;
3519  inlinkPtr = cudaInLink;
3520  }else{
3521  llfat_init_cuda_ex(qudaGaugeParam_ex);
3522  inlinkPtr = cudaInLinkEx;
3523  }
3524  profileFatLink.Stop(QUDA_PROFILE_INIT);
3525 
3526  profileFatLink.Start(QUDA_PROFILE_H2D);
3527  cudaInLink->loadCPUField(cpuInLink, QUDA_CPU_FIELD_LOCATION);
3528  profileFatLink.Stop(QUDA_PROFILE_H2D);
3529 
3530  if(method != QUDA_COMPUTE_FAT_STANDARD){
3531  profileFatLink.Start(QUDA_PROFILE_COMMS);
3532  copyExtendedGauge(*cudaInLinkEx, *cudaInLink, QUDA_CUDA_FIELD_LOCATION);
3533 #ifdef MULTI_GPU
3534  int R[4] = {2, 2, 2, 2};
3535  cudaInLinkEx->exchangeExtendedGhost(R,true); // instead of exchange_cpu_sitelink_ex
3536 #endif
3537  profileFatLink.Stop(QUDA_PROFILE_COMMS);
3538  } // Initialise and load siteLinks
3539 
3540  quda::computeFatLinkCore(inlinkPtr, const_cast<double*>(path_coeff), param, method, cudaFatLink, cudaLongLink, profileFatLink);
3541 
3542  if(ulink){
3543  profileFatLink.Start(QUDA_PROFILE_INIT);
3544  int num_failures=0;
3545  int* num_failures_dev;
3546  cudaMalloc((void**)&num_failures_dev, sizeof(int));
3547  cudaMemset(num_failures_dev, 0, sizeof(int));
3548  if(num_failures_dev == NULL) errorQuda("cudaMalloc fialed for dev_pointer\n");
3549  profileFatLink.Stop(QUDA_PROFILE_INIT);
3550 
3551  profileFatLink.Start(QUDA_PROFILE_COMPUTE);
3552  quda::unitarizeLinksCuda(*param, *cudaFatLink, cudaUnitarizedLink, num_failures_dev); // unitarize on the gpu
3553  profileFatLink.Stop(QUDA_PROFILE_COMPUTE);
3554 
3555 
3556  profileFatLink.Start(QUDA_PROFILE_D2H);
3557  cudaMemcpy(&num_failures, num_failures_dev, sizeof(int), cudaMemcpyDeviceToHost);
3558  profileFatLink.Stop(QUDA_PROFILE_D2H);
3559  cudaFree(num_failures_dev);
3560  if(num_failures>0){
3561  errorQuda("Error in the unitarization component of the hisq fattening\n");
3562  }
3563  profileFatLink.Start(QUDA_PROFILE_D2H);
3564  cudaUnitarizedLink->saveCPUField(cpuUnitarizedLink, QUDA_CPU_FIELD_LOCATION);
3565  profileFatLink.Stop(QUDA_PROFILE_D2H);
3566  }
3567 
3568  profileFatLink.Start(QUDA_PROFILE_D2H);
3569  if(fatlink) cudaFatLink->saveCPUField(cpuFatLink, QUDA_CPU_FIELD_LOCATION);
3570  if(longlink) cudaLongLink->saveCPUField(cpuLongLink, QUDA_CPU_FIELD_LOCATION);
3571  profileFatLink.Stop(QUDA_PROFILE_D2H);
3572 
3573  profileFatLink.Start(QUDA_PROFILE_FREE);
3574  if(longlink) delete cudaLongLink;
3575  delete cudaFatLink;
3576  delete cudaInLink;
3577  delete cudaUnitarizedLink;
3578  if(cudaInLinkEx) delete cudaInLinkEx;
3579  profileFatLink.Stop(QUDA_PROFILE_FREE);
3580 
3581  profileFatLink.Stop(QUDA_PROFILE_TOTAL);
3582 
3583  return;
3584 }
3585 
3586 #endif // GPU_FATLINK
3587 
3589  int pad = 0;
3590 #ifdef MULTI_GPU
3591  int volume = param.x[0]*param.x[1]*param.x[2]*param.x[3];
3592  int face_size[4];
3593  for(int dir=0; dir<4; ++dir) face_size[dir] = (volume/param.x[dir])/2;
3594  pad = *std::max_element(face_size, face_size+4);
3595 #endif
3596 
3597  return pad;
3598 }
3599 
3600 #ifdef GPU_GAUGE_FORCE
3601 namespace quda {
3602  namespace gaugeforce {
3603 #include <dslash_init.cuh>
3604  }
3605 }
3606 #endif
3607 
3608 int computeGaugeForceQuda(void* mom, void* siteLink, int*** input_path_buf, int* path_length,
3609  double* loop_coeff, int num_paths, int max_length, double eb3,
3610  QudaGaugeParam* qudaGaugeParam, double* timeinfo)
3611 {
3612 
3613  /*printfQuda("GaugeForce: use_resident_gauge = %d, make_resident_gauge = %d\n",
3614  qudaGaugeParam->use_resident_gauge, qudaGaugeParam->make_resident_gauge);
3615  printfQuda("GaugeForce: use_resident_mom = %d, make_resident_mom = %d\n",
3616  qudaGaugeParam->use_resident_mom, qudaGaugeParam->make_resident_mom);*/
3617 
3618 #ifdef GPU_GAUGE_FORCE
3619  profileGaugeForce.Start(QUDA_PROFILE_TOTAL);
3620  profileGaugeForce.Start(QUDA_PROFILE_INIT);
3621 
3622  checkGaugeParam(qudaGaugeParam);
3623 
3624  GaugeFieldParam gParam(0, *qudaGaugeParam);
3626  gParam.pad = 0;
3627 
3628 #ifdef MULTI_GPU
3629  GaugeFieldParam gParamEx(gParam);
3630  for (int d=0; d<4; d++) gParamEx.x[d] = gParam.x[d] + 4;
3631 #endif
3632 
3634  gParam.gauge = siteLink;
3635  cpuGaugeField *cpuSiteLink = new cpuGaugeField(gParam);
3636 
3637  cudaGaugeField* cudaSiteLink = NULL;
3638 
3639  if (qudaGaugeParam->use_resident_gauge) {
3640  if (!gaugePrecise) errorQuda("No resident gauge field to use");
3641  cudaSiteLink = gaugePrecise;
3642  profileGaugeForce.Stop(QUDA_PROFILE_INIT);
3643  printfQuda("GaugeForce: Using resident gauge field\n");
3644  } else {
3645  gParam.create = QUDA_NULL_FIELD_CREATE;
3646  gParam.reconstruct = qudaGaugeParam->reconstruct;
3647  gParam.order = (qudaGaugeParam->reconstruct == QUDA_RECONSTRUCT_NO ||
3648  qudaGaugeParam->cuda_prec == QUDA_DOUBLE_PRECISION) ?
3650 
3651  cudaSiteLink = new cudaGaugeField(gParam);
3652  profileGaugeForce.Stop(QUDA_PROFILE_INIT);
3653 
3654  profileGaugeForce.Start(QUDA_PROFILE_H2D);
3655  cudaSiteLink->loadCPUField(*cpuSiteLink, QUDA_CPU_FIELD_LOCATION);
3656  profileGaugeForce.Stop(QUDA_PROFILE_H2D);
3657  }
3658 
3659  profileGaugeForce.Start(QUDA_PROFILE_INIT);
3660 
3661 #ifndef MULTI_GPU
3662  cudaGaugeField *cudaGauge = cudaSiteLink;
3663  qudaGaugeParam->site_ga_pad = gParam.pad; //need to set this value
3664 #else
3665 
3666  gParamEx.create = QUDA_ZERO_FIELD_CREATE;
3667  gParamEx.reconstruct = qudaGaugeParam->reconstruct;
3668  gParamEx.order = (qudaGaugeParam->reconstruct == QUDA_RECONSTRUCT_NO ||
3669  qudaGaugeParam->cuda_prec == QUDA_DOUBLE_PRECISION) ?
3671  qudaGaugeParam->site_ga_pad = gParamEx.pad;//need to set this value
3672 
3673  cudaGaugeField *cudaGauge = new cudaGaugeField(gParamEx);
3674 
3675  copyExtendedGauge(*cudaGauge, *cudaSiteLink, QUDA_CUDA_FIELD_LOCATION);
3676  int R[4] = {2, 2, 2, 2}; // radius of the extended region in each dimension / direction
3677 
3678  profileGaugeForce.Stop(QUDA_PROFILE_INIT);
3679 
3680  profileGaugeForce.Start(QUDA_PROFILE_COMMS);
3681  cudaGauge->exchangeExtendedGhost(R);
3682  profileGaugeForce.Stop(QUDA_PROFILE_COMMS);
3683  profileGaugeForce.Start(QUDA_PROFILE_INIT);
3684 #endif
3685 
3686  GaugeFieldParam &gParamMom = gParam;
3687  gParamMom.order = qudaGaugeParam->gauge_order;
3688  // FIXME - test program always uses MILC for mom but can use QDP for gauge
3689  if (gParamMom.order == QUDA_QDP_GAUGE_ORDER) gParamMom.order = QUDA_MILC_GAUGE_ORDER;
3690  gParamMom.precision = qudaGaugeParam->cpu_prec;
3691  gParamMom.link_type = QUDA_ASQTAD_MOM_LINKS;
3692  gParamMom.create = QUDA_REFERENCE_FIELD_CREATE;
3693  gParamMom.gauge=mom;
3694  if (gParamMom.order == QUDA_TIFR_GAUGE_ORDER) gParamMom.reconstruct = QUDA_RECONSTRUCT_NO;
3695  else gParamMom.reconstruct = QUDA_RECONSTRUCT_10;
3696 
3697  cpuGaugeField* cpuMom = new cpuGaugeField(gParamMom);
3698 
3699  cudaGaugeField* cudaMom = NULL;
3700  if (qudaGaugeParam->use_resident_mom) {
3701  if (!gaugePrecise) errorQuda("No resident momentum field to use");
3702  cudaMom = momResident;
3703  printfQuda("GaugeForce: Using resident mom field\n");
3704  profileGaugeForce.Stop(QUDA_PROFILE_INIT);
3705  } else {
3706  gParamMom.create = QUDA_ZERO_FIELD_CREATE;
3707  gParamMom.order = QUDA_FLOAT2_GAUGE_ORDER;
3708  gParamMom.reconstruct = QUDA_RECONSTRUCT_10;
3709  gParamMom.link_type = QUDA_ASQTAD_MOM_LINKS;
3710  gParamMom.precision = qudaGaugeParam->cuda_prec;
3711  cudaMom = new cudaGaugeField(gParamMom);
3712  profileGaugeForce.Stop(QUDA_PROFILE_INIT);
3713 
3714  profileGaugeForce.Start(QUDA_PROFILE_H2D);
3715  cudaMom->loadCPUField(*cpuMom, QUDA_CPU_FIELD_LOCATION);
3716  profileGaugeForce.Stop(QUDA_PROFILE_H2D);
3717  }
3718 
3719  gaugeforce::initLatticeConstants(*cudaMom, profileGaugeForce);
3720 
3721  profileGaugeForce.Start(QUDA_PROFILE_CONSTANT);
3722  qudaGaugeParam->mom_ga_pad = gParamMom.pad; //need to set this (until we use order classes)
3723  gauge_force_init_cuda(qudaGaugeParam, max_length);
3724  profileGaugeForce.Stop(QUDA_PROFILE_CONSTANT);
3725 
3726  // actually do the computation
3727  profileGaugeForce.Start(QUDA_PROFILE_COMPUTE);
3728  gauge_force_cuda(*cudaMom, eb3, *cudaGauge, qudaGaugeParam, input_path_buf,
3729  path_length, loop_coeff, num_paths, max_length);
3730  profileGaugeForce.Stop(QUDA_PROFILE_COMPUTE);
3731 
3732  // still need to copy this back even when preserving
3733  profileGaugeForce.Start(QUDA_PROFILE_D2H);
3734  cudaMom->saveCPUField(*cpuMom, QUDA_CPU_FIELD_LOCATION);
3735  profileGaugeForce.Stop(QUDA_PROFILE_D2H);
3736 
3737  profileGaugeForce.Start(QUDA_PROFILE_FREE);
3738  if (qudaGaugeParam->make_resident_gauge) {
3739  if (gaugePrecise && gaugePrecise != cudaSiteLink) delete gaugePrecise;
3740  gaugePrecise = cudaSiteLink;
3741  } else {
3742  delete cudaSiteLink;
3743  }
3744 
3745  if (qudaGaugeParam->make_resident_mom) {
3746  if (momResident && momResident != cudaMom) delete momResident;
3747  momResident = cudaMom;
3748  } else {
3749  delete cudaMom;
3750  }
3751 
3752  delete cpuSiteLink;
3753  delete cpuMom;
3754 
3755 #ifdef MULTI_GPU
3756  delete cudaGauge;
3757 #endif
3758  profileGaugeForce.Stop(QUDA_PROFILE_FREE);
3759 
3760  profileGaugeForce.Stop(QUDA_PROFILE_TOTAL);
3761 
3762  if(timeinfo){
3763  timeinfo[0] = profileGaugeForce.Last(QUDA_PROFILE_H2D);
3764  timeinfo[1] = profileGaugeForce.Last(QUDA_PROFILE_COMPUTE);
3765  timeinfo[2] = profileGaugeForce.Last(QUDA_PROFILE_D2H);
3766  }
3767 
3768  checkCudaError();
3769 #else
3770  errorQuda("Gauge force has not been built");
3771 #endif // GPU_GAUGE_FORCE
3772  return 0;
3773 }
3774 
3775 
3777 {
3778  profileCloverCreate.Start(QUDA_PROFILE_TOTAL);
3779  profileCloverCreate.Start(QUDA_PROFILE_INIT);
3780  if(!cloverPrecise){
3781  printfQuda("About to create cloverPrecise\n");
3782  CloverFieldParam cloverParam;
3783  cloverParam.nDim = 4;
3784  for(int dir=0; dir<4; ++dir) cloverParam.x[dir] = gaugePrecise->X()[dir];
3785  cloverParam.setPrecision(invertParam->clover_cuda_prec);
3786  cloverParam.pad = invertParam->cl_pad;
3787  cloverParam.direct = true;
3788  cloverParam.inverse = true;
3789  cloverParam.norm = 0;
3790  cloverParam.invNorm = 0;
3791  cloverParam.twisted = false;
3792  cloverParam.create = QUDA_NULL_FIELD_CREATE;
3793  cloverParam.siteSubset = QUDA_FULL_SITE_SUBSET;
3794  cloverParam.setPrecision(invertParam->cuda_prec);
3795  if (invertParam->dslash_type == QUDA_TWISTED_CLOVER_DSLASH)
3796  {
3797  cloverParam.direct = true;
3798  cloverParam.inverse = false;
3799  cloverPrecise = new cudaCloverField(cloverParam);
3800  cloverParam.inverse = true;
3801  cloverParam.direct = false;
3802  cloverParam.twisted = true;
3803  cloverParam.mu2 = 4.*invertParam->kappa*invertParam->kappa*invertParam->mu*invertParam->mu;
3804  cloverInvPrecise = new cudaCloverField(cloverParam); //FIXME Only with tmClover
3805  } else {
3806  cloverPrecise = new cudaCloverField(cloverParam);
3807  }
3808  }
3809 
3810  int R[4] = {2,2,2,2}; // radius of the extended region in each dimension / direction
3811  int y[4];
3812  for(int dir=0; dir<4; ++dir) y[dir] = gaugePrecise->X()[dir] + 2*R[dir];
3813  int pad = 0;
3814  // clover creation not supported from 8-reconstruct presently so convert to 12
3817  GaugeFieldParam gParamEx(y, gaugePrecise->Precision(), recon, pad,
3819  gParamEx.create = QUDA_ZERO_FIELD_CREATE;
3820  gParamEx.order = gaugePrecise->Order();
3821  gParamEx.siteSubset = QUDA_FULL_SITE_SUBSET;
3822  gParamEx.t_boundary = gaugePrecise->TBoundary();
3823  gParamEx.nFace = 1;
3824  for (int d=0; d<4; d++) gParamEx.r[d] = R[d];
3825 
3826  cudaGaugeField *cudaGaugeExtended = NULL;
3828  cudaGaugeExtended = extendedGaugeResident;
3829  profileCloverCreate.Stop(QUDA_PROFILE_INIT);
3830  } else {
3831  cudaGaugeExtended = new cudaGaugeField(gParamEx);
3832 
3833  // copy gaugePrecise into the extended device gauge field
3835 #if 1
3836  profileCloverCreate.Stop(QUDA_PROFILE_INIT);
3837  profileCloverCreate.Start(QUDA_PROFILE_COMMS);
3838  cudaGaugeExtended->exchangeExtendedGhost(R,true);
3839  profileCloverCreate.Stop(QUDA_PROFILE_COMMS);
3840 #else
3841 
3848  gParam.nFace = 1;
3849 
3850  // create an extended gauge field on the host
3851  for(int dir=0; dir<4; ++dir) gParam.x[dir] += 4;
3852  cpuGaugeField cpuGaugeExtended(gParam);
3853  cudaGaugeExtended->saveCPUField(cpuGaugeExtended, QUDA_CPU_FIELD_LOCATION);
3854 
3855  profileCloverCreate.Stop(QUDA_PROFILE_INIT);
3856  // communicate data
3857  profileCloverCreate.Start(QUDA_PROFILE_COMMS);
3858  //exchange_cpu_sitelink_ex(const_cast<int*>(gaugePrecise->X()), R, (void**)cpuGaugeExtended.Gauge_p(),
3859  // cpuGaugeExtended.Order(),cpuGaugeExtended.Precision(), 0, 4);
3860  cpuGaugeExtended.exchangeExtendedGhost(R,true);
3861 
3862  cudaGaugeExtended->loadCPUField(cpuGaugeExtended, QUDA_CPU_FIELD_LOCATION);
3863  profileCloverCreate.Stop(QUDA_PROFILE_COMMS);
3864 #endif
3865  }
3866 
3867  profileCloverCreate.Start(QUDA_PROFILE_COMPUTE);
3868 #ifdef MULTI_GPU
3869  computeClover(*cloverPrecise, *cudaGaugeExtended, invertParam->clover_coeff, QUDA_CUDA_FIELD_LOCATION);
3870 #else
3872 #endif
3873 
3874  if (invertParam->dslash_type == QUDA_TWISTED_CLOVER_DSLASH)
3875 #ifdef MULTI_GPU
3876  computeClover(*cloverInvPrecise, *cudaGaugeExtended, invertParam->clover_coeff, QUDA_CUDA_FIELD_LOCATION); //FIXME Only with tmClover
3877 #else
3878  computeClover(*cloverInvPrecise, *gaugePrecise, invertParam->clover_coeff, QUDA_CUDA_FIELD_LOCATION); //FIXME Only with tmClover
3879 #endif
3880 
3881  profileCloverCreate.Stop(QUDA_PROFILE_COMPUTE);
3882 
3883  profileCloverCreate.Stop(QUDA_PROFILE_TOTAL);
3884 
3885  // FIXME always preserve the extended gauge
3886  extendedGaugeResident = cudaGaugeExtended;
3887 
3888  return;
3889 }
3890 
3891 void* createGaugeField(void* gauge, int geometry, QudaGaugeParam* param)
3892 {
3893 
3894  GaugeFieldParam gParam(0,*param);
3895  if(geometry == 1){
3896  gParam.geometry = QUDA_SCALAR_GEOMETRY;
3897  }else if(geometry == 4){
3898  gParam.geometry = QUDA_VECTOR_GEOMETRY;
3899  }else{
3900  errorQuda("Only scalar and vector geometries are supported\n");
3901  }
3902  gParam.pad = 0;
3904  gParam.gauge = gauge;
3905  gParam.link_type = QUDA_GENERAL_LINKS;
3906 
3907 
3908  gParam.order = QUDA_FLOAT2_GAUGE_ORDER;
3909  gParam.create = QUDA_ZERO_FIELD_CREATE;
3910  cudaGaugeField* cudaGauge = new cudaGaugeField(gParam);
3911  if(gauge){
3912  gParam.order = QUDA_MILC_GAUGE_ORDER;
3914  cpuGaugeField cpuGauge(gParam);
3915  cudaGauge->loadCPUField(cpuGauge,QUDA_CPU_FIELD_LOCATION);
3916  }
3917  return cudaGauge;
3918 }
3919 
3920 
3921 void saveGaugeField(void* gauge, void* inGauge, QudaGaugeParam* param){
3922 
3923  cudaGaugeField* cudaGauge = reinterpret_cast<cudaGaugeField*>(inGauge);
3924 
3925  GaugeFieldParam gParam(0,*param);
3926  gParam.geometry = cudaGauge->Geometry();
3927  gParam.pad = 0;
3929  gParam.gauge = gauge;
3930  gParam.link_type = QUDA_GENERAL_LINKS;
3931  gParam.order = QUDA_MILC_GAUGE_ORDER;
3933 
3934  cpuGaugeField cpuGauge(gParam);
3935  cudaGauge->saveCPUField(cpuGauge,QUDA_CPU_FIELD_LOCATION);
3936 }
3937 
3938 
3939 void* createExtendedGaugeField(void* gauge, int geometry, QudaGaugeParam* param)
3940 {
3941  profileExtendedGauge.Start(QUDA_PROFILE_TOTAL);
3942 
3943  if (param->use_resident_gauge && extendedGaugeResident && geometry == 4) {
3944  profileExtendedGauge.Stop(QUDA_PROFILE_TOTAL);
3945  return extendedGaugeResident;
3946  }
3947 
3948  profileExtendedGauge.Start(QUDA_PROFILE_INIT);
3949 
3951  if (geometry == 1) {
3952  geom = QUDA_SCALAR_GEOMETRY;
3953  } else if(geometry == 4) {
3954  geom = QUDA_VECTOR_GEOMETRY;
3955  } else {
3956  errorQuda("Only scalar and vector geometries are supported");
3957  }
3958 
3959  cpuGaugeField* cpuGauge = NULL;
3960  cudaGaugeField* cudaGauge = NULL;
3961 
3962 
3963  // Create the unextended cpu field
3964  GaugeFieldParam gParam(0, *param);
3965  gParam.order = QUDA_MILC_GAUGE_ORDER;
3966  gParam.pad = 0;
3967  gParam.link_type = param->type;
3970  gParam.gauge = gauge;
3971  gParam.geometry = geom;
3972 
3973  if(gauge){
3974  cpuGauge = new cpuGaugeField(gParam);
3975  // Create the unextended GPU field
3976  gParam.order = QUDA_FLOAT2_GAUGE_ORDER;
3977  gParam.create = QUDA_NULL_FIELD_CREATE;
3978  cudaGauge = new cudaGaugeField(gParam);
3979  profileExtendedGauge.Stop(QUDA_PROFILE_INIT);
3980 
3981  // load the data into the unextended device field
3982  profileExtendedGauge.Start(QUDA_PROFILE_H2D);
3983  cudaGauge->loadCPUField(*cpuGauge, QUDA_CPU_FIELD_LOCATION);
3984  profileExtendedGauge.Stop(QUDA_PROFILE_H2D);
3985 
3986  profileExtendedGauge.Start(QUDA_PROFILE_INIT);
3987  }
3988 
3989  QudaGaugeParam param_ex;
3990  memcpy(&param_ex, param, sizeof(QudaGaugeParam));
3991  for(int dir=0; dir<4; ++dir) param_ex.X[dir] = param->X[dir]+4;
3992  GaugeFieldParam gParam_ex(0, param_ex);
3993  gParam_ex.link_type = param->type;
3994  gParam_ex.geometry = geom;
3995  gParam_ex.order = QUDA_FLOAT2_GAUGE_ORDER;
3996  gParam_ex.create = QUDA_ZERO_FIELD_CREATE;
3997  gParam_ex.pad = 0;
3998  gParam_ex.ghostExchange = QUDA_GHOST_EXCHANGE_NO;
3999  // create the extended gauge field
4000  cudaGaugeField* cudaGaugeEx = new cudaGaugeField(gParam_ex);
4001 
4002  // copy data from the interior into the border region
4003  if(gauge) copyExtendedGauge(*cudaGaugeEx, *cudaGauge, QUDA_CUDA_FIELD_LOCATION);
4004 
4005  profileExtendedGauge.Stop(QUDA_PROFILE_INIT);
4006  if(gauge){
4007  int R[4] = {2,2,2,2};
4008  // communicate
4009  profileExtendedGauge.Start(QUDA_PROFILE_COMMS);
4010  cudaGaugeEx->exchangeExtendedGhost(R, true);
4011  profileExtendedGauge.Stop(QUDA_PROFILE_COMMS);
4012  if (cpuGauge) delete cpuGauge;
4013  if (cudaGauge) delete cudaGauge;
4014  }
4015  profileExtendedGauge.Stop(QUDA_PROFILE_TOTAL);
4016 
4017  return cudaGaugeEx;
4018 }
4019 
4020 // extend field on the GPU
4021 void extendGaugeField(void* out, void* in){
4022  cudaGaugeField* inGauge = reinterpret_cast<cudaGaugeField*>(in);
4023  cudaGaugeField* outGauge = reinterpret_cast<cudaGaugeField*>(out);
4024 
4025  copyExtendedGauge(*outGauge, *inGauge, QUDA_CUDA_FIELD_LOCATION);
4026 
4027  int R[4] = {2,2,2,2};
4028  outGauge->exchangeExtendedGhost(R,true);
4029 
4030  return;
4031 }
4032 
4033 
4034 
4036  cudaGaugeField* g = reinterpret_cast<cudaGaugeField*>(gauge);
4037  delete g;
4038 }
4039 
4040 
4042  void *clov,
4043  int mu,
4044  int nu,
4045  int dim[4])
4046 {
4047 
4048  profileCloverTrace.Start(QUDA_PROFILE_TOTAL);
4049 
4050 
4051  cudaGaugeField* cudaGauge = reinterpret_cast<cudaGaugeField*>(out);
4052 
4053  if(cloverPrecise){
4055  //computeCloverSigmaTrace(*cudaGauge, cudaClover, mu, nu, QUDA_CUDA_FIELD_LOCATION);
4056  }else{
4057  errorQuda("cloverPrecise not set\n");
4058  }
4059  profileCloverTrace.Stop(QUDA_PROFILE_TOTAL);
4060  return;
4061 }
4062 
4063 
4065  void* gauge,
4066  void* oprod,
4067  int mu, int nu,
4068  double coeff,
4070  QudaGaugeParam* param,
4071  int conjugate)
4072 {
4073  profileCloverDerivative.Start(QUDA_PROFILE_TOTAL);
4074 
4075  checkGaugeParam(param);
4076 
4077  profileCloverDerivative.Start(QUDA_PROFILE_INIT);
4078 #ifndef USE_EXTENDED_VOLUME
4079 #define USE_EXTENDED_VOLUME
4080 #endif
4081 
4082  // create host fields
4083  GaugeFieldParam gParam(0, *param);
4084  gParam.order = QUDA_MILC_GAUGE_ORDER;
4085  gParam.pad = 0;
4086  gParam.geometry = QUDA_SCALAR_GEOMETRY;
4087  gParam.link_type = QUDA_GENERAL_LINKS;
4089  // gParam.gauge = out;
4090  // cpuGaugeField cpuOut(gParam);
4091 #ifndef USE_EXTENDED_VOLUME
4092  gParam.geometry = QUDA_SCALAR_GEOMETRY;
4093  gParam.link_type = QUDA_GENERAL_LINKS;
4094  gParam.gauge = oprod;
4095  cpuGaugeField cpuOprod(gParam);
4096 
4097  gParam.geometry = QUDA_VECTOR_GEOMETRY;
4098  gParam.link_type = QUDA_SU3_LINKS;
4099  gParam.gauge = gauge;
4100  cpuGaugeField cpuGauge(gParam);
4101 #endif
4102 
4103  /*
4104  // create device fields
4105  gParam.geometry = QUDA_SCALAR_GEOMETRY;
4106  gParam.link_type = QUDA_GENERAL_LINKS;
4107  gParam.create = QUDA_NULL_FIELD_CREATE;
4108  // gParam.pad = getGaugePadding(gParam);
4109  gParam.pad = 0;
4110  gParam.ghostExchange = QUDA_GHOST_EXCHANGE_NO;
4111  gParam.order = QUDA_FLOAT2_GAUGE_ORDER;
4112  gParam.create = QUDA_ZERO_FIELD_CREATE;
4113  // cudaGaugeField cudaOut(gParam);
4114  */
4115 
4116 #ifndef USE_EXTENDED_VOLUME
4117  cudaGaugeField cudaOprod(gParam);
4118 
4119  gParam.geometry = QUDA_VECTOR_GEOMETRY;
4120  gParam.link_type = QUDA_SU3_LINKS;
4121  cudaGaugeField cudaGauge(gParam);
4122 #endif
4123  profileCloverDerivative.Stop(QUDA_PROFILE_INIT);
4124 
4125  cudaGaugeField* cudaOut = reinterpret_cast<cudaGaugeField*>(out);
4126  cudaGaugeField* gPointer = reinterpret_cast<cudaGaugeField*>(gauge);
4127  cudaGaugeField* oPointer = reinterpret_cast<cudaGaugeField*>(oprod);
4128 
4129 
4130  profileCloverDerivative.Start(QUDA_PROFILE_COMPUTE);
4131  cloverDerivative(*cudaOut, *gPointer, *oPointer, mu, nu, coeff, parity, conjugate);
4132  profileCloverDerivative.Stop(QUDA_PROFILE_COMPUTE);
4133 
4134 
4135  profileCloverDerivative.Start(QUDA_PROFILE_D2H);
4136 
4137  // saveGaugeField(out, cudaOut, param);
4138  // cudaOut->saveCPUField(cpuOut, QUDA_CPU_FIELD_LOCATION);
4139  profileCloverDerivative.Stop(QUDA_PROFILE_D2H);
4140  checkCudaError();
4141 
4142  // delete cudaOut;
4143 
4144  profileCloverDerivative.Stop(QUDA_PROFILE_TOTAL);
4145 
4146  return;
4147 }
4148 
4149 void computeKSOprodQuda(void* oprod,
4150  void* fermion,
4151  double coeff,
4152  int X[4],
4154 
4155 {
4156 /*
4157  using namespace quda;
4158 
4159  cudaGaugeField* cudaOprod;
4160  cudaColorSpinorField* cudaQuark;
4161 
4162  const int Ls = 1;
4163  const int Ninternal = 6;
4164 #ifdef BUILD_TIFR_INTERFACE
4165  const int Nface = 1;
4166 #else
4167  const int Nface = 3;
4168 #endif
4169  FaceBuffer fB(X, 4, Ninternal, Nface, prec, Ls);
4170  cudaOprod = reinterpret_cast<cudaGaugeField*>(oprod);
4171  cudaQuark = reinterpret_cast<cudaColorSpinorField*>(fermion);
4172 
4173  double new_coeff[2] = {0,0};
4174  new_coeff[0] = coeff;
4175  // Operate on even-parity sites
4176  computeStaggeredOprod(*cudaOprod, *cudaOprod, *cudaQuark, fB, 0, new_coeff);
4177 
4178  // Operator on odd-parity sites
4179  computeStaggeredOprod(*cudaOprod, *cudaOprod, *cudaQuark, fB, 1, new_coeff);
4180 
4181 */
4182  return;
4183 }
4184 
4185 void computeStaggeredForceQuda(void* cudaMom, void* qudaQuark, double coeff)
4186 {
4187  bool use_resident_solution = false;
4188  if (solutionResident) {
4189  qudaQuark = solutionResident;
4190  use_resident_solution = true;
4191  } else {
4192  errorQuda("No input quark field defined");
4193  }
4194 
4195  if (momResident) {
4196  cudaMom = momResident;
4197  } else {
4198  errorQuda("No input momentum defined");
4199  }
4200 
4201  if (!gaugePrecise) {
4202  errorQuda("No resident gauge field");
4203  }
4204 
4205  int pad = 0;
4208  oParam.create = QUDA_ZERO_FIELD_CREATE;
4209  oParam.order = QUDA_FLOAT2_GAUGE_ORDER;
4210  oParam.siteSubset = QUDA_FULL_SITE_SUBSET;
4211  oParam.t_boundary = QUDA_PERIODIC_T;
4212  oParam.nFace = 1;
4213 
4214  // create temporary field for quark-field outer product
4215  cudaGaugeField cudaOprod(oParam);
4216 
4217  // compute quark-field outer product
4218  computeKSOprodQuda(&cudaOprod, qudaQuark, coeff,
4219  const_cast<int*>(gaugePrecise->X()),
4220  gaugePrecise->Precision());
4221 
4222  cudaGaugeField* mom = reinterpret_cast<cudaGaugeField*>(cudaMom);
4223 
4225 
4226  if (use_resident_solution) {
4227  delete solutionResident;
4228  solutionResident = NULL;
4229  }
4230 
4231  return;
4232 }
4233 
4234 
4235 void computeAsqtadForceQuda(void* const milc_momentum,
4236  long long *flops,
4237  const double act_path_coeff[6],
4238  const void* const one_link_src[4],
4239  const void* const naik_src[4],
4240  const void* const link,
4241  const QudaGaugeParam* gParam)
4242 {
4243 
4244 #ifdef GPU_HISQ_FORCE
4245  long long partialFlops;
4246  using namespace quda::fermion_force;
4247  profileAsqtadForce.Start(QUDA_PROFILE_TOTAL);
4248  profileAsqtadForce.Start(QUDA_PROFILE_INIT);
4249 
4250  cudaGaugeField *cudaGauge = NULL;
4251  cpuGaugeField *cpuGauge = NULL;
4252  cudaGaugeField *cudaInForce = NULL;
4253  cpuGaugeField *cpuOneLinkInForce = NULL;
4254  cpuGaugeField *cpuNaikInForce = NULL;
4255  cudaGaugeField *cudaOutForce = NULL;
4256  cudaGaugeField *cudaMom = NULL;
4257  cpuGaugeField *cpuMom = NULL;
4258 
4259 #ifdef MULTI_GPU
4260  cudaGaugeField *cudaGauge_ex = NULL;
4261  cudaGaugeField *cudaInForce_ex = NULL;
4262  cudaGaugeField *cudaOutForce_ex = NULL;
4263 #endif
4264 
4265  GaugeFieldParam param(0, *gParam);
4267  param.anisotropy = 1.0;
4270  param.t_boundary = QUDA_PERIODIC_T;
4271  param.nFace = 1;
4272 
4273  param.link_type = QUDA_GENERAL_LINKS;
4276 
4277  // create host fields
4278  param.gauge = (void*)link;
4279  cpuGauge = new cpuGaugeField(param);
4280 
4281  param.order = QUDA_QDP_GAUGE_ORDER;
4282  param.gauge = (void*)one_link_src;
4283  cpuOneLinkInForce = new cpuGaugeField(param);
4284 
4285  param.gauge = (void*)naik_src;
4286  cpuNaikInForce = new cpuGaugeField(param);
4287 
4288  param.order = QUDA_MILC_GAUGE_ORDER;
4291  param.gauge = milc_momentum;
4292  cpuMom = new cpuGaugeField(param);
4293 
4294  // create device fields
4296  param.link_type = QUDA_GENERAL_LINKS;
4299 
4300  cudaGauge = new cudaGaugeField(param);
4301  cudaInForce = new cudaGaugeField(param);
4302  cudaOutForce = new cudaGaugeField(param);
4303 
4306  cudaMom = new cudaGaugeField(param);
4307 
4308 #ifdef MULTI_GPU
4309  for(int dir=0; dir<4; ++dir) param.x[dir] += 4;
4310  param.link_type = QUDA_GENERAL_LINKS;
4313 
4314  cudaGauge_ex = new cudaGaugeField(param);
4315  cudaInForce_ex = new cudaGaugeField(param);
4316  cudaOutForce_ex = new cudaGaugeField(param);
4317 #endif
4318  profileAsqtadForce.Stop(QUDA_PROFILE_INIT);
4319 
4320 #ifdef MULTI_GPU
4321  int R[4] = {2, 2, 2, 2};
4322 #endif
4323 
4324  profileAsqtadForce.Start(QUDA_PROFILE_H2D);
4326  profileAsqtadForce.Stop(QUDA_PROFILE_H2D);
4327 #ifdef MULTI_GPU
4328  cudaMemset((void**)(cudaInForce_ex->Gauge_p()), 0, cudaInForce_ex->Bytes());
4331 #endif
4332 
4333  profileAsqtadForce.Start(QUDA_PROFILE_H2D);
4334  cudaInForce->loadCPUField(*cpuOneLinkInForce, QUDA_CPU_FIELD_LOCATION);
4335  profileAsqtadForce.Stop(QUDA_PROFILE_H2D);
4336 #ifdef MULTI_GPU
4337  cudaMemset((void**)(cudaInForce_ex->Gauge_p()), 0, cudaInForce_ex->Bytes());
4338  copyExtendedGauge(*cudaInForce_ex, *cudaInForce, QUDA_CUDA_FIELD_LOCATION);
4339  cudaInForce_ex->exchangeExtendedGhost(R,true);
4340 #endif
4341 
4342  cudaMemset((void**)(cudaOutForce->Gauge_p()), 0, cudaOutForce->Bytes());
4343  profileAsqtadForce.Start(QUDA_PROFILE_COMPUTE);
4344 #ifdef MULTI_GPU
4345  cudaMemset((void**)(cudaOutForce_ex->Gauge_p()), 0, cudaOutForce_ex->Bytes());
4346  hisqStaplesForceCuda(act_path_coeff, *gParam, *cudaInForce_ex, *cudaGauge_ex, cudaOutForce_ex, &partialFlops);
4347  *flops += partialFlops;
4348 #else
4349  hisqStaplesForceCuda(act_path_coeff, *gParam, *cudaInForce, *cudaGauge, cudaOutForce, &partialFlops);
4350  *flops += partialFlops;
4351 #endif
4352  profileAsqtadForce.Stop(QUDA_PROFILE_COMPUTE);
4353 
4354  profileAsqtadForce.Start(QUDA_PROFILE_H2D);
4355  cudaInForce->loadCPUField(*cpuNaikInForce, QUDA_CPU_FIELD_LOCATION);
4356 #ifdef MULTI_GPU
4357  copyExtendedGauge(*cudaInForce_ex, *cudaInForce, QUDA_CUDA_FIELD_LOCATION);
4358  cudaInForce_ex->exchangeExtendedGhost(R,true);
4359 #endif
4360  profileAsqtadForce.Stop(QUDA_PROFILE_H2D);
4361 
4362  profileAsqtadForce.Start(QUDA_PROFILE_COMPUTE);
4363 #ifdef MULTI_GPU
4364  hisqLongLinkForceCuda(act_path_coeff[1], *gParam, *cudaInForce_ex, *cudaGauge_ex, cudaOutForce_ex, &partialFlops);
4365  *flops += partialFlops;
4366  completeKSForce(*cudaMom, *cudaOutForce_ex, *cudaGauge_ex, QUDA_CUDA_FIELD_LOCATION, &partialFlops);
4367  *flops += partialFlops;
4368 #else
4369  hisqLongLinkForceCuda(act_path_coeff[1], *gParam, *cudaInForce, *cudaGauge, cudaOutForce, &partialFlops);
4370  *flops += partialFlops;
4371  hisqCompleteForceCuda(*gParam, *cudaOutForce, *cudaGauge, cudaMom, &partialFlops);
4372  *flops += partialFlops;
4373 #endif
4374  profileAsqtadForce.Stop(QUDA_PROFILE_COMPUTE);
4375 
4376  profileAsqtadForce.Start(QUDA_PROFILE_D2H);
4378  profileAsqtadForce.Stop(QUDA_PROFILE_D2H);
4379 
4380  profileAsqtadForce.Start(QUDA_PROFILE_FREE);
4381  delete cudaInForce;
4382  delete cudaOutForce;
4383  delete cudaGauge;
4384  delete cudaMom;
4385 #ifdef MULTI_GPU
4386  delete cudaInForce_ex;
4387  delete cudaOutForce_ex;
4388  delete cudaGauge_ex;
4389 #endif
4390 
4391  delete cpuGauge;
4392  delete cpuOneLinkInForce;
4393  delete cpuNaikInForce;
4394  delete cpuMom;
4395 
4396  profileAsqtadForce.Stop(QUDA_PROFILE_FREE);
4397 
4398  profileAsqtadForce.Stop(QUDA_PROFILE_TOTAL);
4399  return;
4400 
4401 #else
4402  errorQuda("HISQ force has not been built");
4403 #endif
4404 
4405 }
4406 
4407 void
4408 computeHISQForceCompleteQuda(void* const milc_momentum,
4409  const double level2_coeff[6],
4410  const double fat7_coeff[6],
4411  void** quark_array,
4412  int num_terms,
4413  double** quark_coeff,
4414  const void* const w_link,
4415  const void* const v_link,
4416  const void* const u_link,
4417  const QudaGaugeParam* gParam)
4418 {
4419 
4420 /*
4421  void* oprod[2];
4422 
4423  computeStaggeredOprodQuda(void** oprod,
4424  void** fermion,
4425  int num_terms,
4426  double** coeff,
4427  QudaGaugeParam* gParam)
4428 
4429  computeHISQForceQuda(milc_momentum,
4430  level2_coeff,
4431  fat7_coeff,
4432  staple_src,
4433  one_link_src,
4434  naik_src,
4435  w_link,
4436  v_link,
4437  u_link,
4438  gParam);
4439 
4440 */
4441  return;
4442 }
4443 
4444 
4445 
4446 
4447  void
4448 computeHISQForceQuda(void* const milc_momentum,
4449  long long *flops,
4450  const double level2_coeff[6],
4451  const double fat7_coeff[6],
4452  const void* const staple_src[4],
4453  const void* const one_link_src[4],
4454  const void* const naik_src[4],
4455  const void* const w_link,
4456  const void* const v_link,
4457  const void* const u_link,
4458  const QudaGaugeParam* gParam)
4459 {
4460 #ifdef GPU_HISQ_FORCE
4461 
4462  long long partialFlops;
4463 
4464  using namespace quda::fermion_force;
4465  profileHISQForce.Start(QUDA_PROFILE_TOTAL);
4466  profileHISQForce.Start(QUDA_PROFILE_INIT);
4467 
4468  double act_path_coeff[6] = {0,1,level2_coeff[2],level2_coeff[3],level2_coeff[4],level2_coeff[5]};
4469  // You have to look at the MILC routine to understand the following
4470  // Basically, I have already absorbed the one-link coefficient
4471 
4472  GaugeFieldParam param(0, *gParam);
4474  param.order = QUDA_MILC_GAUGE_ORDER;
4477  param.gauge = (void*)milc_momentum;
4478  cpuGaugeField* cpuMom = new cpuGaugeField(param);
4479 
4482  cudaGaugeField* cudaMom = new cudaGaugeField(param);
4483 
4484  param.order = QUDA_MILC_GAUGE_ORDER;
4485  param.link_type = QUDA_GENERAL_LINKS;
4488  param.gauge = (void*)w_link;
4489  cpuGaugeField cpuWLink(param);
4490  param.gauge = (void*)v_link;
4491  cpuGaugeField cpuVLink(param);
4492  param.gauge = (void*)u_link;
4493  cpuGaugeField cpuULink(param);
4495 
4498  cudaGaugeField* cudaGauge = new cudaGaugeField(param);
4499 
4500  cpuGaugeField* cpuStapleForce;
4501  cpuGaugeField* cpuOneLinkForce;
4502  cpuGaugeField* cpuNaikForce;
4503 
4504  param.order = QUDA_QDP_GAUGE_ORDER;
4506  param.gauge = (void*)staple_src;
4507  cpuStapleForce = new cpuGaugeField(param);
4508  param.gauge = (void*)one_link_src;
4509  cpuOneLinkForce = new cpuGaugeField(param);
4510  param.gauge = (void*)naik_src;
4511  cpuNaikForce = new cpuGaugeField(param);
4513 
4515  param.link_type = QUDA_GENERAL_LINKS;
4516  param.precision = gParam->cpu_prec;
4517 
4519  cudaGaugeField* cudaInForce = new cudaGaugeField(param);
4520 
4521 #ifdef MULTI_GPU
4522  for(int dir=0; dir<4; ++dir) param.x[dir] += 4;
4525  cudaGaugeField* cudaGaugeEx = new cudaGaugeField(param);
4526  cudaGaugeField* cudaInForceEx = new cudaGaugeField(param);
4527  cudaGaugeField* cudaOutForceEx = new cudaGaugeField(param);
4528  cudaGaugeField* gaugePtr = cudaGaugeEx;
4529  cudaGaugeField* inForcePtr = cudaInForceEx;
4530  cudaGaugeField* outForcePtr = cudaOutForceEx;
4531 #else
4532  cudaGaugeField* cudaOutForce = new cudaGaugeField(param);
4533  cudaGaugeField* gaugePtr = cudaGauge;
4534  cudaGaugeField* inForcePtr = cudaInForce;
4535  cudaGaugeField* outForcePtr = cudaOutForce;
4536 #endif
4537 
4538 
4539  {
4540  // default settings for the unitarization
4541  const double unitarize_eps = 1e-14;
4542  const double hisq_force_filter = 5e-5;
4543  const double max_det_error = 1e-10;
4544  const bool allow_svd = true;
4545  const bool svd_only = false;
4546  const double svd_rel_err = 1e-8;
4547  const double svd_abs_err = 1e-8;
4548 
4549  setUnitarizeForceConstants(unitarize_eps,
4550  hisq_force_filter,
4551  max_det_error,
4552  allow_svd,
4553  svd_only,
4554  svd_rel_err,
4555  svd_abs_err);
4556  }
4557  profileHISQForce.Stop(QUDA_PROFILE_INIT);
4558 
4559 
4560  profileHISQForce.Start(QUDA_PROFILE_H2D);
4562  profileHISQForce.Stop(QUDA_PROFILE_H2D);
4563 #ifdef MULTI_GPU
4564  int R[4] = {2, 2, 2, 2};
4565  profileHISQForce.Start(QUDA_PROFILE_COMMS);
4567  cudaGaugeEx->exchangeExtendedGhost(R,true);
4568  profileHISQForce.Stop(QUDA_PROFILE_COMMS);
4569 #endif
4570 
4571  profileHISQForce.Start(QUDA_PROFILE_H2D);
4572  cudaInForce->loadCPUField(*cpuStapleForce, QUDA_CPU_FIELD_LOCATION);
4573  profileHISQForce.Stop(QUDA_PROFILE_H2D);
4574 #ifdef MULTI_GPU
4575  profileHISQForce.Start(QUDA_PROFILE_COMMS);
4576  copyExtendedGauge(*cudaInForceEx, *cudaInForce, QUDA_CUDA_FIELD_LOCATION);
4577  cudaInForceEx->exchangeExtendedGhost(R,true);
4578  profileHISQForce.Stop(QUDA_PROFILE_COMMS);
4579  profileHISQForce.Start(QUDA_PROFILE_H2D);
4580  cudaInForce->loadCPUField(*cpuOneLinkForce, QUDA_CPU_FIELD_LOCATION);
4581  profileHISQForce.Stop(QUDA_PROFILE_H2D);
4582  profileHISQForce.Start(QUDA_PROFILE_COMMS);
4583  copyExtendedGauge(*cudaOutForceEx, *cudaInForce, QUDA_CUDA_FIELD_LOCATION);
4584  cudaOutForceEx->exchangeExtendedGhost(R,true);
4585  profileHISQForce.Stop(QUDA_PROFILE_COMMS);
4586 #else
4587  profileHISQForce.Start(QUDA_PROFILE_H2D);
4588  cudaOutForce->loadCPUField(*cpuOneLinkForce, QUDA_CPU_FIELD_LOCATION);
4589  profileHISQForce.Stop(QUDA_PROFILE_H2D);
4590 #endif
4591 
4592  profileHISQForce.Start(QUDA_PROFILE_COMPUTE);
4593  hisqStaplesForceCuda(act_path_coeff, *gParam, *inForcePtr, *gaugePtr, outForcePtr, &partialFlops);
4594  *flops += partialFlops;
4595  profileHISQForce.Stop(QUDA_PROFILE_COMPUTE);
4596 
4597  // Load naik outer product
4598  profileHISQForce.Start(QUDA_PROFILE_H2D);
4599  cudaInForce->loadCPUField(*cpuNaikForce, QUDA_CPU_FIELD_LOCATION);
4600  profileHISQForce.Stop(QUDA_PROFILE_H2D);
4601 #ifdef MULTI_GPU
4602  profileHISQForce.Start(QUDA_PROFILE_COMMS);
4603  copyExtendedGauge(*cudaInForceEx, *cudaInForce, QUDA_CUDA_FIELD_LOCATION);
4604  cudaInForceEx->exchangeExtendedGhost(R,true);
4605  profileHISQForce.Stop(QUDA_PROFILE_COMMS);
4606 #endif
4607 
4608  // Compute Naik three-link term
4609  profileHISQForce.Start(QUDA_PROFILE_COMPUTE);
4610  hisqLongLinkForceCuda(act_path_coeff[1], *gParam, *inForcePtr, *gaugePtr, outForcePtr, &partialFlops);
4611  *flops += partialFlops;
4612  profileHISQForce.Stop(QUDA_PROFILE_COMPUTE);
4613 #ifdef MULTI_GPU
4614  profileHISQForce.Start(QUDA_PROFILE_COMMS);
4615  cudaOutForceEx->exchangeExtendedGhost(R,true);
4616  profileHISQForce.Stop(QUDA_PROFILE_COMMS);
4617 #endif
4618  // load v-link
4619  profileHISQForce.Start(QUDA_PROFILE_H2D);
4621  profileHISQForce.Stop(QUDA_PROFILE_H2D);
4622 #ifdef MULTI_GPU
4623  profileHISQForce.Start(QUDA_PROFILE_COMMS);
4625  cudaGaugeEx->exchangeExtendedGhost(R,true);
4626  profileHISQForce.Stop(QUDA_PROFILE_COMMS);
4627 #endif
4628  // Done with cudaInForce. It becomes the output force. Oops!
4629  profileHISQForce.Start(QUDA_PROFILE_INIT);
4630  int numFailures = 0;
4631  int* numFailuresDev;
4632 
4633  if(cudaMalloc((void**)&numFailuresDev, sizeof(int)) == cudaErrorMemoryAllocation){
4634  errorQuda("cudaMalloc failed for numFailuresDev\n");
4635  }
4636  cudaMemset(numFailuresDev, 0, sizeof(int));
4637  profileHISQForce.Stop(QUDA_PROFILE_INIT);
4638 
4639 
4640  profileHISQForce.Start(QUDA_PROFILE_COMPUTE);
4641  unitarizeForceCuda(*outForcePtr, *gaugePtr, inForcePtr, numFailuresDev, &partialFlops);
4642  *flops += partialFlops;
4643  profileHISQForce.Stop(QUDA_PROFILE_COMPUTE);
4644  profileHISQForce.Start(QUDA_PROFILE_D2H);
4645  cudaMemcpy(&numFailures, numFailuresDev, sizeof(int), cudaMemcpyDeviceToHost);
4646  profileHISQForce.Stop(QUDA_PROFILE_D2H);
4647  cudaFree(numFailuresDev);
4648 
4649  if(numFailures>0){
4650  errorQuda("Error in the unitarization component of the hisq fermion force\n");
4651  exit(1);
4652  }
4653  cudaMemset((void**)(outForcePtr->Gauge_p()), 0, outForcePtr->Bytes());
4654  // read in u-link
4655  profileHISQForce.Start(QUDA_PROFILE_COMPUTE);
4657  profileHISQForce.Stop(QUDA_PROFILE_COMPUTE);
4658 #ifdef MULTI_GPU
4659  profileHISQForce.Start(QUDA_PROFILE_COMMS);
4661  cudaGaugeEx->exchangeExtendedGhost(R,true);
4662  profileHISQForce.Stop(QUDA_PROFILE_COMMS);
4663 #endif
4664  // Compute Fat7-staple term
4665  profileHISQForce.Start(QUDA_PROFILE_COMPUTE);
4666  hisqStaplesForceCuda(fat7_coeff, *gParam, *inForcePtr, *gaugePtr, outForcePtr, &partialFlops);
4667  *flops += partialFlops;
4668  hisqCompleteForceCuda(*gParam, *outForcePtr, *gaugePtr, cudaMom, &partialFlops);
4669  *flops += partialFlops;
4670  profileHISQForce.Stop(QUDA_PROFILE_COMPUTE);
4671 
4672  profileHISQForce.Start(QUDA_PROFILE_D2H);
4673  // Close the paths, make anti-hermitian, and store in compressed format
4675  profileHISQForce.Stop(QUDA_PROFILE_D2H);
4676 
4677 
4678 
4679  profileHISQForce.Start(QUDA_PROFILE_FREE);
4680 
4681  delete cpuStapleForce;
4682  delete cpuOneLinkForce;
4683  delete cpuNaikForce;
4684  delete cpuMom;
4685 
4686  delete cudaInForce;
4687  delete cudaGauge;
4688  delete cudaMom;
4689 
4690 #ifdef MULTI_GPU
4691  delete cudaInForceEx;
4692  delete cudaOutForceEx;
4693  delete cudaGaugeEx;
4694 #else
4695  delete cudaOutForce;
4696 #endif
4697  profileHISQForce.Stop(QUDA_PROFILE_FREE);
4698  profileHISQForce.Stop(QUDA_PROFILE_TOTAL);
4699  return;
4700 #else
4701  errorQuda("HISQ force has not been built");
4702 #endif
4703 }
4704 
4705 void computeStaggeredOprodQuda(void** oprod,
4706  void** fermion,
4707  int num_terms,
4708  double** coeff,
4709  QudaGaugeParam* param)
4710 {
4711  using namespace quda;
4712  profileStaggeredOprod.Start(QUDA_PROFILE_TOTAL);
4713 
4714  checkGaugeParam(param);
4715 
4716  profileStaggeredOprod.Start(QUDA_PROFILE_INIT);
4717  GaugeFieldParam oParam(0, *param);
4718 
4719  oParam.nDim = 4;
4720  oParam.nFace = 0;
4721  // create the host outer-product field
4722  oParam.pad = 0;
4724  oParam.link_type = QUDA_GENERAL_LINKS;
4726  oParam.order = QUDA_QDP_GAUGE_ORDER;
4728  oParam.gauge = oprod[0];
4729  oParam.ghostExchange = QUDA_GHOST_EXCHANGE_NO; // no need for ghost exchange here
4730  cpuGaugeField cpuOprod0(oParam);
4731 
4732  oParam.gauge = oprod[1];
4733  cpuGaugeField cpuOprod1(oParam);
4734 
4735  // create the device outer-product field
4736  oParam.create = QUDA_ZERO_FIELD_CREATE;
4737  oParam.order = QUDA_FLOAT2_GAUGE_ORDER;
4738  cudaGaugeField cudaOprod0(oParam);
4739  cudaGaugeField cudaOprod1(oParam);
4740  profileStaggeredOprod.Stop(QUDA_PROFILE_INIT);
4741 
4742  //initLatticeConstants(cudaOprod0, profileStaggeredOprod);
4743 
4744  profileStaggeredOprod.Start(QUDA_PROFILE_H2D);
4745  cudaOprod0.loadCPUField(cpuOprod0,QUDA_CPU_FIELD_LOCATION);
4746  cudaOprod1.loadCPUField(cpuOprod1,QUDA_CPU_FIELD_LOCATION);
4747  profileStaggeredOprod.Stop(QUDA_PROFILE_H2D);
4748 
4749 
4750  profileStaggeredOprod.Start(QUDA_PROFILE_INIT);
4751 
4752 
4753 
4754  ColorSpinorParam qParam;
4755  qParam.nColor = 3;
4756  qParam.nSpin = 1;
4757  qParam.siteSubset = QUDA_PARITY_SITE_SUBSET;
4758  qParam.fieldOrder = QUDA_SPACE_COLOR_SPIN_FIELD_ORDER;
4759  qParam.siteOrder = QUDA_EVEN_ODD_SITE_ORDER;
4760  qParam.nDim = 4;
4761  qParam.precision = oParam.precision;
4762  qParam.pad = 0;
4763  for(int dir=0; dir<4; ++dir) qParam.x[dir] = oParam.x[dir];
4764  qParam.x[0] /= 2;
4765 
4766  // create the device quark field
4767  qParam.create = QUDA_NULL_FIELD_CREATE;
4768  qParam.fieldOrder = QUDA_FLOAT2_FIELD_ORDER;
4769  cudaColorSpinorField cudaQuarkEven(qParam);
4770  cudaColorSpinorField cudaQuarkOdd(qParam);
4771 
4772  // create the host quark field
4773  qParam.create = QUDA_REFERENCE_FIELD_CREATE;
4774  qParam.fieldOrder = QUDA_SPACE_COLOR_SPIN_FIELD_ORDER;
4775 
4776  const int Ls = 1;
4777  const int Ninternal = 6;
4778  FaceBuffer faceBuffer1(cudaOprod0.X(), 4, Ninternal, 3, cudaOprod0.Precision(), Ls);
4779  FaceBuffer faceBuffer2(cudaOprod0.X(), 4, Ninternal, 3, cudaOprod0.Precision(), Ls);
4780  profileStaggeredOprod.Stop(QUDA_PROFILE_INIT);
4781 
4782  // loop over different quark fields
4783  for(int i=0; i<num_terms; ++i){
4784 
4785  // Wrap the even-parity MILC quark field
4786  profileStaggeredOprod.Start(QUDA_PROFILE_INIT);
4787  qParam.v = fermion[i];
4788  cpuColorSpinorField cpuQuarkEven(qParam); // create host quark field
4789  qParam.v = (char*)fermion[i] + cpuQuarkEven.RealLength()*cpuQuarkEven.Precision();
4790  cpuColorSpinorField cpuQuarkOdd(qParam); // create host field
4791  profileStaggeredOprod.Stop(QUDA_PROFILE_INIT);
4792 
4793  profileStaggeredOprod.Start(QUDA_PROFILE_H2D);
4794  cudaQuarkEven = cpuQuarkEven;
4795  cudaQuarkOdd = cpuQuarkOdd;
4796  profileStaggeredOprod.Stop(QUDA_PROFILE_H2D);
4797 
4798 
4799  profileStaggeredOprod.Start(QUDA_PROFILE_COMPUTE);
4800  // Operate on even-parity sites
4801  computeStaggeredOprod(cudaOprod0, cudaOprod1, cudaQuarkEven, cudaQuarkOdd, faceBuffer1, 0, coeff[i]);
4802 
4803  // Operate on odd-parity sites
4804  computeStaggeredOprod(cudaOprod0, cudaOprod1, cudaQuarkEven, cudaQuarkOdd, faceBuffer2, 1, coeff[i]);
4805  profileStaggeredOprod.Stop(QUDA_PROFILE_COMPUTE);
4806  }
4807 
4808 
4809  // copy the outer product field back to the host
4810  profileStaggeredOprod.Start(QUDA_PROFILE_D2H);
4811  cudaOprod0.saveCPUField(cpuOprod0,QUDA_CPU_FIELD_LOCATION);
4812  cudaOprod1.saveCPUField(cpuOprod1,QUDA_CPU_FIELD_LOCATION);
4813  profileStaggeredOprod.Stop(QUDA_PROFILE_D2H);
4814 
4815 
4816  profileStaggeredOprod.Stop(QUDA_PROFILE_TOTAL);
4817 
4818  checkCudaError();
4819  return;
4820 }
4821 
4822 
4823 /*
4824  void computeStaggeredOprodQuda(void** oprod,
4825  void** fermion,
4826  int num_terms,
4827  double** coeff,
4828  QudaGaugeParam* param)
4829  {
4830  using namespace quda;
4831  profileStaggeredOprod.Start(QUDA_PROFILE_TOTAL);
4832 
4833  checkGaugeParam(param);
4834 
4835  profileStaggeredOprod.Start(QUDA_PROFILE_INIT);
4836  GaugeFieldParam oParam(0, *param);
4837 
4838  oParam.nDim = 4;
4839  oParam.nFace = 0;
4840 // create the host outer-product field
4841 oParam.pad = 0;
4842 oParam.create = QUDA_REFERENCE_FIELD_CREATE;
4843 oParam.link_type = QUDA_GENERAL_LINKS;
4844 oParam.reconstruct = QUDA_RECONSTRUCT_NO;
4845 oParam.order = QUDA_QDP_GAUGE_ORDER;
4846 oParam.gauge = oprod[0];
4847 cpuGaugeField cpuOprod0(oParam);
4848 
4849 oParam.gauge = oprod[1];
4850 cpuGaugeField cpuOprod1(oParam);
4851 
4852 // create the device outer-product field
4853 oParam.create = QUDA_ZERO_FIELD_CREATE;
4854 oParam.order = QUDA_FLOAT2_GAUGE_ORDER;
4855 cudaGaugeField cudaOprod0(oParam);
4856 cudaGaugeField cudaOprod1(oParam);
4857 initLatticeConstants(cudaOprod0, profileStaggeredOprod);
4858 
4859 profileStaggeredOprod.Stop(QUDA_PROFILE_INIT);
4860 
4861 
4862 profileStaggeredOprod.Start(QUDA_PROFILE_H2D);
4863 cudaOprod0.loadCPUField(cpuOprod0,QUDA_CPU_FIELD_LOCATION);
4864 cudaOprod1.loadCPUField(cpuOprod1,QUDA_CPU_FIELD_LOCATION);
4865 profileStaggeredOprod.Stop(QUDA_PROFILE_H2D);
4866 
4867 
4868 
4869 ColorSpinorParam qParam;
4870 qParam.nColor = 3;
4871 qParam.nSpin = 1;
4872 qParam.siteSubset = QUDA_FULL_SITE_SUBSET;
4873 qParam.fieldOrder = QUDA_SPACE_COLOR_SPIN_FIELD_ORDER;
4874 qParam.siteOrder = QUDA_EVEN_ODD_SITE_ORDER;
4875 qParam.nDim = 4;
4876 qParam.precision = oParam.precision;
4877 qParam.pad = 0;
4878 for(int dir=0; dir<4; ++dir) qParam.x[dir] = oParam.x[dir];
4879 
4880 // create the device quark field
4881 qParam.create = QUDA_NULL_FIELD_CREATE;
4882 qParam.fieldOrder = QUDA_FLOAT2_FIELD_ORDER;
4883 cudaColorSpinorField cudaQuark(qParam);
4884 
4885 
4886 cudaColorSpinorField**dQuark = new cudaColorSpinorField*[num_terms];
4887 for(int i=0; i<num_terms; ++i){
4888 dQuark[i] = new cudaColorSpinorField(qParam);
4889 }
4890 
4891 double* new_coeff = new double[num_terms];
4892 
4893 // create the host quark field
4894 qParam.create = QUDA_REFERENCE_FIELD_CREATE;
4895 qParam.fieldOrder = QUDA_SPACE_COLOR_SPIN_FIELD_ORDER;
4896 for(int i=0; i<num_terms; ++i){
4897  qParam.v = fermion[i];
4898  cpuColorSpinorField cpuQuark(qParam);
4899  *(dQuark[i]) = cpuQuark;
4900  new_coeff[i] = coeff[i][0];
4901 }
4902 
4903 
4904 
4905 // loop over different quark fields
4906 for(int i=0; i<num_terms; ++i){
4907  computeKSOprodQuda(&cudaOprod0, dQuark[i], new_coeff[i], oParam.x, oParam.precision);
4908 }
4909 
4910 
4911 
4912 // copy the outer product field back to the host
4913 profileStaggeredOprod.Start(QUDA_PROFILE_D2H);
4914 cudaOprod0.saveCPUField(cpuOprod0,QUDA_CPU_FIELD_LOCATION);
4915 cudaOprod1.saveCPUField(cpuOprod1,QUDA_CPU_FIELD_LOCATION);
4916 profileStaggeredOprod.Stop(QUDA_PROFILE_D2H);
4917 
4918 
4919 for(int i=0; i<num_terms; ++i){
4920  delete dQuark[i];
4921 }
4922 delete[] dQuark;
4923 delete[] new_coeff;
4924 
4925 profileStaggeredOprod.Stop(QUDA_PROFILE_TOTAL);
4926 
4927 checkCudaError();
4928 return;
4929 }
4930 */
4931 
4932 
4933 
4934 
4935 
4937  void* momentum,
4938  double dt,
4939  int conj_mom,
4940  int exact,
4941  QudaGaugeParam* param)
4942 {
4943  profileGaugeUpdate.Start(QUDA_PROFILE_TOTAL);
4944 
4945  checkGaugeParam(param);
4946 
4947  profileGaugeUpdate.Start(QUDA_PROFILE_INIT);
4948  GaugeFieldParam gParam(0, *param);
4949 
4950  // create the host fields
4951  gParam.pad = 0;
4953  gParam.link_type = QUDA_SU3_LINKS;
4955  gParam.gauge = gauge;
4957  cpuGaugeField *cpuGauge = new cpuGaugeField(gParam);
4958 
4959  if (gParam.order == QUDA_TIFR_GAUGE_ORDER) {
4961  } else {
4963  }
4965 
4966  gParam.gauge = momentum;
4967 
4968  cpuGaugeField *cpuMom = new cpuGaugeField(gParam);
4969 
4970  // create the device fields
4971  gParam.create = QUDA_NULL_FIELD_CREATE;
4972  gParam.order = QUDA_FLOAT2_GAUGE_ORDER;
4975 
4976  cudaGaugeField *cudaMom = !param->use_resident_mom ? new cudaGaugeField(gParam) : NULL;
4977 
4978  gParam.pad = param->ga_pad;
4979  gParam.link_type = QUDA_SU3_LINKS;
4980  gParam.reconstruct = param->reconstruct;
4981 
4982  cudaGaugeField *cudaInGauge = !param->use_resident_gauge ? new cudaGaugeField(gParam) : NULL;
4983  cudaGaugeField *cudaOutGauge = new cudaGaugeField(gParam);
4984 
4985  profileGaugeUpdate.Stop(QUDA_PROFILE_INIT);
4986 
4987  profileGaugeUpdate.Start(QUDA_PROFILE_H2D);
4988 
4989  /*printfQuda("UpdateGaugeFieldQuda use_resident_gauge = %d, make_resident_gauge = %d\n",
4990  param->use_resident_gauge, param->make_resident_gauge);
4991  printfQuda("UpdateGaugeFieldQuda use_resident_mom = %d, make_resident_mom = %d\n",
4992  param->use_resident_mom, param->make_resident_mom);*/
4993 
4994  if (!param->use_resident_gauge) { // load fields onto the device
4995  cudaInGauge->loadCPUField(*cpuGauge, QUDA_CPU_FIELD_LOCATION);
4996  } else { // or use resident fields already present
4997  if (!gaugePrecise) errorQuda("No resident gauge field allocated");
4998  cudaInGauge = gaugePrecise;
4999  gaugePrecise = NULL;
5000  }
5001 
5002  if (!param->use_resident_mom) {
5003  cudaMom->loadCPUField(*cpuMom, QUDA_CPU_FIELD_LOCATION);
5004  } else {
5005  if (!momResident) errorQuda("No resident mom field allocated");
5006  cudaMom = momResident;
5007  momResident = NULL;
5008  }
5009 
5010  profileGaugeUpdate.Stop(QUDA_PROFILE_H2D);
5011 
5012  // perform the update
5013  profileGaugeUpdate.Start(QUDA_PROFILE_COMPUTE);
5014  updateGaugeField(*cudaOutGauge, dt, *cudaInGauge, *cudaMom,
5015  (bool)conj_mom, (bool)exact);
5016  profileGaugeUpdate.Stop(QUDA_PROFILE_COMPUTE);
5017 
5018  // copy the gauge field back to the host
5019  profileGaugeUpdate.Start(QUDA_PROFILE_D2H);
5020  cudaOutGauge->saveCPUField(*cpuGauge, QUDA_CPU_FIELD_LOCATION);
5021  profileGaugeUpdate.Stop(QUDA_PROFILE_D2H);
5022 
5023  profileGaugeUpdate.Stop(QUDA_PROFILE_TOTAL);
5024 
5025  if (param->make_resident_gauge) {
5026  if (gaugePrecise != NULL) delete gaugePrecise;
5027  gaugePrecise = cudaOutGauge;
5028  } else {
5029  delete cudaOutGauge;
5030  }
5031 
5032  if (param->make_resident_mom) {
5033  if (momResident != NULL && momResident != cudaMom) delete momResident;
5034  momResident = cudaMom;
5035  } else {
5036  delete cudaMom;
5037  }
5038 
5039  delete cudaInGauge;
5040  delete cpuMom;
5041  delete cpuGauge;
5042 
5043  checkCudaError();
5044  return;
5045 }
5046 
5047 
5048 
5049 
5050 /*
5051  The following functions are for the Fortran interface.
5052  */
5053 
5054 void init_quda_(int *dev) { initQuda(*dev); }
5055 void init_quda_device_(int *dev) { initQudaDevice(*dev); }
5057 void end_quda_() { endQuda(); }
5058 void load_gauge_quda_(void *h_gauge, QudaGaugeParam *param) { loadGaugeQuda(h_gauge, param); }
5061 void load_clover_quda_(void *h_clover, void *h_clovinv, QudaInvertParam *inv_param)
5062 { loadCloverQuda(h_clover, h_clovinv, inv_param); }
5064 void dslash_quda_(void *h_out, void *h_in, QudaInvertParam *inv_param,
5065  QudaParity *parity) { dslashQuda(h_out, h_in, inv_param, *parity); }
5066 void clover_quda_(void *h_out, void *h_in, QudaInvertParam *inv_param,
5067  QudaParity *parity, int *inverse) { cloverQuda(h_out, h_in, inv_param, *parity, *inverse); }
5068 void mat_quda_(void *h_out, void *h_in, QudaInvertParam *inv_param)
5069 { MatQuda(h_out, h_in, inv_param); }
5070 void mat_dag_mat_quda_(void *h_out, void *h_in, QudaInvertParam *inv_param)
5071 { MatDagMatQuda(h_out, h_in, inv_param); }
5072 void invert_quda_(void *hp_x, void *hp_b, QudaInvertParam *param)
5073 { invertQuda(hp_x, hp_b, param); }
5074 void invert_md_quda_(void *hp_x, void *hp_b, QudaInvertParam *param)
5075 { invertMDQuda(hp_x, hp_b, param); }
5076 void invert_multishift_quda_(void *hp_x[QUDA_MAX_MULTI_SHIFT], void *hp_b, QudaInvertParam *param)
5077 { invertMultiShiftQuda(hp_x, hp_b, param); }
5079  *param = newQudaGaugeParam();
5080 }
5082  *param = newQudaInvertParam();
5083 }
5084 
5085 void update_gauge_field_quda_(void *gauge, void *momentum, double *dt,
5086  bool *conj_mom, bool *exact,
5087  QudaGaugeParam *param) {
5088  updateGaugeFieldQuda(gauge, momentum, *dt, (int)*conj_mom, (int)*exact, param);
5089 }
5090 
5091 int compute_gauge_force_quda_(void *mom, void *gauge, int *input_path_buf, int *path_length,
5092  double *loop_coeff, int *num_paths, int *max_length, double *dt,
5093  QudaGaugeParam *param) {
5094 
5095  // fortran uses multi-dimensional arrays which we have convert into an array of pointers to pointers
5096  const int dim = 4;
5097  int ***input_path = (int***)safe_malloc(dim*sizeof(int**));
5098  for (int i=0; i<dim; i++) {
5099  input_path[i] = (int**)safe_malloc(*num_paths*sizeof(int*));
5100  for (int j=0; j<*num_paths; j++) {
5101  input_path[i][j] = (int*)safe_malloc(path_length[j]*sizeof(int));
5102  for (int k=0; k<path_length[j]; k++) {
5103  input_path[i][j][k] = input_path_buf[(i* (*num_paths) + j)* (*max_length) + k];
5104  }
5105  }
5106  }
5107 
5108  computeGaugeForceQuda(mom, gauge, input_path, path_length, loop_coeff, *num_paths, *max_length, *dt, param, 0);
5109 
5110  for (int i=0; i<dim; i++) {
5111  for (int j=0; j<*num_paths; j++) { host_free(input_path[i][j]); }
5112  host_free(input_path[i]);
5113  }
5114  host_free(input_path);
5115 
5116  return 0;
5117 }
5118 
5119 void compute_staggered_force_quda_(void* cudaMom, void* qudaQuark, double *coeff) {
5120  computeStaggeredForceQuda(cudaMom, qudaQuark, *coeff);
5121 }
5122 
5123 // apply the staggered phases
5125  printfQuda("applying staggered phase\n");
5126  if (gaugePrecise) {
5128  } else {
5129  errorQuda("No persistent gauge field");
5130  }
5131 }
5132 
5133 // remove the staggered phases
5135  printfQuda("removing staggered phase\n");
5136  if (gaugePrecise) {
5138  } else {
5139  errorQuda("No persistent gauge field");
5140  }
5141  cudaDeviceSynchronize();
5142 }
5143 
5147 #ifdef MULTI_GPU
5148 static int bqcd_rank_from_coords(const int *coords, void *fdata)
5149 {
5150  int *dims = static_cast<int *>(fdata);
5151 
5152  int rank = coords[3];
5153  for (int i = 2; i >= 0; i--) {
5154  rank = dims[i] * rank + coords[i];
5155  }
5156  return rank;
5157 }
5158 #endif
5159 
5160 void comm_set_gridsize_(int *grid)
5161 {
5162 #ifdef MULTI_GPU
5163  initCommsGridQuda(4, grid, bqcd_rank_from_coords, static_cast<void *>(grid));
5164 #endif
5165 }
5166 
5170 void set_kernel_pack_t_(int* pack)
5171 {
5172  bool pack_ = *pack ? true : false;
5173  setKernelPackT(pack_);
5174 }
5175 
5176 double plaqCuda ()
5177 {
5178  cudaGaugeField *data = NULL;
5179  #ifndef MULTI_GPU
5180 // return quda::plaquette(*gaugePrecise, QUDA_CUDA_FIELD_LOCATION);
5181  data = gaugePrecise;
5182  #else
5183  if (extendedGaugeResident) {
5184  data = extendedGaugeResident;
5185  } else {
5186  int y[4];
5187  for(int dir=0; dir<4; ++dir) y[dir] = gaugePrecise->X()[dir] + 4;
5188  int pad = 0;
5191  gParamEx.create = QUDA_ZERO_FIELD_CREATE;
5192  gParamEx.order = gaugePrecise->Order();
5193  gParamEx.siteSubset = QUDA_FULL_SITE_SUBSET;
5194  gParamEx.t_boundary = gaugePrecise->TBoundary();
5195  gParamEx.nFace = 1;
5196 
5197  data = new cudaGaugeField(gParamEx);
5198 
5200  int R[4] = {2,2,2,2}; // radius of the extended region in each dimension / direction
5201  data->exchangeExtendedGhost(R,true);
5202  extendedGaugeResident = data;
5203  cudaDeviceSynchronize();
5204  }
5205 // return quda::plaquette(*extendedGaugeResident, QUDA_CUDA_FIELD_LOCATION);
5206  #endif
5208 }
5209 
5210 void performAPEnStep(unsigned int nSteps, double alpha)
5211 {
5212  profileAPE.Start(QUDA_PROFILE_TOTAL);
5213 
5214  if (gaugePrecise == NULL) {
5215  errorQuda("Gauge field must be loaded");
5216  }
5217 
5218 #ifdef MULTI_GPU
5219  if (extendedGaugeResident == NULL)
5220  {
5221  int y[4];
5222  for(int dir=0; dir<4; ++dir) y[dir] = gaugePrecise->X()[dir] + 4;
5223  int pad = 0;
5226  gParamEx.create = QUDA_ZERO_FIELD_CREATE;
5227  gParamEx.order = gaugePrecise->Order();
5228  gParamEx.siteSubset = QUDA_FULL_SITE_SUBSET;
5229  gParamEx.t_boundary = gaugePrecise->TBoundary();
5230  gParamEx.nFace = 1;
5231 
5232  extendedGaugeResident = new cudaGaugeField(gParamEx);
5233 
5235  int R[4] = {2,2,2,2}; // radius of the extended region in each dimension / direction
5237  }
5238 #endif
5239 
5240  int pad = 0;
5241  int y[4];
5242 
5243 #ifdef MULTI_GPU
5244  int R[4] = {2,2,2,2}; // radius of the extended region in each dimension / direction
5245  for (int dir=0; dir<4; ++dir) y[dir] = gaugePrecise->X()[dir] + 4;
5246 #else
5247  for (int dir=0; dir<4; ++dir) y[dir] = gaugePrecise->X()[dir];
5248 #endif
5249 
5256  gParam.nFace = 1;
5258 
5259  if (gaugeSmeared == NULL) {
5260 // gaugeSmeared = new cudaGaugeField(gParamEx);
5262  #ifdef MULTI_GPU
5265  #else
5267  #endif
5268  }
5269 
5270  cudaGaugeField *cudaGaugeTemp = NULL;
5271  cudaGaugeTemp = new cudaGaugeField(gParam);
5272 
5273  printfQuda("Plaquette after 0 APE steps: %le\n", plaquette(*gaugeSmeared, QUDA_CUDA_FIELD_LOCATION));
5274 
5275  for (unsigned int i=0; i<nSteps; i++) {
5276  #ifdef MULTI_GPU
5278  cudaGaugeTemp->exchangeExtendedGhost(R,true);
5279  APEStep(*gaugeSmeared, *cudaGaugeTemp, alpha, QUDA_CUDA_FIELD_LOCATION);
5280 // gaugeSmeared->exchangeExtendedGhost(R,true); FIXME I'm not entirely sure whether I can remove this...
5281  #else
5282  cudaGaugeTemp->copy(*gaugeSmeared);
5283  APEStep(*gaugeSmeared, *cudaGaugeTemp, alpha, QUDA_CUDA_FIELD_LOCATION);
5284  #endif
5285  }
5286 
5287  delete cudaGaugeTemp;
5288 
5289  #ifdef MULTI_GPU
5291  #endif
5292 
5293  printfQuda("Plaquette after %d APE steps: %le\n", nSteps, plaquette(*gaugeSmeared, QUDA_CUDA_FIELD_LOCATION));
5294 
5295  profileAPE.Stop(QUDA_PROFILE_TOTAL);
5296 }
5297 
5298 //#include"contractions.cpp" Contraction interface, to be added soon
void new_quda_invert_param_(QudaInvertParam *param)
QudaCloverFieldOrder order
Definition: clover_field.h:21
QudaGaugeParam gauge_param
Definition: dslash_test.cpp:37
void Dslash5(cudaColorSpinorField &out, const cudaColorSpinorField &in, const QudaParity parity) const
void destroyQudaGaugeField(void *gauge)
double secs
Definition: quda.h:183
QudaTboundary t_boundary
Definition: gauge_field.h:18
double Tadpole() const
Definition: gauge_field.h:171
QudaDiracFieldOrder dirac_order
Definition: quda.h:156
QudaMassNormalization mass_normalization
Definition: quda.h:146
double tol_hq_offset[QUDA_MAX_MULTI_SHIFT]
Definition: quda.h:134
void destroyStaggeredOprodEvents()
QudaReconstructType reconstruct_sloppy
Definition: quda.h:46
void comm_finalize(void)
double c_5[QUDA_MAX_DWF_LS]
NEW: used by mobius domain wall only.
Definition: dirac_quda.h:27
double plaqCuda()
cpuGaugeField * cpuOprod
void freeCloverQuda(void)
void * createExtendedGaugeField(void *gauge, int geometry, QudaGaugeParam *param)
int compute_gauge_force_quda_(void *mom, void *gauge, int *input_path_buf, int *path_length, double *loop_coeff, int *num_paths, int *max_length, double *dt, QudaGaugeParam *param)
void setDiracSloppyParam(DiracParam &diracParam, QudaInvertParam *inv_param, bool pc)
double b_5[QUDA_MAX_DWF_LS]
Definition: quda.h:94
void invert_multishift_quda_(void *hp_x[QUDA_MAX_MULTI_SHIFT], void *hp_b, QudaInvertParam *param)
void computeKSOprodQuda(void *oprod, void *fermion, double coeff, int X[4], QudaPrecision prec)
void loadCloverQuda(void *h_clover, void *h_clovinv, QudaInvertParam *inv_param)
void setVerbosityQuda(QudaVerbosity verbosity, const char prefix[], FILE *outfile)
QudaFieldLocation clover_location
Definition: quda.h:160
void llfat_cuda(cudaGaugeField *cudaFatLink, cudaGaugeField *cudaLongLink, cudaGaugeField &cudaSiteLink, cudaGaugeField &cudaStaple, cudaGaugeField &cudaStaple1, QudaGaugeParam *param, double *act_path_coeff)
QudaSolveType solve_type
Definition: quda.h:143
void computeCloverTraceQuda(void *out, void *clov, int mu, int nu, int dim[4])
enum QudaPrecision_s QudaPrecision
void freeGaugeQuda(void)
void saveGaugeField(void *gauge, void *inGauge, QudaGaugeParam *param)
int commDimPartitioned(int dir)
int ga_pad
Definition: quda.h:53
void load_gauge_quda_(void *h_gauge, QudaGaugeParam *param)
int y[4]
void invertMDQuda(void *hp_x, void *hp_b, QudaInvertParam *param)
int make_resident_mom
Definition: quda.h:72
virtual void StoreRitzVecs(void *host_buffer, double *inv_eigenvals, const int *X, QudaInvertParam *inv_par, const int nev, bool cleanResources=false)=0
double mu
Definition: quda.h:97
void * V(bool inverse=false)
Definition: clover_field.h:59
QudaTune tune
Definition: quda.h:185
int VolumeCB() const
void setUnitarizeForceConstants(double unitarize_eps, double hisq_force_filter, double max_det_error, bool allow_svd, bool svd_only, double svd_rel_error, double svd_abs_error)
void Dslash5inv(cudaColorSpinorField &out, const cudaColorSpinorField &in, const QudaParity parity, const double &k) const
#define QUDA_MAX_MULTI_SHIFT
Maximum number of shifts supported by the multi-shift solver. This number may be changed if need be...
cudaGaugeField * gaugeExtended
void createDirac(Dirac *&d, Dirac *&dSloppy, Dirac *&dPre, QudaInvertParam &param, const bool pc_solve)
void printQudaGaugeParam(QudaGaugeParam *param)
Definition: check_params.h:40
QudaVerbosity getVerbosity()
Definition: util_quda.cpp:20
QudaLinkType type
Definition: quda.h:35
void unitarizeForceCuda(cudaGaugeField &cudaOldForce, cudaGaugeField &cudaGauge, cudaGaugeField *cudaNewForce, int *unitarization_failed, long long *flops=NULL)
double kappa
Definition: quda.h:89
size_t Bytes() const
Definition: clover_field.h:67
void Clover(cudaColorSpinorField &out, const cudaColorSpinorField &in, const QudaParity parity) const
void Print()
Definition: timer.cpp:6
void saveCPUField(cpuGaugeField &, const QudaFieldLocation &) const
#define errorQuda(...)
Definition: util_quda.h:73
QudaDslashType dslash_type
Definition: quda.h:85
void setUnitarizeLinksConstants(double unitarize_eps, double max_error, bool allow_svd, bool svd_only, double svd_rel_error, double svd_abs_error, bool check_unitarization=true)
const int * X() const
QudaReconstructType reconstruct_precondition
Definition: quda.h:49
QudaFieldCreate create
Definition: clover_field.h:22
QudaInverterType inv_type
Definition: quda.h:86
Fortran interface functions.
void computeHISQForceCompleteQuda(void *const milc_momentum, const double level2_coeff[6], const double fat7_coeff[6], void **quark_array, int num_terms, double **quark_coeff, const void *const w_link, const void *const v_link, const void *const u_link, const QudaGaugeParam *gParam)
QudaPrecision cuda_prec
Definition: quda.h:152
double c_5[QUDA_MAX_DWF_LS]
Definition: quda.h:95
#define host_free(ptr)
Definition: malloc_quda.h:29
void computeKSLinkQuda(void *fatlink, void *longlink, void *ulink, void *inlink, double *path_coeff, QudaGaugeParam *param, QudaComputeFatMethod method)
__host__ __device__ ValueType sqrt(ValueType x)
Definition: complex_quda.h:105
cudaGaugeField *& gaugeFatExtended
__global__ void const RealA *const const RealA *const const RealA *const const RealB *const const RealB *const int int mu
QudaGaugeFieldOrder Order() const
Definition: gauge_field.h:169
void setOutputPrefix(const char *prefix)
Definition: util_quda.cpp:38
void loadGaugeQuda(void *h_gauge, QudaGaugeParam *param)
void CloverInv(cudaColorSpinorField &out, const cudaColorSpinorField &in, const QudaParity parity) const
cudaColorSpinorField * tmp1
Definition: dslash_test.cpp:41
cudaGaugeField * gaugeLongPrecise
void mat(void *out, void **fatlink, void **longlink, void *in, double kappa, int dagger_bit, QudaPrecision sPrecision, QudaPrecision gPrecision)
void completeKSForce(GaugeField &mom, const GaugeField &oprod, const GaugeField &gauge, QudaFieldLocation location, long long *flops=NULL)
const int Nstream
double trlogA[2]
Definition: quda.h:172
void assertAllMemFree()
Definition: malloc.cpp:294
QudaPrecision precision
Definition: lattice_field.h:41
virtual void reconstruct(cudaColorSpinorField &x, const cudaColorSpinorField &b, const QudaSolutionType) const =0
QudaDagType dagger
Definition: quda.h:145
int test_type
Definition: test_util.cpp:1564
void Dslash5inv(cudaColorSpinorField &out, const cudaColorSpinorField &in, const QudaParity parity, const double &k) const
void free_clover_quda_(void)
void initCommsGridQuda(int nDim, const int *dims, QudaCommsMap func, void *fdata)
double plaquette(const GaugeField &data, QudaFieldLocation location)
Definition: gauge_plaq.cu:242
void massRescale(cudaColorSpinorField &b, QudaInvertParam &param)
virtual void CleanResources()=0
QudaGaugeFieldOrder gauge_order
Definition: quda.h:36
double true_res
Definition: quda.h:105
void computeFatLinkCore(cudaGaugeField *cudaSiteLink, double *act_path_coeff, QudaGaugeParam *qudaGaugeParam, QudaComputeFatMethod method, cudaGaugeField *cudaFatLink, cudaGaugeField *cudaLongLink, TimeProfile &profile)
cpuGaugeField * cpuMom
int comm_gpuid(void)
Definition: comm_mpi.cpp:92
void setFatLinkPadding(QudaComputeFatMethod method, QudaGaugeParam *param)
void cloverDerivative(cudaGaugeField &out, cudaGaugeField &gauge, cudaGaugeField &oprod, int mu, int nu, double coeff, QudaParity parity, int conjugate)
void saveGaugeQuda(void *h_gauge, QudaGaugeParam *param)
void llfat_init_cuda_ex(QudaGaugeParam *param_ex)
#define spinorSiteSize
#define Vsh_t
Definition: llfat_core.h:4
void setUnitarizeLinksPadding(int input_padding, int output_padding)
size_t GBytes() const
cudaColorSpinorField * solutionResident
int compute_clover_trlog
Definition: quda.h:171
QudaSiteSubset siteSubset
Definition: lattice_field.h:42
cudaGaugeField * cudaGauge
QudaPrecision clover_cuda_prec_sloppy
Definition: quda.h:163
cudaGaugeField * gaugeLongExtended
QudaFieldLocation input_location
Definition: quda.h:82
void dslashQuda_4dpc(void *h_out, void *h_in, QudaInvertParam *inv_param, QudaParity parity, int test_type)
double true_res_hq_offset[QUDA_MAX_MULTI_SHIFT]
Definition: quda.h:140
void * longlink[4]
cudaGaugeField * gauge
Definition: dirac_quda.h:30
#define Vsh_z
Definition: llfat_core.h:3
cudaCloverField * cloverPrecondition
QudaUseInitGuess use_init_guess
Definition: quda.h:167
int Ls
Definition: test_util.cpp:40
void init_quda_(int *dev)
int getGaugePadding(GaugeFieldParam &param)
QudaGaugeParam param
Definition: pack_test.cpp:17
void openMagma()
int llfat_ga_pad
Definition: quda.h:58
void setDiracParam(DiracParam &diracParam, QudaInvertParam *inv_param, bool pc)
void comm_init(int ndim, const int *dims, QudaCommsMap rank_from_coords, void *map_data)
Definition: comm_mpi.cpp:31
cudaColorSpinorField & Odd() const
QudaSolutionType solution_type
Definition: quda.h:142
QudaSolverNormalization solver_normalization
Definition: quda.h:147
int numa_affinity_enabled
int E[4]
virtual void Dslash(cudaColorSpinorField &out, const cudaColorSpinorField &in, const QudaParity parity) const =0
int x[QUDA_MAX_DIM]
Definition: lattice_field.h:38
QudaPrecision clover_cuda_prec
Definition: quda.h:162
cudaCloverField * cloverInvPrecise
QudaPrecision Precision() const
void invertMultiShiftQuda(void **_hp_x, void *_hp_b, QudaInvertParam *param)
void initQuda(int dev)
void computeAsqtadForceQuda(void *const milc_momentum, long long *flops, const double act_path_coeff[6], const void *const one_link_src[4], const void *const naik_src[4], const void *const link, const QudaGaugeParam *gParam)
QudaInvertParam * invert_param
Definition: quda.h:261
void setTuning(QudaTune tune)
Definition: util_quda.cpp:33
cudaDeviceProp deviceProp
double spinorGiB
Definition: quda.h:180
void cloverInvert(CloverField &clover, bool computeTraceLog, QudaFieldLocation location)
void Dslash4(cudaColorSpinorField &out, const cudaColorSpinorField &in, const QudaParity parity) const
void exchange_cpu_sitelink_ex(int *X, int *R, void **sitelink, QudaGaugeFieldOrder cpu_order, QudaPrecision gPrecision, int optflag, int geometry)
void Dslash5(cudaColorSpinorField &out, const cudaColorSpinorField &in, const QudaParity parity) const
cudaColorSpinorField * tmp
QudaFieldLocation output_location
Definition: quda.h:83
QudaPrecision clover_cuda_prec_precondition
Definition: quda.h:164
int site_ga_pad
Definition: quda.h:55
void apply_staggered_phase_quda_()
VOLATILE spinorFloat kappa
void Dagger(QudaDagType dag)
Definition: dirac_quda.h:140
double true_res_hq_offset[QUDA_MAX_MULTI_SHIFT]
Definition: invert_quda.h:113
void updateInvertParam(QudaInvertParam &param, int offset=-1)
Definition: invert_quda.h:194
double m5
Definition: quda.h:91
void llfat_cuda_ex(cudaGaugeField *cudaFatLink, cudaGaugeField *cudaLongLink, cudaGaugeField &cudaSiteLink, cudaGaugeField &cudaStaple, cudaGaugeField &cudaStaple1, QudaGaugeParam *param, double *act_path_coeff)
cpuGaugeField * cpuFatLink
void exchangeExtendedGhost(const int *R, bool no_comms_fill=false)
cudaCloverField * cloverSloppy
QudaVerbosity verbosity
Definition: quda.h:174
void free_gauge_quda_()
static void flushPinnedCache()
double tol_offset[QUDA_MAX_MULTI_SHIFT]
Definition: quda.h:131
int commDim[QUDA_MAX_DIM]
Definition: dirac_quda.h:42
double true_res_offset[QUDA_MAX_MULTI_SHIFT]
Definition: quda.h:137
void MatQuda(void *h_out, void *h_in, QudaInvertParam *inv_param)
void load_clover_quda_(void *h_clover, void *h_clovinv, QudaInvertParam *inv_param)
QudaDagType dagger
Definition: dirac_quda.h:29
cpuColorSpinorField * in
cudaGaugeField *& gaugeFatPrecondition
QudaInvertParam newQudaInvertParam(void)
cudaGaugeField * cudaFatLink
void setPrecision(QudaPrecision precision)
Definition: clover_field.h:23
double gflops
Definition: quda.h:182
static Solver * create(SolverParam &param, DiracMatrix &mat, DiracMatrix &matSloppy, DiracMatrix &matPrecon, TimeProfile &profile)
Definition: solver.cpp:12
void hisqCompleteForceCuda(const QudaGaugeParam &param, const cudaGaugeField &oprod, const cudaGaugeField &link, cudaGaugeField *force, long long *flops=NULL)
static void freeBuffer(int index=0)
QudaPrecision cuda_prec_precondition
Definition: quda.h:48
void free_sloppy_gauge_quda_()
int staple_pad
Definition: quda.h:57
QudaCloverFieldOrder clover_order
Definition: quda.h:166
void createDslashEvents()
Definition: dslash_quda.cu:108
void updateGaugeFieldQuda(void *gauge, void *momentum, double dt, int conj_mom, int exact, QudaGaugeParam *param)
int preserve_gauge
Definition: quda.h:62
void remove_staggered_phase_quda_()
void invert_md_quda_(void *hp_x, void *hp_b, QudaInvertParam *param)
QudaGaugeFieldOrder order
Definition: gauge_field.h:15
cudaCloverField * cloverInvPrecondition
virtual void MdagM(cudaColorSpinorField &out, const cudaColorSpinorField &in) const =0
QudaReconstructType Reconstruct() const
Definition: gauge_field.h:168
#define warningQuda(...)
Definition: util_quda.h:84
double true_res_hq
Definition: quda.h:106
void performAPEnStep(unsigned int nSteps, double alpha)
__host__ __device__ ValueType pow(ValueType x, ExponentType e)
Definition: complex_quda.h:100
double b_5[QUDA_MAX_DWF_LS]
NEW: used by domain wall and twisted mass.
Definition: dirac_quda.h:26
QudaDiracType type
Definition: dirac_quda.h:21
void copyCuda(cudaColorSpinorField &dst, const cudaColorSpinorField &src)
Definition: copy_quda.cu:235
__constant__ double coeff
QudaPrecision cuda_prec_sloppy
Definition: quda.h:45
int max_search_dim
Definition: quda.h:242
Dirac * dirac
Definition: dslash_test.cpp:45
void loadCPUField(const cpuGaugeField &, const QudaFieldLocation &)
static void CloseMagma()
Definition: blas_magma.cpp:58
enum QudaGaugeFieldOrder_s QudaGaugeFieldOrder
QudaMatPCType matpcType
NEW: used by mobius domain wall only.
Definition: dirac_quda.h:28
double true_res_offset[QUDA_MAX_MULTI_SHIFT]
Definition: invert_quda.h:110
void comm_set_gridsize_(int *grid)
double offset[QUDA_MAX_MULTI_SHIFT]
Definition: quda.h:128
void gauge_force_cuda(cudaGaugeField &cudaMom, double eb3, cudaGaugeField &cudaSiteLink, QudaGaugeParam *param, int ***input_path, int *length, double *path_coeff, int num_paths, int max_length)
void initQudaMemory()
QudaSolutionType RitzMat_lanczos
Definition: quda.h:262
void init_quda_memory_()
cudaCloverField * cloverPrecise
void compute_staggered_force_quda_(void *cudaMom, void *qudaQuark, double *coeff)
enum QudaParity_s QudaParity
cpuGaugeField * cpuGauge
QudaReconstructType reconstruct
Definition: quda.h:43
void lanczosQuda(int k0, int m, void *hp_Apsi, void *hp_r, void *hp_V, void *hp_alpha, void *hp_beta, QudaEigParam *eig_param)
QudaPrecision cuda_prec
Definition: quda.h:42
void extendGaugeField(void *out, void *in)
int X[4]
Definition: quda.h:29
cudaCloverField * cloverInv
Definition: dirac_quda.h:34
void computeStaggeredOprod(cudaGaugeField &out, cudaColorSpinorField &in, FaceBuffer &facebuffer, const unsigned int parity, const double coeff, const unsigned int displacement)
double mass
Definition: quda.h:88
void dslashQuda_mdwf(void *h_out, void *h_in, QudaInvertParam *inv_param, QudaParity parity, int test_type)
static Eig_Solver * create(QudaEigParam &param, RitzMat &ritz_mat, TimeProfile &profile)
Definition: eig_solver.cpp:12
void unitarizeLinksCuda(const QudaGaugeParam &param, cudaGaugeField &infield, cudaGaugeField *outfield, int *num_failures)
void computeCloverSigmaTrace(GaugeField &gauge, const CloverField &clover, int dir1, int dir2, QudaFieldLocation location)
cudaGaugeField * cudaMom
void computeStaggeredForceQuda(void *cudaMom, void *qudaQuark, double coeff)
void MatDagMatQuda(void *h_out, void *h_in, QudaInvertParam *inv_param)
void invertMultiShiftMDQuda(void **_hp_xe, void **_hp_xo, void **_hp_ye, void **_hp_yo, void *_hp_b, QudaInvertParam *param)
void invert_quda_(void *hp_x, void *hp_b, QudaInvertParam *param)
#define safe_malloc(size)
Definition: malloc_quda.h:25
void copy(const CloverField &src, bool inverse=true)
int dims[QUDA_MAX_DIM]
int x[4]
double shift
Shift term added onto operator (M^dag M + shift)
Definition: dirac_quda.h:643
void setMass(double mass)
Definition: dirac_quda.h:132
void pushVerbosity(QudaVerbosity verbosity)
Definition: util_quda.cpp:52
void init_quda_device_(int *dev)
cudaGaugeField * gaugeLongSloppy
void Dslash4(cudaColorSpinorField &out, const cudaColorSpinorField &in, const QudaParity parity) const
QudaGhostExchange ghostExchange
Definition: gauge_field.h:40
#define checkCudaErrorNoSync()
Definition: util_quda.h:94
void hisqLongLinkForceCuda(double coeff, const QudaGaugeParam &param, const cudaGaugeField &oprod, const cudaGaugeField &link, cudaGaugeField *newOprod, long long *flops=NULL)
void Dslash4pre(cudaColorSpinorField &out, const cudaColorSpinorField &in, const QudaParity parity) const
void update_gauge_field_quda_(void *gauge, void *momentum, double *dt, bool *conj_mom, bool *exact, QudaGaugeParam *param)
void APEStep(GaugeField &dataDs, const GaugeField &dataOr, double alpha, QudaFieldLocation location)
Definition: gauge_ape.cu:497
void invertQuda(void *hp_x, void *hp_b, QudaInvertParam *param)
void printQudaInvertParam(QudaInvertParam *param)
Definition: check_params.h:162
void computeHISQForceQuda(void *const milc_momentum, long long *flops, const double level2_coeff[6], const double fat7_coeff[6], const void *const staple_src[4], const void *const one_link_src[4], const void *const naik_src[4], const void *const w_link, const void *const v_link, const void *const u_link, const QudaGaugeParam *gParam)
void clover_quda_(void *h_out, void *h_in, QudaInvertParam *inv_param, QudaParity *parity, int *inverse)
void hisqStaplesForceCuda(const double path_coeff[6], const QudaGaugeParam &param, const cudaGaugeField &oprod, const cudaGaugeField &link, cudaGaugeField *newOprod, long long *flops=NULL)
void incrementalEigQuda(void *_h_x, void *_h_b, QudaInvertParam *param, void *_h_u, double *inv_eigenvals, int last_rhs)
cudaGaugeField * cudaGauge_ex
QudaFieldLocation location
Definition: quda.h:27
QudaInvertParam inv_param
Definition: dslash_test.cpp:38
cpuColorSpinorField * out
void Stop(QudaProfileType idx)
double gaugeGiB
Definition: quda.h:60
QudaPrecision cuda_prec_precondition
Definition: quda.h:154
cudaGaugeField * gaugePrecondition
void endBlas(void)
Definition: blas_quda.cu:59
void loadTuneCache(QudaVerbosity verbosity)
Definition: tune.cpp:131
GaugeFieldParam gParam
__constant__ int Vh_2d_max
if(x2 >=X2) return
enum QudaReconstructType_s QudaReconstructType
Main header file for the QUDA library.
static DeflatedSolver * create(SolverParam &param, DiracMatrix &mat, DiracMatrix &matSloppy, DiracMatrix &matCGSloppy, DiracMatrix &matDeflate, TimeProfile &profile)
Definition: solver.cpp:150
void * createGaugeField(void *gauge, int geometry, QudaGaugeParam *param)
virtual void M(cudaColorSpinorField &out, const cudaColorSpinorField &in) const =0
cudaCloverField * clover
Definition: dirac_quda.h:33
QudaLinkType link_type
Definition: gauge_field.h:17
void set_kernel_pack_t_(int *pack)
QudaPrecision Precision() const
virtual void prepare(cudaColorSpinorField *&src, cudaColorSpinorField *&sol, cudaColorSpinorField &x, cudaColorSpinorField &b, const QudaSolutionType) const =0
void printPeakMemUsage()
Definition: malloc.cpp:286
double Last(QudaProfileType idx)
#define QUDA_MAX_DWF_LS
Maximum length of the Ls dimension for domain-wall fermions.
void applyStaggeredPhase()
Definition: gauge_field.cpp:66
#define Vsh_y
Definition: llfat_core.h:2
int mom_ga_pad
Definition: quda.h:59
void freeSloppyGaugeQuda(void)
void mat_dag_mat_quda_(void *h_out, void *h_in, QudaInvertParam *inv_param)
int use_resident_gauge
Definition: quda.h:69
#define printfQuda(...)
Definition: util_quda.h:67
void new_quda_gauge_param_(QudaGaugeParam *param)
cudaGaugeField * fatGauge
Definition: dirac_quda.h:31
QudaTwistFlavorType twist_flavor
Definition: quda.h:100
void cloverQuda(void *h_out, void *h_in, QudaInvertParam *inv_param, QudaParity parity, int inverse)
cudaGaugeField * gaugeSmeared
#define MAX(a, b)
quda::cudaGaugeField * checkGauge(QudaInvertParam *param)
cudaStream_t * streams
int use_resident_mom
Definition: quda.h:70
QudaReconstructType reconstruct
Definition: gauge_field.h:14
void setKernelPackT(bool pack)
Definition: dslash_quda.cu:82
void closeMagma()
QudaFieldCreate create
Definition: gauge_field.h:26
void Start(QudaProfileType idx)
void printLaunchTimer()
Definition: tune.cpp:437
cudaGaugeField *& gaugeFatSloppy
void copy(const GaugeField &)
QudaResidualType residual_type
Definition: quda.h:235
enum QudaFieldGeometry_s QudaFieldGeometry
QudaUseInitGuess use_init_guess
Definition: invert_quda.h:38
int num_offset
Definition: quda.h:123
cudaGaugeField * longGauge
Definition: dirac_quda.h:32
void dslash_quda_(void *h_out, void *h_in, QudaInvertParam *inv_param, QudaParity *parity)
void popVerbosity()
Definition: util_quda.cpp:63
#define Vsh_x
Definition: llfat_core.h:1
enum QudaVerbosity_s QudaVerbosity
double cloverGiB
Definition: quda.h:181
void updateGaugeField(GaugeField &out, double dt, const GaugeField &in, const GaugeField &mom, bool conj_mom, bool exact)
void createCloverQuda(QudaInvertParam *invertParam)
void end_quda_()
void computeStaggeredOprodQuda(void **oprod, void **fermion, int num_terms, double **coeff, QudaGaugeParam *param)
double epsilon
Definition: quda.h:98
#define QUDA_MAX_DIM
Maximum number of dimensions supported by QUDA. In practice, no routines make use of more than 5...
cudaCloverField * cloverInvSloppy
#define checkCudaError()
Definition: util_quda.h:110
QudaFieldGeometry geometry
Definition: gauge_field.h:28
void setOutputFile(FILE *outfile)
Definition: util_quda.cpp:44
cudaGaugeField * gaugePrecise
int(* QudaCommsMap)(const int *coords, void *fdata)
Definition: comm_quda.h:12
static Dirac * create(const DiracParam &param)
Definition: dirac.cpp:134
cudaGaugeField *& gaugeFatPrecise
QudaFieldGeometry Geometry() const
Definition: gauge_field.h:177
void dslashQuda(void *h_out, void *h_in, QudaInvertParam *inv_param, QudaParity parity)
void llfat_init_cuda(QudaGaugeParam *param)
void Mdag(cudaColorSpinorField &out, const cudaColorSpinorField &in) const
Definition: dirac.cpp:68
int make_resident_gauge
Definition: quda.h:71
cudaGaugeField * cudaOprod
QudaPrecision prec
Definition: test_util.cpp:1551
void gauge_force_init_cuda(QudaGaugeParam *param, int max_length)
void axCuda(const double &a, cudaColorSpinorField &x)
Definition: blas_quda.cu:171
cudaGaugeField * extendedGaugeResident
void computeClover(CloverField &clover, const GaugeField &gauge, double coeff, QudaFieldLocation location)
Definition: clover_quda.cu:602
QudaDslashType dslash_type_precondition
Definition: quda.h:208
double norm2(const ColorSpinorField &)
QudaPrecision clover_cpu_prec
Definition: quda.h:161
void initLatticeConstants(const LatticeField &lat, TimeProfile &profile)
QudaTboundary TBoundary() const
Definition: gauge_field.h:173
enum QudaComputeFatMethod_s QudaComputeFatMethod
void computeCloverDerivativeQuda(void *out, void *gauge, void *oprod, int mu, int nu, double coeff, QudaParity parity, QudaGaugeParam *param, int conjugate)
void destroyDslashEvents()
Definition: dslash_quda.cu:129
double * TrLog() const
Definition: clover_field.h:64
const QudaParity parity
Definition: dslash_test.cpp:29
void * gauge[4]
Definition: su3_test.cpp:15
void copyExtendedGauge(GaugeField &out, const GaugeField &in, QudaFieldLocation location, void *Out=0, void *In=0)
void setVerbosity(const QudaVerbosity verbosity)
Definition: util_quda.cpp:24
QudaMatPCType matpc_type
Definition: quda.h:144
void initBlas()
Definition: blas_quda.cu:53
int setNumaAffinity(int)
cudaGaugeField * momResident
cudaColorSpinorField & Even() const
double Anisotropy() const
Definition: gauge_field.h:170
static void OpenMagma()
Definition: blas_magma.cpp:39
double kappa5
Definition: dslash_test.cpp:32
void saveTuneCache(QudaVerbosity verbosity)
Definition: tune.cpp:205
QudaPrecision cpu_prec
Definition: quda.h:40
cudaGaugeField * gaugeSloppy
void createStaggeredOprodEvents()
void initQudaDevice(int dev)
void endQuda(void)
int computeGaugeForceQuda(void *mom, void *siteLink, int ***input_path_buf, int *path_length, double *loop_coeff, int num_paths, int max_length, double eb3, QudaGaugeParam *qudaGaugeParam, double *timeinfo)
int overlap
Definition: quda.h:67
void setDiracPreParam(DiracParam &diracParam, QudaInvertParam *inv_param, const bool pc)
QudaGaugeParam newQudaGaugeParam(void)
QudaPreserveSource preserve_source
Definition: quda.h:149
void mat_quda_(void *h_out, void *h_in, QudaInvertParam *inv_param)
double clover_coeff
Definition: quda.h:169
void * fatlink[4]
cudaGaugeField * gaugeLongPrecondition
void removeStaggeredPhase()
Definition: gauge_field.cpp:79