QUDA  0.9.0
interface_quda.cpp
Go to the documentation of this file.
1 #include <iostream>
2 #include <stdio.h>
3 #include <stdlib.h>
4 #include <math.h>
5 #include <string.h>
6 #include <sys/time.h>
7 
8 #include <quda.h>
9 #include <quda_fortran.h>
10 #include <quda_internal.h>
11 #include <comm_quda.h>
12 #include <tune_quda.h>
13 #include <blas_quda.h>
14 #include <gauge_field.h>
15 #include <dirac_quda.h>
16 #include <ritz_quda.h>
17 #include <dslash_quda.h>
18 #include <invert_quda.h>
19 #include <lanczos_quda.h>
20 #include <color_spinor_field.h>
21 #include <eig_variables.h>
22 #include <clover_field.h>
23 #include <llfat_quda.h>
24 #include <unitarization_links.h>
25 #include <algorithm>
26 #include <staggered_oprod.h>
27 #include <ks_improved_force.h>
28 #include <ks_force_quda.h>
29 #include <random_quda.h>
30 
31 #include <multigrid.h>
32 
33 #include <deflation.h>
34 
35 #ifdef NUMA_NVML
36 #include <numa_affinity.h>
37 #endif
38 
39 #include <cuda.h>
40 
41 #ifdef MULTI_GPU
42 extern void exchange_cpu_sitelink_ex(int* X, int *R, void** sitelink, QudaGaugeFieldOrder cpu_order,
43  QudaPrecision gPrecision, int optflag, int geom);
44 #endif // MULTI_GPU
45 
46 #include <ks_force_quda.h>
47 
48 #ifdef GPU_GAUGE_FORCE
49 #include <gauge_force_quda.h>
50 #endif
51 #include <gauge_update_quda.h>
52 
53 #define MAX(a,b) ((a)>(b)? (a):(b))
54 #define TDIFF(a,b) (b.tv_sec - a.tv_sec + 0.000001*(b.tv_usec - a.tv_usec))
55 
56 #define spinorSiteSize 24 // real numbers per spinor
57 
58 #define MAX_GPU_NUM_PER_NODE 16
59 
60 // define newQudaGaugeParam() and newQudaInvertParam()
61 #define INIT_PARAM
62 #include "check_params.h"
63 #undef INIT_PARAM
64 
65 // define (static) checkGaugeParam() and checkInvertParam()
66 #define CHECK_PARAM
67 #include "check_params.h"
68 #undef CHECK_PARAM
69 
70 // define printQudaGaugeParam() and printQudaInvertParam()
71 #define PRINT_PARAM
72 #include "check_params.h"
73 #undef PRINT_PARAM
74 
75 #include <gauge_tools.h>
76 #include <contractQuda.h>
77 
78 #include <momentum.h>
79 
80 
81 using namespace quda;
82 
83 static int R[4] = {0, 0, 0, 0};
84 // setting this to false prevents redundant halo exchange but isn't yet compatible with HISQ / ASQTAD kernels
85 static bool redundant_comms = false;
86 
87 //for MAGMA lib:
88 #include <blas_magma.h>
89 
90 static bool InitMagma = false;
91 
92 void openMagma() {
93 
94  if (!InitMagma) {
95  OpenMagma();
96  InitMagma = true;
97  } else {
98  printfQuda("\nMAGMA library was already initialized..\n");
99  }
100 
101 }
102 
103 void closeMagma(){
104 
105  if (InitMagma) {
106  CloseMagma();
107  InitMagma = false;
108  } else {
109  printfQuda("\nMAGMA library was not initialized..\n");
110  }
111 
112  return;
113 }
114 
119 
120 // It's important that these alias the above so that constants are set correctly in Dirac::Dirac()
125 
126 
131 
133 
137 
140 
141 std::vector<cudaColorSpinorField*> solutionResident;
142 
143 // vector of spinors used for forecasting solutions in HMC
144 #define QUDA_MAX_CHRONO 2
145 // each entry is a pair for both p and Ap storage
146 std::vector< std::vector< std::pair<ColorSpinorField*,ColorSpinorField*> > > chronoResident(QUDA_MAX_CHRONO);
147 
148 // Mapped memory buffer used to hold unitarization failures
149 static int *num_failures_h = NULL;
150 static int *num_failures_d = NULL;
151 
152 cudaDeviceProp deviceProp;
153 cudaStream_t *streams;
154 #ifdef PTHREADS
155 pthread_mutex_t pthread_mutex;
156 #endif
157 
158 static bool initialized = false;
159 
161 static TimeProfile profileInit("initQuda");
162 
164 static TimeProfile profileGauge("loadGaugeQuda");
165 
167 static TimeProfile profileClover("loadCloverQuda");
168 
170 static TimeProfile profileDslash("dslashQuda");
171 
173 static TimeProfile profileInvert("invertQuda");
174 
176 static TimeProfile profileMulti("invertMultiShiftQuda");
177 
179 static TimeProfile profileFatLink("computeKSLinkQuda");
180 
182 static TimeProfile profileGaugeForce("computeGaugeForceQuda");
183 
185 static TimeProfile profileGaugeUpdate("updateGaugeFieldQuda");
186 
188 static TimeProfile profileExtendedGauge("createExtendedGaugeField");
189 
191 static TimeProfile profileCloverForce("computeCloverForceQuda");
192 
194 static TimeProfile profileStaggeredForce("computeStaggeredForceQuda");
195 
197 static TimeProfile profileHISQForce("computeHISQForceQuda");
198 
200 static TimeProfile profilePlaq("plaqQuda");
201 
203 static TimeProfile profileWuppertal("wuppertalQuda");
204 
206 static TimeProfile profileGauss("gaussQuda");
207 
209 static TimeProfile profileQCharge("qChargeQuda");
210 
212 static TimeProfile profileAPE("APEQuda");
213 
215 static TimeProfile profileSTOUT("STOUTQuda");
216 
218 static TimeProfile profileOvrImpSTOUT("OvrImpSTOUTQuda");
219 
221 static TimeProfile profileProject("projectSU3Quda");
222 
224 static TimeProfile profilePhase("staggeredPhaseQuda");
225 
227 static TimeProfile profileContract("contractQuda");
228 
230 static TimeProfile profileCovDev("covDevQuda");
231 
233 static TimeProfile profileMomAction("momActionQuda");
234 
236 static TimeProfile profileEnd("endQuda");
237 
239 static TimeProfile GaugeFixFFTQuda("GaugeFixFFTQuda");
240 static TimeProfile GaugeFixOVRQuda("GaugeFixOVRQuda");
241 
242 
243 
245 static TimeProfile profileInit2End("initQuda-endQuda",false);
246 
247 namespace quda {
248  void printLaunchTimer();
249 }
250 
251 void setVerbosityQuda(QudaVerbosity verbosity, const char prefix[], FILE *outfile)
252 {
254  setOutputPrefix(prefix);
255  setOutputFile(outfile);
256 }
257 
258 
259 typedef struct {
260  int ndim;
261  int dims[QUDA_MAX_DIM];
262 } LexMapData;
263 
267 static int lex_rank_from_coords(const int *coords, void *fdata)
268 {
269  LexMapData *md = static_cast<LexMapData *>(fdata);
270 
271  int rank = coords[0];
272  for (int i = 1; i < md->ndim; i++) {
273  rank = md->dims[i] * rank + coords[i];
274  }
275  return rank;
276 }
277 
278 #ifdef QMP_COMMS
279 
282 static int qmp_rank_from_coords(const int *coords, void *fdata)
283 {
284  return QMP_get_node_number_from(coords);
285 }
286 #endif
287 
288 
289 static bool comms_initialized = false;
290 
291 void initCommsGridQuda(int nDim, const int *dims, QudaCommsMap func, void *fdata)
292 {
293  if (nDim != 4) {
294  errorQuda("Number of communication grid dimensions must be 4");
295  }
296 
297  LexMapData map_data;
298  if (!func) {
299 
300 #if QMP_COMMS
301  if (QMP_logical_topology_is_declared()) {
302  if (QMP_get_logical_number_of_dimensions() != 4) {
303  errorQuda("QMP logical topology must have 4 dimensions");
304  }
305  for (int i=0; i<nDim; i++) {
306  int qdim = QMP_get_logical_dimensions()[i];
307  if(qdim != dims[i]) {
308  errorQuda("QMP logical dims[%d]=%d does not match dims[%d]=%d argument", i, qdim, i, dims[i]);
309  }
310  }
311  fdata = NULL;
312  func = qmp_rank_from_coords;
313  } else {
314  warningQuda("QMP logical topology is undeclared; using default lexicographical ordering");
315 #endif
316 
317  map_data.ndim = nDim;
318  for (int i=0; i<nDim; i++) {
319  map_data.dims[i] = dims[i];
320  }
321  fdata = (void *) &map_data;
323 
324 #if QMP_COMMS
325  }
326 #endif
327 
328  }
329  comm_init(nDim, dims, func, fdata);
330  comms_initialized = true;
331 }
332 
333 
334 static void init_default_comms()
335 {
336 #if defined(QMP_COMMS)
337  if (QMP_logical_topology_is_declared()) {
338  int ndim = QMP_get_logical_number_of_dimensions();
339  const int *dims = QMP_get_logical_dimensions();
340  initCommsGridQuda(ndim, dims, NULL, NULL);
341  } else {
342  errorQuda("initQuda() called without prior call to initCommsGridQuda(),"
343  " and QMP logical topology has not been declared");
344  }
345 #elif defined(MPI_COMMS)
346  errorQuda("When using MPI for communications, initCommsGridQuda() must be called before initQuda()");
347 #else // single-GPU
348  const int dims[4] = {1, 1, 1, 1};
349  initCommsGridQuda(4, dims, NULL, NULL);
350 #endif
351 }
352 
353 
354 #define STR_(x) #x
355 #define STR(x) STR_(x)
357 #undef STR
358 #undef STR_
359 
360 extern char* gitversion;
361 
362 /*
363  * Set the device that QUDA uses.
364  */
365 void initQudaDevice(int dev) {
366 
367  //static bool initialized = false;
368  if (initialized) return;
369  initialized = true;
370 
374 
375  if (getVerbosity() >= QUDA_SUMMARIZE) {
376 #ifdef GITVERSION
377  printfQuda("QUDA %s (git %s)\n",quda_version.c_str(),gitversion);
378 #else
379  printfQuda("QUDA %s\n",quda_version.c_str());
380 #endif
381  }
382 
383 #if defined(MULTI_GPU) && (CUDA_VERSION == 4000)
384  //check if CUDA_NIC_INTEROP is set to 1 in the enviroment
385  // not needed for CUDA >= 4.1
386  char* cni_str = getenv("CUDA_NIC_INTEROP");
387  if(cni_str == NULL){
388  errorQuda("Environment variable CUDA_NIC_INTEROP is not set");
389  }
390  int cni_int = atoi(cni_str);
391  if (cni_int != 1){
392  errorQuda("Environment variable CUDA_NIC_INTEROP is not set to 1");
393  }
394 #endif
395 
396  int deviceCount;
397  cudaGetDeviceCount(&deviceCount);
398  if (deviceCount == 0) {
399  errorQuda("No CUDA devices found");
400  }
401 
402  for(int i=0; i<deviceCount; i++) {
403  cudaGetDeviceProperties(&deviceProp, i);
404  checkCudaErrorNoSync(); // "NoSync" for correctness in HOST_DEBUG mode
405  if (getVerbosity() >= QUDA_SUMMARIZE) {
406  printfQuda("Found device %d: %s\n", i, deviceProp.name);
407  }
408  }
409 
410 #ifdef MULTI_GPU
411  if (dev < 0) {
412  if (!comms_initialized) {
413  errorQuda("initDeviceQuda() called with a negative device ordinal, but comms have not been initialized");
414  }
415  dev = comm_gpuid();
416  }
417 #else
418  if (dev < 0 || dev >= 16) errorQuda("Invalid device number %d", dev);
419 #endif
420 
421  cudaGetDeviceProperties(&deviceProp, dev);
422  checkCudaErrorNoSync(); // "NoSync" for correctness in HOST_DEBUG mode
423  if (deviceProp.major < 1) {
424  errorQuda("Device %d does not support CUDA", dev);
425  }
426 
427 
428 // Check GPU and QUDA build compatibiliy
429 // 4 cases:
430 // a) QUDA and GPU match: great
431 // b) QUDA built for higher compute capability: error
432 // c) QUDA built for lower major compute capability: warn if QUDA_ALLOW_JIT, else error
433 // d) QUDA built for same major compute capability but lower minor: warn
434 
435  const int my_major = __COMPUTE_CAPABILITY__ / 100;
436  const int my_minor = (__COMPUTE_CAPABILITY__ - my_major * 100) / 10;
437 // b) UDA was compiled for a higher compute capability
438  if (deviceProp.major * 100 + deviceProp.minor * 10 < __COMPUTE_CAPABILITY__)
439  errorQuda("** Running on a device with compute capability %i.%i but QUDA was compiled for %i.%i. ** \n --- Please set the correct QUDA_GPU_ARCH when running cmake.\n", deviceProp.major, deviceProp.minor, my_major, my_minor);
440 
441 
442 // c) QUDA was compiled for a lower compute capability
443  if (deviceProp.major < my_major) {
444  char *allow_jit_env = getenv("QUDA_ALLOW_JIT");
445  if (allow_jit_env && strcmp(allow_jit_env, "1") == 0) {
446  if (getVerbosity() > QUDA_SILENT) warningQuda("** Running on a device with compute capability %i.%i but QUDA was compiled for %i.%i. **\n -- Jitting the PTX since QUDA_ALLOW_JIT=1 was set. Note that this will take some time.\n", deviceProp.major, deviceProp.minor, my_major, my_minor);
447  } else {
448  errorQuda("** Running on a device with compute capability %i.%i but QUDA was compiled for %i.%i. **\n --- Please set the correct QUDA_GPU_ARCH when running cmake.\n If you want the PTX to be jitted for your current GPU arch please set the enviroment variable QUDA_ALLOW_JIT=1.", deviceProp.major, deviceProp.minor, my_major, my_minor);
449  }
450  }
451 // d) QUDA built for same major compute capability but lower minor
452  if (deviceProp.major == my_major and deviceProp.minor > my_minor) {
453  warningQuda("** Running on a device with compute capability %i.%i but QUDA was compiled for %i.%i. **\n -- This might result in a lower performance. Please consider adjusting QUDA_GPU_ARCH when running cmake.\n", deviceProp.major, deviceProp.minor, my_major, my_minor);
454  }
455 
456  if (getVerbosity() >= QUDA_SUMMARIZE) {
457  printfQuda("Using device %d: %s\n", dev, deviceProp.name);
458  }
459 #ifndef USE_QDPJIT
460  cudaSetDevice(dev);
461  checkCudaErrorNoSync(); // "NoSync" for correctness in HOST_DEBUG mode
462 #endif
463 
464 
465 #if ((CUDA_VERSION >= 6000) && defined NUMA_NVML)
466  char *enable_numa_env = getenv("QUDA_ENABLE_NUMA");
467  if (enable_numa_env && strcmp(enable_numa_env, "0") == 0) {
468  if (getVerbosity() > QUDA_SILENT) printfQuda("Disabling numa_affinity\n");
469  }
470  else{
471  setNumaAffinityNVML(dev);
472  }
473 #endif
474 
475 
476 
477  cudaDeviceSetCacheConfig(cudaFuncCachePreferL1);
478  //cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeEightByte);
479  // cudaGetDeviceProperties(&deviceProp, dev);
480 
481  { // determine if we will do CPU or GPU data reordering (default is GPU)
482  char *reorder_str = getenv("QUDA_REORDER_LOCATION");
483 
484  if (!reorder_str || (strcmp(reorder_str,"CPU") && strcmp(reorder_str,"cpu")) ) {
485  warningQuda("Data reordering done on GPU (set with QUDA_REORDER_LOCATION=GPU/CPU)");
487  } else {
488  warningQuda("Data reordering done on CPU (set with QUDA_REORDER_LOCATION=GPU/CPU)");
490  }
491  }
492 
495 }
496 
497 /*
498  * Any persistent memory allocations that QUDA uses are done here.
499  */
501 {
504 
506 
507  streams = new cudaStream_t[Nstream];
508 
509 #if (CUDA_VERSION >= 5050)
510  int greatestPriority;
511  int leastPriority;
512  cudaDeviceGetStreamPriorityRange(&leastPriority, &greatestPriority);
513  for (int i=0; i<Nstream-1; i++) {
514  cudaStreamCreateWithPriority(&streams[i], cudaStreamDefault, greatestPriority);
515  }
516  cudaStreamCreateWithPriority(&streams[Nstream-1], cudaStreamDefault, leastPriority);
517 #else
518  for (int i=0; i<Nstream; i++) {
519  cudaStreamCreate(&streams[i]);
520  }
521 #endif
522 
523  checkCudaError();
525  blas::init();
526 
527  // initalize the memory pool allocators
528  pool::init();
529 
530  num_failures_h = static_cast<int*>(mapped_malloc(sizeof(int)));
531  cudaHostGetDevicePointer(&num_failures_d, num_failures_h, 0);
532 
533  loadTuneCache();
534 
535  for (int d=0; d<4; d++) R[d] = 2 * (redundant_comms || commDimPartitioned(d));
536 
539 }
540 
541 void updateR()
542 {
543  for (int d=0; d<4; d++) R[d] = 2 * (redundant_comms || commDimPartitioned(d));
544 }
545 
546 void initQuda(int dev)
547 {
548  // initialize communications topology, if not already done explicitly via initCommsGridQuda()
550 
551  // set the device that QUDA uses
552  initQudaDevice(dev);
553 
554  // set the persistant memory allocations that QUDA uses (Blas, streams, etc.)
555  initQudaMemory();
556 
557 #ifdef PTHREADS
558  pthread_mutexattr_t mutex_attr;
559  pthread_mutexattr_init(&mutex_attr);
560  pthread_mutexattr_settype(&mutex_attr, PTHREAD_MUTEX_RECURSIVE);
561  pthread_mutex_init(&pthread_mutex, &mutex_attr);
562 #endif
563 }
564 
565 // helper for creating extended gauge fields
568 {
569  profile.TPSTART(QUDA_PROFILE_INIT);
570  int y[4];
571  for (int dir=0; dir<4; ++dir) y[dir] = in.X()[dir] + 2*R[dir];
572  int pad = 0;
573 
574  GaugeFieldParam gParamEx(y, in.Precision(), recon != QUDA_RECONSTRUCT_INVALID ? recon : in.Reconstruct(), pad,
575  in.Geometry(), QUDA_GHOST_EXCHANGE_EXTENDED);
576  gParamEx.create = QUDA_ZERO_FIELD_CREATE;
577  gParamEx.order = in.Order();
579  gParamEx.t_boundary = in.TBoundary();
580  gParamEx.nFace = 1;
581  gParamEx.tadpole = in.Tadpole();
582  for (int d=0; d<4; d++) gParamEx.r[d] = R[d];
583 
584  cudaGaugeField *out = new cudaGaugeField(gParamEx);
585 
586  // copy input field into the extended device gauge field
588 
589  profile.TPSTOP(QUDA_PROFILE_INIT);
590 
591  // now fill up the halos
592  out->exchangeExtendedGhost(R,profile,redundant_comms);
593 
594  return out;
595 }
596 
597 // This is a flag used to signal when we have downloaded new gauge
598 // field. Set by loadGaugeQuda and consumed by loadCloverQuda as one
599 // possible flag to indicate we need to recompute the clover field
600 static bool invalidate_clover = true;
601 
602 void loadGaugeQuda(void *h_gauge, QudaGaugeParam *param)
603 {
605 
606  if (!initialized) errorQuda("QUDA not initialized");
608 
609  checkGaugeParam(param);
610 
612  // Set the specific input parameters and create the cpu gauge field
613  GaugeFieldParam gauge_param(h_gauge, *param);
614 
615  // if we are using half precision then we need to compute the fat
616  // link maximum while still on the cpu
617  // FIXME get a kernel for this
619  gauge_param.compute_fat_link_max = true;
620 
621  if (gauge_param.order <= 4) gauge_param.ghostExchange = QUDA_GHOST_EXCHANGE_NO;
623  static_cast<GaugeField*>(new cpuGaugeField(gauge_param)) :
624  static_cast<GaugeField*>(new cudaGaugeField(gauge_param));
625 
626  if (in->Order() == QUDA_BQCD_GAUGE_ORDER) {
627  static size_t checksum = SIZE_MAX;
628  size_t in_checksum = in->checksum(true);
629  if (in_checksum == checksum) {
630  if (getVerbosity() >= QUDA_VERBOSE) printfQuda("Gauge field unchanged - using cached gauge field %lu\n", checksum);
633  delete in;
634  invalidate_clover = false;
635  return;
636  }
637  checksum = in_checksum;
638  invalidate_clover = true;
639  }
640 
641  // free any current gauge field before new allocations to reduce memory overhead
642  switch (param->type) {
643  case QUDA_WILSON_LINKS:
647  break;
652  break;
657  break;
658  case QUDA_SMEARED_LINKS:
659  if (gaugeSmeared) delete gaugeSmeared;
660  break;
661  default:
662  errorQuda("Invalid gauge type %d", param->type);
663  }
664 
665  // if not preserving then copy the gauge field passed in
666  cudaGaugeField *precise = NULL;
667 
668  // switch the parameters for creating the mirror precise cuda gauge field
670  gauge_param.precision = param->cuda_prec;
672  gauge_param.ghostExchange = QUDA_GHOST_EXCHANGE_PAD;
673  gauge_param.pad = param->ga_pad;
674  gauge_param.order = (gauge_param.precision == QUDA_DOUBLE_PRECISION ||
677 
678  precise = new cudaGaugeField(gauge_param);
679 
680  if (param->use_resident_gauge) {
681  if(gaugePrecise == NULL) errorQuda("No resident gauge field");
682  // copy rather than point at to ensure that the padded region is filled in
683  precise->copy(*gaugePrecise);
684  precise->exchangeGhost();
685  delete gaugePrecise;
686  gaugePrecise = NULL;
688  } else {
691  precise->copy(*in);
693  }
694 
695  param->gaugeGiB += precise->GBytes();
696 
697  // for gaugeSmeared we are interested only in the precise version
698  if (param->type == QUDA_SMEARED_LINKS) {
700 
702  delete precise;
703  delete in;
705 
707  return;
708  }
709 
710  // creating sloppy fields isn't really compute, but it is work done on the gpu
712 
713  // switch the parameters for creating the mirror sloppy cuda gauge field
714  gauge_param.precision = param->cuda_prec_sloppy;
716  gauge_param.order = (gauge_param.precision == QUDA_DOUBLE_PRECISION ||
719  cudaGaugeField *sloppy = NULL;
722  sloppy = new cudaGaugeField(gauge_param);
723  sloppy->copy(*precise);
724  param->gaugeGiB += sloppy->GBytes();
725  } else {
726  sloppy = precise;
727  }
728 
729  // switch the parameters for creating the mirror preconditioner cuda gauge field
732  gauge_param.order = (gauge_param.precision == QUDA_DOUBLE_PRECISION ||
735  cudaGaugeField *precondition = NULL;
738  precondition = new cudaGaugeField(gauge_param);
739  precondition->copy(*sloppy);
740  param->gaugeGiB += precondition->GBytes();
741  } else {
742  precondition = sloppy;
743  }
744 
746 
747  // create an extended preconditioning field
748  cudaGaugeField* extended = nullptr;
749  if (param->overlap){
750  int R[4]; // domain-overlap widths in different directions
751  for (int i=0; i<4; ++i) R[i] = param->overlap*commDimPartitioned(i);
752  extended = createExtendedGauge(*precondition, R, profileGauge);
753  }
754 
755  switch (param->type) {
756  case QUDA_WILSON_LINKS:
757  gaugePrecise = precise;
758  gaugeSloppy = sloppy;
759  gaugePrecondition = precondition;
760 
761  if(param->overlap) gaugeExtended = extended;
762  break;
764  gaugeFatPrecise = precise;
765  gaugeFatSloppy = sloppy;
766  gaugeFatPrecondition = precondition;
767 
768  if(param->overlap){
769  if(gaugeFatExtended) errorQuda("Extended gauge fat field already allocated");
770  gaugeFatExtended = extended;
771  }
772  break;
774  gaugeLongPrecise = precise;
775  gaugeLongSloppy = sloppy;
776  gaugeLongPrecondition = precondition;
777 
778  if(param->overlap){
779  if(gaugeLongExtended) errorQuda("Extended gauge long field already allocated");
780  gaugeLongExtended = extended;
781  }
782  break;
783  default:
784  errorQuda("Invalid gauge type %d", param->type);
785  }
786 
788  delete in;
790 
791  if (extendedGaugeResident) {
792  // updated the resident gauge field if needed
793  const int *R_ = extendedGaugeResident->R();
794  const int R[] = { R_[0], R_[1], R_[2], R_[3] };
796  delete extendedGaugeResident;
797 
799  }
800 
802 }
803 
804 void saveGaugeQuda(void *h_gauge, QudaGaugeParam *param)
805 {
807 
809  errorQuda("Non-cpu output location not yet supported");
810 
811  if (!initialized) errorQuda("QUDA not initialized");
812  checkGaugeParam(param);
813 
814  // Set the specific cpu parameters and create the cpu gauge field
815  GaugeFieldParam gauge_param(h_gauge, *param);
817  cudaGaugeField *cudaGauge = NULL;
818  switch (param->type) {
819  case QUDA_WILSON_LINKS:
821  break;
824  break;
827  break;
828  case QUDA_SMEARED_LINKS:
830  gauge_param.precision = param->cuda_prec;
832  gauge_param.ghostExchange = QUDA_GHOST_EXCHANGE_PAD;
833  gauge_param.pad = param->ga_pad;
834  gauge_param.order = (gauge_param.precision == QUDA_DOUBLE_PRECISION ||
839  break;
840  default:
841  errorQuda("Invalid gauge type");
842  }
843 
847 
848  if(param->type == QUDA_SMEARED_LINKS) {
849  delete cudaGauge;
850  }
851 
853 }
854 
855 
857 void freeSloppyCloverQuda();
858 
859 void loadCloverQuda(void *h_clover, void *h_clovinv, QudaInvertParam *inv_param)
860 {
863  bool device_calc = false; // calculate clover and inverse on the device?
864 
867 
868  if (!initialized) errorQuda("QUDA not initialized");
869 
870  if ( (!h_clover && !h_clovinv) || inv_param->compute_clover ) {
871  device_calc = true;
872  if (inv_param->clover_coeff == 0.0) errorQuda("called with neither clover term nor inverse and clover coefficient not set");
873  if (gaugePrecise->Anisotropy() != 1.0) errorQuda("cannot compute anisotropic clover field");
874  }
875 
876  if (inv_param->clover_cpu_prec == QUDA_HALF_PRECISION) errorQuda("Half precision not supported on CPU");
877  if (gaugePrecise == NULL) errorQuda("Gauge field must be loaded before clover");
879  errorQuda("Wrong dslash_type %d in loadCloverQuda()", inv_param->dslash_type);
880  }
881 
882  // determines whether operator is preconditioned when calling invertQuda()
883  bool pc_solve = (inv_param->solve_type == QUDA_DIRECT_PC_SOLVE ||
886 
887  // determines whether operator is preconditioned when calling MatQuda() or MatDagMatQuda()
888  bool pc_solution = (inv_param->solution_type == QUDA_MATPC_SOLUTION ||
890 
891  bool asymmetric = (inv_param->matpc_type == QUDA_MATPC_EVEN_EVEN_ASYMMETRIC ||
893 
894  // uninverted clover term is required when applying unpreconditioned operator,
895  // but note that dslashQuda() is always preconditioned
896  if (!h_clover && !pc_solve && !pc_solution) {
897  //warningQuda("Uninverted clover term not loaded");
898  }
899 
900  // uninverted clover term is also required for "asymmetric" preconditioning
901  if (!h_clover && pc_solve && pc_solution && asymmetric && !device_calc) {
902  warningQuda("Uninverted clover term not loaded");
903  }
904 
905  bool twisted = inv_param->dslash_type == QUDA_TWISTED_CLOVER_DSLASH ? true : false;
906 #ifdef DYNAMIC_CLOVER
907  bool dynamic_clover = twisted ? true : false; // dynamic clover only supported on twisted clover currently
908 #else
909  bool dynamic_clover = false;
910 #endif
911 
912  CloverFieldParam clover_param;
913  clover_param.nDim = 4;
914  clover_param.csw = inv_param->clover_coeff;
915  clover_param.twisted = twisted;
916  clover_param.mu2 = twisted ? 4.*inv_param->kappa*inv_param->kappa*inv_param->mu*inv_param->mu : 0.0;
917  clover_param.siteSubset = QUDA_FULL_SITE_SUBSET;
918  for (int i=0; i<4; i++) clover_param.x[i] = gaugePrecise->X()[i];
919  clover_param.pad = inv_param->cl_pad;
920  clover_param.create = QUDA_NULL_FIELD_CREATE;
921  clover_param.norm = nullptr;
922  clover_param.invNorm = nullptr;
923  clover_param.setPrecision(inv_param->clover_cuda_prec);
924  clover_param.direct = h_clover || device_calc ? true : false;
925  clover_param.inverse = (h_clovinv || pc_solve) && !dynamic_clover ? true : false;
926  CloverField *in = nullptr;
928 
929  // FIXME do we need to make this more robust to changing other meta data (compare cloverPrecise against clover_param)
930  bool clover_update = false;
931  double csw_old = cloverPrecise ? cloverPrecise->Csw() : 0.0;
932  if (!cloverPrecise || invalidate_clover || inv_param->clover_coeff != csw_old) clover_update = true;
933 
934  // compute or download clover field only if gauge field has been updated or clover field doesn't exist
935  if (clover_update) {
936  if (getVerbosity() >= QUDA_VERBOSE) printfQuda("Creating new clover field\n");
938  if (cloverPrecise) delete cloverPrecise;
939 
941  cloverPrecise = new cudaCloverField(clover_param);
942 
943  if (!device_calc || inv_param->return_clover || inv_param->return_clover_inverse) {
944  // create a param for the cpu clover field
945  CloverFieldParam inParam(clover_param);
947  inParam.order = inv_param->clover_order;
948  inParam.direct = h_clover ? true : false;
949  inParam.inverse = h_clovinv ? true : false;
950  inParam.clover = h_clover;
951  inParam.cloverInv = h_clovinv;
954  static_cast<CloverField*>(new cpuCloverField(inParam)) :
955  static_cast<CloverField*>(new cudaCloverField(inParam));
956  }
958 
959  if (!device_calc) {
961  cloverPrecise->copy(*in, h_clovinv && !inv_param->compute_clover_inverse ? true : false);
963  } else {
967  }
968 
969  // inverted clover term is required when applying preconditioned operator
970  if ((!h_clovinv || inv_param->compute_clover_inverse) && pc_solve) {
972  if (!dynamic_clover) {
975  inv_param->trlogA[0] = cloverPrecise->TrLog()[0];
976  inv_param->trlogA[1] = cloverPrecise->TrLog()[1];
977  }
978  }
980  }
981  } else {
982  if (getVerbosity() >= QUDA_VERBOSE) printfQuda("Gauge field unchanged - using cached clover field\n");
983  }
984 
986 
987  clover_param.direct = true;
988  clover_param.inverse = dynamic_clover ? false : true;
989 
991 
993 
994  // if requested, copy back the clover / inverse field
996  if (!h_clover && !h_clovinv) errorQuda("Requested clover field return but no clover host pointers set");
997 
998  // copy the inverted clover term into host application order on the device
999  clover_param.setPrecision(inv_param->clover_cpu_prec);
1000  clover_param.direct = (h_clover && inv_param->return_clover);
1001  clover_param.inverse = (h_clovinv && inv_param->return_clover_inverse);
1002 
1003  // this isn't really "epilogue" but this label suffices
1005  cudaCloverField *hack = nullptr;
1006  if (!dynamic_clover) {
1007  clover_param.order = inv_param->clover_order;
1008  hack = new cudaCloverField(clover_param);
1009  hack->copy(*cloverPrecise); // FIXME this can lead to an redundant copies if we're not copying back direct + inverse
1010  } else {
1011  cudaCloverField *hackOfTheHack = new cudaCloverField(clover_param); // Hack of the hack
1012  hackOfTheHack->copy(*cloverPrecise, false);
1015  inv_param->trlogA[0] = cloverPrecise->TrLog()[0];
1016  inv_param->trlogA[1] = cloverPrecise->TrLog()[1];
1017  }
1018  clover_param.order = inv_param->clover_order;
1019  hack = new cudaCloverField(clover_param);
1020  hack->copy(*hackOfTheHack); // FIXME this can lead to an redundant copies if we're not copying back direct + inverse
1021  delete hackOfTheHack;
1022  }
1024 
1025  // copy the field into the host application's clover field
1027  if (inv_param->return_clover) {
1028  qudaMemcpy((char*)(in->V(false)), (char*)(hack->V(false)), in->Bytes(), cudaMemcpyDeviceToHost);
1029  }
1031  qudaMemcpy((char*)(in->V(true)), (char*)(hack->V(true)), in->Bytes(), cudaMemcpyDeviceToHost);
1032  }
1033 
1035 
1036  delete hack;
1037  checkCudaError();
1038  }
1039 
1041  if (in) delete in; // delete object referencing input field
1043 
1044  popVerbosity();
1045 
1047 }
1048 
1049 void freeSloppyCloverQuda(void);
1050 
1052 {
1054 
1055  if (cloverPrecise) {
1056  // create the mirror sloppy clover field
1057  CloverFieldParam clover_param(*cloverPrecise);
1058 
1059  clover_param.setPrecision(prec_sloppy);
1060 
1061  if (cloverPrecise->V(false) != cloverPrecise->V(true)) {
1062  clover_param.direct = true;
1063  clover_param.inverse = true;
1064  } else {
1065  clover_param.direct = false;
1066  clover_param.inverse = true;
1067  }
1068 
1069  if (clover_param.precision != cloverPrecise->Precision()) {
1070  cloverSloppy = new cudaCloverField(clover_param);
1071  cloverSloppy->copy(*cloverPrecise, clover_param.inverse);
1072  } else {
1074  }
1075 
1076  // switch the parameters for creating the mirror preconditioner clover field
1077  clover_param.setPrecision(prec_precondition);
1078 
1079  // create the mirror preconditioner clover field
1080  if (clover_param.precision != cloverSloppy->Precision()) {
1081  cloverPrecondition = new cudaCloverField(clover_param);
1082  cloverPrecondition->copy(*cloverSloppy, clover_param.inverse);
1083  } else {
1085  }
1086  }
1087 
1088 }
1089 
1090 void freeGaugeQuda(void)
1091 {
1092  if (!initialized) errorQuda("QUDA not initialized");
1094  if (gaugePrecise != gaugeSloppy && gaugeSloppy) delete gaugeSloppy;
1095  if (gaugePrecise) delete gaugePrecise;
1096  if (gaugeExtended) delete gaugeExtended;
1097 
1098  gaugePrecondition = NULL;
1099  gaugeSloppy = NULL;
1100  gaugePrecise = NULL;
1101  gaugeExtended = NULL;
1102 
1105  if (gaugeLongPrecise) delete gaugeLongPrecise;
1107 
1108  gaugeLongPrecondition = NULL;
1109  gaugeLongSloppy = NULL;
1110  gaugeLongPrecise = NULL;
1111  gaugeLongExtended = NULL;
1112 
1115  if (gaugeFatPrecise) delete gaugeFatPrecise;
1116 
1117  gaugeFatPrecondition = NULL;
1118  gaugeFatSloppy = NULL;
1119  gaugeFatPrecise = NULL;
1120  gaugeFatExtended = NULL;
1121 
1122  if (gaugeSmeared) delete gaugeSmeared;
1123 
1124  gaugeSmeared = NULL;
1125  // Need to merge extendedGaugeResident and gaugeFatPrecise/gaugePrecise
1126  if (extendedGaugeResident) {
1127  delete extendedGaugeResident;
1128  extendedGaugeResident = NULL;
1129  }
1130 }
1131 
1132 // just free the sloppy fields used in mixed-precision solvers
1134 {
1135  if (!initialized) errorQuda("QUDA not initialized");
1137  if (gaugePrecise != gaugeSloppy && gaugeSloppy) delete gaugeSloppy;
1138 
1139  gaugePrecondition = NULL;
1140  gaugeSloppy = NULL;
1141 
1144 
1145  gaugeLongPrecondition = NULL;
1146  gaugeLongSloppy = NULL;
1147 
1150 
1151  gaugeFatPrecondition = NULL;
1152  gaugeFatSloppy = NULL;
1153 }
1154 
1155 
1157 {
1158  // first do SU3 links (if they exist)
1159  if (gaugePrecise) {
1161  gauge_param.setPrecision(prec_sloppy);
1162  //gauge_param.reconstruct = param->reconstruct_sloppy; // FIXME
1163 
1164  if (gaugeSloppy) errorQuda("gaugeSloppy already exists");
1165 
1166  if (gauge_param.precision != gaugePrecise->Precision() ||
1170  } else {
1172  }
1173 
1174  // switch the parameters for creating the mirror preconditioner cuda gauge field
1175  gauge_param.setPrecision(prec_precondition);
1176  //gauge_param.reconstruct = param->reconstruct_precondition; // FIXME
1177 
1178  if (gaugePrecondition) errorQuda("gaugePrecondition already exists");
1179 
1180  if (gauge_param.precision != gaugeSloppy->Precision() ||
1184  } else {
1186  }
1187  }
1188 
1189  // fat links (if they exist)
1190  if (gaugeFatPrecise) {
1192 
1193  if (gaugeFatSloppy != gaugeSloppy) {
1194  gauge_param.setPrecision(prec_sloppy);
1195  //gauge_param.reconstruct = param->reconstruct_sloppy; // FIXME
1196 
1197  if (gaugeFatSloppy) errorQuda("gaugeFatSloppy already exists");
1199 
1200  if (gauge_param.precision != gaugeFatPrecise->Precision() ||
1204  } else {
1206  }
1207  }
1208 
1210  // switch the parameters for creating the mirror preconditioner cuda gauge field
1211  gauge_param.setPrecision(prec_precondition);
1212  //gauge_param.reconstruct = param->reconstruct_precondition; // FIXME
1213 
1214  if (gaugeFatPrecondition) errorQuda("gaugeFatPrecondition already exists\n");
1215 
1216  if (gauge_param.precision != gaugeFatSloppy->Precision() ||
1220  } else {
1222  }
1223  }
1224  }
1225 
1226  // long links (if they exist)
1227  if (gaugeLongPrecise) {
1229  gauge_param.setPrecision(prec_sloppy);
1230  //gauge_param.reconstruct = param->reconstruct_sloppy; // FIXME
1231 
1232  if (gaugeLongSloppy) errorQuda("gaugeLongSloppy already exists");
1234 
1235  if (gauge_param.precision != gaugeLongPrecise->Precision() ||
1239  } else {
1241  }
1242 
1243  // switch the parameters for creating the mirror preconditioner cuda gauge field
1244  gauge_param.setPrecision(prec_precondition);
1245  //gauge_param.reconstruct = param->reconstruct_precondition; // FIXME
1246 
1247  if (gaugeLongPrecondition) warningQuda("gaugeLongPrecondition already exists\n");
1248 
1249  if (gauge_param.precision != gaugeLongSloppy->Precision() ||
1253  } else {
1255  }
1256  }
1257 }
1258 
1260 {
1261  if (!initialized) errorQuda("QUDA not initialized");
1264  cloverPrecondition = nullptr;
1265  cloverSloppy = nullptr;
1266 }
1267 
1268 void freeCloverQuda(void)
1269 {
1270  if (!initialized) errorQuda("QUDA not initialized");
1272  if (cloverPrecise) delete cloverPrecise;
1273  cloverPrecise = nullptr;
1274 }
1275 
1277 {
1278  if (i >= QUDA_MAX_CHRONO)
1279  errorQuda("Requested chrono index %d is outside of max %d\n", i, QUDA_MAX_CHRONO);
1280 
1281  auto &basis = chronoResident[i];
1282 
1283  for (unsigned int j=0; j<basis.size(); j++) {
1284  if (basis[j].first) delete basis[j].first;
1285  if (basis[j].second) delete basis[j].second;
1286  }
1287  basis.clear();
1288 }
1289 
1290 void endQuda(void)
1291 {
1292  profileEnd.TPSTART(QUDA_PROFILE_TOTAL);
1293 
1294  if (!initialized) return;
1295 
1296  freeGaugeQuda();
1297  freeCloverQuda();
1298 
1299  for (int i=0; i<QUDA_MAX_CHRONO; i++) flushChronoQuda(i);
1300 
1301  for (auto v : solutionResident) if (v) delete v;
1302  solutionResident.clear();
1303 
1304  if(momResident) delete momResident;
1305 
1308 
1309  blas::end();
1310 
1313 
1315  num_failures_h = NULL;
1316  num_failures_d = NULL;
1317 
1318  if (streams) {
1319  for (int i=0; i<Nstream; i++) cudaStreamDestroy(streams[i]);
1320  delete []streams;
1321  streams = NULL;
1322  }
1324 
1325  saveTuneCache();
1326  saveProfile();
1327 
1328  initialized = false;
1329 
1330  comm_finalize();
1331  comms_initialized = false;
1332 
1335 
1336  // print out the profile information of the lifetime of the library
1337  if (getVerbosity() >= QUDA_SUMMARIZE) {
1338  profileInit.Print();
1339  profileGauge.Print();
1340  profileClover.Print();
1341  profileDslash.Print();
1342  profileInvert.Print();
1343  profileMulti.Print();
1352  profileCovDev.Print();
1353  profilePlaq.Print();
1355  profileAPE.Print();
1356  profileSTOUT.Print();
1358  profilePhase.Print();
1360  profileEnd.Print();
1361 
1364 
1365  printLaunchTimer();
1366  printAPIProfile();
1367 
1368  printfQuda("\n");
1370  printfQuda("\n");
1371  }
1372 
1373  assertAllMemFree();
1374 
1375  char *device_reset_env = getenv("QUDA_DEVICE_RESET");
1376  if (device_reset_env && strcmp(device_reset_env,"1") == 0) {
1377  // end this CUDA context
1378  cudaDeviceReset();
1379  }
1380 
1381 }
1382 
1383 
1384 namespace quda {
1385 
1386  void setDiracParam(DiracParam &diracParam, QudaInvertParam *inv_param, const bool pc)
1387  {
1388  double kappa = inv_param->kappa;
1391  }
1392 
1393  switch (inv_param->dslash_type) {
1394  case QUDA_WILSON_DSLASH:
1395  diracParam.type = pc ? QUDA_WILSONPC_DIRAC : QUDA_WILSON_DIRAC;
1396  break;
1398  diracParam.type = pc ? QUDA_CLOVERPC_DIRAC : QUDA_CLOVER_DIRAC;
1399  break;
1402  diracParam.Ls = inv_param->Ls;
1403  break;
1405  if(pc) {
1406  diracParam.type = QUDA_DOMAIN_WALL_4DPC_DIRAC;
1407  diracParam.Ls = inv_param->Ls;
1408  } else errorQuda("For 4D type of DWF dslash, pc must be turned on, %d", inv_param->dslash_type);
1409  break;
1411  if (inv_param->Ls > QUDA_MAX_DWF_LS)
1412  errorQuda("Length of Ls dimension %d greater than QUDA_MAX_DWF_LS %d", inv_param->Ls, QUDA_MAX_DWF_LS);
1414  diracParam.Ls = inv_param->Ls;
1415  memcpy(diracParam.b_5, inv_param->b_5, sizeof(double)*inv_param->Ls);
1416  memcpy(diracParam.c_5, inv_param->c_5, sizeof(double)*inv_param->Ls);
1417  break;
1418  case QUDA_STAGGERED_DSLASH:
1419  diracParam.type = pc ? QUDA_STAGGEREDPC_DIRAC : QUDA_STAGGERED_DIRAC;
1420  break;
1421  case QUDA_ASQTAD_DSLASH:
1422  diracParam.type = pc ? QUDA_ASQTADPC_DIRAC : QUDA_ASQTAD_DIRAC;
1423  break;
1427  diracParam.Ls = 1;
1428  diracParam.epsilon = 0.0;
1429  } else {
1430  diracParam.Ls = 2;
1432  }
1433  break;
1437  diracParam.Ls = 1;
1438  diracParam.epsilon = 0.0;
1439  } else {
1440  diracParam.Ls = 2;
1442  }
1443  break;
1444  case QUDA_LAPLACE_DSLASH:
1446  break;
1447  case QUDA_COVDEV_DSLASH:
1448  diracParam.type = QUDA_GAUGE_COVDEV_DIRAC;
1449  break;
1450  default:
1451  errorQuda("Unsupported dslash_type %d", inv_param->dslash_type);
1452  }
1453 
1454  diracParam.matpcType = inv_param->matpc_type;
1455  diracParam.dagger = inv_param->dagger;
1456  diracParam.gauge = gaugePrecise;
1457  diracParam.fatGauge = gaugeFatPrecise;
1458  diracParam.longGauge = gaugeLongPrecise;
1459  diracParam.clover = cloverPrecise;
1460  diracParam.kappa = kappa;
1461  diracParam.mass = inv_param->mass;
1462  diracParam.m5 = inv_param->m5;
1463  diracParam.mu = inv_param->mu;
1464 
1465  for (int i=0; i<4; i++) diracParam.commDim[i] = 1; // comms are always on
1466  }
1467 
1468 
1469  void setDiracSloppyParam(DiracParam &diracParam, QudaInvertParam *inv_param, const bool pc)
1470  {
1471  setDiracParam(diracParam, inv_param, pc);
1472 
1473  diracParam.gauge = gaugeSloppy;
1474  diracParam.fatGauge = gaugeFatSloppy;
1475  diracParam.longGauge = gaugeLongSloppy;
1476  diracParam.clover = cloverSloppy;
1477 
1478  for (int i=0; i<4; i++) {
1479  diracParam.commDim[i] = 1; // comms are always on
1480  }
1481 
1482  }
1483 
1484  // The preconditioner currently mimicks the sloppy operator with no comms
1485  void setDiracPreParam(DiracParam &diracParam, QudaInvertParam *inv_param, const bool pc, bool comms)
1486  {
1487  setDiracParam(diracParam, inv_param, pc);
1488 
1489  if(inv_param->overlap){
1490  diracParam.gauge = gaugeExtended;
1491  diracParam.fatGauge = gaugeFatExtended;
1492  diracParam.longGauge = gaugeLongExtended;
1493  }else{
1494  diracParam.gauge = gaugePrecondition;
1495  diracParam.fatGauge = gaugeFatPrecondition;
1496  diracParam.longGauge = gaugeLongPrecondition;
1497  }
1498  diracParam.clover = cloverPrecondition;
1499 
1500  for (int i=0; i<4; i++) {
1501  diracParam.commDim[i] = comms ? 1 : 0;
1502  }
1503 
1504  // In the preconditioned staggered CG allow a different dslash type in the preconditioning
1507  diracParam.type = pc ? QUDA_STAGGEREDPC_DIRAC : QUDA_STAGGERED_DIRAC;
1508  diracParam.gauge = gaugeFatPrecondition;
1509  }
1510  }
1511 
1512 
1513  void createDirac(Dirac *&d, Dirac *&dSloppy, Dirac *&dPre, QudaInvertParam &param, const bool pc_solve)
1514  {
1515  DiracParam diracParam;
1516  DiracParam diracSloppyParam;
1517  DiracParam diracPreParam;
1518 
1519  setDiracParam(diracParam, &param, pc_solve);
1520  setDiracSloppyParam(diracSloppyParam, &param, pc_solve);
1521  bool comms_flag = (param.inv_type != QUDA_INC_EIGCG_INVERTER) ? false : true ;//inc eigCG needs 2 sloppy precisions.
1522  setDiracPreParam(diracPreParam, &param, pc_solve, comms_flag);
1523 
1524 
1525  d = Dirac::create(diracParam); // create the Dirac operator
1526  dSloppy = Dirac::create(diracSloppyParam);
1527  dPre = Dirac::create(diracPreParam);
1528  }
1529 
1531 
1533 
1534  double kappa5 = (0.5/(5.0 + param.m5));
1535  double kappa = (param.dslash_type == QUDA_DOMAIN_WALL_DSLASH ||
1536  param.dslash_type == QUDA_DOMAIN_WALL_4D_DSLASH ||
1537  param.dslash_type == QUDA_MOBIUS_DWF_DSLASH) ? kappa5 : param.kappa;
1538 
1540  printfQuda("Mass rescale: Kappa is: %g\n", kappa);
1541  printfQuda("Mass rescale: mass normalization: %d\n", param.mass_normalization);
1542  double nin = blas::norm2(b);
1543  printfQuda("Mass rescale: norm of source in = %g\n", nin);
1544  }
1545 
1546  // staggered dslash uses mass normalization internally
1547  if (param.dslash_type == QUDA_ASQTAD_DSLASH || param.dslash_type == QUDA_STAGGERED_DSLASH) {
1548  switch (param.solution_type) {
1549  case QUDA_MAT_SOLUTION:
1550  case QUDA_MATPC_SOLUTION:
1551  if (param.mass_normalization == QUDA_KAPPA_NORMALIZATION) blas::ax(2.0*param.mass, b);
1552  break;
1555  if (param.mass_normalization == QUDA_KAPPA_NORMALIZATION) blas::ax(4.0*param.mass*param.mass, b);
1556  break;
1557  default:
1558  errorQuda("Not implemented");
1559  }
1560  return;
1561  }
1562 
1563  for(int i=0; i<param.num_offset; i++) {
1564  unscaled_shifts[i] = param.offset[i];
1565  }
1566 
1567  // multiply the source to compensate for normalization of the Dirac operator, if necessary
1568  switch (param.solution_type) {
1569  case QUDA_MAT_SOLUTION:
1570  if (param.mass_normalization == QUDA_MASS_NORMALIZATION ||
1571  param.mass_normalization == QUDA_ASYMMETRIC_MASS_NORMALIZATION) {
1572  blas::ax(2.0*kappa, b);
1573  for(int i=0; i<param.num_offset; i++) param.offset[i] *= 2.0*kappa;
1574  }
1575  break;
1577  if (param.mass_normalization == QUDA_MASS_NORMALIZATION ||
1578  param.mass_normalization == QUDA_ASYMMETRIC_MASS_NORMALIZATION) {
1579  blas::ax(4.0*kappa*kappa, b);
1580  for(int i=0; i<param.num_offset; i++) param.offset[i] *= 4.0*kappa*kappa;
1581  }
1582  break;
1583  case QUDA_MATPC_SOLUTION:
1584  if (param.mass_normalization == QUDA_MASS_NORMALIZATION) {
1585  blas::ax(4.0*kappa*kappa, b);
1586  for(int i=0; i<param.num_offset; i++) param.offset[i] *= 4.0*kappa*kappa;
1587  } else if (param.mass_normalization == QUDA_ASYMMETRIC_MASS_NORMALIZATION) {
1588  blas::ax(2.0*kappa, b);
1589  for(int i=0; i<param.num_offset; i++) param.offset[i] *= 2.0*kappa;
1590  }
1591  break;
1593  if (param.mass_normalization == QUDA_MASS_NORMALIZATION) {
1594  blas::ax(16.0*std::pow(kappa,4), b);
1595  for(int i=0; i<param.num_offset; i++) param.offset[i] *= 16.0*std::pow(kappa,4);
1596  } else if (param.mass_normalization == QUDA_ASYMMETRIC_MASS_NORMALIZATION) {
1597  blas::ax(4.0*kappa*kappa, b);
1598  for(int i=0; i<param.num_offset; i++) param.offset[i] *= 4.0*kappa*kappa;
1599  }
1600  break;
1601  default:
1602  errorQuda("Solution type %d not supported", param.solution_type);
1603  }
1604 
1605  if (getVerbosity() >= QUDA_DEBUG_VERBOSE) printfQuda("Mass rescale done\n");
1606  if (getVerbosity() >= QUDA_DEBUG_VERBOSE) {
1607  printfQuda("Mass rescale: Kappa is: %g\n", kappa);
1608  printfQuda("Mass rescale: mass normalization: %d\n", param.mass_normalization);
1609  double nin = blas::norm2(b);
1610  printfQuda("Mass rescale: norm of source out = %g\n", nin);
1611  }
1612 
1613  }
1614 }
1615 
1616 void dslashQuda(void *h_out, void *h_in, QudaInvertParam *inv_param, QudaParity parity)
1617 {
1619 
1624 
1625  if (gaugePrecise == NULL) errorQuda("Gauge field not allocated");
1627  errorQuda("Clover field not allocated");
1628 
1631 
1633  ColorSpinorField *in_h = ColorSpinorField::Create(cpuParam);
1634  ColorSpinorParam cudaParam(cpuParam, *inv_param);
1635 
1636  cpuParam.v = h_out;
1637  cpuParam.location = inv_param->output_location;
1638  ColorSpinorField *out_h = ColorSpinorField::Create(cpuParam);
1639 
1640  cudaParam.create = QUDA_NULL_FIELD_CREATE;
1641  cudaColorSpinorField in(*in_h, cudaParam);
1642  cudaColorSpinorField out(in, cudaParam);
1643 
1644  bool pc = true;
1645  DiracParam diracParam;
1646  setDiracParam(diracParam, inv_param, pc);
1647 
1649 
1651  in = *in_h;
1653 
1655 
1656  if (getVerbosity() >= QUDA_DEBUG_VERBOSE) {
1657  double cpu = blas::norm2(*in_h);
1658  double gpu = blas::norm2(in);
1659  printfQuda("In CPU %e CUDA %e\n", cpu, gpu);
1660  }
1661 
1665  blas::ax(1.0/(2.0*inv_param->mass), in);
1666 
1668  if (parity == QUDA_EVEN_PARITY) {
1670  } else {
1672  }
1674  }
1675 
1676  Dirac *dirac = Dirac::create(diracParam); // create the Dirac operator
1678  cudaParam.create = QUDA_NULL_FIELD_CREATE;
1679  cudaColorSpinorField tmp1(in, cudaParam);
1680  ((DiracTwistedCloverPC*) dirac)->TwistCloverInv(tmp1, in, (parity+1)%2); // apply the clover-twist
1681  dirac->Dslash(out, tmp1, parity); // apply the operator
1682  } else {
1683  dirac->Dslash(out, in, parity); // apply the operator
1684  }
1686 
1688  *out_h = out;
1690 
1691  if (getVerbosity() >= QUDA_DEBUG_VERBOSE) {
1692  double cpu = blas::norm2(*out_h);
1693  double gpu = blas::norm2(out);
1694  printfQuda("Out CPU %e CUDA %e\n", cpu, gpu);
1695  }
1696 
1698  delete dirac; // clean up
1699 
1700  delete out_h;
1701  delete in_h;
1703 
1704  popVerbosity();
1706 }
1707 
1709 {
1711  setKernelPackT(true);
1712  else
1713  errorQuda("This type of dslashQuda operator is defined for QUDA_DOMAIN_WALL_$D_DSLASH and QUDA_MOBIUS_DWF_DSLASH only");
1714 
1715  if (gaugePrecise == NULL) errorQuda("Gauge field not allocated");
1716 
1719 
1721  ColorSpinorField *in_h = ColorSpinorField::Create(cpuParam);
1722 
1723  ColorSpinorParam cudaParam(cpuParam, *inv_param);
1724  cudaColorSpinorField in(*in_h, cudaParam);
1725 
1726  if (getVerbosity() >= QUDA_DEBUG_VERBOSE) {
1727  double cpu = blas::norm2(*in_h);
1728  double gpu = blas::norm2(in);
1729  printfQuda("In CPU %e CUDA %e\n", cpu, gpu);
1730  }
1731 
1732  cudaParam.create = QUDA_NULL_FIELD_CREATE;
1733  cudaColorSpinorField out(in, cudaParam);
1734 
1736  if (parity == QUDA_EVEN_PARITY) {
1738  } else {
1740  }
1742  }
1743  bool pc = true;
1744 
1745  DiracParam diracParam;
1746  setDiracParam(diracParam, inv_param, pc);
1747 
1748  DiracDomainWall4DPC dirac(diracParam); // create the Dirac operator
1749  printfQuda("kappa for QUDA input : %e\n",inv_param->kappa);
1750  switch (test_type) {
1751  case 0:
1752  dirac.Dslash4(out, in, parity);
1753  break;
1754  case 1:
1755  dirac.Dslash5(out, in, parity);
1756  break;
1757  case 2:
1758  dirac.Dslash5inv(out, in, parity, inv_param->kappa);
1759  break;
1760  }
1761 
1762  cpuParam.v = h_out;
1763  cpuParam.location = inv_param->output_location;
1764  ColorSpinorField *out_h = ColorSpinorField::Create(cpuParam);
1765  *out_h = out;
1766 
1767  if (getVerbosity() >= QUDA_DEBUG_VERBOSE) {
1768  double cpu = blas::norm2(*out_h);
1769  double gpu = blas::norm2(out);
1770  printfQuda("Out CPU %e CUDA %e\n", cpu, gpu);
1771  }
1772 
1773  delete out_h;
1774  delete in_h;
1775 
1776  popVerbosity();
1777 }
1778 
1780 {
1782  setKernelPackT(true);
1783  else
1784  errorQuda("This type of dslashQuda operator is defined for QUDA_DOMAIN_WALL_$D_DSLASH and QUDA_MOBIUS_DWF_DSLASH only");
1785 
1786  if (gaugePrecise == NULL) errorQuda("Gauge field not allocated");
1787 
1790 
1792  ColorSpinorField *in_h = ColorSpinorField::Create(cpuParam);
1793 
1794  ColorSpinorParam cudaParam(cpuParam, *inv_param);
1795  cudaColorSpinorField in(*in_h, cudaParam);
1796 
1797  if (getVerbosity() >= QUDA_DEBUG_VERBOSE) {
1798  double cpu = blas::norm2(*in_h);
1799  double gpu = blas::norm2(in);
1800  printfQuda("In CPU %e CUDA %e\n", cpu, gpu);
1801  }
1802 
1803  cudaParam.create = QUDA_NULL_FIELD_CREATE;
1804  cudaColorSpinorField out(in, cudaParam);
1805 
1807  if (parity == QUDA_EVEN_PARITY) {
1809  } else {
1811  }
1813  }
1814  bool pc = true;
1815 
1816  DiracParam diracParam;
1817  setDiracParam(diracParam, inv_param, pc);
1818 
1819  DiracMobiusPC dirac(diracParam); // create the Dirac operator
1820  switch (test_type) {
1821  case 0:
1822  dirac.Dslash4(out, in, parity);
1823  break;
1824  case 1:
1825  dirac.Dslash5(out, in, parity);
1826  break;
1827  case 2:
1828  dirac.Dslash4pre(out, in, parity);
1829  break;
1830  case 3:
1831  dirac.Dslash5inv(out, in, parity);
1832  break;
1833  }
1834 
1835  cpuParam.v = h_out;
1836  cpuParam.location = inv_param->output_location;
1837  ColorSpinorField *out_h = ColorSpinorField::Create(cpuParam);
1838  *out_h = out;
1839 
1840  if (getVerbosity() >= QUDA_DEBUG_VERBOSE) {
1841  double cpu = blas::norm2(*out_h);
1842  double gpu = blas::norm2(out);
1843  printfQuda("Out CPU %e CUDA %e\n", cpu, gpu);
1844  }
1845 
1846  delete out_h;
1847  delete in_h;
1848 
1849  popVerbosity();
1850 }
1851 
1852 
1853 void MatQuda(void *h_out, void *h_in, QudaInvertParam *inv_param)
1854 {
1856 
1860 
1861  if (gaugePrecise == NULL) errorQuda("Gauge field not allocated");
1863  errorQuda("Clover field not allocated");
1865 
1866  bool pc = (inv_param->solution_type == QUDA_MATPC_SOLUTION ||
1868 
1869  ColorSpinorParam cpuParam(h_in, *inv_param, gaugePrecise->X(), pc, inv_param->input_location);
1870  ColorSpinorField *in_h = ColorSpinorField::Create(cpuParam);
1871 
1872  ColorSpinorParam cudaParam(cpuParam, *inv_param);
1873  cudaColorSpinorField in(*in_h, cudaParam);
1874 
1875  if (getVerbosity() >= QUDA_DEBUG_VERBOSE) {
1876  double cpu = blas::norm2(*in_h);
1877  double gpu = blas::norm2(in);
1878  printfQuda("In CPU %e CUDA %e\n", cpu, gpu);
1879  }
1880 
1881  cudaParam.create = QUDA_NULL_FIELD_CREATE;
1882  cudaColorSpinorField out(in, cudaParam);
1883 
1884  DiracParam diracParam;
1885  setDiracParam(diracParam, inv_param, pc);
1886 
1887  Dirac *dirac = Dirac::create(diracParam); // create the Dirac operator
1888  dirac->M(out, in); // apply the operator
1889  delete dirac; // clean up
1890 
1891  double kappa = inv_param->kappa;
1892  if (pc) {
1894  blas::ax(0.25/(kappa*kappa), out);
1896  blas::ax(0.5/kappa, out);
1897  }
1898  } else {
1901  blas::ax(0.5/kappa, out);
1902  }
1903  }
1904 
1905  cpuParam.v = h_out;
1906  cpuParam.location = inv_param->output_location;
1907  ColorSpinorField *out_h = ColorSpinorField::Create(cpuParam);
1908  *out_h = out;
1909 
1910  if (getVerbosity() >= QUDA_DEBUG_VERBOSE) {
1911  double cpu = blas::norm2(*out_h);
1912  double gpu = blas::norm2(out);
1913  printfQuda("Out CPU %e CUDA %e\n", cpu, gpu);
1914  }
1915 
1916  delete out_h;
1917  delete in_h;
1918 
1919  popVerbosity();
1920 }
1921 
1922 
1923 void MatDagMatQuda(void *h_out, void *h_in, QudaInvertParam *inv_param)
1924 {
1926 
1930 
1931  if (!initialized) errorQuda("QUDA not initialized");
1932  if (gaugePrecise == NULL) errorQuda("Gauge field not allocated");
1934  errorQuda("Clover field not allocated");
1936 
1937  bool pc = (inv_param->solution_type == QUDA_MATPC_SOLUTION ||
1939 
1940  ColorSpinorParam cpuParam(h_in, *inv_param, gaugePrecise->X(), pc, inv_param->input_location);
1941  ColorSpinorField *in_h = ColorSpinorField::Create(cpuParam);
1942 
1943  ColorSpinorParam cudaParam(cpuParam, *inv_param);
1944  cudaColorSpinorField in(*in_h, cudaParam);
1945 
1946  if (getVerbosity() >= QUDA_DEBUG_VERBOSE){
1947  double cpu = blas::norm2(*in_h);
1948  double gpu = blas::norm2(in);
1949  printfQuda("In CPU %e CUDA %e\n", cpu, gpu);
1950  }
1951 
1952  cudaParam.create = QUDA_NULL_FIELD_CREATE;
1953  cudaColorSpinorField out(in, cudaParam);
1954 
1955  // double kappa = inv_param->kappa;
1956  // if (inv_param->dirac_order == QUDA_CPS_WILSON_DIRAC_ORDER) kappa *= gaugePrecise->anisotropy;
1957 
1958  DiracParam diracParam;
1959  setDiracParam(diracParam, inv_param, pc);
1960 
1961  Dirac *dirac = Dirac::create(diracParam); // create the Dirac operator
1962  dirac->MdagM(out, in); // apply the operator
1963  delete dirac; // clean up
1964 
1965  double kappa = inv_param->kappa;
1966  if (pc) {
1968  blas::ax(1.0/std::pow(2.0*kappa,4), out);
1970  blas::ax(0.25/(kappa*kappa), out);
1971  }
1972  } else {
1975  blas::ax(0.25/(kappa*kappa), out);
1976  }
1977  }
1978 
1979  cpuParam.v = h_out;
1980  cpuParam.location = inv_param->output_location;
1981  ColorSpinorField *out_h = ColorSpinorField::Create(cpuParam);
1982  *out_h = out;
1983 
1984  if (getVerbosity() >= QUDA_DEBUG_VERBOSE){
1985  double cpu = blas::norm2(*out_h);
1986  double gpu = blas::norm2(out);
1987  printfQuda("Out CPU %e CUDA %e\n", cpu, gpu);
1988  }
1989 
1990  delete out_h;
1991  delete in_h;
1992 
1993  popVerbosity();
1994 }
1995 
1996 namespace quda{
1998  return (gaugePrecise != NULL) and param->cuda_prec == gaugePrecise->Precision();
1999 }
2000 }
2001 
2003 
2004  if (param->dslash_type != QUDA_CLOVER_WILSON_DSLASH && param->dslash_type != QUDA_TWISTED_CLOVER_DSLASH) {
2005  return;
2006  }
2007 
2008  if (param->cuda_prec != cloverPrecise->Precision()) {
2009  errorQuda("Solve precision %d doesn't match clover precision %d", param->cuda_prec, cloverPrecise->Precision());
2010  }
2011 
2016  }
2017 
2018  if (cloverPrecise == nullptr) errorQuda("Precise gauge field doesn't exist");
2019  if (cloverSloppy == nullptr) errorQuda("Sloppy gauge field doesn't exist");
2020  if (cloverPrecondition == nullptr) errorQuda("Precondition gauge field doesn't exist");
2021 }
2022 
2024 
2025  if (param->cuda_prec != gaugePrecise->Precision()) {
2026  errorQuda("Solve precision %d doesn't match gauge precision %d", param->cuda_prec, gaugePrecise->Precision());
2027  }
2028 
2030  if (param->dslash_type != QUDA_ASQTAD_DSLASH) {
2035  }
2036 
2037  if (gaugePrecise == NULL) errorQuda("Precise gauge field doesn't exist");
2038  if (gaugeSloppy == NULL) errorQuda("Sloppy gauge field doesn't exist");
2039  if (gaugePrecondition == NULL) errorQuda("Precondition gauge field doesn't exist");
2040  if (param->overlap) {
2041  if (gaugeExtended == NULL) errorQuda("Extended gauge field doesn't exist");
2042  }
2044  } else {
2051  }
2052 
2053  if (gaugeFatPrecise == NULL) errorQuda("Precise gauge fat field doesn't exist");
2054  if (gaugeFatSloppy == NULL) errorQuda("Sloppy gauge fat field doesn't exist");
2055  if (gaugeFatPrecondition == NULL) errorQuda("Precondition gauge fat field doesn't exist");
2056  if (param->overlap) {
2057  if(gaugeFatExtended == NULL) errorQuda("Extended gauge fat field doesn't exist");
2058  }
2059 
2060  if (gaugeLongPrecise == NULL) errorQuda("Precise gauge long field doesn't exist");
2061  if (gaugeLongSloppy == NULL) errorQuda("Sloppy gauge long field doesn't exist");
2062  if (gaugeLongPrecondition == NULL) errorQuda("Precondition gauge long field doesn't exist");
2063  if (param->overlap) {
2064  if(gaugeLongExtended == NULL) errorQuda("Extended gauge long field doesn't exist");
2065  }
2067  }
2068 
2069  checkClover(param);
2070 
2071  return cudaGauge;
2072 }
2073 
2074 void cloverQuda(void *h_out, void *h_in, QudaInvertParam *inv_param, QudaParity parity, int inverse)
2075 {
2077 
2078  if (!initialized) errorQuda("QUDA not initialized");
2079  if (gaugePrecise == NULL) errorQuda("Gauge field not allocated");
2080  if (cloverPrecise == NULL) errorQuda("Clover field not allocated");
2081 
2083 
2085  errorQuda("Cannot apply the clover term for a non Wilson-clover or Twisted-mass-clover dslash");
2086 
2087  ColorSpinorParam cpuParam(h_in, *inv_param, gaugePrecise->X(), 1);
2088 
2090  static_cast<ColorSpinorField*>(new cpuColorSpinorField(cpuParam)) :
2091  static_cast<ColorSpinorField*>(new cudaColorSpinorField(cpuParam));
2092 
2093  ColorSpinorParam cudaParam(cpuParam, *inv_param);
2094  cudaColorSpinorField in(*in_h, cudaParam);
2095 
2096  if (getVerbosity() >= QUDA_DEBUG_VERBOSE) {
2097  double cpu = blas::norm2(*in_h);
2098  double gpu = blas::norm2(in);
2099  printfQuda("In CPU %e CUDA %e\n", cpu, gpu);
2100  }
2101 
2102  cudaParam.create = QUDA_NULL_FIELD_CREATE;
2103  cudaColorSpinorField out(in, cudaParam);
2104 
2106  if (parity == QUDA_EVEN_PARITY) {
2108  } else {
2110  }
2112  }
2113  bool pc = true;
2114 
2115  DiracParam diracParam;
2116  setDiracParam(diracParam, inv_param, pc);
2117  //FIXME: Do we need this for twisted clover???
2118  DiracCloverPC dirac(diracParam); // create the Dirac operator
2119  if (!inverse) dirac.Clover(out, in, parity); // apply the clover operator
2120  else dirac.CloverInv(out, in, parity);
2121 
2122  cpuParam.v = h_out;
2123  cpuParam.location = inv_param->output_location;
2124  ColorSpinorField *out_h = ColorSpinorField::Create(cpuParam);
2125  *out_h = out;
2126 
2127  if (getVerbosity() >= QUDA_DEBUG_VERBOSE) {
2128  double cpu = blas::norm2(*out_h);
2129  double gpu = blas::norm2(out);
2130  printfQuda("Out CPU %e CUDA %e\n", cpu, gpu);
2131  }
2132 
2133  /*for (int i=0; i<in_h->Volume(); i++) {
2134  ((cpuColorSpinorField*)out_h)->PrintVector(i);
2135  }*/
2136 
2137  delete out_h;
2138  delete in_h;
2139 
2140  popVerbosity();
2141 }
2142 
2143 
2144 void lanczosQuda(int k0, int m, void *hp_Apsi, void *hp_r, void *hp_V,
2145  void *hp_alpha, void *hp_beta, QudaEigParam *eig_param)
2146 {
2148  param = eig_param->invert_param;
2149 
2150  if (param->dslash_type == QUDA_DOMAIN_WALL_DSLASH ||
2151  param->dslash_type == QUDA_DOMAIN_WALL_4D_DSLASH ||
2152  param->dslash_type == QUDA_MOBIUS_DWF_DSLASH) setKernelPackT(true);
2153  if (gaugePrecise == NULL) errorQuda("Gauge field not allocated");
2154 
2156 
2157  if (!initialized) errorQuda("QUDA not initialized");
2158 
2159  pushVerbosity(param->verbosity);
2161 
2162  checkInvertParam(param);
2163 
2164  // check the gauge fields have been created
2166 
2167  checkEigParam(eig_param);
2168 
2169  bool pc_solution = (param->solution_type == QUDA_MATPC_DAG_SOLUTION) ||
2170  (param->solution_type == QUDA_MATPCDAG_MATPC_SHIFT_SOLUTION);
2171 
2172  // create the dirac operator
2173  DiracParam diracParam;
2174  setDiracParam(diracParam, param, pc_solution);
2175  Dirac *d = Dirac::create(diracParam); // create the Dirac operator
2176 
2177  Dirac &dirac = *d;
2178 
2180 
2181  cudaColorSpinorField *r = NULL;
2182  cudaColorSpinorField *Apsi = NULL;
2183  const int *X = cudaGauge->X();
2184 
2185  // wrap CPU host side pointers
2186  ColorSpinorParam cpuParam(hp_r, *param, X, pc_solution);
2187  ColorSpinorField *h_r = (param->input_location == QUDA_CPU_FIELD_LOCATION) ?
2188  static_cast<ColorSpinorField*>(new cpuColorSpinorField(cpuParam)) :
2189  static_cast<ColorSpinorField*>(new cudaColorSpinorField(cpuParam));
2190 
2191  cpuParam.v = hp_Apsi;
2192  ColorSpinorField *h_Apsi = (param->input_location == QUDA_CPU_FIELD_LOCATION) ?
2193  static_cast<ColorSpinorField*>(new cpuColorSpinorField(cpuParam)) :
2194  static_cast<ColorSpinorField*>(new cudaColorSpinorField(cpuParam));
2195 
2196  //Make Eigen vector data set
2197  cpuColorSpinorField **h_Eig_Vec;
2198  h_Eig_Vec =(cpuColorSpinorField **)safe_malloc( m*sizeof(cpuColorSpinorField*));
2199  for( int k = 0 ; k < m ; k++)
2200  {
2201  cpuParam.v = ((double**)hp_V)[k];
2202  h_Eig_Vec[k] = new cpuColorSpinorField(cpuParam);
2203  }
2204 
2205  // download source
2206  ColorSpinorParam cudaParam(cpuParam, *param);
2207  cudaParam.create = QUDA_COPY_FIELD_CREATE;
2208  r = new cudaColorSpinorField(*h_r, cudaParam);
2209  Apsi = new cudaColorSpinorField(*h_Apsi, cudaParam);
2210 
2211  double cpu;
2212  double gpu;
2213 
2214  if (getVerbosity() >= QUDA_VERBOSE) {
2215  cpu = blas::norm2(*h_r);
2216  gpu = blas::norm2(*r);
2217  printfQuda("r vector CPU %1.14e CUDA %1.14e\n", cpu, gpu);
2218  cpu = blas::norm2(*h_Apsi);
2219  gpu = blas::norm2(*Apsi);
2220  printfQuda("Apsi vector CPU %1.14e CUDA %1.14e\n", cpu, gpu);
2221  }
2222 
2223  // download Eigen vector set
2224  cudaColorSpinorField **Eig_Vec;
2225  Eig_Vec = (cudaColorSpinorField **)safe_malloc( m*sizeof(cudaColorSpinorField*));
2226 
2227  for( int k = 0 ; k < m ; k++)
2228  {
2229  Eig_Vec[k] = new cudaColorSpinorField(*h_Eig_Vec[k], cudaParam);
2230  if (getVerbosity() >= QUDA_VERBOSE) {
2231  cpu = blas::norm2(*h_Eig_Vec[k]);
2232  gpu = blas::norm2(*Eig_Vec[k]);
2233  printfQuda("Eig_Vec[%d] CPU %1.14e CUDA %1.14e\n", k, cpu, gpu);
2234  }
2235  }
2237 
2238  if(eig_param->RitzMat_lanczos == QUDA_MATPC_DAG_SOLUTION)
2239  {
2240  DiracMdag mat(dirac);
2241  RitzMat ritz_mat(mat,*eig_param);
2242  Eig_Solver *eig_solve = Eig_Solver::create(*eig_param, ritz_mat, profileInvert);
2243  (*eig_solve)((double*)hp_alpha, (double*)hp_beta, Eig_Vec, *r, *Apsi, k0, m);
2244  delete eig_solve;
2245  }
2246  else if(eig_param->RitzMat_lanczos == QUDA_MATPCDAG_MATPC_SOLUTION)
2247  {
2248  DiracMdagM mat(dirac);
2249  RitzMat ritz_mat(mat,*eig_param);
2250  Eig_Solver *eig_solve = Eig_Solver::create(*eig_param, ritz_mat, profileInvert);
2251  (*eig_solve)((double*)hp_alpha, (double*)hp_beta, Eig_Vec, *r, *Apsi, k0, m);
2252  delete eig_solve;
2253  }
2255  {
2256  DiracMdagM mat(dirac);
2257  RitzMat ritz_mat(mat,*eig_param);
2258  Eig_Solver *eig_solve = Eig_Solver::create(*eig_param, ritz_mat, profileInvert);
2259  (*eig_solve)((double*)hp_alpha, (double*)hp_beta, Eig_Vec, *r, *Apsi, k0, m);
2260  delete eig_solve;
2261  }
2262  else
2263  {
2264  errorQuda("invalid ritz matrix type\n");
2265  exit(0);
2266  }
2267 
2268  //Write back calculated eigen vector
2270  for( int k = 0 ; k < m ; k++)
2271  {
2272  *h_Eig_Vec[k] = *Eig_Vec[k];
2273  }
2274  *h_r = *r;
2275  *h_Apsi = *Apsi;
2277 
2278 
2279  delete h_r;
2280  delete h_Apsi;
2281  for( int k = 0 ; k < m ; k++)
2282  {
2283  delete Eig_Vec[k];
2284  delete h_Eig_Vec[k];
2285  }
2286  host_free(Eig_Vec);
2287  host_free(h_Eig_Vec);
2288 
2289  delete d;
2290 
2291  popVerbosity();
2292 
2293  saveTuneCache();
2295 }
2296 
2298  : profile(profile) {
2299  profile.TPSTART(QUDA_PROFILE_INIT);
2300  QudaInvertParam *param = mg_param.invert_param;
2301 
2303  checkMultigridParam(&mg_param);
2304 
2305  // check MG params (needs to go somewhere else)
2306  if (mg_param.n_level > QUDA_MAX_MG_LEVEL)
2307  errorQuda("Requested MG levels %d greater than allowed maximum %d", mg_param.n_level, QUDA_MAX_MG_LEVEL);
2308  for (int i=0; i<mg_param.n_level; i++) {
2310  errorQuda("Unsupported smoother solve type %d on level %d", mg_param.smoother_solve_type[i], i);
2311  }
2312  if (param->solve_type != QUDA_DIRECT_SOLVE)
2313  errorQuda("Outer MG solver can only use QUDA_DIRECT_SOLVE at present");
2314 
2315  pushVerbosity(param->verbosity);
2317  mg_param.secs = 0;
2318  mg_param.gflops = 0;
2319 
2320  bool pc_solution = (param->solution_type == QUDA_MATPC_SOLUTION) ||
2321  (param->solution_type == QUDA_MATPCDAG_MATPC_SOLUTION);
2322 
2323  bool outer_pc_solve = (param->solve_type == QUDA_DIRECT_PC_SOLVE) ||
2324  (param->solve_type == QUDA_NORMOP_PC_SOLVE);
2325 
2326  // create the dirac operators for the fine grid
2327 
2328  // this is the Dirac operator we use for inter-grid residual computation
2329  DiracParam diracParam;
2330  setDiracSloppyParam(diracParam, param, outer_pc_solve);
2331  d = Dirac::create(diracParam);
2332  m = new DiracM(*d);
2333 
2334  // this is the Dirac operator we use for smoothing
2335  DiracParam diracSmoothParam;
2336  bool fine_grid_pc_solve = (mg_param.smoother_solve_type[0] == QUDA_DIRECT_PC_SOLVE) ||
2337  (mg_param.smoother_solve_type[0] == QUDA_NORMOP_PC_SOLVE);
2338  setDiracSloppyParam(diracSmoothParam, param, fine_grid_pc_solve);
2339  dSmooth = Dirac::create(diracSmoothParam);
2340  mSmooth = new DiracM(*dSmooth);
2341 
2342  // this is the Dirac operator we use for sloppy smoothing (we use the preconditioner fields for this)
2343  DiracParam diracSmoothSloppyParam;
2344  setDiracPreParam(diracSmoothSloppyParam, param, fine_grid_pc_solve, true);
2345  dSmoothSloppy = Dirac::create(diracSmoothSloppyParam);;
2347 
2348  printfQuda("Creating vector of null space fields of length %d\n", mg_param.n_vec[0]);
2349 
2350  ColorSpinorParam cpuParam(0, *param, cudaGauge->X(), pc_solution, QUDA_CPU_FIELD_LOCATION);
2351  cpuParam.create = QUDA_ZERO_FIELD_CREATE;
2352  cpuParam.precision = param->cuda_prec_sloppy;
2353  B.resize(mg_param.n_vec[0]);
2354  for (int i=0; i<mg_param.n_vec[0]; i++) B[i] = new cpuColorSpinorField(cpuParam);
2355 
2356  // fill out the MG parameters for the fine level
2357  mgParam = new MGParam(mg_param, B, m, mSmooth, mSmoothSloppy);
2358 
2359  mg = new MG(*mgParam, profile);
2361  profile.TPSTOP(QUDA_PROFILE_INIT);
2362 }
2363 
2366 
2367  multigrid_solver *mg = new multigrid_solver(*mg_param, profileInvert);
2368 
2370 
2371  saveProfile(__func__);
2372  flushProfile();
2373  saveTuneCache();
2374  return static_cast<void*>(mg);
2375 }
2376 
2377 void destroyMultigridQuda(void *mg) {
2378  delete static_cast<multigrid_solver*>(mg);
2379 }
2380 
2381 void updateMultigridQuda(void *mg_, QudaMultigridParam *mg_param) {
2382  multigrid_solver *mg = static_cast<multigrid_solver*>(mg_);
2383 
2384  QudaInvertParam *param = mg_param->invert_param;
2385  checkGauge(param);
2386  checkMultigridParam(mg_param);
2387 
2388  bool outer_pc_solve = (param->solve_type == QUDA_DIRECT_PC_SOLVE) ||
2389  (param->solve_type == QUDA_NORMOP_PC_SOLVE);
2390 
2391  // free the previous dirac oprators
2392  if (mg->m) delete mg->m;
2393  if (mg->mSmooth) delete mg->mSmooth;
2394  if (mg->mSmoothSloppy) delete mg->mSmoothSloppy;
2395 
2396  if (mg->d) delete mg->d;
2397  if (mg->dSmooth) delete mg->dSmooth;
2398  if (mg->dSmoothSloppy && mg->dSmoothSloppy != mg->dSmooth) delete mg->dSmoothSloppy;
2399 
2400  // create new fine dirac operators
2401 
2402  // this is the Dirac operator we use for inter-grid residual computation
2403  DiracParam diracParam;
2404  setDiracSloppyParam(diracParam, param, outer_pc_solve);
2405  mg->d = Dirac::create(diracParam);
2406  mg->m = new DiracM(*(mg->d));
2407 
2408  // this is the Dirac operator we use for smoothing
2409  DiracParam diracSmoothParam;
2410  bool fine_grid_pc_solve = (mg_param->smoother_solve_type[0] == QUDA_DIRECT_PC_SOLVE) ||
2411  (mg_param->smoother_solve_type[0] == QUDA_NORMOP_PC_SOLVE);
2412  setDiracSloppyParam(diracSmoothParam, param, fine_grid_pc_solve);
2413  mg->dSmooth = Dirac::create(diracSmoothParam);
2414  mg->mSmooth = new DiracM(*(mg->dSmooth));
2415 
2416  // this is the Dirac operator we use for sloppy smoothing (we use the preconditioner fields for this)
2417  DiracParam diracSmoothSloppyParam;
2418  setDiracPreParam(diracSmoothSloppyParam, param, fine_grid_pc_solve, true);
2419  mg->dSmoothSloppy = Dirac::create(diracSmoothSloppyParam);;
2420  mg->mSmoothSloppy = new DiracM(*(mg->dSmoothSloppy));
2421 
2422  mg->mgParam->matResidual = mg->m;
2423  mg->mgParam->matSmooth = mg->mSmooth;
2425 
2426  // recreate the smoothers on the fine level
2427  mg->mg->destroySmoother();
2428  mg->mg->createSmoother();
2429 
2430  //mgParam = new MGParam(mg_param, B, *m, *mSmooth, *mSmoothSloppy);
2431  //mg = new MG(*mgParam, profile);
2433 }
2434 
2436  : d(nullptr), m(nullptr), RV(nullptr), deflParam(nullptr), defl(nullptr), profile(profile) {
2437 
2438  QudaInvertParam *param = eig_param.invert_param;
2439 
2440  if(param->inv_type != QUDA_EIGCG_INVERTER && param->inv_type != QUDA_INC_EIGCG_INVERTER) return;
2441 
2442  profile.TPSTART(QUDA_PROFILE_INIT);
2443 
2445  eig_param.secs = 0;
2446  eig_param.gflops = 0;
2447 
2448  DiracParam diracParam;
2449  if(eig_param.cuda_prec_ritz == param->cuda_prec)
2450  {
2451  setDiracParam(diracParam, param, (param->solve_type == QUDA_DIRECT_PC_SOLVE) || (param->solve_type == QUDA_NORMOP_PC_SOLVE));
2452  } else {
2453  setDiracSloppyParam(diracParam, param, (param->solve_type == QUDA_DIRECT_PC_SOLVE) || (param->solve_type == QUDA_NORMOP_PC_SOLVE));
2454  }
2455 
2456  const bool pc_solve = (param->solve_type == QUDA_NORMOP_PC_SOLVE);
2457 
2458  d = Dirac::create(diracParam);
2459  m = pc_solve ? static_cast<DiracMatrix*>( new DiracMdagM(*d) ) : static_cast<DiracMatrix*>( new DiracM(*d));
2460 
2461  ColorSpinorParam ritzParam(0, *param, cudaGauge->X(), pc_solve, eig_param.location);
2462 
2463  ritzParam.create = QUDA_ZERO_FIELD_CREATE;
2464  ritzParam.is_composite = true;
2465  ritzParam.is_component = false;
2466  ritzParam.composite_dim = param->nev*param->deflation_grid;
2467  ritzParam.setPrecision(param->cuda_prec_ritz);
2468 
2469  if (ritzParam.location==QUDA_CUDA_FIELD_LOCATION) {
2470  ritzParam.fieldOrder = (param->cuda_prec_ritz == QUDA_DOUBLE_PRECISION ) ? QUDA_FLOAT2_FIELD_ORDER : QUDA_FLOAT4_FIELD_ORDER;
2471  if(ritzParam.nSpin != 1) ritzParam.gammaBasis = QUDA_UKQCD_GAMMA_BASIS;
2472 
2473  //select memory location here, by default ritz vectors will be allocated on the device
2474  //but if not sufficient device memory, then the user may choose mapped type of memory
2475  ritzParam.mem_type = eig_param.mem_type_ritz;
2476  } else { //host location
2477  ritzParam.mem_type = QUDA_MEMORY_PINNED;
2478  }
2479 
2480  int ritzVolume = 1;
2481  for(int d = 0; d < ritzParam.nDim; d++) ritzVolume *= ritzParam.x[d];
2482 
2483  if( getVerbosity() == QUDA_DEBUG_VERBOSE ) {
2484 
2485  size_t byte_estimate = (size_t)ritzParam.composite_dim*(size_t)ritzVolume*(ritzParam.nColor*ritzParam.nSpin*ritzParam.precision);
2486  printfQuda("allocating bytes: %lu (lattice volume %d, prec %d)" , byte_estimate, ritzVolume, ritzParam.precision);
2487  if(ritzParam.mem_type == QUDA_MEMORY_DEVICE) printfQuda("Using device memory type.\n");
2488  else if (ritzParam.mem_type == QUDA_MEMORY_MAPPED) printfQuda("Using mapped memory type.\n");
2489  }
2490 
2491  RV = ColorSpinorField::Create(ritzParam);
2492 
2493  deflParam = new DeflationParam(eig_param, RV, *m);
2494 
2495  defl = new Deflation(*deflParam, profile);
2496 
2497  profile.TPSTOP(QUDA_PROFILE_INIT);
2498 }
2499 
2500 void* newDeflationQuda(QudaEigParam *eig_param) {
2502 #ifdef MAGMA_LIB
2503  openMagma();
2504 #endif
2505  deflated_solver *defl = new deflated_solver(*eig_param, profileInvert);
2506 
2508 
2509  saveProfile(__func__);
2510  flushProfile();
2511  return static_cast<void*>(defl);
2512 }
2513 
2514 void destroyDeflationQuda(void *df) {
2515 #ifdef MAGMA_LIB
2516  closeMagma();
2517 #endif
2518  delete static_cast<deflated_solver*>(df);
2519 }
2520 
2521 void invertQuda(void *hp_x, void *hp_b, QudaInvertParam *param)
2522 {
2523  if (param->dslash_type == QUDA_DOMAIN_WALL_DSLASH ||
2524  param->dslash_type == QUDA_DOMAIN_WALL_4D_DSLASH ||
2525  param->dslash_type == QUDA_MOBIUS_DWF_DSLASH) setKernelPackT(true);
2526 
2528 
2529  if (!initialized) errorQuda("QUDA not initialized");
2530 
2531  pushVerbosity(param->verbosity);
2533 
2534  checkInvertParam(param);
2535 
2536  // check the gauge fields have been created
2538 
2539  // It was probably a bad design decision to encode whether the system is even/odd preconditioned (PC) in
2540  // solve_type and solution_type, rather than in separate members of QudaInvertParam. We're stuck with it
2541  // for now, though, so here we factorize everything for convenience.
2542 
2543  bool pc_solution = (param->solution_type == QUDA_MATPC_SOLUTION) ||
2544  (param->solution_type == QUDA_MATPCDAG_MATPC_SOLUTION);
2545  bool pc_solve = (param->solve_type == QUDA_DIRECT_PC_SOLVE) ||
2546  (param->solve_type == QUDA_NORMOP_PC_SOLVE) || (param->solve_type == QUDA_NORMERR_PC_SOLVE);
2547  bool mat_solution = (param->solution_type == QUDA_MAT_SOLUTION) ||
2548  (param->solution_type == QUDA_MATPC_SOLUTION);
2549  bool direct_solve = (param->solve_type == QUDA_DIRECT_SOLVE) ||
2550  (param->solve_type == QUDA_DIRECT_PC_SOLVE);
2551  bool norm_error_solve = (param->solve_type == QUDA_NORMERR_SOLVE) ||
2552  (param->solve_type == QUDA_NORMERR_PC_SOLVE);
2553 
2554  param->spinorGiB = cudaGauge->VolumeCB() * spinorSiteSize;
2555  if (!pc_solve) param->spinorGiB *= 2;
2556  param->spinorGiB *= (param->cuda_prec == QUDA_DOUBLE_PRECISION ? sizeof(double) : sizeof(float));
2557  if (param->preserve_source == QUDA_PRESERVE_SOURCE_NO) {
2558  param->spinorGiB *= (param->inv_type == QUDA_CG_INVERTER ? 5 : 7)/(double)(1<<30);
2559  } else {
2560  param->spinorGiB *= (param->inv_type == QUDA_CG_INVERTER ? 8 : 9)/(double)(1<<30);
2561  }
2562 
2563  param->secs = 0;
2564  param->gflops = 0;
2565  param->iter = 0;
2566 
2567  Dirac *d = NULL;
2568  Dirac *dSloppy = NULL;
2569  Dirac *dPre = NULL;
2570 
2571  // create the dirac operator
2572  createDirac(d, dSloppy, dPre, *param, pc_solve);
2573 
2574  Dirac &dirac = *d;
2575  Dirac &diracSloppy = *dSloppy;
2576  Dirac &diracPre = *dPre;
2577 
2579 
2580  ColorSpinorField *b = NULL;
2581  ColorSpinorField *x = NULL;
2582  ColorSpinorField *in = NULL;
2583  ColorSpinorField *out = NULL;
2584 
2585  const int *X = cudaGauge->X();
2586 
2587  // wrap CPU host side pointers
2588  ColorSpinorParam cpuParam(hp_b, *param, X, pc_solution, param->input_location);
2589  ColorSpinorField *h_b = ColorSpinorField::Create(cpuParam);
2590 
2591  cpuParam.v = hp_x;
2592  cpuParam.location = param->output_location;
2593  ColorSpinorField *h_x = ColorSpinorField::Create(cpuParam);
2594 
2595  // download source
2596  ColorSpinorParam cudaParam(cpuParam, *param);
2597  cudaParam.create = QUDA_COPY_FIELD_CREATE;
2598  b = new cudaColorSpinorField(*h_b, cudaParam);
2599 
2600  // now check if we need to invalidate the solutionResident vectors
2601  bool invalidate = false;
2602  for (auto v : solutionResident)
2603  if (cudaParam.precision != v->Precision()) { invalidate = true; break; }
2604 
2605  if (invalidate) {
2606  for (auto v : solutionResident) if (v) delete v;
2607  solutionResident.clear();
2608  }
2609 
2610  if (!solutionResident.size()) {
2611  cudaParam.create = QUDA_NULL_FIELD_CREATE;
2612  solutionResident.push_back(new cudaColorSpinorField(cudaParam)); // solution
2613  }
2614  x = solutionResident[0];
2615 
2616  if (param->use_init_guess == QUDA_USE_INIT_GUESS_YES) { // download initial guess
2617  // initial guess only supported for single-pass solvers
2618  if ((param->solution_type == QUDA_MATDAG_MAT_SOLUTION || param->solution_type == QUDA_MATPCDAG_MATPC_SOLUTION) &&
2619  (param->solve_type == QUDA_DIRECT_SOLVE || param->solve_type == QUDA_DIRECT_PC_SOLVE)) {
2620  errorQuda("Initial guess not supported for two-pass solver");
2621  }
2622 
2623  *x = *h_x; // solution
2624  } else { // zero initial guess
2625  blas::zero(*x);
2626  }
2627 
2629 
2630  double nb = blas::norm2(*b);
2631  if (nb==0.0) errorQuda("Source has zero norm");
2632 
2633  if (getVerbosity() >= QUDA_VERBOSE) {
2634  double nh_b = blas::norm2(*h_b);
2635  double nh_x = blas::norm2(*h_x);
2636  double nx = blas::norm2(*x);
2637  printfQuda("Source: CPU = %g, CUDA copy = %g\n", nh_b, nb);
2638  printfQuda("Solution: CPU = %g, CUDA copy = %g\n", nh_x, nx);
2639  }
2640 
2641  // rescale the source and solution vectors to help prevent the onset of underflow
2642  if (param->solver_normalization == QUDA_SOURCE_NORMALIZATION) {
2643  blas::ax(1.0/sqrt(nb), *b);
2644  blas::ax(1.0/sqrt(nb), *x);
2645  }
2646 
2647  massRescale(*static_cast<cudaColorSpinorField*>(b), *param);
2648 
2649  dirac.prepare(in, out, *x, *b, param->solution_type);
2650 
2651  if (getVerbosity() >= QUDA_VERBOSE) {
2652  double nin = blas::norm2(*in);
2653  double nout = blas::norm2(*out);
2654  printfQuda("Prepared source = %g\n", nin);
2655  printfQuda("Prepared solution = %g\n", nout);
2656  }
2657 
2658  if (getVerbosity() >= QUDA_VERBOSE) {
2659  double nin = blas::norm2(*in);
2660  printfQuda("Prepared source post mass rescale = %g\n", nin);
2661  }
2662 
2663  // solution_type specifies *what* system is to be solved.
2664  // solve_type specifies *how* the system is to be solved.
2665  //
2666  // We have the following four cases (plus preconditioned variants):
2667  //
2668  // solution_type solve_type Effect
2669  // ------------- ---------- ------
2670  // MAT DIRECT Solve Ax=b
2671  // MATDAG_MAT DIRECT Solve A^dag y = b, followed by Ax=y
2672  // MAT NORMOP Solve (A^dag A) x = (A^dag b)
2673  // MATDAG_MAT NORMOP Solve (A^dag A) x = b
2674  // MAT NORMERR Solve (A A^dag) y = b, then x = A^dag y
2675  //
2676  // We generally require that the solution_type and solve_type
2677  // preconditioning match. As an exception, the unpreconditioned MAT
2678  // solution_type may be used with any solve_type, including
2679  // DIRECT_PC and NORMOP_PC. In these cases, preparation of the
2680  // preconditioned source and reconstruction of the full solution are
2681  // taken care of by Dirac::prepare() and Dirac::reconstruct(),
2682  // respectively.
2683 
2684  if (pc_solution && !pc_solve) {
2685  errorQuda("Preconditioned (PC) solution_type requires a PC solve_type");
2686  }
2687 
2688  if (!mat_solution && !pc_solution && pc_solve) {
2689  errorQuda("Unpreconditioned MATDAG_MAT solution_type requires an unpreconditioned solve_type");
2690  }
2691 
2692  if (!mat_solution && norm_error_solve) {
2693  errorQuda("Normal-error solve requires Mat solution");
2694  }
2695 
2696  if (param->inv_type_precondition == QUDA_MG_INVERTER && (!direct_solve || !mat_solution)) {
2697  errorQuda("Multigrid preconditioning only supported for direct solves");
2698  }
2699 
2700  if (param->use_resident_chrono && (direct_solve || norm_error_solve) ){
2701  errorQuda("Chronological forcasting only presently supported for M^dagger M solver");
2702  }
2703 
2704  if (mat_solution && !direct_solve && !norm_error_solve) { // prepare source: b' = A^dag b
2706  dirac.Mdag(*in, tmp);
2707  } else if (!mat_solution && direct_solve) { // perform the first of two solves: A^dag y = b
2708  DiracMdag m(dirac), mSloppy(diracSloppy), mPre(diracPre);
2709  SolverParam solverParam(*param);
2710  Solver *solve = Solver::create(solverParam, m, mSloppy, mPre, profileInvert);
2711  (*solve)(*out, *in);
2712  blas::copy(*in, *out);
2713  solverParam.updateInvertParam(*param);
2714  delete solve;
2715  }
2716 
2717  if (direct_solve) {
2718  DiracM m(dirac), mSloppy(diracSloppy), mPre(diracPre);
2719  SolverParam solverParam(*param);
2720  Solver *solve = Solver::create(solverParam, m, mSloppy, mPre, profileInvert);
2721  (*solve)(*out, *in);
2722  solverParam.updateInvertParam(*param);
2723  delete solve;
2724  } else if (!norm_error_solve) {
2725  DiracMdagM m(dirac), mSloppy(diracSloppy), mPre(diracPre);
2726  SolverParam solverParam(*param);
2727 
2728  // chronological forecasting
2729  if (param->use_resident_chrono && chronoResident[param->chrono_index].size() > 0) {
2730  auto &basis = chronoResident[param->chrono_index];
2731 
2733 
2734  for (unsigned int j=0; j<basis.size(); j++) m(*basis[j].second, *basis[j].first, tmp, tmp2);
2735 
2736  bool orthogonal = true;
2737  bool apply_mat = false;
2738  MinResExt mre(m, orthogonal, apply_mat, profileInvert);
2739  blas::copy(tmp, *in);
2740 
2741  mre(*out, tmp, basis);
2742  }
2743 
2744  Solver *solve = Solver::create(solverParam, m, mSloppy, mPre, profileInvert);
2745  (*solve)(*out, *in);
2746  solverParam.updateInvertParam(*param);
2747  delete solve;
2748  } else { // norm_error_solve
2749  DiracMMdag m(dirac), mSloppy(diracSloppy), mPre(diracPre);
2751  SolverParam solverParam(*param);
2752  Solver *solve = Solver::create(solverParam, m, mSloppy, mPre, profileInvert);
2753  (*solve)(tmp, *in); // y = (M M^\dag) b
2754  dirac.Mdag(*out, tmp); // x = M^dag y
2755  solverParam.updateInvertParam(*param);
2756  delete solve;
2757  }
2758 
2759  if (getVerbosity() >= QUDA_VERBOSE){
2760  double nx = blas::norm2(*x);
2761  printfQuda("Solution = %g\n",nx);
2762  }
2763 
2765  dirac.reconstruct(*x, *b, param->solution_type);
2766 
2767  if (param->solver_normalization == QUDA_SOURCE_NORMALIZATION) {
2768  // rescale the solution
2769  blas::ax(sqrt(nb), *x);
2770  }
2772 
2773  if (!param->make_resident_solution) {
2775  *h_x = *x;
2777  }
2778 
2780 
2781  if (param->make_resident_chrono) {
2782  int i = param->chrono_index;
2783  if (i >= QUDA_MAX_CHRONO)
2784  errorQuda("Requested chrono index %d is outside of max %d\n", i, QUDA_MAX_CHRONO);
2785 
2786  auto &basis = chronoResident[i];
2787 
2788  // if we have filled the space yet just augment
2789  if ((int)basis.size() < param->max_chrono_dim) {
2790  ColorSpinorParam cs_param(*x);
2791  basis.push_back(std::pair<ColorSpinorField*,ColorSpinorField*>(ColorSpinorField::Create(cs_param),ColorSpinorField::Create(cs_param)));
2792  }
2793 
2794  // shuffle every entry down one and bring the last to the front
2795  ColorSpinorField *tmp = basis[basis.size()-1].first;
2796  for (unsigned int j=basis.size()-1; j>0; j--) basis[j].first = basis[j-1].first;
2797  basis[0].first = tmp;
2798  *(basis[0]).first = *x; // set first entry to new solution
2799  }
2800 
2801  if (param->compute_action) {
2802  Complex action = blas::cDotProduct(*b, *x);
2803  param->action[0] = action.real();
2804  param->action[1] = action.imag();
2805  }
2806 
2807  if (getVerbosity() >= QUDA_VERBOSE){
2808  double nx = blas::norm2(*x);
2809  double nh_x = blas::norm2(*h_x);
2810  printfQuda("Reconstructed: CUDA solution = %g, CPU copy = %g\n", nx, nh_x);
2811  }
2813 
2815 
2816  delete h_b;
2817  delete h_x;
2818  delete b;
2819 
2820  if (!param->make_resident_solution) {
2821  for (auto v: solutionResident) if (v) delete v;
2822  solutionResident.clear();
2823  }
2824 
2825  delete d;
2826  delete dSloppy;
2827  delete dPre;
2828 
2830 
2831  popVerbosity();
2832 
2833  // cache is written out even if a long benchmarking job gets interrupted
2834  saveTuneCache();
2835 
2837 }
2838 
2839 
2848 void invertMultiSrcQuda(void **_hp_x, void **_hp_b, QudaInvertParam *param)
2849 {
2850 
2851  // currently that code is just a copy of invertQuda and cannot work
2852 
2853  if (param->dslash_type == QUDA_DOMAIN_WALL_DSLASH ||
2854  param->dslash_type == QUDA_DOMAIN_WALL_4D_DSLASH ||
2855  param->dslash_type == QUDA_MOBIUS_DWF_DSLASH) setKernelPackT(true);
2856 
2858 
2859  if (!initialized) errorQuda("QUDA not initialized");
2860 
2861  pushVerbosity(param->verbosity);
2863 
2864  checkInvertParam(param);
2865 
2866  // check the gauge fields have been created
2868 
2869  // It was probably a bad design decision to encode whether the system is even/odd preconditioned (PC) in
2870  // solve_type and solution_type, rather than in separate members of QudaInvertParam. We're stuck with it
2871  // for now, though, so here we factorize everything for convenience.
2872 
2873  bool pc_solution = (param->solution_type == QUDA_MATPC_SOLUTION) ||
2874  (param->solution_type == QUDA_MATPCDAG_MATPC_SOLUTION);
2875  bool pc_solve = (param->solve_type == QUDA_DIRECT_PC_SOLVE) ||
2876  (param->solve_type == QUDA_NORMOP_PC_SOLVE) || (param->solve_type == QUDA_NORMERR_PC_SOLVE);
2877  bool mat_solution = (param->solution_type == QUDA_MAT_SOLUTION) ||
2878  (param->solution_type == QUDA_MATPC_SOLUTION);
2879  bool direct_solve = (param->solve_type == QUDA_DIRECT_SOLVE) ||
2880  (param->solve_type == QUDA_DIRECT_PC_SOLVE);
2881  bool norm_error_solve = (param->solve_type == QUDA_NORMERR_SOLVE) ||
2882  (param->solve_type == QUDA_NORMERR_PC_SOLVE);
2883 
2884  param->spinorGiB = cudaGauge->VolumeCB() * spinorSiteSize;
2885  if (!pc_solve) param->spinorGiB *= 2;
2886  param->spinorGiB *= (param->cuda_prec == QUDA_DOUBLE_PRECISION ? sizeof(double) : sizeof(float));
2887  if (param->preserve_source == QUDA_PRESERVE_SOURCE_NO) {
2888  param->spinorGiB *= (param->inv_type == QUDA_CG_INVERTER ? 5 : 7)/(double)(1<<30);
2889  } else {
2890  param->spinorGiB *= (param->inv_type == QUDA_CG_INVERTER ? 8 : 9)/(double)(1<<30);
2891  }
2892 
2893  param->secs = 0;
2894  param->gflops = 0;
2895  param->iter = 0;
2896 
2897  Dirac *d = NULL;
2898  Dirac *dSloppy = NULL;
2899  Dirac *dPre = NULL;
2900 
2901  // create the dirac operator
2902  createDirac(d, dSloppy, dPre, *param, pc_solve);
2903 
2904  Dirac &dirac = *d;
2905  Dirac &diracSloppy = *dSloppy;
2906  Dirac &diracPre = *dPre;
2907 
2909 
2910  // std::vector<ColorSpinorField*> b; // Cuda Solutions
2911  // b.resize(param->num_src);
2912  // std::vector<ColorSpinorField*> x; // Cuda Solutions
2913  // x.resize(param->num_src);
2914  ColorSpinorField* in; // = NULL;
2915  //in.resize(param->num_src);
2916  ColorSpinorField* out; // = NULL;
2917  //out.resize(param->num_src);
2918 
2919  // for(int i=0;i < param->num_src;i++){
2920  // in[i] = NULL;
2921  // out[i] = NULL;
2922  // }
2923 
2924  const int *X = cudaGauge->X();
2925 
2926 
2927  // Host pointers for x, take a copy of the input host pointers
2928  void** hp_x;
2929  hp_x = new void* [ param->num_src ];
2930 
2931  void** hp_b;
2932  hp_b = new void* [param->num_src];
2933 
2934  for(int i=0;i < param->num_src;i++){
2935  hp_x[i] = _hp_x[i];
2936  hp_b[i] = _hp_b[i];
2937  }
2938 
2939  // wrap CPU host side pointers
2940  ColorSpinorParam cpuParam(hp_b[0], *param, X, pc_solution, param->input_location);
2941  std::vector<ColorSpinorField*> h_b;
2942  h_b.resize(param->num_src);
2943  for(int i=0; i < param->num_src; i++) {
2944  cpuParam.v = hp_b[i]; //MW seems wird in the loop
2945  h_b[i] = ColorSpinorField::Create(cpuParam);
2946  }
2947 
2948  // cpuParam.v = hp_x;
2949  cpuParam.location = param->output_location;
2950  std::vector<ColorSpinorField*> h_x;
2951  h_x.resize(param->num_src);
2952 //
2953  for(int i=0; i < param->num_src; i++) {
2954  cpuParam.v = hp_x[i]; //MW seems wird in the loop
2955  h_x[i] = ColorSpinorField::Create(cpuParam);
2956  }
2957 
2958 
2959  // MW currently checked until here
2960 
2961  // download source
2962  printfQuda("Setup b\n");
2963  ColorSpinorParam cudaParam(cpuParam, *param);
2964  cudaParam.create = QUDA_NULL_FIELD_CREATE;
2965  cudaParam.is_composite = true;
2966  cudaParam.composite_dim = param->num_src;
2967 
2968  printfQuda("Create b \n");
2970 
2971 
2972 
2973 
2974  for(int i=0; i < param->num_src; i++) {
2975  b->Component(i) = *h_b[i];
2976  }
2977  printfQuda("Done b \n");
2978 
2980  if (param->use_init_guess == QUDA_USE_INIT_GUESS_YES) { // download initial guess
2981  // initial guess only supported for single-pass solvers
2982  if ((param->solution_type == QUDA_MATDAG_MAT_SOLUTION || param->solution_type == QUDA_MATPCDAG_MATPC_SOLUTION) &&
2983  (param->solve_type == QUDA_DIRECT_SOLVE || param->solve_type == QUDA_DIRECT_PC_SOLVE)) {
2984  errorQuda("Initial guess not supported for two-pass solver");
2985  }
2986  cudaParam.is_composite = true;
2987  cudaParam.is_component = false;
2988  cudaParam.composite_dim = param->num_src;
2989 
2990  x = ColorSpinorField::Create(cudaParam);
2991  for(int i=0; i < param->num_src; i++) {
2992  x->Component(i) = *h_x[i];
2993  }
2994 
2995  } else { // zero initial guess
2996  // Create the solution fields filled with zero
2997  cudaParam.create = QUDA_ZERO_FIELD_CREATE;
2998  printfQuda("Create x \n");
2999  x = ColorSpinorField::Create(cudaParam);
3000  printfQuda("Done x \n");
3001  // solution
3002  }
3003 
3005 
3006  double * nb = new double[param->num_src];
3007  for(int i=0; i < param->num_src; i++) {
3008  nb[i] = blas::norm2(b->Component(i));
3009  printfQuda("Source %i: CPU = %g, CUDA copy = %g\n", i, nb[i], nb[i]);
3010  if (nb[i]==0.0) errorQuda("Source has zero norm");
3011 
3012  if (getVerbosity() >= QUDA_VERBOSE) {
3013  double nh_b = blas::norm2(*h_b[i]);
3014  double nh_x = blas::norm2(*h_x[i]);
3015  double nx = blas::norm2(x->Component(i));
3016  printfQuda("Source %i: CPU = %g, CUDA copy = %g\n", i, nh_b, nb[i]);
3017  printfQuda("Solution %i: CPU = %g, CUDA copy = %g\n", i, nh_x, nx);
3018  }
3019  }
3020 
3021  // MW checked until here do far
3022 
3023  // rescale the source and solution vectors to help prevent the onset of underflow
3024  if (param->solver_normalization == QUDA_SOURCE_NORMALIZATION) {
3025  for(int i=0; i < param->num_src; i++) {
3026  blas::ax(1.0/sqrt(nb[i]), b->Component(i));
3027  blas::ax(1.0/sqrt(nb[i]), x->Component(i));
3028  }
3029  }
3030 
3031  for(int i=0; i < param->num_src; i++) {
3032  massRescale(dynamic_cast<cudaColorSpinorField&>( b->Component(i) ), *param);
3033  }
3034 
3035  // MW: need to check what dirac.prepare does
3036  // for now let's just try looping of num_rhs already here???
3037  // for(int i=0; i < param->num_src; i++) {
3038  dirac.prepare(in, out, *x, *b, param->solution_type);
3039 for(int i=0; i < param->num_src; i++) {
3040  if (getVerbosity() >= QUDA_VERBOSE) {
3041  double nin = blas::norm2((in->Component(i)));
3042  double nout = blas::norm2((out->Component(i)));
3043  printfQuda("Prepared source %i = %g\n", i, nin);
3044  printfQuda("Prepared solution %i = %g\n", i, nout);
3045  }
3046 
3047  if (getVerbosity() >= QUDA_VERBOSE) {
3048  double nin = blas::norm2(in->Component(i));
3049  printfQuda("Prepared source %i post mass rescale = %g\n", i, nin);
3050  }
3051  }
3052 
3053  // solution_type specifies *what* system is to be solved.
3054  // solve_type specifies *how* the system is to be solved.
3055  //
3056  // We have the following four cases (plus preconditioned variants):
3057  //
3058  // solution_type solve_type Effect
3059  // ------------- ---------- ------
3060  // MAT DIRECT Solve Ax=b
3061  // MATDAG_MAT DIRECT Solve A^dag y = b, followed by Ax=y
3062  // MAT NORMOP Solve (A^dag A) x = (A^dag b)
3063  // MATDAG_MAT NORMOP Solve (A^dag A) x = b
3064  // MAT NORMERR Solve (A A^dag) y = b, then x = A^dag y
3065  //
3066  // We generally require that the solution_type and solve_type
3067  // preconditioning match. As an exception, the unpreconditioned MAT
3068  // solution_type may be used with any solve_type, including
3069  // DIRECT_PC and NORMOP_PC. In these cases, preparation of the
3070  // preconditioned source and reconstruction of the full solution are
3071  // taken care of by Dirac::prepare() and Dirac::reconstruct(),
3072  // respectively.
3073 
3074  if (pc_solution && !pc_solve) {
3075  errorQuda("Preconditioned (PC) solution_type requires a PC solve_type");
3076  }
3077 
3078  if (!mat_solution && !pc_solution && pc_solve) {
3079  errorQuda("Unpreconditioned MATDAG_MAT solution_type requires an unpreconditioned solve_type");
3080  }
3081 
3082  if (!mat_solution && norm_error_solve) {
3083  errorQuda("Normal-error solve requires Mat solution");
3084  }
3085 
3086  if (param->inv_type_precondition == QUDA_MG_INVERTER && (pc_solve || pc_solution || !direct_solve || !mat_solution))
3087  errorQuda("Multigrid preconditioning only supported for direct non-red-black solve");
3088 
3089  if (mat_solution && !direct_solve && !norm_error_solve) { // prepare source: b' = A^dag b
3090  for(int i=0; i < param->num_src; i++) {
3092  dirac.Mdag(in->Component(i), tmp);
3093  }
3094  } else if (!mat_solution && direct_solve) { // perform the first of two solves: A^dag y = b
3095  DiracMdag m(dirac), mSloppy(diracSloppy), mPre(diracPre);
3096  SolverParam solverParam(*param);
3097  Solver *solve = Solver::create(solverParam, m, mSloppy, mPre, profileInvert);
3098  solve->solve(*out,*in);
3099  for(int i=0; i < param->num_src; i++) {
3101  }
3102  solverParam.updateInvertParam(*param);
3103  delete solve;
3104  }
3105 
3106  if (direct_solve) {
3107  DiracM m(dirac), mSloppy(diracSloppy), mPre(diracPre);
3108  SolverParam solverParam(*param);
3109  Solver *solve = Solver::create(solverParam, m, mSloppy, mPre, profileInvert);
3110  solve->solve(*out,*in);
3111  solverParam.updateInvertParam(*param);
3112  delete solve;
3113  } else if (!norm_error_solve) {
3114  DiracMdagM m(dirac), mSloppy(diracSloppy), mPre(diracPre);
3115  SolverParam solverParam(*param);
3116  Solver *solve = Solver::create(solverParam, m, mSloppy, mPre, profileInvert);
3117  solve->solve(*out,*in);
3118  solverParam.updateInvertParam(*param);
3119  delete solve;
3120  } else { // norm_error_solve
3121  DiracMMdag m(dirac), mSloppy(diracSloppy), mPre(diracPre);
3122  errorQuda("norm_error_solve not supported in multi source solve");
3123  //cudaColorSpinorField tmp(*out);
3124  // SolverParam solverParam(*param);
3125  //Solver *solve = Solver::create(solverParam, m, mSloppy, mPre, profileInvert);
3126  //(*solve)(tmp, *in); // y = (M M^\dag) b
3127  //dirac.Mdag(*out, tmp); // x = M^dag y
3128  //solverParam.updateInvertParam(*param,i,i);
3129  // delete solve;
3130  }
3131 
3132  if (getVerbosity() >= QUDA_VERBOSE){
3133  for(int i=0; i < param->num_src; i++) {
3134  double nx = blas::norm2(x->Component(i));
3135  printfQuda("Solution %i = %g\n",i, nx);
3136  }
3137  }
3138 
3139 
3141  for(int i=0; i< param->num_src; i++){
3142  dirac.reconstruct(x->Component(i), b->Component(i), param->solution_type);
3143  }
3145 
3146  if (param->solver_normalization == QUDA_SOURCE_NORMALIZATION) {
3147  for(int i=0; i< param->num_src; i++){
3148  // rescale the solution
3149  blas::ax(sqrt(nb[i]), x->Component(i));
3150  }
3151  }
3152 
3153  // MW -- not sure how to handle that here
3154  if (!param->make_resident_solution) {
3156  for(int i=0; i< param->num_src; i++){
3157  *h_x[i] = x->Component(i);
3158  }
3160  }
3161 
3162  if (getVerbosity() >= QUDA_VERBOSE){
3163  for(int i=0; i< param->num_src; i++){
3164  double nx = blas::norm2(x->Component(i));
3165  double nh_x = blas::norm2(*h_x[i]);
3166  printfQuda("Reconstructed: CUDA solution = %g, CPU copy = %g\n", nx, nh_x);
3167  }
3168  }
3169 
3170  //FIX need to make sure all deletes are correct again
3171  for(int i=0; i < param->num_src; i++){
3172  delete h_x[i];
3173  // delete x[i];
3174  delete h_b[i];
3175  // delete b[i];
3176  }
3177  delete [] hp_b;
3178  delete [] hp_x;
3179 // delete [] b;
3180 // if (!param->make_resident_solution) delete x; // FIXME make this cleaner
3181 
3182  delete d;
3183  delete dSloppy;
3184  delete dPre;
3185  delete x;
3186  delete b;
3187 
3188  popVerbosity();
3189 
3190  // FIXME: added temporarily so that the cache is written out even if a long benchmarking job gets interrupted
3191  saveTuneCache();
3192 
3194 }
3195 
3196 
3197 
3206 void invertMultiShiftQuda(void **_hp_x, void *_hp_b, QudaInvertParam *param)
3207 {
3208 
3211 
3212  if (param->dslash_type == QUDA_DOMAIN_WALL_DSLASH ||
3213  param->dslash_type == QUDA_DOMAIN_WALL_4D_DSLASH ||
3214  param->dslash_type == QUDA_MOBIUS_DWF_DSLASH) setKernelPackT(true);
3215 
3216  if (!initialized) errorQuda("QUDA not initialized");
3217 
3218  checkInvertParam(param);
3219 
3220  // check the gauge fields have been created
3222 
3223  if (param->num_offset > QUDA_MAX_MULTI_SHIFT)
3224  errorQuda("Number of shifts %d requested greater than QUDA_MAX_MULTI_SHIFT %d",
3225  param->num_offset, QUDA_MAX_MULTI_SHIFT);
3226 
3227  pushVerbosity(param->verbosity);
3228 
3229  bool pc_solution = (param->solution_type == QUDA_MATPC_SOLUTION) || (param->solution_type == QUDA_MATPCDAG_MATPC_SOLUTION);
3230  bool pc_solve = (param->solve_type == QUDA_DIRECT_PC_SOLVE) || (param->solve_type == QUDA_NORMOP_PC_SOLVE);
3231  bool mat_solution = (param->solution_type == QUDA_MAT_SOLUTION) || (param->solution_type == QUDA_MATPC_SOLUTION);
3232  bool direct_solve = (param->solve_type == QUDA_DIRECT_SOLVE) || (param->solve_type == QUDA_DIRECT_PC_SOLVE);
3233 
3234  if (mat_solution) {
3235  errorQuda("Multi-shift solver does not support MAT or MATPC solution types");
3236  }
3237  if (direct_solve) {
3238  errorQuda("Multi-shift solver does not support DIRECT or DIRECT_PC solve types");
3239  }
3240  if (pc_solution & !pc_solve) {
3241  errorQuda("Preconditioned (PC) solution_type requires a PC solve_type");
3242  }
3243  if (!pc_solution & pc_solve) {
3244  errorQuda("In multi-shift solver, a preconditioned (PC) solve_type requires a PC solution_type");
3245  }
3246 
3247  // No of GiB in a checkerboard of a spinor
3248  param->spinorGiB = cudaGauge->VolumeCB() * spinorSiteSize;
3249  if( !pc_solve) param->spinorGiB *= 2; // Double volume for non PC solve
3250 
3251  // **** WARNING *** this may not match implementation...
3252  if( param->inv_type == QUDA_CG_INVERTER ) {
3253  // CG-M needs 5 vectors for the smallest shift + 2 for each additional shift
3254  param->spinorGiB *= (5 + 2*(param->num_offset-1))/(double)(1<<30);
3255  } else {
3256  errorQuda("QUDA only currently supports multi-shift CG");
3257  // BiCGStab-M needs 7 for the original shift + 2 for each additional shift + 1 auxiliary
3258  // (Jegerlehner hep-lat/9612014 eq (3.13)
3259  param->spinorGiB *= (7 + 2*(param->num_offset-1))/(double)(1<<30);
3260  }
3261 
3262  // Timing and FLOP counters
3263  param->secs = 0;
3264  param->gflops = 0;
3265  param->iter = 0;
3266 
3267  for (int i=0; i<param->num_offset-1; i++) {
3268  for (int j=i+1; j<param->num_offset; j++) {
3269  if (param->offset[i] > param->offset[j])
3270  errorQuda("Offsets must be ordered from smallest to largest");
3271  }
3272  }
3273 
3274  // Host pointers for x, take a copy of the input host pointers
3275  void** hp_x;
3276  hp_x = new void* [ param->num_offset ];
3277 
3278  void* hp_b = _hp_b;
3279  for(int i=0;i < param->num_offset;i++){
3280  hp_x[i] = _hp_x[i];
3281  }
3282 
3283  // Create the matrix.
3284  // The way this works is that createDirac will create 'd' and 'dSloppy'
3285  // which are global. We then grab these with references...
3286  //
3287  // Balint: Isn't there a nice construction pattern we could use here? This is
3288  // expedient but yucky.
3289  // DiracParam diracParam;
3290  if (param->dslash_type == QUDA_ASQTAD_DSLASH ||
3291  param->dslash_type == QUDA_STAGGERED_DSLASH){
3292  param->mass = sqrt(param->offset[0]/4);
3293  }
3294 
3295  Dirac *d = NULL;
3296  Dirac *dSloppy = NULL;
3297  Dirac *dPre = NULL;
3298 
3299  // create the dirac operator
3300  createDirac(d, dSloppy, dPre, *param, pc_solve);
3301  Dirac &dirac = *d;
3302  Dirac &diracSloppy = *dSloppy;
3303 
3304  cudaColorSpinorField *b = NULL; // Cuda RHS
3305  std::vector<ColorSpinorField*> x; // Cuda Solutions
3306  x.resize(param->num_offset);
3307 
3308  // Grab the dimension array of the input gauge field.
3309  const int *X = ( param->dslash_type == QUDA_ASQTAD_DSLASH ) ?
3310  gaugeFatPrecise->X() : gaugePrecise->X();
3311 
3312  // This creates a ColorSpinorParam struct, from the host data
3313  // pointer, the definitions in param, the dimensions X, and whether
3314  // the solution is on a checkerboard instruction or not. These can
3315  // then be used as 'instructions' to create the actual
3316  // ColorSpinorField
3317  ColorSpinorParam cpuParam(hp_b, *param, X, pc_solution, param->input_location);
3318  ColorSpinorField *h_b = ColorSpinorField::Create(cpuParam);
3319 
3320  std::vector<ColorSpinorField*> h_x;
3321  h_x.resize(param->num_offset);
3322 
3323  cpuParam.location = param->output_location;
3324  for(int i=0; i < param->num_offset; i++) {
3325  cpuParam.v = hp_x[i];
3326  h_x[i] = ColorSpinorField::Create(cpuParam);
3327  }
3328 
3330  profileMulti.TPSTART(QUDA_PROFILE_H2D);
3331  // Now I need a colorSpinorParam for the device
3332  ColorSpinorParam cudaParam(cpuParam, *param);
3333  // This setting will download a host vector
3334  cudaParam.create = QUDA_COPY_FIELD_CREATE;
3335  b = new cudaColorSpinorField(*h_b, cudaParam); // Creates b and downloads h_b to it
3337 
3339  // Create the solution fields filled with zero
3340  cudaParam.create = QUDA_ZERO_FIELD_CREATE;
3341 
3342  // now check if we need to invalidate the solutionResident vectors
3343  bool invalidate = false;
3344  for (auto v : solutionResident)
3345  if (cudaParam.precision != v->Precision()) { invalidate = true; break; }
3346 
3347  if (invalidate) {
3348  for (auto v : solutionResident) delete v;
3349  solutionResident.clear();
3350  }
3351 
3352  // grow resident solutions to be big enough
3353  for (int i=solutionResident.size(); i < param->num_offset; i++) {
3354  solutionResident.push_back(new cudaColorSpinorField(cudaParam));
3355  }
3356  for (int i=0; i < param->num_offset; i++) x[i] = solutionResident[i];
3357 
3359 
3360 
3362 
3363  // Check source norms
3364  double nb = blas::norm2(*b);
3365  if (nb==0.0) errorQuda("Source has zero norm");
3366 
3367  if(getVerbosity() >= QUDA_VERBOSE ) {
3368  double nh_b = blas::norm2(*h_b);
3369  printfQuda("Source: CPU = %g, CUDA copy = %g\n", nh_b, nb);
3370  }
3371 
3372  // rescale the source vector to help prevent the onset of underflow
3373  if (param->solver_normalization == QUDA_SOURCE_NORMALIZATION) {
3374  blas::ax(1.0/sqrt(nb), *b);
3375  }
3376 
3377  massRescale(*b, *param);
3379 
3380  // use multi-shift CG
3381  {
3382  DiracMdagM m(dirac), mSloppy(diracSloppy);
3383  SolverParam solverParam(*param);
3384  MultiShiftCG cg_m(m, mSloppy, solverParam, profileMulti);
3385  cg_m(x, *b);
3386  solverParam.updateInvertParam(*param);
3387  }
3388 
3389  if (param->compute_true_res) {
3390  // check each shift has the desired tolerance and use sequential CG to refine
3392  cudaParam.create = QUDA_ZERO_FIELD_CREATE;
3393  cudaColorSpinorField r(*b, cudaParam);
3395 
3396 #define REFINE_INCREASING_MASS
3397 #ifdef REFINE_INCREASING_MASS
3398  for(int i=0; i < param->num_offset; i++) {
3399 #else
3400  for(int i=param->num_offset-1; i >= 0; i--) {
3401 #endif
3402  double rsd_hq = param->residual_type & QUDA_HEAVY_QUARK_RESIDUAL ?
3403  param->true_res_hq_offset[i] : 0;
3404  double tol_hq = param->residual_type & QUDA_HEAVY_QUARK_RESIDUAL ?
3405  param->tol_hq_offset[i] : 0;
3406 
3407  /*
3408  In the case where the shifted systems have zero tolerance
3409  specified, we refine these systems until either the limit of
3410  precision is reached (prec_tol) or until the tolerance reaches
3411  the iterated residual tolerance of the previous multi-shift
3412  solver (iter_res_offset[i]), which ever is greater.
3413  */
3414  const double prec_tol = std::pow(10.,(-2*(int)param->cuda_prec+2));
3415  const double iter_tol = (param->iter_res_offset[i] < prec_tol ? prec_tol : (param->iter_res_offset[i] *1.1));
3416  const double refine_tol = (param->tol_offset[i] == 0.0 ? iter_tol : param->tol_offset[i]);
3417  // refine if either L2 or heavy quark residual tolerances have not been met, only if desired residual is > 0
3418  if ((param->true_res_offset[i] > refine_tol || rsd_hq > tol_hq)) {
3419  if (getVerbosity() >= QUDA_SUMMARIZE)
3420  printfQuda("Refining shift %d: L2 residual %e / %e, heavy quark %e / %e (actual / requested)\n",
3421  i, param->true_res_offset[i], param->tol_offset[i], rsd_hq, tol_hq);
3422 
3423  // for staggered the shift is just a change in mass term (FIXME: for twisted mass also)
3424  if (param->dslash_type == QUDA_ASQTAD_DSLASH ||
3425  param->dslash_type == QUDA_STAGGERED_DSLASH) {
3426  dirac.setMass(sqrt(param->offset[i]/4));
3427  diracSloppy.setMass(sqrt(param->offset[i]/4));
3428  }
3429 
3430  DiracMdagM m(dirac), mSloppy(diracSloppy);
3431 
3432  // need to curry in the shift if we are not doing staggered
3433  if (param->dslash_type != QUDA_ASQTAD_DSLASH &&
3434  param->dslash_type != QUDA_STAGGERED_DSLASH) {
3435  m.shift = param->offset[i];
3436  mSloppy.shift = param->offset[i];
3437  }
3438 
3439  if (0) { // experimenting with Minimum residual extrapolation
3440  // only perform MRE using current and previously refined solutions
3441 #ifdef REFINE_INCREASING_MASS
3442  const int nRefine = i+1;
3443 #else
3444  const int nRefine = param->num_offset - i + 1;
3445 #endif
3446 
3447  std::vector<ColorSpinorField*> q;
3448  q.resize(nRefine);
3449  std::vector<ColorSpinorField*> z;
3450  z.resize(nRefine);
3451  cudaParam.create = QUDA_NULL_FIELD_CREATE;
3452  cudaColorSpinorField tmp(cudaParam);
3453 
3454  for(int j=0; j < nRefine; j++) {
3455  q[j] = new cudaColorSpinorField(cudaParam);
3456  z[j] = new cudaColorSpinorField(cudaParam);
3457  }
3458 
3459  *z[0] = *x[0]; // zero solution already solved
3460 #ifdef REFINE_INCREASING_MASS
3461  for (int j=1; j<nRefine; j++) *z[j] = *x[j];
3462 #else
3463  for (int j=1; j<nRefine; j++) *z[j] = *x[param->num_offset-j];
3464 #endif
3465 
3466  bool orthogonal = true;
3467  bool apply_mat = true;
3468  MinResExt mre(m, orthogonal, apply_mat, profileMulti);
3469  blas::copy(tmp, *b);
3470  mre(*x[i], tmp, z, q);
3471 
3472  for(int j=0; j < nRefine; j++) {
3473  delete q[j];
3474  delete z[j];
3475  }
3476  }
3477 
3478  SolverParam solverParam(*param);
3479  solverParam.iter = 0;
3481  solverParam.tol = (param->tol_offset[i] > 0.0 ? param->tol_offset[i] : iter_tol); // set L2 tolerance
3482  solverParam.tol_hq = param->tol_hq_offset[i]; // set heavy quark tolerance
3483 
3484  CG cg(m, mSloppy, solverParam, profileMulti);
3485  cg(*x[i], *b);
3486 
3487  solverParam.true_res_offset[i] = solverParam.true_res;
3488  solverParam.true_res_hq_offset[i] = solverParam.true_res_hq;
3489  solverParam.updateInvertParam(*param,i);
3490 
3491  if (param->dslash_type == QUDA_ASQTAD_DSLASH ||
3492  param->dslash_type == QUDA_STAGGERED_DSLASH) {
3493  dirac.setMass(sqrt(param->offset[0]/4)); // restore just in case
3494  diracSloppy.setMass(sqrt(param->offset[0]/4)); // restore just in case
3495  }
3496 
3497  }
3498  }
3499  }
3500 
3501  // restore shifts -- avoid side effects
3502  for(int i=0; i < param->num_offset; i++) {
3503  param->offset[i] = unscaled_shifts[i];
3504  }
3505 
3506  profileMulti.TPSTART(QUDA_PROFILE_D2H);
3507 
3508  if (param->compute_action) {
3509  Complex action(0);
3510  for (int i=0; i<param->num_offset; i++) action += param->residue[i] * blas::cDotProduct(*b, *x[i]);
3511  param->action[0] = action.real();
3512  param->action[1] = action.imag();
3513  }
3514 
3515  for(int i=0; i < param->num_offset; i++) {
3516  if (param->solver_normalization == QUDA_SOURCE_NORMALIZATION) { // rescale the solution
3517  blas::ax(sqrt(nb), *x[i]);
3518  }
3519 
3520  if (getVerbosity() >= QUDA_VERBOSE){
3521  double nx = blas::norm2(*x[i]);
3522  printfQuda("Solution %d = %g\n", i, nx);
3523  }
3524 
3525  if (!param->make_resident_solution) *h_x[i] = *x[i];
3526  }
3528 
3530 
3531  if (!param->make_resident_solution) {
3532  for (auto v: solutionResident) if (v) delete v;
3533  solutionResident.clear();
3534  }
3535 
3537 
3539  for(int i=0; i < param->num_offset; i++){
3540  delete h_x[i];
3541  //if (!param->make_resident_solution) delete x[i];
3542  }
3543 
3544  delete h_b;
3545  delete b;
3546 
3547  delete [] hp_x;
3548 
3549  delete d;
3550  delete dSloppy;
3551  delete dPre;
3553 
3554  popVerbosity();
3555 
3556  // cache is written out even if a long benchmarking job gets interrupted
3557  saveTuneCache();
3558 
3560 }
3561 
3562 void computeKSLinkQuda(void* fatlink, void* longlink, void* ulink, void* inlink, double *path_coeff, QudaGaugeParam *param) {
3563 
3564 #ifdef GPU_FATLINK
3567 
3568  checkGaugeParam(param);
3569 
3570  if (ulink) {
3571  const double unitarize_eps = 1e-14;
3572  const double max_error = 1e-10;
3573  const int reunit_allow_svd = 1;
3574  const int reunit_svd_only = 0;
3575  const double svd_rel_error = 1e-6;
3576  const double svd_abs_error = 1e-6;
3579  }
3580 
3582  cpuGaugeField cpuFatLink(gParam); // create the host fatlink
3583  gParam.gauge = longlink;
3584  cpuGaugeField cpuLongLink(gParam); // create the host longlink
3585  gParam.gauge = ulink;
3586  cpuGaugeField cpuUnitarizedLink(gParam);
3588  gParam.gauge = inlink;
3589  cpuGaugeField cpuInLink(gParam); // create the host sitelink
3590 
3591  // create the device fields
3595  cudaGaugeField *cudaInLink = new cudaGaugeField(gParam);
3596 
3598 
3600  cudaInLink->loadCPUField(cpuInLink);
3602 
3603  cudaGaugeField *cudaInLinkEx = createExtendedGauge(*cudaInLink, R, profileFatLink);
3604 
3606  delete cudaInLink;
3608 
3615  cudaGaugeField *cudaUnitarizedLink = ulink ? new cudaGaugeField(gParam) : nullptr;
3616  cudaGaugeField *cudaLongLink = longlink ? new cudaGaugeField(gParam) : nullptr;
3617 
3619  fatLongKSLink(cudaFatLink, cudaLongLink, *cudaInLinkEx, path_coeff);
3621 
3622  if (ulink) {
3624  *num_failures_h = 0;
3625  quda::unitarizeLinks(*cudaUnitarizedLink, *cudaFatLink, num_failures_d); // unitarize on the gpu
3626  if (*num_failures_h>0) errorQuda("Error in unitarization component of the hisq fattening: %d failures\n", *num_failures_h);
3628  }
3629 
3631  if (ulink) cudaUnitarizedLink->saveCPUField(cpuUnitarizedLink);
3633  if (longlink) cudaLongLink->saveCPUField(cpuLongLink);
3635 
3637  delete cudaFatLink;
3638  if (longlink) delete cudaLongLink;
3639  if (ulink) delete cudaUnitarizedLink;
3640  delete cudaInLinkEx;
3642 
3644 #else
3645  errorQuda("Fat-link has not been built");
3646 #endif // GPU_FATLINK
3647 
3648  return;
3649 }
3650 
3652  int pad = 0;
3653 #ifdef MULTI_GPU
3654  int volume = param.x[0]*param.x[1]*param.x[2]*param.x[3];
3655  int face_size[4];
3656  for(int dir=0; dir<4; ++dir) face_size[dir] = (volume/param.x[dir])/2;
3657  pad = *std::max_element(face_size, face_size+4);
3658 #endif
3659 
3660  return pad;
3661 }
3662 
3663 int computeGaugeForceQuda(void* mom, void* siteLink, int*** input_path_buf, int* path_length,
3664  double* loop_coeff, int num_paths, int max_length, double eb3, QudaGaugeParam* qudaGaugeParam)
3665 {
3666 #ifdef GPU_GAUGE_FORCE
3669 
3670  checkGaugeParam(qudaGaugeParam);
3671 
3675  cpuGaugeField *cpuSiteLink = (!qudaGaugeParam->use_resident_gauge) ? new cpuGaugeField(gParam) : NULL;
3676 
3677  cudaGaugeField* cudaSiteLink = NULL;
3678 
3680  if (!gaugePrecise) errorQuda("No resident gauge field to use");
3681  cudaSiteLink = gaugePrecise;
3683  } else {
3689 
3690  cudaSiteLink = new cudaGaugeField(gParam);
3692 
3694  cudaSiteLink->loadCPUField(*cpuSiteLink);
3696 
3698  }
3699 
3701  // FIXME - test program always uses MILC for mom but can use QDP for gauge
3702  if (gParamMom.order == QUDA_QDP_GAUGE_ORDER) gParamMom.order = QUDA_MILC_GAUGE_ORDER;
3703  if (gParamMom.order == QUDA_TIFR_GAUGE_ORDER || gParamMom.order == QUDA_TIFR_PADDED_GAUGE_ORDER) gParamMom.reconstruct = QUDA_RECONSTRUCT_NO;
3704  else gParamMom.reconstruct = QUDA_RECONSTRUCT_10;
3705 
3706  gParamMom.site_offset = qudaGaugeParam->mom_offset;
3707  gParamMom.site_size = qudaGaugeParam->site_size;
3708  cpuGaugeField* cpuMom = (!qudaGaugeParam->use_resident_mom) ? new cpuGaugeField(gParamMom) : NULL;
3709 
3710  cudaGaugeField* cudaMom = NULL;
3712  if (!momResident) errorQuda("No resident momentum field to use");
3713  cudaMom = momResident;
3716  } else {
3718  gParamMom.order = QUDA_FLOAT2_GAUGE_ORDER;
3719  gParamMom.reconstruct = QUDA_RECONSTRUCT_10;
3720  gParamMom.link_type = QUDA_ASQTAD_MOM_LINKS;
3721  gParamMom.precision = qudaGaugeParam->cuda_prec;
3722  gParamMom.create = QUDA_ZERO_FIELD_CREATE;
3723  cudaMom = new cudaGaugeField(gParamMom);
3725  if (!qudaGaugeParam->overwrite_mom) {
3729  }
3730  }
3731 
3733 
3734  // actually do the computation
3736  gaugeForce(*cudaMom, *cudaGauge, eb3, input_path_buf, path_length, loop_coeff, num_paths, max_length);
3738 
3743  }
3744 
3747  if (gaugePrecise && gaugePrecise != cudaSiteLink) delete gaugePrecise;
3748  gaugePrecise = cudaSiteLink;
3749  } else {
3750  delete cudaSiteLink;
3751  }
3752 
3754  if (momResident && momResident != cudaMom) delete momResident;
3755  momResident = cudaMom;
3756  } else {
3757  delete cudaMom;
3758  }
3759 
3760  if (cpuSiteLink) delete cpuSiteLink;
3761  if (cpuMom) delete cpuMom;
3762 
3766  } else {
3767  delete cudaGauge;
3768  }
3770 
3772 
3773  checkCudaError();
3774 #else
3775  errorQuda("Gauge force has not been built");
3776 #endif // GPU_GAUGE_FORCE
3777  return 0;
3778 }
3779 
3781 {
3783  if (!cloverPrecise) errorQuda("Clover field not allocated");
3784 
3786  // for clover we optimize to only send depth 1 halos in y/z/t (FIXME - make work for x, make robust in general)
3787  int R[4];
3788  for (int d=0; d<4; d++) R[d] = (d==0 ? 2 : 1) * (redundant_comms || commDimPartitioned(d));
3790 
3792  // create the Fmunu field
3794  tensorParam.siteSubset = QUDA_FULL_SITE_SUBSET;
3795  tensorParam.order = QUDA_FLOAT2_GAUGE_ORDER;
3796  tensorParam.ghostExchange = QUDA_GHOST_EXCHANGE_NO;
3797  cudaGaugeField Fmunu(tensorParam);
3799 
3801  computeFmunu(Fmunu, *gauge, QUDA_CUDA_FIELD_LOCATION);
3804 
3806 
3807  // FIXME always preserve the extended gauge
3808  extendedGaugeResident = gauge;
3809 
3810  return;
3811 }
3812 
3813 void* createGaugeFieldQuda(void* gauge, int geometry, QudaGaugeParam* param)
3814 {
3816  gParam.geometry = static_cast<QudaFieldGeometry>(geometry);
3817  if (geometry != QUDA_SCALAR_GEOMETRY && geometry != QUDA_VECTOR_GEOMETRY)
3818  errorQuda("Only scalar and vector geometries are supported\n");
3819 
3820  cpuGaugeField *cpuGauge = nullptr;
3821  if (gauge) cpuGauge = new cpuGaugeField(gParam);
3822 
3826 
3827  if (gauge) {
3829  delete cpuGauge;
3830  }
3831 
3832  return cudaGauge;
3833 }
3834 
3835 
3836 void saveGaugeFieldQuda(void* gauge, void* inGauge, QudaGaugeParam* param){
3837 
3838  cudaGaugeField* cudaGauge = reinterpret_cast<cudaGaugeField*>(inGauge);
3839 
3842 
3845 
3846 }
3847 
3848 
3849 void destroyGaugeFieldQuda(void* gauge){
3850  cudaGaugeField* g = reinterpret_cast<cudaGaugeField*>(gauge);
3851  delete g;
3852 }
3853 
3854 
3855 void computeStaggeredForceQuda(void* h_mom, double dt, double delta, void *h_force, void **x,
3857 {
3860 
3862 
3863  // create the host momentum field
3867 
3868  // create the host momentum field
3870  gParam.gauge = h_force;
3872 
3873  // create the device momentum field
3879 
3880  // create temporary field for quark-field outer product
3885  GaugeField *cudaForce_[2] = {&cudaForce};
3886 
3887  ColorSpinorParam qParam;
3889  qParam.nColor = 3;
3890  qParam.nSpin = 1;
3893  qParam.nDim = 5; // 5 since staggered mrhs
3894  qParam.precision = gParam.precision;
3895  qParam.pad = 0;
3896  for(int dir=0; dir<4; ++dir) qParam.x[dir] = gParam.x[dir];
3897  qParam.x[4] = 1;
3898  qParam.create = QUDA_NULL_FIELD_CREATE;
3901 
3904 
3906  if (!momResident) errorQuda("Cannot use resident momentum field since none appears resident");
3907  cudaMom = momResident;
3908  } else {
3909  // download the initial momentum (FIXME make an option just to return?)
3911  }
3912 
3913  // resident gauge field is required
3915  errorQuda("Resident gauge field is required");
3916 
3919 
3920  const int nvector = inv_param->num_offset;
3921  std::vector<ColorSpinorField*> X(nvector);
3922  for ( int i=0; i<nvector; i++) X[i] = ColorSpinorField::Create(qParam);
3923 
3925  if (solutionResident.size() < (unsigned int)nvector)
3926  errorQuda("solutionResident.size() %lu does not match number of shifts %d",
3927  solutionResident.size(), nvector);
3928  }
3929 
3930  // create the staggered operator
3931  DiracParam diracParam;
3933  Dirac *dirac = Dirac::create(diracParam);
3934 
3937 
3938  for (int i=0; i<nvector; i++) {
3939  ColorSpinorField &x = *(X[i]);
3940 
3941  if (inv_param->use_resident_solution) x.Even() = *(solutionResident[i]);
3942  else errorQuda("%s requires resident solution", __func__);
3943 
3944  // set the odd solution component
3945  dirac->Dslash(x.Odd(), x.Even(), QUDA_ODD_PARITY);
3946  }
3947 
3950 
3951 #if 0
3953  for (auto v : solutionResident) if (v) delete solutionResident[i];
3954  solutionResident.clear();
3955  }
3956 #endif
3957  delete dirac;
3958 
3961 
3962  // compute quark-field outer product
3963  for (int i=0; i<nvector; i++) {
3964  ColorSpinorField &x = *(X[i]);
3965  // second component is zero since we have no three hop term
3966  double coeff[2] = {dt * inv_param->residue[i], 0.0};
3967 
3968  // Operate on even-parity sites
3969  computeStaggeredOprod(cudaForce_, x, coeff, 1);
3970  }
3971 
3972  // mom += delta * [U * force]TA
3976 
3979 
3981  // copy the momentum field back to the host
3983  }
3984 
3986  // make the momentum field resident
3987  momResident = cudaMom;
3988  } else {
3989  delete cudaMom;
3990  }
3991 
3994 
3995  for (int i=0; i<nvector; i++) delete X[i];
3996 
3999 
4000  checkCudaError();
4001  return;
4002 }
4003 
4004 void computeHISQForceQuda(void* const milc_momentum,
4005  long long *flops,
4006  const double level2_coeff[6],
4007  const double fat7_coeff[6],
4008  const void* const w_link,
4009  const void* const v_link,
4010  const void* const u_link,
4011  void **fermion,
4012  int num_terms,
4013  int num_naik_terms,
4014  double **coeff,
4016 {
4017 #ifdef GPU_STAGGERED_OPROD
4018  using namespace quda;
4019  using namespace quda::fermion_force;
4021  if (gParam->gauge_order != QUDA_MILC_GAUGE_ORDER) errorQuda("Unsupported input field order %d", gParam->gauge_order);
4022 
4023  checkGaugeParam(gParam);
4024 
4026 
4027  // create the device outer-product field
4029  oParam.nFace = 0;
4030  oParam.create = QUDA_ZERO_FIELD_CREATE;
4031  oParam.order = QUDA_FLOAT2_GAUGE_ORDER;
4032  cudaGaugeField *stapleOprod = new cudaGaugeField(oParam);
4033  cudaGaugeField *oneLinkOprod = new cudaGaugeField(oParam);
4034  cudaGaugeField *naikOprod = new cudaGaugeField(oParam);
4035 
4036  {
4037  // default settings for the unitarization
4038  const double unitarize_eps = 1e-14;
4039  const double hisq_force_filter = 5e-5;
4040  const double max_det_error = 1e-10;
4041  const bool allow_svd = true;
4042  const bool svd_only = false;
4043  const double svd_rel_err = 1e-8;
4044  const double svd_abs_err = 1e-8;
4045 
4046  setUnitarizeForceConstants(unitarize_eps, hisq_force_filter, max_det_error, allow_svd, svd_only, svd_rel_err, svd_abs_err);
4047  }
4048 
4049  double act_path_coeff[6] = {0,1,level2_coeff[2],level2_coeff[3],level2_coeff[4],level2_coeff[5]};
4050  // You have to look at the MILC routine to understand the following
4051  // Basically, I have already absorbed the one-link coefficient
4052 
4054  //param.nFace = 0;
4055  param.order = QUDA_MILC_GAUGE_ORDER;
4057  param.ghostExchange = QUDA_GHOST_EXCHANGE_NO;
4058  cpuGaugeField* cpuMom = (!gParam->use_resident_mom) ? new cpuGaugeField(param) : NULL;
4059 
4060  param.link_type = QUDA_GENERAL_LINKS;
4062  param.gauge = (void*)w_link;
4063  cpuGaugeField cpuWLink(param);
4064  param.gauge = (void*)v_link;
4065  cpuGaugeField cpuVLink(param);
4066  param.gauge = (void*)u_link;
4068 
4069  param.create = QUDA_ZERO_FIELD_CREATE;
4071  param.link_type = QUDA_ASQTAD_MOM_LINKS;
4073  GaugeFieldParam momParam(param);
4074 
4075  param.create = QUDA_ZERO_FIELD_CREATE;
4076  param.link_type = QUDA_GENERAL_LINKS;
4077  param.precision = gParam->cpu_prec;
4079 
4081  for (int dir=0; dir<4; ++dir) {
4082  param.x[dir] += 2*R[dir];
4083  param.r[dir] = R[dir];
4084  }
4085 
4087  param.create = QUDA_ZERO_FIELD_CREATE;
4088  param.ghostExchange = QUDA_GHOST_EXCHANGE_EXTENDED;
4089 
4091 
4092  { // do outer-product computation
4093  ColorSpinorParam qParam;
4094  qParam.nColor = 3;
4095  qParam.nSpin = 1;
4098  qParam.nDim = 4;
4099  qParam.precision = oParam.precision;
4100  qParam.pad = 0;
4101  for (int dir=0; dir<4; ++dir) qParam.x[dir] = oParam.x[dir];
4102 
4103  // create the device quark field
4104  qParam.create = QUDA_NULL_FIELD_CREATE;
4106  cudaColorSpinorField cudaQuark(qParam);
4107 
4108  // create the host quark field
4111  qParam.v = fermion[0];
4112 
4113  { // regular terms
4114  GaugeField *oprod[2] = {stapleOprod, naikOprod};
4115 
4116  // loop over different quark fields
4117  for(int i=0; i<num_terms; ++i){
4118 
4119  // Wrap the MILC quark field
4121  qParam.v = fermion[i];
4122  cpuColorSpinorField cpuQuark(qParam); // create host quark field
4124 
4126  cudaQuark = cpuQuark;
4128 
4130  computeStaggeredOprod(oprod, cudaQuark, coeff[i], 3);
4132  }
4133  }
4134 
4135  { // naik terms
4136  oneLinkOprod->copy(*stapleOprod);
4137  ax(level2_coeff[0], *oneLinkOprod);
4138  GaugeField *oprod[2] = {oneLinkOprod, naikOprod};
4139 
4140  // loop over different quark fields
4141  for(int i=0; i<num_naik_terms; ++i){
4142 
4143  // Wrap the MILC quark field
4145  qParam.v = fermion[i + num_terms - num_naik_terms];
4146  cpuColorSpinorField cpuQuark(qParam); // create host quark field
4148 
4150  cudaQuark = cpuQuark;
4152 
4154  computeStaggeredOprod(oprod, cudaQuark, coeff[i], 3);
4156  }
4157  }
4158  }
4159 
4161  cudaGaugeField* cudaInForce = new cudaGaugeField(param);
4162  copyExtendedGauge(*cudaInForce, *stapleOprod, QUDA_CUDA_FIELD_LOCATION);
4163  delete stapleOprod;
4164 
4165  cudaGaugeField* cudaOutForce = new cudaGaugeField(param);
4166  copyExtendedGauge(*cudaOutForce, *oneLinkOprod, QUDA_CUDA_FIELD_LOCATION);
4167  delete oneLinkOprod;
4168 
4171 
4173 
4174  cudaInForce->exchangeExtendedGhost(R,profileHISQForce,true);
4176  cudaOutForce->exchangeExtendedGhost(R,profileHISQForce,true);
4177 
4179  hisqStaplesForce(*cudaOutForce, *cudaInForce, *cudaGauge, act_path_coeff, flops);
4181 
4182  // Load naik outer product
4183  copyExtendedGauge(*cudaInForce, *naikOprod, QUDA_CUDA_FIELD_LOCATION);
4184  cudaInForce->exchangeExtendedGhost(R,profileHISQForce,true);
4185  delete naikOprod;
4186 
4187  // Compute Naik three-link term
4189  hisqLongLinkForce(*cudaOutForce, *cudaInForce, *cudaGauge, act_path_coeff[1], flops);
4191 
4192  cudaOutForce->exchangeExtendedGhost(R,profileHISQForce,true);
4193 
4194  // load v-link
4197 
4199  *num_failures_h = 0;
4200  unitarizeForce(*cudaInForce, *cudaOutForce, *cudaGauge, num_failures_d, flops);
4202 
4203  if (*num_failures_h>0) errorQuda("Error in the unitarization component of the hisq fermion force: %d failures\n", *num_failures_h);
4204 
4205  cudaMemset((void**)(cudaOutForce->Gauge_p()), 0, cudaOutForce->Bytes());
4206 
4207  // read in u-link
4210 
4211  // Compute Fat7-staple term
4213  hisqStaplesForce(*cudaOutForce, *cudaInForce, *cudaGauge, fat7_coeff, flops);
4215 
4216  delete cudaInForce;
4217  cudaGaugeField* cudaMom = new cudaGaugeField(momParam);
4218 
4220  hisqCompleteForce(*cudaMom, *cudaOutForce, *cudaGauge, flops);
4222 
4223  if (gParam->use_resident_mom) {
4224  if (!momResident) errorQuda("No resident momentum field to use");
4226  }
4227 
4228  if (gParam->return_result_mom) {
4229  // Close the paths, make anti-hermitian, and store in compressed format
4230  if (gParam->return_result_mom) cudaMom->saveCPUField(*cpuMom, profileHISQForce);
4231  }
4232 
4234 
4235  if (cpuMom) delete cpuMom;
4236 
4237  if (!gParam->make_resident_mom) {
4238  delete momResident;
4239  momResident = nullptr;
4240  }
4241  if (cudaMom) delete cudaMom;
4242  delete cudaOutForce;
4243  delete cudaGauge;
4245 
4247 
4248  return;
4249 #else
4250  errorQuda("HISQ force has not been built");
4251 #endif
4252 }
4253 
4254 void computeCloverForceQuda(void *h_mom, double dt, void **h_x, void **h_p,
4255  double *coeff, double kappa2, double ck,
4256  int nvector, double multiplicity, void *gauge,
4258 
4259 
4260  using namespace quda;
4263 
4264  checkGaugeParam(gauge_param);
4265  if (!gaugePrecise) errorQuda("No resident gauge field");
4266 
4268  // create the host momentum field
4270  fParam.order = gauge_param->gauge_order;
4271  cpuGaugeField cpuMom(fParam);
4272 
4273  // create the device momentum field
4274  fParam.create = QUDA_ZERO_FIELD_CREATE;
4275  fParam.order = QUDA_FLOAT2_GAUGE_ORDER;
4276  cudaGaugeField cudaMom(fParam);
4277 
4278  // create the device force field
4279  fParam.link_type = QUDA_GENERAL_LINKS;
4280  fParam.create = QUDA_ZERO_FIELD_CREATE;
4281  fParam.order = QUDA_FLOAT2_GAUGE_ORDER;
4283  cudaGaugeField cudaForce(fParam);
4284 
4285  ColorSpinorParam qParam;
4287  qParam.nColor = 3;
4288  qParam.nSpin = 4;
4291  qParam.nDim = 4;
4292  qParam.precision = fParam.precision;
4293  qParam.pad = 0;
4294  for(int dir=0; dir<4; ++dir) qParam.x[dir] = fParam.x[dir];
4295 
4296  // create the device quark field
4297  qParam.create = QUDA_NULL_FIELD_CREATE;
4300 
4301  std::vector<ColorSpinorField*> quarkX, quarkP;
4302  for (int i=0; i<nvector; i++) {
4303  quarkX.push_back(ColorSpinorField::Create(qParam));
4304  quarkP.push_back(ColorSpinorField::Create(qParam));
4305  }
4306 
4308  qParam.x[0] /= 2;
4309  cudaColorSpinorField tmp(qParam);
4310 
4311  // create the host quark field
4314  qParam.gammaBasis = QUDA_DEGRAND_ROSSI_GAMMA_BASIS; // need expose this to interface
4315 
4316  bool pc_solve = (inv_param->solve_type == QUDA_DIRECT_PC_SOLVE) ||
4318  DiracParam diracParam;
4319  setDiracParam(diracParam, inv_param, pc_solve);
4320  diracParam.tmp1 = &tmp; // use as temporary for dirac->M
4321  Dirac *dirac = Dirac::create(diracParam);
4322 
4324  if (solutionResident.size() < (unsigned int)nvector)
4325  errorQuda("solutionResident.size() %lu does not match number of shifts %d",
4326  solutionResident.size(), nvector);
4327  }
4328 
4330 
4331  // create oprod and trace fields
4332  fParam.geometry = QUDA_TENSOR_GEOMETRY;
4333  cudaGaugeField oprod(fParam);
4334 
4337 
4338  std::vector<double> force_coeff(nvector);
4339  // loop over different quark fields
4340  for(int i=0; i<nvector; i++){
4341  ColorSpinorField &x = *(quarkX[i]);
4342  ColorSpinorField &p = *(quarkP[i]);
4343 
4345  // for downloading x_e
4347  qParam.x[0] /= 2;
4348 
4349  // Wrap the even-parity MILC quark field
4352  qParam.v = h_x[i];
4353  cpuColorSpinorField cpuQuarkX(qParam); // create host quark field
4355 
4357  x.Even() = cpuQuarkX;
4359 
4361  gamma5(x.Even(), x.Even());
4362  } else {
4363  x.Even() = *(solutionResident[i]);
4364  }
4365 
4366  dirac->Dslash(x.Odd(), x.Even(), QUDA_ODD_PARITY);
4367  dirac->M(p.Even(), x.Even());
4369  dirac->Dslash(p.Odd(), p.Even(), QUDA_ODD_PARITY);
4371 
4372  gamma5(x, x);
4373  gamma5(p, p);
4374 
4375  force_coeff[i] = 2.0*dt*coeff[i]*kappa2;
4376  }
4377 
4378  computeCloverForce(cudaForce, *gaugePrecise, quarkX, quarkP, force_coeff);
4379 
4380  // In double precision the clover derivative is faster with no reconstruct
4381  cudaGaugeField *u = &gaugeEx;
4382  if (gaugeEx.Reconstruct() == QUDA_RECONSTRUCT_12 && gaugeEx.Precision() == QUDA_DOUBLE_PRECISION) {
4383  GaugeFieldParam param(gaugeEx);
4385  u = new cudaGaugeField(param);
4386  u -> copy(gaugeEx);
4387  }
4388 
4389  computeCloverSigmaTrace(oprod, *cloverPrecise, 2.0*ck*multiplicity*dt);
4390 
4391  /* Now the U dA/dU terms */
4392  std::vector< std::vector<double> > ferm_epsilon(nvector);
4393  for (int shift = 0; shift < nvector; shift++) {
4394  ferm_epsilon[shift].reserve(2);
4395  ferm_epsilon[shift][0] = 2.0*ck*coeff[shift]*dt;
4396  ferm_epsilon[shift][1] = -kappa2 * 2.0*ck*coeff[shift]*dt;
4397  }
4398 
4399  computeCloverSigmaOprod(oprod, quarkX, quarkP, ferm_epsilon);
4400 
4402 
4404 
4405  cloverDerivative(cudaForce, *u, *oprodEx, 1.0, QUDA_ODD_PARITY);
4406  cloverDerivative(cudaForce, *u, *oprodEx, 1.0, QUDA_EVEN_PARITY);
4407 
4408  if (u != &gaugeEx) delete u;
4409 
4412 
4413  // copy the outer product field back to the host
4417 
4419 
4420  for (int i=0; i<nvector; i++) {
4421  delete quarkX[i];
4422  delete quarkP[i];
4423  }
4424 
4425 #if 0
4427  for (auto v : solutionResident) if (v) delete v;
4428  solutionResident.clear();
4429  }
4430 #endif
4431  delete dirac;
4433 
4434  checkCudaError();
4436  return;
4437 }
4438 
4439 
4440 
4441 void updateGaugeFieldQuda(void* gauge,
4442  void* momentum,
4443  double dt,
4444  int conj_mom,
4445  int exact,
4447 {
4449 
4450  checkGaugeParam(param);
4451 
4453 
4454  // create the host fields
4458  bool need_cpu = !param->use_resident_gauge || param->return_result_gauge;
4459  cpuGaugeField *cpuGauge = need_cpu ? new cpuGaugeField(gParam) : NULL;
4460 
4461  GaugeFieldParam gParamMom(momentum, *param);
4462  gParamMom.reconstruct = (gParamMom.order == QUDA_TIFR_GAUGE_ORDER || gParamMom.order == QUDA_TIFR_PADDED_GAUGE_ORDER) ?
4464  gParamMom.link_type = QUDA_ASQTAD_MOM_LINKS;
4465  gParamMom.site_offset = param->mom_offset;
4466  gParamMom.site_size = param->site_size;
4467  cpuGaugeField *cpuMom = !param->use_resident_mom ? new cpuGaugeField(gParamMom) : NULL;
4468 
4469  // create the device fields
4475  gParam.pad = 0;
4477 
4480  cudaGaugeField *cudaInGauge = !param->use_resident_gauge ? new cudaGaugeField(gParam) : NULL;
4481  cudaGaugeField *cudaOutGauge = new cudaGaugeField(gParam);
4482 
4484 
4486 
4487  if (!param->use_resident_gauge) { // load fields onto the device
4488  cudaInGauge->loadCPUField(*cpuGauge);
4489  } else { // or use resident fields already present
4490  if (!gaugePrecise) errorQuda("No resident gauge field allocated");
4491  cudaInGauge = gaugePrecise;
4492  gaugePrecise = NULL;
4493  }
4494 
4495  if (!param->use_resident_mom) {
4497  } else {
4498  if (!momResident) errorQuda("No resident mom field allocated");
4499  cudaMom = momResident;
4500  momResident = NULL;
4501  }
4502 
4504 
4505  // perform the update
4507  updateGaugeField(*cudaOutGauge, dt, *cudaInGauge, *cudaMom,
4508  (bool)conj_mom, (bool)exact);
4510 
4511  if (param->return_result_gauge) {
4512  // copy the gauge field back to the host
4514  cudaOutGauge->saveCPUField(*cpuGauge);
4516  }
4517 
4519  if (param->make_resident_gauge) {
4520  if (gaugePrecise != NULL) delete gaugePrecise;
4521  gaugePrecise = cudaOutGauge;
4522  } else {
4523  delete cudaOutGauge;
4524  }
4525 
4526  if (param->make_resident_mom) {
4527  if (momResident != NULL && momResident != cudaMom) delete momResident;
4528  momResident = cudaMom;
4529  } else {
4530  delete cudaMom;
4531  }
4532 
4533  delete cudaInGauge;
4534  if (cpuMom) delete cpuMom;
4535  if (cpuGauge) delete cpuGauge;
4537 
4538  checkCudaError();
4539 
4541  return;
4542 }
4543 
4544  void projectSU3Quda(void *gauge_h, double tol, QudaGaugeParam *param) {
4546 
4548  checkGaugeParam(param);
4549 
4550  // create the gauge field
4554  bool need_cpu = !param->use_resident_gauge || param->return_result_gauge;
4555  cpuGaugeField *cpuGauge = need_cpu ? new cpuGaugeField(gParam) : NULL;
4556 
4557  // create the device fields
4563 
4564  if (param->use_resident_gauge) {
4565  if (!gaugePrecise) errorQuda("No resident gauge field to use");
4567  } else {
4571  }
4572 
4574  *num_failures_h = 0;
4575 
4576  // project onto SU(3)
4578 
4580 
4581  if(*num_failures_h>0)
4582  errorQuda("Error in the SU(3) unitarization: %d failures\n", *num_failures_h);
4583 
4587 
4588  if (param->make_resident_gauge) {
4589  if (gaugePrecise != NULL && cudaGauge != gaugePrecise) delete gaugePrecise;
4591  } else {
4592  delete cudaGauge;
4593  }
4594 
4596  if (cpuGauge) delete cpuGauge;
4598 
4600  }
4601 
4602  void staggeredPhaseQuda(void *gauge_h, QudaGaugeParam *param) {
4604 
4606  checkGaugeParam(param);
4607 
4608  // create the gauge field
4610  bool need_cpu = !param->use_resident_gauge || param->return_result_gauge;
4611  cpuGaugeField *cpuGauge = need_cpu ? new cpuGaugeField(gParam) : NULL;
4612 
4613  // create the device fields
4619 
4620  if (param->use_resident_gauge) {
4621  if (!gaugePrecise) errorQuda("No resident gauge field to use");
4623  } else {
4624  profilePhase.TPSTART(QUDA_PROFILE_H2D);
4627  }
4628 
4630  *num_failures_h = 0;
4631 
4632  // apply / remove phase as appropriate
4635 
4637 
4638  profilePhase.TPSTART(QUDA_PROFILE_D2H);
4641 
4642  if (param->make_resident_gauge) {
4643  if (gaugePrecise != NULL && cudaGauge != gaugePrecise) delete gaugePrecise;
4645  } else {
4646  delete cudaGauge;
4647  }
4648 
4650  if (cpuGauge) delete cpuGauge;
4652 
4654  }
4655 
4656 // evaluate the momentum action
4657 double momActionQuda(void* momentum, QudaGaugeParam* param)
4658 {
4660 
4662  checkGaugeParam(param);
4663 
4664  // create the momentum fields
4668 
4670 
4671  // create the device fields
4675 
4677 
4679 
4681  if (!param->use_resident_mom) {
4683  } else {
4684  if (!momResident) errorQuda("No resident mom field allocated");
4685  cudaMom = momResident;
4686  }
4688 
4689  // perform the update
4691  double action = computeMomAction(*cudaMom);
4693 
4695  if (param->make_resident_mom) {
4696  if (momResident != NULL && momResident != cudaMom) delete momResident;
4697  momResident = cudaMom;
4698  } else {
4699  delete cudaMom;
4700  momResident = NULL;
4701  }
4702  if (cpuMom) {
4703  delete cpuMom;
4704  }
4705 
4707 
4708  checkCudaError();
4709 
4711  return action;
4712 }
4713 
4714 /*
4715  The following functions are for the Fortran interface.
4716 */
4717 
4718 void init_quda_(int *dev) { initQuda(*dev); }
4719 void init_quda_device_(int *dev) { initQudaDevice(*dev); }
4721 void end_quda_() { endQuda(); }
4722 void load_gauge_quda_(void *h_gauge, QudaGaugeParam *param) { loadGaugeQuda(h_gauge, param); }
4725 void load_clover_quda_(void *h_clover, void *h_clovinv, QudaInvertParam *inv_param)
4726 { loadCloverQuda(h_clover, h_clovinv, inv_param); }
4728 void dslash_quda_(void *h_out, void *h_in, QudaInvertParam *inv_param,
4729  QudaParity *parity) { dslashQuda(h_out, h_in, inv_param, *parity); }
4730 void clover_quda_(void *h_out, void *h_in, QudaInvertParam *inv_param,
4731  QudaParity *parity, int *inverse) { cloverQuda(h_out, h_in, inv_param, *parity, *inverse); }
4732 void mat_quda_(void *h_out, void *h_in, QudaInvertParam *inv_param)
4733 { MatQuda(h_out, h_in, inv_param); }
4734 void mat_dag_mat_quda_(void *h_out, void *h_in, QudaInvertParam *inv_param)
4735 { MatDagMatQuda(h_out, h_in, inv_param); }
4736 void invert_quda_(void *hp_x, void *hp_b, QudaInvertParam *param) {
4737  fflush(stdout);
4738  // ensure that fifth dimension is set to 1
4739  if (param->dslash_type == QUDA_ASQTAD_DSLASH || param->dslash_type == QUDA_STAGGERED_DSLASH) param->Ls = 1;
4740  invertQuda(hp_x, hp_b, param);
4741  fflush(stdout);
4742 }
4743 
4744 void invert_multishift_quda_(void *h_x, void *hp_b, QudaInvertParam *param) {
4745  // ensure that fifth dimension is set to 1
4746  if (param->dslash_type == QUDA_ASQTAD_DSLASH || param->dslash_type == QUDA_STAGGERED_DSLASH) param->Ls = 1;
4747 
4748  if (!gaugePrecise) errorQuda("Resident gauge field not allocated");
4749 
4750  // get data into array of pointers
4751  int nSpin = (param->dslash_type == QUDA_STAGGERED_DSLASH || param->dslash_type == QUDA_ASQTAD_DSLASH) ? 1 : 4;
4752 
4753  // compute offset assuming TIFR padded ordering (FIXME)
4754  if (param->dirac_order != QUDA_TIFR_PADDED_DIRAC_ORDER)
4755  errorQuda("Fortran multi-shift solver presently only supports QUDA_TIFR_PADDED_DIRAC_ORDER");
4756 
4757  const int *X = gaugePrecise->X();
4758  size_t cb_offset = (X[0]/2) * X[1] * (X[2] + 4) * X[3] * gaugePrecise->Ncolor() * nSpin * 2 * param->cpu_prec;
4759  void *hp_x[QUDA_MAX_MULTI_SHIFT];
4760  for (int i=0; i<param->num_offset; i++) hp_x[i] = static_cast<char*>(h_x) + i*cb_offset;
4761 
4762  invertMultiShiftQuda(hp_x, hp_b, param);
4763 }
4764 
4766 
4767 void register_pinned_quda_(void *ptr, size_t *bytes) {
4768  cudaHostRegister(ptr, *bytes, cudaHostRegisterDefault);
4769  checkCudaError();
4770 }
4771 
4773  cudaHostUnregister(ptr);
4774  checkCudaError();
4775 }
4776 
4778  *param = newQudaGaugeParam();
4779 }
4782 }
4783 
4784 void update_gauge_field_quda_(void *gauge, void *momentum, double *dt,
4785  bool *conj_mom, bool *exact,
4786  QudaGaugeParam *param) {
4787  updateGaugeFieldQuda(gauge, momentum, *dt, (int)*conj_mom, (int)*exact, param);
4788 }
4789 
4790 static inline int opp(int dir) { return 7-dir; }
4791 
4792 static void createGaugeForcePaths(int **paths, int dir, int num_loop_types){
4793 
4794  int index=0;
4795  // Plaquette paths
4796  if (num_loop_types >= 1)
4797  for(int i=0; i<4; ++i){
4798  if(i==dir) continue;
4799  paths[index][0] = i; paths[index][1] = opp(dir); paths[index++][2] = opp(i);
4800  paths[index][0] = opp(i); paths[index][1] = opp(dir); paths[index++][2] = i;
4801  }
4802 
4803  // Rectangle Paths
4804  if (num_loop_types >= 2)
4805  for(int i=0; i<4; ++i){
4806  if(i==dir) continue;
4807  paths[index][0] = paths[index][1] = i; paths[index][2] = opp(dir); paths[index][3] = paths[index][4] = opp(i);
4808  index++;
4809  paths[index][0] = paths[index][1] = opp(i); paths[index][2] = opp(dir); paths[index][3] = paths[index][4] = i;
4810  index++;
4811  paths[index][0] = dir; paths[index][1] = i; paths[index][2] = paths[index][3] = opp(dir); paths[index][4] = opp(i);
4812  index++;
4813  paths[index][0] = dir; paths[index][1] = opp(i); paths[index][2] = paths[index][3] = opp(dir); paths[index][4] = i;
4814  index++;
4815  paths[index][0] = i; paths[index][1] = paths[index][2] = opp(dir); paths[index][3] = opp(i); paths[index][4] = dir;
4816  index++;
4817  paths[index][0] = opp(i); paths[index][1] = paths[index][2] = opp(dir); paths[index][3] = i; paths[index][4] = dir;
4818  index++;
4819  }
4820 
4821  if (num_loop_types >= 3) {
4822  // Staple paths
4823  for(int i=0; i<4; ++i){
4824  for(int j=0; j<4; ++j){
4825  if(i==dir || j==dir || i==j) continue;
4826  paths[index][0] = i; paths[index][1] = j; paths[index][2] = opp(dir); paths[index][3] = opp(i), paths[index][4] = opp(j);
4827  index++;
4828  paths[index][0] = i; paths[index][1] = opp(j); paths[index][2] = opp(dir); paths[index][3] = opp(i), paths[index][4] = j;
4829  index++;
4830  paths[index][0] = opp(i); paths[index][1] = j; paths[index][2] = opp(dir); paths[index][3] = i, paths[index][4] = opp(j);
4831  index++;
4832  paths[index][0] = opp(i); paths[index][1] = opp(j); paths[index][2] = opp(dir); paths[index][3] = i, paths[index][4] = j;
4833  index++;
4834  }
4835  }
4836  }
4837 
4838 }
4839 
4840 void compute_gauge_force_quda_(void *mom, void *gauge, int *num_loop_types, double *coeff, double *dt,
4841  QudaGaugeParam *param) {
4842 
4843  int numPaths = 0;
4844  switch (*num_loop_types) {
4845  case 1:
4846  numPaths = 6;
4847  break;
4848  case 2:
4849  numPaths = 24;
4850  break;
4851  case 3:
4852  numPaths = 48;
4853  break;
4854  default:
4855  errorQuda("Invalid num_loop_types = %d\n", *num_loop_types);
4856  }
4857 
4858  double *loop_coeff = static_cast<double*>(safe_malloc(numPaths*sizeof(double)));
4859  int *path_length = static_cast<int*>(safe_malloc(numPaths*sizeof(int)));
4860 
4861  if (*num_loop_types >= 1) for(int i= 0; i< 6; ++i) {
4862  loop_coeff[i] = coeff[0];
4863  path_length[i] = 3;
4864  }
4865  if (*num_loop_types >= 2) for(int i= 6; i<24; ++i) {
4866  loop_coeff[i] = coeff[1];
4867  path_length[i] = 5;
4868  }
4869  if (*num_loop_types >= 3) for(int i=24; i<48; ++i) {
4870  loop_coeff[i] = coeff[2];
4871  path_length[i] = 5;
4872  }
4873 
4874  int** input_path_buf[4];
4875  for(int dir=0; dir<4; ++dir){
4876  input_path_buf[dir] = static_cast<int**>(safe_malloc(numPaths*sizeof(int*)));
4877  for(int i=0; i<numPaths; ++i){
4878  input_path_buf[dir][i] = static_cast<int*>(safe_malloc(path_length[i]*sizeof(int)));
4879  }
4880  createGaugeForcePaths(input_path_buf[dir], dir, *num_loop_types);
4881  }
4882 
4883  int max_length = 6;
4884 
4885  computeGaugeForceQuda(mom, gauge, input_path_buf, path_length, loop_coeff, numPaths, max_length, *dt, param);
4886 
4887  for(int dir=0; dir<4; ++dir){
4888  for(int i=0; i<numPaths; ++i) host_free(input_path_buf[dir][i]);
4889  host_free(input_path_buf[dir]);
4890  }
4891 
4892  host_free(path_length);
4893  host_free(loop_coeff);
4894 }
4895 
4896 void compute_staggered_force_quda_(void* h_mom, double *dt, double *delta, void *gauge, void *x, QudaGaugeParam *gauge_param, QudaInvertParam *inv_param) {
4897  computeStaggeredForceQuda(h_mom, *dt, *delta, gauge, (void**)x, gauge_param, inv_param);
4898 }
4899 
4900 // apply the staggered phases
4902  if (getVerbosity() >= QUDA_VERBOSE) printfQuda("applying staggered phase\n");
4903  if (gaugePrecise) {
4905  } else {
4906  errorQuda("No persistent gauge field");
4907  }
4908 }
4909 
4910 // remove the staggered phases
4912  if (getVerbosity() >= QUDA_VERBOSE) printfQuda("removing staggered phase\n");
4913  if (gaugePrecise) {
4915  } else {
4916  errorQuda("No persistent gauge field");
4917  }
4919 }
4920 
4921 // evaluate the kinetic term
4922 void kinetic_quda_(double *kin, void* momentum, QudaGaugeParam* param) {
4923  *kin = momActionQuda(momentum, param);
4924 }
4925 
4926 
4930 #ifdef MULTI_GPU
4931 static int bqcd_rank_from_coords(const int *coords, void *fdata)
4932 {
4933  int *dims = static_cast<int *>(fdata);
4934 
4935  int rank = coords[3];
4936  for (int i = 2; i >= 0; i--) {
4937  rank = dims[i] * rank + coords[i];
4938  }
4939  return rank;
4940 }
4941 #endif
4942 
4943 void comm_set_gridsize_(int *grid)
4944 {
4945 #ifdef MULTI_GPU
4946  initCommsGridQuda(4, grid, bqcd_rank_from_coords, static_cast<void *>(grid));
4947 #endif
4948 }
4949 
4954 {
4955  bool pack_ = *pack ? true : false;
4956  setKernelPackT(pack_);
4957 }
4958 
4959 
4960 
4961 void gaussGaugeQuda(long seed)
4962 {
4963 #ifdef GPU_GAUGE_TOOLS
4965 
4967  if (!gaugePrecise)
4968  errorQuda("Cannot generate Gauss GaugeField as there is no resident gauge field");
4969 
4970  cudaGaugeField *data = NULL;
4971  data = gaugePrecise;
4972 
4974 
4976  RNG* randstates = new RNG(data->Volume(), seed, data->X());
4977  randstates->Init();
4978  quda::gaugeGauss(*data, *randstates);
4979  randstates->Release();
4980  delete randstates;
4982 
4984 
4985  if (extendedGaugeResident) {
4988  }
4989 #else
4990  errorQuda("Gauge tools are not build");
4991 #endif
4992 }
4993 
4994 
4995 /*
4996  * Computes the total, spatial and temporal plaquette averages of the loaded gauge configuration.
4997  */
4998 void plaq_quda_(double plaq[3]) {
4999  plaqQuda(plaq);
5000 }
5001 
5002 
5003 void plaqQuda (double plq[3])
5004 {
5006 
5007  if (!gaugePrecise) errorQuda("Cannot compute plaquette as there is no resident gauge field");
5008 
5010  extendedGaugeResident = data;
5011 
5013  double3 plaq = quda::plaquette(*data, QUDA_CUDA_FIELD_LOCATION);
5014  plq[0] = plaq.x;
5015  plq[1] = plaq.y;
5016  plq[2] = plaq.z;
5018 
5020  return;
5021 }
5022 
5023 void performWuppertalnStep(void *h_out, void *h_in, QudaInvertParam *inv_param,
5024  unsigned int nSteps, double alpha)
5025 {
5027 
5028  if (gaugePrecise == NULL) errorQuda("Gauge field must be loaded");
5029 
5032 
5033  cudaGaugeField *precise = NULL;
5034 
5035  if (gaugeSmeared != NULL) {
5036  if (getVerbosity() >= QUDA_VERBOSE)
5037  printfQuda("Wuppertal smearing done with gaugeSmeared\n");
5040  precise = new cudaGaugeField(gParam);
5042  precise->exchangeGhost();
5043  } else {
5044  if (getVerbosity() >= QUDA_VERBOSE)
5045  printfQuda("Wuppertal smearing done with gaugePrecise\n");
5046  precise = gaugePrecise;
5047  }
5048 
5049  ColorSpinorParam cpuParam(h_in, *inv_param, precise->X(), 0, inv_param->input_location);
5050  ColorSpinorField *in_h = ColorSpinorField::Create(cpuParam);
5051 
5052  ColorSpinorParam cudaParam(cpuParam, *inv_param);
5053  cudaColorSpinorField in(*in_h, cudaParam);
5054 
5055  if (getVerbosity() >= QUDA_DEBUG_VERBOSE) {
5056  double cpu = blas::norm2(*in_h);
5057  double gpu = blas::norm2(in);
5058  printfQuda("In CPU %e CUDA %e\n", cpu, gpu);
5059  }
5060 
5061  cudaParam.create = QUDA_NULL_FIELD_CREATE;
5062  cudaColorSpinorField out(in, cudaParam);
5063  int parity = 0;
5064 
5065  for (unsigned int i=0; i<nSteps; i++) {
5066  if(i) in = out;
5067  wuppertalStep(out, in, parity, *precise, alpha);
5068  if (getVerbosity() >= QUDA_DEBUG_VERBOSE) {
5069  double norm = blas::norm2(out);
5070  printfQuda("Step %d, vector norm %e\n", i, norm);
5071  }
5072  }
5073 
5074  cpuParam.v = h_out;
5075  cpuParam.location = inv_param->output_location;
5076  ColorSpinorField *out_h = ColorSpinorField::Create(cpuParam);
5077  *out_h = out;
5078 
5079  if (getVerbosity() >= QUDA_DEBUG_VERBOSE) {
5080  double cpu = blas::norm2(*out_h);
5081  double gpu = blas::norm2(out);
5082  printfQuda("Out CPU %e CUDA %e\n", cpu, gpu);
5083  }
5084 
5085  if (gaugeSmeared != NULL)
5086  delete precise;
5087 
5088  delete out_h;
5089  delete in_h;
5090 
5091  popVerbosity();
5092 
5094 }
5095 
5096 void performAPEnStep(unsigned int nSteps, double alpha)
5097 {
5098  profileAPE.TPSTART(QUDA_PROFILE_TOTAL);
5099 
5100  if (gaugePrecise == NULL) errorQuda("Gauge field must be loaded");
5101 
5102  if (gaugeSmeared != NULL) delete gaugeSmeared;
5104 
5106  cudaGaugeField *cudaGaugeTemp = new cudaGaugeField(gParam);
5107 
5108  if (getVerbosity() == QUDA_VERBOSE) {
5110  printfQuda("Plaquette after 0 APE steps: %le %le %le\n", plq.x, plq.y, plq.z);
5111  }
5112 
5113  for (unsigned int i=0; i<nSteps; i++) {
5114  cudaGaugeTemp->copy(*gaugeSmeared);
5116  APEStep(*gaugeSmeared, *cudaGaugeTemp, alpha);
5117  }
5118 
5119  delete cudaGaugeTemp;
5120 
5122 
5123  if (getVerbosity() == QUDA_VERBOSE) {
5125  printfQuda("Plaquette after %d APE steps: %le %le %le\n", nSteps, plq.x, plq.y, plq.z);
5126  }
5127 
5129 }
5130 
5131 void performSTOUTnStep(unsigned int nSteps, double rho)
5132 {
5134 
5135  if (gaugePrecise == NULL) errorQuda("Gauge field must be loaded");
5136 
5137  if (gaugeSmeared != NULL) delete gaugeSmeared;
5139 
5141  cudaGaugeField *cudaGaugeTemp = new cudaGaugeField(gParam);
5142 
5143  if (getVerbosity() == QUDA_VERBOSE) {
5145  printfQuda("Plaquette after 0 STOUT steps: %le %le %le\n", plq.x, plq.y, plq.z);
5146  }
5147 
5148  for (unsigned int i=0; i<nSteps; i++) {
5149  cudaGaugeTemp->copy(*gaugeSmeared);
5151  STOUTStep(*gaugeSmeared, *cudaGaugeTemp, rho);
5152  }
5153 
5154  delete cudaGaugeTemp;
5155 
5157 
5158  if (getVerbosity() == QUDA_VERBOSE) {
5160  printfQuda("Plaquette after %d STOUT steps: %le %le %le\n", nSteps, plq.x, plq.y, plq.z);
5161  }
5162 
5164 }
5165 
5166  void performOvrImpSTOUTnStep(unsigned int nSteps, double rho, double epsilon)
5167 {
5169 
5170  if (gaugePrecise == NULL) errorQuda("Gauge field must be loaded");
5171 
5172  if (gaugeSmeared != NULL) delete gaugeSmeared;
5174 
5176  cudaGaugeField *cudaGaugeTemp = new cudaGaugeField(gParam);
5177 
5178  if (getVerbosity() == QUDA_VERBOSE) {
5180  printfQuda("Plaquette after 0 OvrImpSTOUT steps: %le %le %le\n", plq.x, plq.y, plq.z);
5181  }
5182 
5183  for (unsigned int i=0; i<nSteps; i++) {
5184  cudaGaugeTemp->copy(*gaugeSmeared);
5186  OvrImpSTOUTStep(*gaugeSmeared, *cudaGaugeTemp, rho, epsilon);
5187  }
5188 
5189  delete cudaGaugeTemp;
5190 
5192 
5193  if (getVerbosity() == QUDA_VERBOSE) {
5195  printfQuda("Plaquette after %d OvrImpSTOUT steps: %le %le %le\n", nSteps, plq.x, plq.y, plq.z);
5196  }
5197 
5199 }
5200 
5201 
5202 int computeGaugeFixingOVRQuda(void* gauge, const unsigned int gauge_dir, const unsigned int Nsteps, \
5203  const unsigned int verbose_interval, const double relax_boost, const double tolerance, const unsigned int reunit_interval, \
5204  const unsigned int stopWtheta, QudaGaugeParam* param , double* timeinfo)
5205 {
5206 
5208 
5209  checkGaugeParam(param);
5210 
5212  GaugeFieldParam gParam(gauge, *param);
5214 
5215  //gParam.pad = getFatLinkPadding(param->X);
5221  cudaGaugeField *cudaInGauge = new cudaGaugeField(gParam);
5222 
5224 
5226 
5227 
5229  cudaInGauge->loadCPUField(*cpuGauge);
5230  /* } else { // or use resident fields already present
5231  if (!gaugePrecise) errorQuda("No resident gauge field allocated");
5232  cudaInGauge = gaugePrecise;
5233  gaugePrecise = NULL;
5234  } */
5235 
5237 
5238  checkCudaError();
5239 
5240  if (comm_size() == 1) {
5241  // perform the update
5243  gaugefixingOVR(*cudaInGauge, gauge_dir, Nsteps, verbose_interval, relax_boost, tolerance, \
5244  reunit_interval, stopWtheta);
5246  } else {
5247  cudaGaugeField *cudaInGaugeEx = createExtendedGauge(*cudaInGauge, R, GaugeFixOVRQuda);
5248 
5249  // perform the update
5251  gaugefixingOVR(*cudaInGaugeEx, gauge_dir, Nsteps, verbose_interval, relax_boost, tolerance, \
5252  reunit_interval, stopWtheta);
5254 
5255  //HOW TO COPY BACK TO CPU: cudaInGaugeEx->cpuGauge
5256  copyExtendedGauge(*cudaInGauge, *cudaInGaugeEx, QUDA_CUDA_FIELD_LOCATION);
5257  }
5258 
5259  checkCudaError();
5260  // copy the gauge field back to the host
5262  cudaInGauge->saveCPUField(*cpuGauge);
5264 
5266 
5267  if (param->make_resident_gauge) {
5268  if (gaugePrecise != NULL) delete gaugePrecise;
5269  gaugePrecise = cudaInGauge;
5270  } else {
5271  delete cudaInGauge;
5272  }
5273 
5274  if(timeinfo){
5275  timeinfo[0] = GaugeFixOVRQuda.Last(QUDA_PROFILE_H2D);
5276  timeinfo[1] = GaugeFixOVRQuda.Last(QUDA_PROFILE_COMPUTE);
5277  timeinfo[2] = GaugeFixOVRQuda.Last(QUDA_PROFILE_D2H);
5278  }
5279 
5280  checkCudaError();
5281  return 0;
5282 }
5283 
5284 
5285 
5286 
5287 int computeGaugeFixingFFTQuda(void* gauge, const unsigned int gauge_dir, const unsigned int Nsteps, \
5288  const unsigned int verbose_interval, const double alpha, const unsigned int autotune, const double tolerance, \
5289  const unsigned int stopWtheta, QudaGaugeParam* param , double* timeinfo)
5290 {
5291 
5293 
5294  checkGaugeParam(param);
5295 
5297 
5298  GaugeFieldParam gParam(gauge, *param);
5300 
5301  //gParam.pad = getFatLinkPadding(param->X);
5307 
5308  cudaGaugeField *cudaInGauge = new cudaGaugeField(gParam);
5309 
5310 
5312 
5314 
5315  //if (!param->use_resident_gauge) { // load fields onto the device
5316  cudaInGauge->loadCPUField(*cpuGauge);
5317  /*} else { // or use resident fields already present
5318  if (!gaugePrecise) errorQuda("No resident gauge field allocated");
5319  cudaInGauge = gaugePrecise;
5320  gaugePrecise = NULL;
5321  } */
5322 
5323 
5325 
5326  // perform the update
5328  checkCudaError();
5329 
5330  gaugefixingFFT(*cudaInGauge, gauge_dir, Nsteps, verbose_interval, alpha, autotune, tolerance, stopWtheta);
5331 
5333 
5334  checkCudaError();
5335  // copy the gauge field back to the host
5337  checkCudaError();
5338  cudaInGauge->saveCPUField(*cpuGauge);
5340  checkCudaError();
5341 
5343 
5344  if (param->make_resident_gauge) {
5345  if (gaugePrecise != NULL) delete gaugePrecise;
5346  gaugePrecise = cudaInGauge;
5347  } else {
5348  delete cudaInGauge;
5349  }
5350 
5351  if(timeinfo){
5352  timeinfo[0] = GaugeFixFFTQuda.Last(QUDA_PROFILE_H2D);
5353  timeinfo[1] = GaugeFixFFTQuda.Last(QUDA_PROFILE_COMPUTE);
5354  timeinfo[2] = GaugeFixFFTQuda.Last(QUDA_PROFILE_D2H);
5355  }
5356 
5357  checkCudaError();
5358  return 0;
5359 }
5360 
5369 void contract(const cudaColorSpinorField x, const cudaColorSpinorField y, void *ctrn, const QudaContractType cType)
5370 {
5371  if (x.Precision() == QUDA_DOUBLE_PRECISION) {
5372  contractCuda(x.Even(), y.Even(), ((double2*)ctrn), cType, QUDA_EVEN_PARITY, profileContract);
5373  contractCuda(x.Odd(), y.Odd(), ((double2*)ctrn), cType, QUDA_ODD_PARITY, profileContract);
5374  } else if (x.Precision() == QUDA_SINGLE_PRECISION) {
5375  contractCuda(x.Even(), y.Even(), ((float2*) ctrn), cType, QUDA_EVEN_PARITY, profileContract);
5376  contractCuda(x.Odd(), y.Odd(), ((float2*) ctrn), cType, QUDA_ODD_PARITY, profileContract);
5377  } else {
5378  errorQuda("Precision not supported for contractions\n");
5379  }
5380 }
5381 
5382 void contract(const cudaColorSpinorField x, const cudaColorSpinorField y, void *ctrn, const QudaContractType cType, const int tC)
5383 {
5384  if (x.Precision() == QUDA_DOUBLE_PRECISION) {
5385  contractCuda(x.Even(), y.Even(), ((double2*)ctrn), cType, tC, QUDA_EVEN_PARITY, profileContract);
5386  contractCuda(x.Odd(), y.Odd(), ((double2*)ctrn), cType, tC, QUDA_ODD_PARITY, profileContract);
5387  } else if (x.Precision() == QUDA_SINGLE_PRECISION) {
5388  contractCuda(x.Even(), y.Even(), ((float2*) ctrn), cType, tC, QUDA_EVEN_PARITY, profileContract);
5389  contractCuda(x.Odd(), y.Odd(), ((float2*) ctrn), cType, tC, QUDA_ODD_PARITY, profileContract);
5390  } else {
5391  errorQuda("Precision not supported for contractions\n");
5392  }
5393 }
5394 
5395 double qChargeCuda ()
5396 {
5398 
5399  cudaGaugeField *gauge = nullptr;
5400  if (!gaugeSmeared) {
5402  gauge = extendedGaugeResident;
5403  } else {
5404  gauge = gaugeSmeared;
5405  }
5406  // Do we keep the smeared extended field on memory, or the unsmeared one?
5407 
5409  // create the Fmunu field
5410 
5412  tensorParam.siteSubset = QUDA_FULL_SITE_SUBSET;
5413  tensorParam.order = QUDA_FLOAT2_GAUGE_ORDER;
5414  tensorParam.ghostExchange = QUDA_GHOST_EXCHANGE_NO;
5415  cudaGaugeField Fmunu(tensorParam);
5416 
5419 
5420  computeFmunu(Fmunu, *gauge, QUDA_CUDA_FIELD_LOCATION);
5421  double charge = quda::computeQCharge(Fmunu, QUDA_CUDA_FIELD_LOCATION);
5422 
5425 
5426  return charge;
5427 }
void new_quda_invert_param_(QudaInvertParam *param)
QudaCloverFieldOrder order
Definition: clover_field.h:21
static QudaGaugeParam qudaGaugeParam
void setRho(double rho)
Bakes in the rho factor into the clover field, (for real diagonal additive Hasenbusch), e.g., A + rho.
void contract(const cudaColorSpinorField x, const cudaColorSpinorField y, void *ctrn, const QudaContractType cType)
QudaTboundary t_boundary
Definition: gauge_field.h:18
QudaDiracFieldOrder dirac_order
Definition: quda.h:195
QudaMassNormalization mass_normalization
Definition: quda.h:185
#define qudaMemcpy(dst, src, count, kind)
Definition: quda_cuda_api.h:32
QudaReconstructType reconstruct_sloppy
Definition: quda.h:46
void comm_finalize(void)
double c_5[QUDA_MAX_DWF_LS]
NEW: used by mobius domain wall only.
Definition: dirac_quda.h:28
void Init()
Initialize CURAND RNG states.
Definition: random.cu:146
DiracMatrix * matSmoothSloppy
Definition: multigrid.h:78
void fatLongKSLink(cudaGaugeField *fat, cudaGaugeField *lng, const cudaGaugeField &gauge, const double *coeff)
Compute the fat and long links for an improved staggered (Kogut-Susskind) fermions.
Definition: llfat_quda.cu:524
void cloverDerivative(cudaGaugeField &force, cudaGaugeField &gauge, cudaGaugeField &oprod, double coeff, QudaParity parity)
Compute the derivative of the clover matrix in the direction mu,nu and compute the resulting force gi...
QudaGhostExchange ghostExchange
Definition: lattice_field.h:60
void freeCloverQuda(void)
void computeKSLinkQuda(void *fatlink, void *longlink, void *ulink, void *inlink, double *path_coeff, QudaGaugeParam *param)
void setDiracSloppyParam(DiracParam &diracParam, QudaInvertParam *inv_param, bool pc)
static TimeProfile profileStaggeredForce("computeStaggeredForceQuda")
Profiler for computeHISQForceQuda.
void * createGaugeFieldQuda(void *gauge, int geometry, QudaGaugeParam *param)
void exchange_cpu_sitelink_ex(int *X, int *R, void **sitelink, QudaGaugeFieldOrder cpu_order, QudaPrecision gPrecision, int optflag, int geometry)
double b_5[QUDA_MAX_DWF_LS]
Definition: quda.h:102
int commDimPartitioned(int dir)
double * TrLog() const
Definition: clover_field.h:87
void loadCloverQuda(void *h_clover, void *h_clovinv, QudaInvertParam *inv_param)
void setVerbosityQuda(QudaVerbosity verbosity, const char prefix[], FILE *outfile)
double3 plaquette(const GaugeField &U, QudaFieldLocation location)
Definition: gauge_plaq.cu:138
QudaFieldLocation clover_location
Definition: quda.h:199
QudaSolveType solve_type
Definition: quda.h:182
QudaVerbosity verbosity
enum QudaPrecision_s QudaPrecision
double tol_hq
Definition: test_util.cpp:1648
void freeGaugeQuda(void)
void kinetic_quda_(double *kin, void *momentum, QudaGaugeParam *param)
Evaluate the kinetic (momentum) contribution to classical Hamiltonian for Hybrid Monte Carlo...
static TimeProfile profileFatLink("computeKSLinkQuda")
Profiler for computeGaugeForceQuda.
int ga_pad
Definition: quda.h:53
void load_gauge_quda_(void *h_gauge, QudaGaugeParam *param)
int make_resident_mom
Definition: quda.h:74
void saveCPUField(cpuGaugeField &cpu) const
Upload from this field into a CPU field.
void computeStaggeredForceQuda(void *h_mom, double dt, double delta, void *h_force, void **x, QudaGaugeParam *gauge_param, QudaInvertParam *inv_param)
double momActionQuda(void *momentum, QudaGaugeParam *param)
size_t gauge_offset
Definition: quda.h:78
double mu
Definition: quda.h:105
void * V(bool inverse=false)
Definition: clover_field.h:73
void computeHISQForceQuda(void *const milc_momentum, long long *flops, const double level2_coeff[6], const double fat7_coeff[6], const void *const w_link, const void *const v_link, const void *const u_link, void **fermion, int num_terms, int num_naik_terms, double **coeff, QudaGaugeParam *gParam)
__host__ __device__ ValueType norm(const complex< ValueType > &z)
Returns the magnitude of z squared.
Definition: complex_quda.h:896
static TimeProfile profileGaugeUpdate("updateGaugeFieldQuda")
Profiler for createExtendedGaugeField.
void setUnitarizeForceConstants(double unitarize_eps, double hisq_force_filter, double max_det_error, bool allow_svd, bool svd_only, double svd_rel_error, double svd_abs_error)
Set the constant parameters for the force unitarization.
#define QUDA_MAX_MULTI_SHIFT
Maximum number of shifts supported by the multi-shift solver. This number may be changed if need be...
cudaGaugeField * gaugeExtended
void createDirac(Dirac *&d, Dirac *&dSloppy, Dirac *&dPre, QudaInvertParam &param, const bool pc_solve)
void printQudaGaugeParam(QudaGaugeParam *param)
Definition: check_params.h:40
QudaVerbosity getVerbosity()
Definition: util_quda.cpp:20
const void * func
QudaLinkType type
Definition: quda.h:35
int fflush(FILE *)
double kappa
Definition: quda.h:97
void end(void)
Definition: blas_quda.cu:70
void computeCloverForce(GaugeField &force, const GaugeField &U, std::vector< ColorSpinorField *> &x, std::vector< ColorSpinorField *> &p, std::vector< double > &coeff)
Compute the force contribution from the solver solution fields.
static TimeProfile profileQCharge("qChargeQuda")
Profiler for APEQuda.
void createSmoother()
Create the smoothers.
Definition: multigrid.cpp:213
#define errorQuda(...)
Definition: util_quda.h:90
double norm2(const ColorSpinorField &a)
Definition: reduce_quda.cu:241
void setUnitarizeLinksConstants(double unitarize_eps, double max_error, bool allow_svd, bool svd_only, double svd_rel_error, double svd_abs_error)
void init()
Definition: blas_quda.cu:64
QudaDslashType dslash_type
Definition: quda.h:93
void setPrecision(QudaPrecision precision)
Helper function for setting the precision and corresponding field order for QUDA internal fields...
Definition: gauge_field.h:113
QudaReconstructType reconstruct_precondition
Definition: quda.h:49
QudaFieldCreate create
Definition: clover_field.h:22
QudaInverterType inv_type
Definition: quda.h:94
Fortran interface functions.
double c_5[QUDA_MAX_DWF_LS]
Definition: quda.h:103
int return_clover_inverse
Definition: quda.h:217
#define host_free(ptr)
Definition: malloc_quda.h:59
void computeFmunu(GaugeField &Fmunu, const GaugeField &gauge, QudaFieldLocation location)
QudaSolveType smoother_solve_type[QUDA_MAX_MG_LEVEL]
Definition: quda.h:426
void destroySmoother()
Free the smoothers.
Definition: multigrid.cpp:254
void performSTOUTnStep(unsigned int nSteps, double rho)
__host__ __device__ ValueType sqrt(ValueType x)
Definition: complex_quda.h:105
cudaGaugeField *& gaugeFatExtended
Complex cDotProduct(ColorSpinorField &, ColorSpinorField &)
Definition: reduce_quda.cu:500
void STOUTStep(GaugeField &dataDs, const GaugeField &dataOr, double rho)
Definition: gauge_stout.cu:300
cudaGaugeField * cudaMom
std::complex< double > Complex
Definition: eig_variables.h:13
void setOutputPrefix(const char *prefix)
Definition: util_quda.cpp:68
void loadGaugeQuda(void *h_gauge, QudaGaugeParam *param)
void gaussGaugeQuda(long seed)
void plaq_quda_(double plaq[3])
static TimeProfile profileMulti("invertMultiShiftQuda")
Profiler for computeFatLinkQuda.
static TimeProfile profileOvrImpSTOUT("OvrImpSTOUTQuda")
Profiler for projectSU3Quda.
cudaColorSpinorField * tmp
Definition: covdev_test.cpp:44
cudaGaugeField * gaugeLongPrecise
static int rank
Definition: comm_mpi.cpp:42
static TimeProfile profileGaugeForce("computeGaugeForceQuda")
Profiler for updateGaugeFieldQuda.
static ColorSpinorField * Create(const ColorSpinorParam &param)
static TimeProfile profileAPE("APEQuda")
Profiler for STOUTQuda.
const int Nstream
#define QUDA_VERSION_MINOR
Definition: quda_constants.h:2
double trlogA[2]
Definition: quda.h:212
void assertAllMemFree()
Definition: malloc.cpp:379
__host__ __device__ void copy(T1 &a, const T2 &b)
QudaPrecision precision
Definition: lattice_field.h:54
void invertMultiSrcQuda(void **_hp_x, void **_hp_b, QudaInvertParam *param)
static int R[4]
QudaDagType dagger
Definition: quda.h:184
void copy(ColorSpinorField &dst, const ColorSpinorField &src)
Definition: copy_quda.cu:263
void ax(const double &a, ColorSpinorField &x)
Definition: blas_quda.cu:209
cpuGaugeField * cpuMom
static TimeProfile profilePlaq("plaqQuda")
Profiler for wuppertalQuda.
void free_clover_quda_(void)
QudaGaugeParam gauge_param
void initCommsGridQuda(int nDim, const int *dims, QudaCommsMap func, void *fdata)
QudaFieldGeometry Geometry() const
Definition: gauge_field.h:212
static cudaGaugeField * createExtendedGauge(cudaGaugeField &in, const int *R, TimeProfile &profile, bool redundant_comms=false, QudaReconstructType recon=QUDA_RECONSTRUCT_INVALID)
double pow(double, double)
int computeGaugeFixingOVRQuda(void *gauge, const unsigned int gauge_dir, const unsigned int Nsteps, const unsigned int verbose_interval, const double relax_boost, const double tolerance, const unsigned int reunit_interval, const unsigned int stopWtheta, QudaGaugeParam *param, double *timeinfo)
Gauge fixing with overrelaxation with support for single and multi GPU.
void massRescale(cudaColorSpinorField &b, QudaInvertParam &param)
ColorSpinorField & Component(const int idx) const
std::vector< std::vector< std::pair< ColorSpinorField *, ColorSpinorField * > > > chronoResident(QUDA_MAX_CHRONO)
void loadSloppyGaugeQuda(QudaPrecision prec_sloppy, QudaPrecision prec_precondition)
void destroyDeflationQuda(void *df)
QudaGaugeFieldOrder gauge_order
Definition: quda.h:36
void computeCloverForceQuda(void *h_mom, double dt, void **h_x, void **h_p, double *coeff, double kappa2, double ck, int nvector, double multiplicity, void *gauge, QudaGaugeParam *gauge_param, QudaInvertParam *inv_param)
size_t mom_offset
Definition: quda.h:79
static TimeProfile profileInit2End("initQuda-endQuda", false)
int comm_gpuid(void)
Definition: comm_mpi.cpp:132
int return_clover
Definition: quda.h:216
void gaugeForce(GaugeField &mom, const GaugeField &u, double coeff, int ***input_path, int *length, double *path_coeff, int num_paths, int max_length)
Compute the gauge-force contribution to the momentum.
Definition: gauge_force.cu:339
void saveGaugeQuda(void *h_gauge, QudaGaugeParam *param)
void gaugefixingOVR(cudaGaugeField &data, const int gauge_dir, const int Nsteps, const int verbose_interval, const double relax_boost, const double tolerance, const int reunit_interval, const int stopWtheta)
Gauge fixing with overrelaxation with support for single and multi GPU.
#define spinorSiteSize
void invert_multishift_quda_(void *h_x, void *hp_b, QudaInvertParam *param)
int overwrite_mom
Definition: quda.h:69
static TimeProfile profileMomAction("momActionQuda")
Profiler for endQuda.
static bool InitMagma
int compute_clover_trlog
Definition: quda.h:211
QudaSiteSubset siteSubset
Definition: lattice_field.h:55
char * gitversion
Definition: version.cpp:4
void exit(int) __attribute__((noreturn))
QudaPrecision clover_cuda_prec_sloppy
Definition: quda.h:202
int * greatestPriority
cudaGaugeField * gaugeLongExtended
QudaFieldLocation input_location
Definition: quda.h:90
void solve(Complex *psi, std::vector< ColorSpinorField *> &p, std::vector< ColorSpinorField *> &q, ColorSpinorField &b)
Solve the equation A p_k psi_k = b by minimizing the residual and using Gaussian elimination.
Definition: inv_mre.cpp:64
void destroyGaugeFieldQuda(void *gauge)
void computeCloverSigmaTrace(GaugeField &output, const CloverField &clover, double coeff)
Compute the matrix tensor field necessary for the force calculation from the clover trace action...
void dslashQuda_4dpc(void *h_out, void *h_in, QudaInvertParam *inv_param, QudaParity parity, int test_type)
cudaGaugeField * gauge
Definition: dirac_quda.h:31
size_t site_size
Definition: quda.h:80
void CloseMagma()
Definition: blas_magma.cu:326
int computeGaugeFixingFFTQuda(void *gauge, const unsigned int gauge_dir, const unsigned int Nsteps, const unsigned int verbose_interval, const double alpha, const unsigned int autotune, const double tolerance, const unsigned int stopWtheta, QudaGaugeParam *param, double *timeinfo)
Gauge fixing with Steepest descent method with FFTs with support for single GPU only.
cudaCloverField * cloverPrecondition
DeflationParam * deflParam
Definition: deflation.h:187
int n_vec[QUDA_MAX_MG_LEVEL]
Definition: quda.h:407
QudaPrecision prec_sloppy
Definition: test_util.cpp:1616
void init_quda_(int *dev)
void flush_pinned()
Free all outstanding pinned-memory allocations.
Definition: malloc.cpp:533
int getGaugePadding(GaugeFieldParam &param)
void gaugeGauss(GaugeField &dataDs, RNG &rngstate)
void APEStep(GaugeField &dataDs, const GaugeField &dataOr, double alpha)
Definition: gauge_ape.cu:240
char * index(const char *, int)
QudaGaugeParam param
Definition: pack_test.cpp:17
void openMagma()
double computeMomAction(const GaugeField &mom)
Compute and return global the momentum action 1/2 mom^2.
Definition: momentum.cu:113
#define b
void setDiracParam(DiracParam &diracParam, QudaInvertParam *inv_param, bool pc)
void comm_init(int ndim, const int *dims, QudaCommsMap rank_from_coords, void *map_data)
Definition: comm_mpi.cpp:61
static int ndim
Definition: layout_hyper.c:53
QudaSolutionType solution_type
Definition: quda.h:181
QudaMemoryType mem_type_ritz
Definition: quda.h:367
int Ncolor() const
Definition: gauge_field.h:202
int strcmp(const char *__s1, const char *__s2)
int x[QUDA_MAX_DIM]
Definition: lattice_field.h:50
QudaPrecision clover_cuda_prec
Definition: quda.h:201
const int * R() const
std::vector< cudaColorSpinorField * > solutionResident
void invertMultiShiftQuda(void **_hp_x, void *_hp_b, QudaInvertParam *param)
void * longlink[4]
bool is_composite
for deflation solvers:
void loadCPUField(const cpuGaugeField &cpu)
Download into this field from a CPU field.
double Last(QudaProfileType idx)
void initQuda(int dev)
int comm_size(void)
Definition: comm_mpi.cpp:126
QudaInvertParam * invert_param
Definition: quda.h:346
cudaDeviceProp deviceProp
void cloverInvert(CloverField &clover, bool computeTraceLog, QudaFieldLocation location)
This function compute the Cholesky decomposition of each clover matrix and stores the clover inverse ...
void ax(const double &a, GaugeField &u)
Scale the gauge field by the scalar a.
double secs
Definition: quda.h:385
double tol
Definition: test_util.cpp:1647
static unsigned int delta
void init()
Initialize the memory pool allocator.
Definition: malloc.cpp:424
void hisqLongLinkForce(GaugeField &newOprod, const GaugeField &oprod, const GaugeField &link, double coeff, long long *flops=nullptr)
Compute the long-link contribution to the fermion force.
QudaFieldLocation output_location
Definition: quda.h:91
void unitarizeLinks(cudaGaugeField &outfield, const cudaGaugeField &infield, int *fails)
QudaPrecision clover_cuda_prec_precondition
Definition: quda.h:203
QudaFieldLocation location
QudaInvertParam inv_param
Definition: covdev_test.cpp:37
int setNumaAffinityNVML(int deviceid)
bool canReuseResidentGauge(QudaInvertParam *inv_param)
void apply_staggered_phase_quda_()
Apply the staggered phase factors to the resident gauge field.
TimeProfile & profile
Definition: deflation.h:190
VOLATILE spinorFloat kappa
void hisqStaplesForce(GaugeField &newOprod, const GaugeField &oprod, const GaugeField &link, const double path_coeff[6], long long *flops=nullptr)
Compute the fat-link contribution to the fermion force.
double true_res_hq_offset[QUDA_MAX_MULTI_SHIFT]
Definition: invert_quda.h:151
void updateInvertParam(QudaInvertParam &param, int offset=-1)
Definition: invert_quda.h:296
double m5
Definition: quda.h:99
size_t Bytes() const
Definition: gauge_field.h:242
cpuGaugeField * cpuFatLink
bool StaggeredPhaseApplied() const
Definition: gauge_field.h:214
void exchangeExtendedGhost(const int *R, bool no_comms_fill=false)
This does routine will populate the border / halo region of a gauge field that has been created using...
void updateMultigridQuda(void *mg_, QudaMultigridParam *mg_param)
Updates the multigrid preconditioner for the new gauge / clover field.
bool Csw() const
Definition: clover_field.h:107
void flushProfile()
Flush profile contents, setting all counts to zero.
Definition: tune.cpp:462
DiracMatrix * m
Definition: deflation.h:183
static bool initialized
Profiler for initQuda.
void Release()
Release Device memory for CURAND RNG states.
Definition: random.cu:168
cudaCloverField * cloverSloppy
multigrid_solver(QudaMultigridParam &mg_param, TimeProfile &profile)
QudaVerbosity verbosity
Definition: quda.h:219
#define tmp2
Definition: tmc_core.h:16
void free_gauge_quda_()
int commDim[QUDA_MAX_DIM]
Definition: dirac_quda.h:43
void MatQuda(void *h_out, void *h_in, QudaInvertParam *inv_param)
static TimeProfile profileSTOUT("STOUTQuda")
Profiler for OvrImpSTOUTQuda.
void load_clover_quda_(void *h_clover, void *h_clovinv, QudaInvertParam *inv_param)
void projectSU3Quda(void *gauge_h, double tol, QudaGaugeParam *param)
QudaDagType dagger
Definition: dirac_quda.h:30
cpuColorSpinorField * in
cudaGaugeField *& gaugeFatPrecondition
QudaInvertParam newQudaInvertParam(void)
cudaGaugeField * cudaFatLink
static const std::string quda_version
Definition: tune.cpp:96
TimeProfile & profile
Definition: multigrid.h:396
void setPrecision(QudaPrecision precision)
Definition: clover_field.h:23
void flush_device()
Free all outstanding device-memory allocations.
Definition: malloc.cpp:545
void Dagger(QudaDagType dag) const
Definition: dirac_quda.h:153
static Solver * create(SolverParam &param, DiracMatrix &mat, DiracMatrix &matSloppy, DiracMatrix &matPrecon, TimeProfile &profile)
Definition: solver.cpp:13
for(int s=0;s< param.dc.Ls;s++)
double gflops
Definition: quda.h:382
void Print()
Definition: timer.cpp:6
QudaPrecision cuda_prec_precondition
Definition: quda.h:48
void free_sloppy_gauge_quda_()
QudaCloverFieldOrder clover_order
Definition: quda.h:205
Class declaration to initialize and hold CURAND RNG states.
Definition: random_quda.h:23
static __inline__ size_t p
void createDslashEvents()
Definition: dslash_quda.cu:86
void updateGaugeFieldQuda(void *gauge, void *momentum, double dt, int conj_mom, int exact, QudaGaugeParam *param)
double Anisotropy() const
Definition: gauge_field.h:205
DiracMatrix * matSmooth
Definition: multigrid.h:75
void remove_staggered_phase_quda_()
Remove the staggered phase factors to the resident gauge field.
static int lex_rank_from_coords(const int *coords, void *fdata)
void freeSloppyCloverQuda()
QudaGaugeFieldOrder order
Definition: gauge_field.h:15
void exchangeExtendedGhost(cudaColorSpinorField *spinor, int R[], int parity, cudaStream_t *stream_p)
#define warningQuda(...)
Definition: util_quda.h:101
void performAPEnStep(unsigned int nSteps, double alpha)
static TimeProfile profileClover("loadCloverQuda")
Profiler for dslashQuda.
void performWuppertalnStep(void *h_out, void *h_in, QudaInvertParam *inv_param, unsigned int nSteps, double alpha)
static unsigned int unsigned int shift
double b_5[QUDA_MAX_DWF_LS]
NEW: used by domain wall and twisted mass.
Definition: dirac_quda.h:27
#define QUDA_VERSION_SUBMINOR
Definition: quda_constants.h:3
QudaDiracType type
Definition: dirac_quda.h:22
#define QUDA_MAX_CHRONO
void OpenMagma()
Definition: blas_magma.cu:310
void unregister_pinned_quda_(void *ptr)
Pinned a pre-existing memory allocation.
QudaPrecision cuda_prec_sloppy
Definition: quda.h:45
void exchangeGhost(QudaLinkDirection link_direction=QUDA_LINK_BACKWARDS)
Exchange the ghost and store store in the padded region.
static bool invalidate_clover
enum QudaGaugeFieldOrder_s QudaGaugeFieldOrder
int Volume() const
#define tmp1
Definition: tmc_core.h:15
QudaMatPCType matpcType
NEW: used by mobius domain wall only.
Definition: dirac_quda.h:29
double true_res_offset[QUDA_MAX_MULTI_SHIFT]
Definition: invert_quda.h:145
void comm_set_gridsize_(int *grid)
void projectSU3(cudaGaugeField &U, double tol, int *fails)
Project the input gauge field onto the SU(3) group. This is a destructive operation. The number of link failures is reported so appropriate action can be taken.
int(* QudaCommsMap)(const int *coords, void *fdata)
Definition: comm_quda.h:12
void initQudaMemory()
static TimeProfile profileDslash("dslashQuda")
Profiler for invertQuda.
void saveProfile(const std::string label="")
Save profile to disk.
Definition: tune.cpp:472
void saveGaugeFieldQuda(void *gauge, void *inGauge, QudaGaugeParam *param)
QudaSolutionType RitzMat_lanczos
Definition: quda.h:348
void init_quda_memory_()
static TimeProfile profileGauge("loadGaugeQuda")
Profile for loadCloverQuda.
cudaCloverField * cloverPrecise
enum QudaParity_s QudaParity
void register_pinned_quda_(void *ptr, size_t *bytes)
Pinned a pre-existing memory allocation.
QudaReconstructType reconstruct
Definition: quda.h:43
void lanczosQuda(int k0, int m, void *hp_Apsi, void *hp_r, void *hp_V, void *hp_alpha, void *hp_beta, QudaEigParam *eig_param)
long unsigned int size_t
QudaPrecision cuda_prec
Definition: quda.h:42
void applyU(GaugeField &force, GaugeField &U)
Definition: momentum.cu:340
static bool comms_initialized
void OvrImpSTOUTStep(GaugeField &dataDs, const GaugeField &dataOr, double rho, double epsilon)
Definition: gauge_stout.cu:801
static int * num_failures_h
double mass
Definition: quda.h:96
void dslashQuda_mdwf(void *h_out, void *h_in, QudaInvertParam *inv_param, QudaParity parity, int test_type)
static Eig_Solver * create(QudaEigParam &param, RitzMat &ritz_mat, TimeProfile &profile)
Definition: eig_solver.cpp:12
QudaFieldLocation location
Definition: quda.h:370
Deflation * defl
Definition: deflation.h:189
cudaError_t qudaDeviceSynchronize()
Wrapper around cudaDeviceSynchronize or cuDeviceSynchronize.
static TimeProfile profileGauss("gaussQuda")
Profiler for plaqQuda.
void MatDagMatQuda(void *h_out, void *h_in, QudaInvertParam *inv_param)
static TimeProfile profileProject("projectSU3Quda")
Profiler for staggeredPhaseQuda.
void * memcpy(void *__dst, const void *__src, size_t __n)
void unitarizeForce(cudaGaugeField &newForce, const cudaGaugeField &oldForce, const cudaGaugeField &gauge, int *unitarization_failed, long long *flops=NULL)
Unitarize the fermion force.
const void * ptr
void invert_quda_(void *hp_x, void *hp_b, QudaInvertParam *param)
#define safe_malloc(size)
Definition: malloc_quda.h:54
void zero(ColorSpinorField &a)
Definition: blas_quda.cu:45
void copy(const CloverField &src, bool inverse=true)
Copy into this CloverField from the generic CloverField src.
int dims[QUDA_MAX_DIM]
double shift
Shift term added onto operator (M^dag M + shift)
Definition: dirac_quda.h:1058
static void init_default_comms()
void setMass(double mass)
Definition: dirac_quda.h:140
void pushVerbosity(QudaVerbosity verbosity)
Definition: util_quda.cpp:82
double qChargeCuda()
static int * num_failures_d
void init_quda_device_(int *dev)
cudaGaugeField * gaugeLongSloppy
int compute_clover_inverse
Definition: quda.h:215
QudaPrecision prec_precondition
Definition: test_util.cpp:1617
void loadSloppyCloverQuda(QudaPrecision prec_sloppy, QudaPrecision prec_precondition)
#define checkCudaErrorNoSync()
Definition: util_quda.h:113
void update_gauge_field_quda_(void *gauge, void *momentum, double *dt, bool *conj_mom, bool *exact, QudaGaugeParam *param)
void Mdag(ColorSpinorField &out, const ColorSpinorField &in) const
Definition: dirac.cpp:73
void plaqQuda(double plq[3])
void gaugefixingFFT(cudaGaugeField &data, const int gauge_dir, const int Nsteps, const int verbose_interval, const double alpha, const int autotune, const double tolerance, const int stopWtheta)
Gauge fixing with Steepest descent method with FFTs with support for single GPU only.
void invertQuda(void *hp_x, void *hp_b, QudaInvertParam *param)
void printQudaInvertParam(QudaInvertParam *param)
Definition: check_params.h:191
#define STR(x)
void clover_quda_(void *h_out, void *h_in, QudaInvertParam *inv_param, QudaParity *parity, int *inverse)
void wuppertalStep(ColorSpinorField &out, const ColorSpinorField &in, int parity, const GaugeField &U, double A, double B)
virtual void M(ColorSpinorField &out, const ColorSpinorField &in) const
void hisqCompleteForce(GaugeField &momentum, const GaugeField &oprod, const GaugeField &link, long long *flops=nullptr)
Multiply the computed the force matrix by the gauge field and perform traceless anti-hermitian projec...
QudaFieldLocation location
Definition: quda.h:27
double clover_rho
Definition: quda.h:209
GaugeCovDev * dirac
Definition: covdev_test.cpp:75
static TimeProfile profileInvert("invertQuda")
Profiler for invertMultiShiftQuda.
cpuColorSpinorField * out
double gaugeGiB
Definition: quda.h:60
static TimeProfile profilePhase("staggeredPhaseQuda")
Profiler for contractions.
static TimeProfile GaugeFixOVRQuda("GaugeFixOVRQuda")
Profiler for toal time spend between init and end.
cudaGaugeField * gaugePrecondition
bool twisted
Clover coefficient.
Definition: clover_field.h:17
static TimeProfile profileCovDev("covDevQuda")
Profiler for contractions.
GaugeFieldParam gParam
void * fatlink[4]
deflated_solver(QudaEigParam &eig_param, TimeProfile &profile)
enum QudaReconstructType_s QudaReconstructType
Main header file for the QUDA library.
int r[QUDA_MAX_DIM]
static TimeProfile profileInit("initQuda")
Profile for loadGaugeQuda / saveGaugeQuda.
cudaCloverField * clover
Definition: dirac_quda.h:34
static bool redundant_comms
#define QUDA_MAX_MG_LEVEL
Maximum number of multi-grid levels. This number may be increased if needed.
QudaLinkType link_type
Definition: gauge_field.h:17
std::vector< ColorSpinorField * > B
Definition: multigrid.h:391
void set_kernel_pack_t_(int *pack)
fTemporary function exposed for TIFR benchmarking
virtual void MdagM(ColorSpinorField &out, const ColorSpinorField &in) const
void printPeakMemUsage()
Definition: malloc.cpp:371
#define QUDA_MAX_DWF_LS
Maximum length of the Ls dimension for domain-wall fermions.
void applyStaggeredPhase()
void loadTuneCache()
Definition: tune.cpp:302
void * newDeflationQuda(QudaEigParam *eig_param)
void staggeredPhaseQuda(void *gauge_h, QudaGaugeParam *param)
void printQudaMultigridParam(QudaMultigridParam *param)
Definition: check_params.h:504
void freeSloppyGaugeQuda(void)
double computeQCharge(GaugeField &Fmunu, QudaFieldLocation location)
void mat_dag_mat_quda_(void *h_out, void *h_in, QudaInvertParam *inv_param)
if(err !=cudaSuccess)
virtual void prepare(ColorSpinorField *&src, ColorSpinorField *&sol, ColorSpinorField &x, ColorSpinorField &b, const QudaSolutionType) const
int use_resident_gauge
Definition: quda.h:71
#define printfQuda(...)
Definition: util_quda.h:84
void new_quda_gauge_param_(QudaGaugeParam *param)
void contractCuda(const cudaColorSpinorField &x, const cudaColorSpinorField &y, void *result, const QudaContractType contract_type, const QudaParity parity, TimeProfile &profile)
Definition: contract.cu:202
cudaGaugeField * fatGauge
Definition: dirac_quda.h:32
QudaTwistFlavorType twist_flavor
Definition: quda.h:108
int atoi(const char *)
void cloverQuda(void *h_out, void *h_in, QudaInvertParam *inv_param, QudaParity parity, int inverse)
virtual void Dslash(ColorSpinorField &out, const ColorSpinorField &in, const QudaParity parity) const
int VolumeCB() const
int return_result_gauge
Definition: quda.h:75
cudaGaugeField * gaugeSmeared
static TimeProfile GaugeFixFFTQuda("GaugeFixFFTQuda")
double residue[QUDA_MAX_MULTI_SHIFT]
Definition: quda.h:171
unsigned long long flops
Definition: blas_quda.cu:42
cudaGaugeField * cudaGauge
static TimeProfile profileExtendedGauge("createExtendedGaugeField")
Profiler for computeCloverForceQuda.
quda::cudaGaugeField * checkGauge(QudaInvertParam *param)
void updateMomentum(GaugeField &mom, double coeff, GaugeField &force)
Definition: momentum.cu:224
cudaStream_t * streams
int use_resident_mom
Definition: quda.h:72
QudaReconstructType reconstruct
Definition: gauge_field.h:14
void setKernelPackT(bool pack)
Definition: dslash_quda.cu:59
void closeMagma()
QudaFieldCreate create
Definition: gauge_field.h:25
void printAPIProfile()
Print out the timer profile for CUDA API calls.
void printLaunchTimer()
Definition: tune.cpp:797
cudaGaugeField *& gaugeFatSloppy
void gamma5(ColorSpinorField &out, const ColorSpinorField &in)
Applies a gamma5 matrix to a spinor (wrapper to ApplyGamma)
Definition: dslash_quda.cu:427
enum QudaContractType_s QudaContractType
static TimeProfile profileEnd("endQuda")
Profiler for GaugeFixing.
QudaReconstructType Reconstruct() const
Definition: gauge_field.h:203
const int * X() const
enum QudaFieldGeometry_s QudaFieldGeometry
QudaUseInitGuess use_init_guess
Definition: invert_quda.h:50
int num_offset
Definition: quda.h:146
void flushChronoQuda(int i)
Flush the chronological history for the given index.
cudaGaugeField * longGauge
Definition: dirac_quda.h:33
void dslash_quda_(void *h_out, void *h_in, QudaInvertParam *inv_param, QudaParity *parity)
void popVerbosity()
Definition: util_quda.cpp:93
enum QudaVerbosity_s QudaVerbosity
double cloverGiB
Definition: quda.h:226
void updateGaugeField(GaugeField &out, double dt, const GaugeField &in, const GaugeField &mom, bool conj_mom, bool exact)
void createCloverQuda(QudaInvertParam *invertParam)
int return_result_mom
Definition: quda.h:76
int test_type
Definition: test_util.cpp:1634
void end_quda_()
int compute_clover
Definition: quda.h:214
int computeGaugeForceQuda(void *mom, void *siteLink, int ***input_path_buf, int *path_length, double *loop_coeff, int num_paths, int max_length, double eb3, QudaGaugeParam *qudaGaugeParam)
void computeStaggeredOprod(GaugeField *out[], ColorSpinorField &in, const double coeff[], int nFace)
Compute the outer-product field between the staggered quark field&#39;s one and (for HISQ and ASQTAD) thr...
double epsilon
Definition: quda.h:106
#define QUDA_MAX_DIM
Maximum number of dimensions supported by QUDA. In practice, no routines make use of more than 5...
cpuGaugeField * cpuGauge
#define checkCudaError()
Definition: util_quda.h:129
QudaFieldGeometry geometry
Definition: gauge_field.h:27
static void PrintGlobal()
Definition: timer.cpp:55
void setOutputFile(FILE *outfile)
Definition: util_quda.cpp:74
cudaGaugeField * gaugePrecise
#define mapped_malloc(size)
Definition: malloc_quda.h:56
int use_resident_solution
Definition: quda.h:316
static Dirac * create(const DiracParam &param)
Definition: dirac.cpp:142
cudaGaugeField *& gaugeFatPrecise
static double unscaled_shifts[QUDA_MAX_MULTI_SHIFT]
void dslashQuda(void *h_out, void *h_in, QudaInvertParam *inv_param, QudaParity parity)
void compute_gauge_force_quda_(void *mom, void *gauge, int *num_loop_types, double *coeff, double *dt, QudaGaugeParam *param)
Compute the gauge force and update the mometum field.
void updateR()
update the radius for halos.
static TimeProfile profileWuppertal("wuppertalQuda")
Profiler for gaussQuda.
cudaGaugeField * cudaForce
void flush_chrono_quda_(int *index)
Flush the chronological history for the given index.
static TimeProfile profileContract("contractQuda")
Profiler for contractions.
int make_resident_gauge
Definition: quda.h:73
void copy(const GaugeField &src)
void mat(void *out, void **link, void *in, int dagger_bit, int mu, QudaPrecision sPrecision, QudaPrecision gPrecision)
static __inline__ size_t size_t d
void computeCloverSigmaOprod(GaugeField &oprod, std::vector< ColorSpinorField *> &x, std::vector< ColorSpinorField *> &p, std::vector< std::vector< double > > &coeff)
Compute the outer product from the solver solution fields arising from the diagonal term of the fermi...
QudaPrecision Precision() const
cudaGaugeField * extendedGaugeResident
void computeClover(CloverField &clover, const GaugeField &gauge, double coeff, QudaFieldLocation location)
Definition: clover_quda.cu:204
void saveTuneCache()
Definition: tune.cpp:388
QudaDslashType dslash_type_precondition
Definition: quda.h:259
QudaPrecision clover_cpu_prec
Definition: quda.h:200
QudaPrecision cuda_prec_ritz
Definition: quda.h:364
QudaParity parity
Definition: covdev_test.cpp:53
static int opp(int dir)
static TimeProfile profileHISQForce("computeHISQForceQuda")
Profiler for plaqQuda.
void reorder_location_set(QudaFieldLocation reorder_location_)
Set whether data is reorderd on the CPU or GPU. This can set at QUDA initialization using the environ...
static void freeGhostBuffer(void)
Free statically allocated ghost buffers.
void destroyMultigridQuda(void *mg)
Free resources allocated by the multigrid solver.
void destroyDslashEvents()
Definition: dslash_quda.cu:118
char * getenv(const char *)
ColorSpinorField * RV
Definition: deflation.h:185
void performOvrImpSTOUTnStep(unsigned int nSteps, double rho, double epsilon)
void copyExtendedGauge(GaugeField &out, const GaugeField &in, QudaFieldLocation location, void *Out=0, void *In=0)
#define QUDA_VERSION_MAJOR
Definition: quda_constants.h:1
void setVerbosity(const QudaVerbosity verbosity)
Definition: util_quda.cpp:24
QudaMatPCType matpc_type
Definition: quda.h:183
cudaGaugeField * momResident
ColorSpinorField * tmp1
Definition: dirac_quda.h:40
DiracMatrix * matResidual
Definition: multigrid.h:72
virtual void reconstruct(ColorSpinorField &x, const ColorSpinorField &b, const QudaSolutionType) const
cpuGaugeField * cpuForce
static TimeProfile profileCloverForce("computeCloverForceQuda")
Profiler for computeStaggeredForceQuda.
double kappa5
static void createGaugeForcePaths(int **paths, int dir, int num_loop_types)
void * newMultigridQuda(QudaMultigridParam *mg_param)
unsigned long long bytes
Definition: blas_quda.cu:43
QudaPrecision cpu_prec
Definition: quda.h:40
cudaGaugeField * gaugeSloppy
int comm_dim_partitioned(int dim)
void initQudaDevice(int dev)
void endQuda(void)
int overlap
Definition: quda.h:67
QudaGaugeParam newQudaGaugeParam(void)
size_t GBytes() const
const int * X() const
QudaInvertParam * invert_param
Definition: quda.h:395
void checkClover(QudaInvertParam *param)
void mat_quda_(void *h_out, void *h_in, QudaInvertParam *inv_param)
double clover_coeff
Definition: quda.h:208
void compute_staggered_force_quda_(void *h_mom, double *dt, double *delta, void *gauge, void *x, QudaGaugeParam *gauge_param, QudaInvertParam *inv_param)
void setDiracPreParam(DiracParam &diracParam, QudaInvertParam *inv_param, const bool pc, bool comms)
cudaGaugeField * gaugeLongPrecondition
void removeStaggeredPhase()