47 #ifdef GPU_GAUGE_FORCE 52 #define MAX(a,b) ((a)>(b)? (a):(b)) 53 #define TDIFF(a,b) (b.tv_sec - a.tv_sec + 0.000001*(b.tv_usec - a.tv_usec)) 55 #define spinorSiteSize 24 // real numbers per spinor 57 #define MAX_GPU_NUM_PER_NODE 16 80 #include <cuda_profiler_api.h> 84 static int R[4] = {0, 0, 0, 0};
101 printfQuda(
"\nMAGMA library was already initialized..\n");
112 printfQuda(
"\nMAGMA library was not initialized..\n");
148 #define QUDA_MAX_CHRONO 12 256 static std::vector<int> target_list;
257 static bool enable =
false;
258 static bool init =
false;
260 char *profile_target_env = getenv(
"QUDA_ENABLE_TARGET_PROFILE");
262 if ( profile_target_env ) {
263 std::stringstream target_stream(profile_target_env);
266 while(target_stream >> target) {
267 target_list.push_back(target);
268 if (target_stream.peek() ==
',') target_stream.ignore();
271 if (target_list.size() > 0) {
272 std::sort(target_list.begin(), target_list.end());
273 target_list.erase( unique( target_list.begin(), target_list.end() ), target_list.end() );
274 warningQuda(
"Targeted profiling enabled for %lu functions\n", target_list.size());
280 char* donotprofile_env = getenv(
"QUDA_DO_NOT_PROFILE");
281 if (donotprofile_env && (!(strcmp(donotprofile_env,
"0") == 0))) {
288 static int target_count = 0;
289 static unsigned int i = 0;
295 if (i < target_list.size() && target_count++ == target_list[i]) {
343 int rank = coords[0];
344 for (
int i = 1; i < md->ndim; i++) {
345 rank = md->dims[i] * rank + coords[i];
354 static int qmp_rank_from_coords(
const int *coords,
void *fdata)
356 return QMP_get_node_number_from(coords);
363 #if defined(QMP_COMMS) || defined(MPI_COMMS) 364 MPI_Comm MPI_COMM_HANDLE;
365 static int user_set_comm_handle = 0;
370 #if defined(QMP_COMMS) || defined(MPI_COMMS) 371 MPI_COMM_HANDLE = *((MPI_Comm *)mycomm);
372 user_set_comm_handle = 1;
377 static void initQMPComms(
void)
381 if (!user_set_comm_handle) {
383 QMP_get_mpi_comm(QMP_comm_get_default(), &mycomm);
387 #elif defined(MPI_COMMS) 388 static void initMPIComms(
void)
391 if (!user_set_comm_handle) {
392 static MPI_Comm mycomm;
393 MPI_Comm_dup(MPI_COMM_WORLD, &mycomm);
407 #elif defined(MPI_COMMS) 412 errorQuda(
"Number of communication grid dimensions must be 4");
419 if (QMP_logical_topology_is_declared()) {
420 if (QMP_get_logical_number_of_dimensions() != 4) {
421 errorQuda(
"QMP logical topology must have 4 dimensions");
423 for (
int i=0; i<nDim; i++) {
424 int qdim = QMP_get_logical_dimensions()[i];
425 if(qdim != dims[i]) {
426 errorQuda(
"QMP logical dims[%d]=%d does not match dims[%d]=%d argument", i, qdim, i, dims[i]);
430 func = qmp_rank_from_coords;
432 warningQuda(
"QMP logical topology is undeclared; using default lexicographical ordering");
435 map_data.
ndim = nDim;
436 for (
int i=0; i<nDim; i++) {
437 map_data.
dims[i] = dims[i];
439 fdata = (
void *) &map_data;
454 #if defined(QMP_COMMS) 455 if (QMP_logical_topology_is_declared()) {
456 int ndim = QMP_get_logical_number_of_dimensions();
457 const int *
dims = QMP_get_logical_dimensions();
460 errorQuda(
"initQuda() called without prior call to initCommsGridQuda()," 461 " and QMP logical topology has not been declared");
463 #elif defined(MPI_COMMS) 464 errorQuda(
"When using MPI for communications, initCommsGridQuda() must be called before initQuda()");
466 const int dims[4] = {1, 1, 1, 1};
473 #define STR(x) STR_(x) 502 cudaDriverGetVersion(&driver_version);
503 printfQuda(
"CUDA Driver version = %d\n", driver_version);
506 cudaRuntimeGetVersion(&runtime_version);
507 printfQuda(
"CUDA Runtime version = %d\n", runtime_version);
510 nvmlReturn_t result = nvmlInit();
511 if (NVML_SUCCESS != result)
errorQuda(
"NVML Init failed with error %d", result);
513 char graphics_version[
length];
514 result = nvmlSystemGetDriverVersion(graphics_version, length);
515 if (NVML_SUCCESS != result)
errorQuda(
"nvmlSystemGetDriverVersion failed with error %d", result);
516 printfQuda(
"Graphic driver version = %s\n", graphics_version);
517 result = nvmlShutdown();
518 if (NVML_SUCCESS != result)
errorQuda(
"NVML Shutdown failed with error %d", result);
521 #if defined(MULTI_GPU) && (CUDA_VERSION == 4000) 524 char* cni_str = getenv(
"CUDA_NIC_INTEROP");
525 if(cni_str ==
nullptr){
526 errorQuda(
"Environment variable CUDA_NIC_INTEROP is not set");
528 int cni_int = atoi(cni_str);
530 errorQuda(
"Environment variable CUDA_NIC_INTEROP is not set to 1");
535 cudaGetDeviceCount(&deviceCount);
536 if (deviceCount == 0) {
540 for(
int i=0; i<deviceCount; i++) {
551 errorQuda(
"initDeviceQuda() called with a negative device ordinal, but comms have not been initialized");
556 if (dev < 0 || dev >= 16)
errorQuda(
"Invalid device number %d", dev);
562 errorQuda(
"Device %d does not support CUDA", dev);
573 const int my_major = __COMPUTE_CAPABILITY__ / 100;
574 const int my_minor = (__COMPUTE_CAPABILITY__ - my_major * 100) / 10;
577 errorQuda(
"** Running on a device with compute capability %i.%i but QUDA was compiled for %i.%i. ** \n --- Please set the correct QUDA_GPU_ARCH when running cmake.\n",
deviceProp.major,
deviceProp.minor, my_major, my_minor);
582 char *allow_jit_env = getenv(
"QUDA_ALLOW_JIT");
583 if (allow_jit_env && strcmp(allow_jit_env,
"1") == 0) {
584 if (
getVerbosity() >
QUDA_SILENT)
warningQuda(
"** Running on a device with compute capability %i.%i but QUDA was compiled for %i.%i. **\n -- Jitting the PTX since QUDA_ALLOW_JIT=1 was set. Note that this will take some time.\n",
deviceProp.major,
deviceProp.minor, my_major, my_minor);
586 errorQuda(
"** Running on a device with compute capability %i.%i but QUDA was compiled for %i.%i. **\n --- Please set the correct QUDA_GPU_ARCH when running cmake.\n If you want the PTX to be jitted for your current GPU arch please set the enviroment variable QUDA_ALLOW_JIT=1.",
deviceProp.major,
deviceProp.minor, my_major, my_minor);
591 warningQuda(
"** Running on a device with compute capability %i.%i but QUDA was compiled for %i.%i. **\n -- This might result in a lower performance. Please consider adjusting QUDA_GPU_ARCH when running cmake.\n",
deviceProp.major,
deviceProp.minor, my_major, my_minor);
603 #if ((CUDA_VERSION >= 6000) && defined NUMA_NVML) 604 char *enable_numa_env = getenv(
"QUDA_ENABLE_NUMA");
605 if (enable_numa_env && strcmp(enable_numa_env,
"0") == 0) {
615 cudaDeviceSetCacheConfig(cudaFuncCachePreferL1);
620 char *reorder_str = getenv(
"QUDA_REORDER_LOCATION");
622 if (!reorder_str || (strcmp(reorder_str,
"CPU") && strcmp(reorder_str,
"cpu")) ) {
623 warningQuda(
"Data reordering done on GPU (set with QUDA_REORDER_LOCATION=GPU/CPU)");
626 warningQuda(
"Data reordering done on CPU (set with QUDA_REORDER_LOCATION=GPU/CPU)");
647 int greatestPriority;
649 cudaDeviceGetStreamPriorityRange(&leastPriority, &greatestPriority);
650 for (
int i=0; i<
Nstream-1; i++) {
651 cudaStreamCreateWithPriority(&
streams[i], cudaStreamDefault, greatestPriority);
653 cudaStreamCreateWithPriority(&
streams[Nstream-1], cudaStreamDefault, leastPriority);
697 for (
int dir=0; dir<4; ++dir) y[dir] = in.
X()[dir] + 2*R[dir];
703 gParamEx.order = in.
Order();
707 gParamEx.tadpole = in.
Tadpole();
709 for (
int d=0; d<4; d++) gParamEx.r[d] = R[d];
736 checkGaugeParam(param);
754 static size_t checksum = SIZE_MAX;
755 size_t in_checksum = in->
checksum(
true);
756 if (in_checksum == checksum) {
764 checksum = in_checksum;
769 switch (param->
type) {
771 if (gaugeRefinement != gaugeSloppy && gaugeRefinement)
delete gaugeRefinement;
772 if (gaugeSloppy != gaugePrecondition && gaugePrecondition)
delete gaugePrecondition;
773 if (gaugePrecise != gaugeSloppy && gaugeSloppy)
delete gaugeSloppy;
777 if (gaugeFatRefinement != gaugeFatSloppy && gaugeFatRefinement)
delete gaugeFatRefinement;
778 if (gaugeFatSloppy != gaugeFatPrecondition && gaugeFatPrecondition)
delete gaugeFatPrecondition;
779 if (gaugeFatPrecise != gaugeFatSloppy && gaugeFatSloppy)
delete gaugeFatSloppy;
783 if (gaugeLongRefinement != gaugeLongSloppy && gaugeLongRefinement)
delete gaugeLongRefinement;
785 if (gaugeLongPrecise != gaugeLongSloppy && gaugeLongSloppy)
delete gaugeLongSloppy;
808 if(gaugePrecise ==
nullptr)
errorQuda(
"No resident gauge field");
810 precise->
copy(*gaugePrecise);
813 gaugePrecise =
nullptr;
845 sloppy->
copy(*precise);
857 precondition->
copy(*sloppy);
859 precondition = sloppy;
869 refinement->
copy(*sloppy);
884 switch (param->
type) {
886 gaugePrecise = precise;
887 gaugeSloppy = sloppy;
888 gaugePrecondition = precondition;
889 gaugeRefinement = refinement;
891 if(param->
overlap) gaugeExtended = extended;
894 gaugeFatPrecise = precise;
895 gaugeFatSloppy = sloppy;
896 gaugeFatPrecondition = precondition;
897 gaugeFatRefinement = refinement;
900 if(gaugeFatExtended)
errorQuda(
"Extended gauge fat field already allocated");
901 gaugeFatExtended = extended;
905 gaugeLongPrecise = precise;
906 gaugeLongSloppy = sloppy;
907 gaugeLongPrecondition = precondition;
908 gaugeLongRefinement = refinement;
911 if(gaugeLongExtended)
errorQuda(
"Extended gauge long field already allocated");
912 gaugeLongExtended = extended;
923 if (extendedGaugeResident) {
925 const int *R_ = extendedGaugeResident->
R();
926 const int R[] = { R_[0], R_[1], R_[2], R_[3] };
941 errorQuda(
"Non-cpu output location not yet supported");
944 checkGaugeParam(param);
950 switch (param->
type) {
990 checkCloverParam(inv_param);
991 bool device_calc =
false;
1000 if (inv_param->
clover_coeff == 0.0)
errorQuda(
"called with neither clover term nor inverse and clover coefficient not set");
1001 if (gaugePrecise->
Anisotropy() != 1.0)
errorQuda(
"cannot compute anisotropic clover field");
1005 if (gaugePrecise ==
nullptr)
errorQuda(
"Gauge field must be loaded before clover");
1024 if (!h_clover && !pc_solve && !pc_solution) {
1029 if (!h_clover && pc_solve && pc_solution && asymmetric && !device_calc) {
1034 #ifdef DYNAMIC_CLOVER 1035 bool dynamic_clover =
true;
1037 bool dynamic_clover =
false;
1041 clover_param.
nDim = 4;
1043 clover_param.
twisted = twisted;
1044 clover_param.
mu2 = twisted ? 4.*inv_param->
kappa*inv_param->
kappa*inv_param->
mu*inv_param->
mu : 0.0;
1046 for (
int i=0; i<4; i++) clover_param.
x[i] = gaugePrecise->
X()[i];
1049 clover_param.
norm =
nullptr;
1050 clover_param.
invNorm =
nullptr;
1052 clover_param.
direct = h_clover || device_calc ? true :
false;
1053 clover_param.
inverse = (h_clovinv || pc_solve) && !dynamic_clover ?
true :
false;
1058 bool clover_update =
false;
1059 double csw_old = cloverPrecise ? cloverPrecise->
Csw() : 0.0;
1063 if (clover_update) {
1076 inParam.
direct = h_clover ? true :
false;
1077 inParam.
inverse = h_clovinv ? true :
false;
1078 inParam.
clover = h_clover;
1090 cloverPrecise->
copy(*in, inverse);
1101 if (!dynamic_clover) {
1114 clover_param.
direct =
true;
1115 clover_param.
inverse = dynamic_clover ? false :
true;
1125 if (!h_clover && !h_clovinv)
errorQuda(
"Requested clover field return but no clover host pointers set");
1135 if (!dynamic_clover) {
1138 hack->
copy(*cloverPrecise);
1141 hackOfTheHack->copy(*cloverPrecise,
false);
1149 hack->
copy(*hackOfTheHack);
1150 delete hackOfTheHack;
1157 qudaMemcpy((
char*)(in->
V(
false)), (
char*)(hack->
V(
false)), in->
Bytes(), cudaMemcpyDeviceToHost);
1160 qudaMemcpy((
char*)(in->
V(
true)), (
char*)(hack->
V(
true)), in->
Bytes(), cudaMemcpyDeviceToHost);
1184 if (cloverPrecise) {
1190 if (cloverPrecise->
V(
false) != cloverPrecise->
V(
true)) {
1191 clover_param.
direct =
true;
1194 clover_param.
direct =
false;
1200 cloverSloppy->
copy(*cloverPrecise, clover_param.
inverse);
1211 cloverPrecondition->
copy(*cloverSloppy, clover_param.
inverse);
1222 cloverRefinement->
copy(*cloverSloppy, clover_param.
inverse);
1234 if (gaugeSloppy != gaugeRefinement && gaugeRefinement)
delete gaugeRefinement;
1235 if (gaugeSloppy != gaugePrecondition && gaugePrecondition)
delete gaugePrecondition;
1236 if (gaugePrecise != gaugeSloppy && gaugeSloppy)
delete gaugeSloppy;
1238 gaugeRefinement =
nullptr;
1239 gaugePrecondition =
nullptr;
1240 gaugeSloppy =
nullptr;
1242 if (gaugeLongSloppy != gaugeLongRefinement && gaugeLongRefinement)
delete gaugeLongRefinement;
1243 if (gaugeLongSloppy != gaugeLongPrecondition && gaugeLongPrecondition)
delete gaugeLongPrecondition;
1244 if (gaugeLongPrecise != gaugeLongSloppy && gaugeLongSloppy)
delete gaugeLongSloppy;
1246 gaugeLongRefinement =
nullptr;
1247 gaugeLongPrecondition =
nullptr;
1248 gaugeLongSloppy =
nullptr;
1250 if (gaugeFatSloppy != gaugeFatRefinement && gaugeFatRefinement)
delete gaugeFatRefinement;
1251 if (gaugeFatSloppy != gaugeFatPrecondition && gaugeFatPrecondition)
delete gaugeFatPrecondition;
1252 if (gaugeFatPrecise != gaugeFatSloppy && gaugeFatSloppy)
delete gaugeFatSloppy;
1254 gaugeFatRefinement =
nullptr;
1255 gaugeFatPrecondition =
nullptr;
1256 gaugeFatSloppy =
nullptr;
1268 gaugePrecise =
nullptr;
1269 gaugeExtended =
nullptr;
1274 gaugeLongPrecise =
nullptr;
1275 gaugeLongExtended =
nullptr;
1279 gaugeFatPrecise =
nullptr;
1280 gaugeFatExtended =
nullptr;
1284 gaugeSmeared =
nullptr;
1286 if (extendedGaugeResident) {
1288 extendedGaugeResident =
nullptr;
1302 if (gaugeSloppy)
errorQuda(
"gaugeSloppy already exists");
1306 gaugeSloppy->
copy(*gaugePrecise);
1315 if (gaugePrecondition)
errorQuda(
"gaugePrecondition already exists");
1319 gaugePrecondition->
copy(*gaugeSloppy);
1328 if (gaugeRefinement)
errorQuda(
"gaugeRefinement already exists");
1332 gaugeRefinement->
copy(*gaugeSloppy);
1339 if (gaugeFatPrecise) {
1344 if (gaugeFatSloppy)
errorQuda(
"gaugeFatSloppy already exists");
1349 gaugeFatSloppy->
copy(*gaugeFatPrecise);
1357 if (gaugeFatPrecondition)
errorQuda(
"gaugeFatPrecondition already exists\n");
1362 gaugeFatPrecondition->
copy(*gaugeFatSloppy);
1370 if (gaugeFatRefinement)
errorQuda(
"gaugeFatRefinement already exists\n");
1375 gaugeFatRefinement->
copy(*gaugeFatSloppy);
1382 if (gaugeLongPrecise) {
1388 if (gaugeLongSloppy)
errorQuda(
"gaugeLongSloppy already exists");
1393 gaugeLongSloppy->
copy(*gaugeLongPrecise);
1402 if (gaugeLongPrecondition)
errorQuda(
"gaugeLongPrecondition already exists\n");
1407 gaugeLongPrecondition->
copy(*gaugeLongSloppy);
1416 if (gaugeLongRefinement)
errorQuda(
"gaugeLongRefinement already exists\n");
1421 gaugeLongRefinement->
copy(*gaugeLongSloppy);
1431 if (cloverRefinement != cloverSloppy && cloverRefinement)
delete cloverRefinement;
1432 if (cloverPrecondition != cloverSloppy && cloverPrecondition)
delete cloverPrecondition;
1433 if (cloverSloppy != cloverPrecise && cloverSloppy)
delete cloverSloppy;
1435 cloverRefinement =
nullptr;
1436 cloverPrecondition =
nullptr;
1437 cloverSloppy =
nullptr;
1445 cloverPrecise =
nullptr;
1455 for (
auto v : basis) {
1472 for (
auto v : solutionResident)
if (v)
delete v;
1473 solutionResident.clear();
1551 char *device_reset_env = getenv(
"QUDA_DEVICE_RESET");
1552 if (device_reset_env && strcmp(device_reset_env,
"1") == 0) {
1578 diracParam.
Ls = inv_param->
Ls;
1582 diracParam.
Ls = inv_param->
Ls;
1588 diracParam.
Ls = inv_param->
Ls;
1589 if (
sizeof(
Complex) !=
sizeof(
double _Complex)) {
1590 errorQuda(
"Irreconcilable difference between interface and internal complex number conventions");
1592 memcpy(diracParam.
b_5, inv_param->
b_5,
sizeof(
Complex) * inv_param->
Ls);
1593 memcpy(diracParam.
c_5, inv_param->
c_5,
sizeof(
Complex) * inv_param->
Ls);
1596 for (
int i = 0; i < diracParam.
Ls; i++) {
1597 printfQuda(
"fromQUDA diracParam: b5[%d] = %f + i%f, c5[%d] = %f + i%f\n", i, diracParam.
b_5[i].real(),
1598 diracParam.
b_5[i].imag(), i, diracParam.
c_5[i].real(), diracParam.
c_5[i].imag());
1650 diracParam.
m5 = inv_param->
m5;
1651 diracParam.
mu = inv_param->
mu;
1653 for (
int i=0; i<4; i++) diracParam.
commDim[i] = 1;
1670 for (
int i=0; i<4; i++) {
1688 for (
int i=0; i<4; i++) {
1713 for (
int i=0; i<4; i++) {
1714 diracParam.
commDim[i] = comms ? 1 : 0;
1771 double kappa5 = (0.5/(5.0 + param.
m5));
1777 printfQuda(
"Mass rescale: Kappa is: %g\n", kappa);
1780 printfQuda(
"Mass rescale: norm of source in = %g\n", nin);
1801 unscaled_shifts[i] = param.
offset[i];
1844 printfQuda(
"Mass rescale: Kappa is: %g\n", kappa);
1847 printfQuda(
"Mass rescale: norm of source out = %g\n", nin);
1864 errorQuda(
"Clover field not allocated");
1918 dirac->Dslash(out, tmp1, parity);
1920 dirac->Dslash(out, in, parity);
1931 printfQuda(
"Out CPU %e CUDA %e\n", cpu, gpu);
1986 switch (test_type) {
1988 dirac.
Dslash4(out, in, parity);
1991 dirac.
Dslash5(out, in, parity);
2006 printfQuda(
"Out CPU %e CUDA %e\n", cpu, gpu);
2055 switch (test_type) {
2057 dirac.
Dslash4(out, in, parity);
2060 dirac.
Dslash5(out, in, parity);
2078 printfQuda(
"Out CPU %e CUDA %e\n", cpu, gpu);
2098 errorQuda(
"Clover field not allocated");
2148 printfQuda(
"Out CPU %e CUDA %e\n", cpu, gpu);
2168 errorQuda(
"Clover field not allocated");
2196 dirac->
MdagM(out, in);
2221 printfQuda(
"Out CPU %e CUDA %e\n", cpu, gpu);
2237 return (gaugeFatPrecise !=
nullptr) and param->
cuda_prec == gaugeFatPrecise->
Precision();
2260 if (cloverPrecise ==
nullptr)
errorQuda(
"Precise clover field doesn't exist");
2261 if (cloverSloppy ==
nullptr)
errorQuda(
"Sloppy clover field doesn't exist");
2262 if (cloverPrecondition ==
nullptr)
errorQuda(
"Precondition clover field doesn't exist");
2263 if (cloverRefinement ==
nullptr)
errorQuda(
"Refinement clover field doesn't exist");
2270 if (gaugePrecise ==
nullptr)
errorQuda(
"Precise gauge field doesn't exist");
2287 if (gaugeSloppy ==
nullptr)
errorQuda(
"Sloppy gauge field doesn't exist");
2288 if (gaugePrecondition ==
nullptr)
errorQuda(
"Precondition gauge field doesn't exist");
2289 if (gaugeRefinement ==
nullptr)
errorQuda(
"Refinement gauge field doesn't exist");
2291 if (gaugeExtended ==
nullptr)
errorQuda(
"Extended gauge field doesn't exist");
2295 if (gaugeFatPrecise ==
nullptr)
errorQuda(
"Precise gauge fat field doesn't exist");
2296 if (gaugeLongPrecise ==
nullptr)
errorQuda(
"Precise gauge long field doesn't exist");
2318 if (gaugeFatSloppy ==
nullptr)
errorQuda(
"Sloppy gauge fat field doesn't exist");
2319 if (gaugeFatPrecondition ==
nullptr)
errorQuda(
"Precondition gauge fat field doesn't exist");
2320 if (gaugeFatRefinement ==
nullptr)
errorQuda(
"Refinement gauge fat field doesn't exist");
2322 if (gaugeFatExtended ==
nullptr)
errorQuda(
"Extended gauge fat field doesn't exist");
2325 if (gaugeLongSloppy ==
nullptr)
errorQuda(
"Sloppy gauge long field doesn't exist");
2326 if (gaugeLongPrecondition ==
nullptr)
errorQuda(
"Precondition gauge long field doesn't exist");
2327 if (gaugeLongRefinement ==
nullptr)
errorQuda(
"Refinement gauge long field doesn't exist");
2329 if (gaugeLongExtended ==
nullptr)
errorQuda(
"Extended gauge long field doesn't exist");
2344 if (gaugePrecise ==
nullptr)
errorQuda(
"Gauge field not allocated");
2345 if (cloverPrecise ==
nullptr)
errorQuda(
"Clover field not allocated");
2350 errorQuda(
"Cannot apply the clover term for a non Wilson-clover or Twisted-mass-clover dslash");
2384 if (!inverse) dirac.
Clover(out, in, parity);
2395 printfQuda(
"Out CPU %e CUDA %e\n", cpu, gpu);
2428 checkInvertParam(inv_param);
2429 checkEigParam(eig_param);
2435 inv_param->
secs = 0;
2437 inv_param->
iter = 0;
2442 Dirac *dSloppy =
nullptr;
2443 Dirac *dPre =
nullptr;
2446 createDirac(d, dSloppy, dPre, *inv_param, pc_solve);
2451 const int *
X = cudaGauge->
X();
2455 std::vector<ColorSpinorField *> host_evecs_;
2456 for (
int i = 0; i < eig_param->
nConv; i++) {
2457 cpuParam.
v = host_evecs[i];
2466 std::vector<Complex> evals(eig_param->
nConv, 0.0);
2467 std::vector<ColorSpinorField *> kSpace;
2475 errorQuda(
"Polynomial acceleration with non-symmetric matrices not supported");
2487 (*eig_solve)(kSpace, evals);
2496 (*eig_solve)(kSpace, evals);
2505 (*eig_solve)(kSpace, evals);
2514 (*eig_solve)(kSpace, evals);
2518 errorQuda(
"Invalid use_norm_op and dagger combination");
2522 for (
int i = 0; i < eig_param->
nConv; i++) { host_evals[i] = real(evals[i]) + imag(evals[i]) * _Complex_I; }
2527 for (
int i = 0; i < eig_param->
nConv; i++) *host_evecs_[i] = *kSpace[i];
2532 for (
int i = 0; i < eig_param->
nConv; i++)
delete host_evecs_[i];
2536 for (
int i = 0; i < eig_param->
nConv; i++)
delete kSpace[i];
2548 : profile(profile) {
2552 checkMultigridParam(&mg_param);
2558 for (
int i=0; i<mg_param.
n_level; i++) {
2563 errorQuda(
"Outer MG solver can only use QUDA_DIRECT_SOLVE at present");
2610 B.resize(mg_param.
n_vec[0]);
2638 return static_cast<void*
>(
mg);
2655 checkMultigridParam(mg_param);
2672 if (
mg->m)
delete mg->m;
2673 if (
mg->mSmooth)
delete mg->mSmooth;
2674 if (
mg->mSmoothSloppy)
delete mg->mSmoothSloppy;
2676 if (
mg->d)
delete mg->d;
2677 if (
mg->dSmooth)
delete mg->dSmooth;
2678 if (
mg->dSmoothSloppy &&
mg->dSmoothSloppy !=
mg->dSmooth)
delete mg->dSmoothSloppy;
2700 mg->mSmoothSloppy =
new DiracM(*(
mg->dSmoothSloppy));
2702 mg->mgParam->matResidual =
mg->m;
2703 mg->mgParam->matSmooth =
mg->mSmooth;
2704 mg->mgParam->matSmoothSloppy =
mg->mSmoothSloppy;
2706 mg->mgParam->updateInvertParam(*param);
2707 if(
mg->mgParam->mg_global.invert_param != param)
2708 mg->mgParam->mg_global.invert_param =
param;
2710 bool refresh =
true;
2733 checkMultigridParam(mg_param);
2744 :
d(nullptr),
m(nullptr), RV(nullptr), deflParam(nullptr), defl(nullptr), profile(profile) {
2772 ritzParam.is_composite =
true;
2773 ritzParam.is_component =
false;
2789 for(
int d = 0;
d < ritzParam.nDim;
d++) ritzVolume *= ritzParam.x[
d];
2793 size_t byte_estimate = (size_t)ritzParam.composite_dim*(
size_t)ritzVolume*(ritzParam.nColor*ritzParam.nSpin*ritzParam.Precision());
2794 printfQuda(
"allocating bytes: %lu (lattice volume %d, prec %d)", byte_estimate, ritzVolume, ritzParam.Precision());
2820 return static_cast<void*
>(
defl);
2841 checkInvertParam(param, hp_x, hp_b);
2866 Dirac *dSloppy =
nullptr;
2867 Dirac *dPre =
nullptr;
2873 Dirac &diracSloppy = *dSloppy;
2874 Dirac &diracPre = *dPre;
2883 const int *
X = cudaGauge->
X();
2899 bool invalidate =
false;
2901 for (
auto v : solutionResident)
2902 if (b->
Precision() != v->Precision() || b->
SiteSubset() != v->SiteSubset()) { invalidate =
true;
break; }
2905 for (
auto v : solutionResident)
if (v)
delete v;
2906 solutionResident.clear();
2909 if (!solutionResident.size()) {
2913 x = solutionResident[0];
2923 errorQuda(
"Initial guess not supported for two-pass solver");
2935 if (nb==0.0)
errorQuda(
"Source has zero norm");
2939 printfQuda(
"Source: CPU = %g, CUDA copy = %g\n", nh_b, nb);
2943 printfQuda(
"Solution: CPU = %g, CUDA copy = %g\n", nh_x, nx);
2953 massRescale(*static_cast<cudaColorSpinorField*>(b), *param);
2961 printfQuda(
"Prepared solution = %g\n", nout);
2966 printfQuda(
"Prepared source post mass rescale = %g\n", nin);
2990 if (pc_solution && !pc_solve) {
2991 errorQuda(
"Preconditioned (PC) solution_type requires a PC solve_type");
2994 if (!mat_solution && !pc_solution && pc_solve) {
2995 errorQuda(
"Unpreconditioned MATDAG_MAT solution_type requires an unpreconditioned solve_type");
2998 if (!mat_solution && norm_error_solve) {
2999 errorQuda(
"Normal-error solve requires Mat solution");
3003 errorQuda(
"Multigrid preconditioning only supported for direct solves");
3007 errorQuda(
"Chronological forcasting only presently supported for M^dagger M solver");
3012 if (mat_solution && !direct_solve && !norm_error_solve) {
3014 dirac.
Mdag(*in, tmp);
3015 }
else if (!mat_solution && direct_solve) {
3016 DiracMdag m(dirac), mSloppy(diracSloppy), mPre(diracPre);
3019 (*solve)(*
out, *
in);
3026 DiracM m(dirac), mSloppy(diracSloppy), mPre(diracPre);
3037 std::vector<ColorSpinorField*> Ap;
3038 for (
unsigned int k=0; k < basis.size(); k++) {
3043 for (
unsigned int j=0; j<basis.size(); j++)
m(*Ap[j], *basis[j], *tmp, *tmp2);
3045 for (
unsigned int j=0; j<basis.size(); j++) mSloppy(*Ap[j], *basis[j], *tmp, *tmp2);
3047 errorQuda(
"Unexpected precision %d for chrono vectors (doesn't match outer %d or sloppy precision %d)",
3051 bool orthogonal =
true;
3052 bool apply_mat =
false;
3053 bool hermitian =
false;
3057 mre(*out, *tmp, basis, Ap);
3060 if (ap)
delete (ap);
3063 if (tmp2 != out)
delete tmp2;
3069 (*solve)(*
out, *
in);
3072 }
else if (!norm_error_solve) {
3073 DiracMdagM m(dirac), mSloppy(diracSloppy), mPre(diracPre);
3083 std::vector<ColorSpinorField*> Ap;
3086 for (
unsigned int k=0; k < basis.size(); k++) {
3091 for (
unsigned int j=0; j<basis.size(); j++)
m(*Ap[j], *basis[j], *tmp, *tmp2);
3093 for (
unsigned int j=0; j<basis.size(); j++) mSloppy(*Ap[j], *basis[j], *tmp, *tmp2);
3095 errorQuda(
"Unexpected precision %d for chrono vectors (doesn't match outer %d or sloppy precision %d)",
3099 bool orthogonal =
true;
3100 bool apply_mat =
false;
3101 bool hermitian =
true;
3105 mre(*out, *tmp, basis, Ap);
3111 if (tmp2 != out)
delete tmp2;
3117 (*solve)(*
out, *
in);
3121 DiracMMdag m(dirac), mSloppy(diracSloppy), mPre(diracPre);
3126 dirac.
Mdag(*out, tmp);
3149 errorQuda(
"Requested chrono_max_dim %i is smaller than already existing chroology %i",param->
chrono_max_dim,(
int)basis.size());
3162 for (
unsigned int j=basis.size()-1; j>0; j--) basis[j] = basis[j-1];
3185 param->
action[0] = action.real();
3186 param->
action[1] = action.imag();
3192 printfQuda(
"Reconstructed: CUDA solution = %g, CPU copy = %g\n", nx, nh_x);
3203 for (
auto v: solutionResident)
if (v)
delete v;
3204 solutionResident.clear();
3244 checkInvertParam(param, _hp_x[0], _hp_b[0]);
3269 Dirac *dSloppy =
nullptr;
3270 Dirac *dPre =
nullptr;
3276 Dirac &diracSloppy = *dSloppy;
3277 Dirac &diracPre = *dPre;
3295 const int *
X = cudaGauge->
X();
3300 hp_x =
new void* [ param->
num_src ];
3303 hp_b =
new void* [param->
num_src];
3305 for(
int i=0;i < param->
num_src;i++){
3312 std::vector<ColorSpinorField*> h_b;
3314 for(
int i=0; i < param->
num_src; i++) {
3315 cpuParam.
v = hp_b[i];
3321 std::vector<ColorSpinorField*> h_x;
3324 for(
int i=0; i < param->
num_src; i++) {
3325 cpuParam.
v = hp_x[i];
3345 for(
int i=0; i < param->
num_src; i++) {
3355 errorQuda(
"Initial guess not supported for two-pass solver");
3362 for(
int i=0; i < param->
num_src; i++) {
3377 auto * nb =
new double[param->
num_src];
3378 for(
int i=0; i < param->
num_src; i++) {
3380 printfQuda(
"Source %i: CPU = %g, CUDA copy = %g\n", i, nb[i], nb[i]);
3381 if (nb[i]==0.0)
errorQuda(
"Source has zero norm");
3387 printfQuda(
"Source %i: CPU = %g, CUDA copy = %g\n", i, nh_b, nb[i]);
3388 printfQuda(
"Solution %i: CPU = %g, CUDA copy = %g\n", i, nh_x, nx);
3396 for(
int i=0; i < param->
num_src; i++) {
3402 for(
int i=0; i < param->
num_src; i++) {
3410 for(
int i=0; i < param->
num_src; i++) {
3414 printfQuda(
"Prepared source %i = %g\n", i, nin);
3415 printfQuda(
"Prepared solution %i = %g\n", i, nout);
3420 printfQuda(
"Prepared source %i post mass rescale = %g\n", i, nin);
3445 if (pc_solution && !pc_solve) {
3446 errorQuda(
"Preconditioned (PC) solution_type requires a PC solve_type");
3449 if (!mat_solution && !pc_solution && pc_solve) {
3450 errorQuda(
"Unpreconditioned MATDAG_MAT solution_type requires an unpreconditioned solve_type");
3453 if (!mat_solution && norm_error_solve) {
3454 errorQuda(
"Normal-error solve requires Mat solution");
3458 errorQuda(
"Multigrid preconditioning only supported for direct non-red-black solve");
3460 if (mat_solution && !direct_solve && !norm_error_solve) {
3461 for(
int i=0; i < param->
num_src; i++) {
3465 }
else if (!mat_solution && direct_solve) {
3466 DiracMdag m(dirac), mSloppy(diracSloppy), mPre(diracPre);
3470 for(
int i=0; i < param->
num_src; i++) {
3478 DiracM m(dirac), mSloppy(diracSloppy), mPre(diracPre);
3484 }
else if (!norm_error_solve) {
3485 DiracMdagM m(dirac), mSloppy(diracSloppy), mPre(diracPre);
3492 DiracMMdag m(dirac), mSloppy(diracSloppy), mPre(diracPre);
3493 errorQuda(
"norm_error_solve not supported in multi source solve");
3504 for(
int i=0; i < param->
num_src; i++) {
3512 for(
int i=0; i< param->
num_src; i++){
3518 for(
int i=0; i< param->
num_src; i++){
3527 for(
int i=0; i< param->
num_src; i++){
3534 for(
int i=0; i< param->
num_src; i++){
3537 printfQuda(
"Reconstructed: CUDA solution = %g, CPU copy = %g\n", nx, nh_x);
3542 for(
int i=0; i < param->
num_src; i++){
3588 checkInvertParam(param, _hp_x[0], _hp_b);
3594 errorQuda(
"Number of shifts %d requested greater than QUDA_MAX_MULTI_SHIFT %d",
3609 errorQuda(
"For Staggered-type fermions, multi-shift solver only suports MATPC solution type");
3613 errorQuda(
"For Staggered-type fermions, multi-shift solver only supports DIRECT_PC solve types");
3619 errorQuda(
"For Wilson-type fermions, multi-shift solver does not support MAT or MATPC solution types");
3622 errorQuda(
"For Wilson-type fermions, multi-shift solver does not support DIRECT or DIRECT_PC solve types");
3624 if (pc_solution & !pc_solve) {
3625 errorQuda(
"For Wilson-type fermions, preconditioned (PC) solution_type requires a PC solve_type");
3627 if (!pc_solution & pc_solve) {
3628 errorQuda(
"For Wilson-type fermions, in multi-shift solver, a preconditioned (PC) solve_type requires a PC solution_type");
3640 errorQuda(
"Offsets must be ordered from smallest to largest");
3666 Dirac *dSloppy =
nullptr;
3667 Dirac *dPre =
nullptr;
3668 Dirac *dRefine =
nullptr;
3671 createDirac(d, dSloppy, dPre, dRefine, *param, pc_solve);
3673 Dirac &diracSloppy = *dSloppy;
3677 std::vector<ColorSpinorField*> x;
3679 std::vector<ColorSpinorField*> p;
3680 std::unique_ptr<double[]> r2_old(
new double[param->
num_offset]);
3684 gaugeFatPrecise->
X() : gaugePrecise->
X();
3694 std::vector<ColorSpinorField*> h_x;
3699 cpuParam.
v = hp_x[i];
3717 bool invalidate =
false;
3718 for (
auto v : solutionResident)
3719 if (cudaParam.
Precision() != v->Precision()) { invalidate =
true;
break; }
3722 for (
auto v : solutionResident)
delete v;
3723 solutionResident.clear();
3727 for (
int i=solutionResident.size(); i < param->
num_offset; i++) {
3730 for (
int i=0; i < param->
num_offset; i++) x[i] = solutionResident[i];
3739 if (nb==0.0)
errorQuda(
"Source has zero norm");
3743 printfQuda(
"Source: CPU = %g, CUDA copy = %g\n", nh_b, nb);
3759 mSloppy =
new DiracM(diracSloppy);
3767 cg_m(x, *b, p, r2_old.get());
3782 Dirac &diracSloppy = *dRefine;
3784 #define REFINE_INCREASING_MASS 3785 #ifdef REFINE_INCREASING_MASS 3788 for(
int i=param->
num_offset-1; i >= 0; i--) {
3808 printfQuda(
"Refining shift %d: L2 residual %e / %e, heavy quark %e / %e (actual / requested)\n",
3823 mSloppy =
new DiracM(diracSloppy);
3838 #ifdef REFINE_INCREASING_MASS 3839 const int nRefine = i+1;
3841 const int nRefine = param->
num_offset - i + 1;
3844 std::vector<ColorSpinorField*> q;
3846 std::vector<ColorSpinorField*> z;
3851 for(
int j=0; j < nRefine; j++) {
3857 #ifdef REFINE_INCREASING_MASS 3858 for (
int j=1; j<nRefine; j++) *z[j] = *x[j];
3860 for (
int j=1; j<nRefine; j++) *z[j] = *x[param->
num_offset-j];
3863 bool orthogonal =
true;
3864 bool apply_mat =
true;
3865 bool hermitian =
true;
3868 mre(*x[i], tmp, z, q);
3870 for(
int j=0; j < nRefine; j++) {
3877 solverParam.
iter = 0;
3886 cg(*x[i], *b, p[i], r2_old[i]);
3918 param->
action[0] = action.real();
3919 param->
action[1] = action.imag();
3939 for (
auto v: solutionResident)
if (v)
delete v;
3940 solutionResident.clear();
3960 for (
auto& pp : p)
delete pp;
3980 checkGaugeParam(param);
3984 const double max_error = 1e-10;
3990 svd_rel_error, svd_abs_error);
3997 gParam.
gauge = ulink;
4000 gParam.
gauge = inlink;
4028 cudaGaugeField *cudaLongLink = longlink ?
new cudaGaugeField(gParam) :
nullptr;
4031 fatLongKSLink(cudaFatLink, cudaLongLink, *cudaInLinkEx, path_coeff);
4043 if (ulink) cudaUnitarizedLink->
saveCPUField(cpuUnitarizedLink);
4050 if (longlink)
delete cudaLongLink;
4051 if (ulink)
delete cudaUnitarizedLink;
4052 delete cudaInLinkEx;
4057 errorQuda(
"Fat-link has not been built");
4058 #endif // GPU_FATLINK 4064 int volume = param.
x[0]*param.
x[1]*param.
x[2]*param.
x[3];
4066 for(
int dir=0; dir<4; ++dir) face_size[dir] = (volume/param.
x[dir])/2;
4067 pad = *std::max_element(face_size, face_size+4);
4076 #ifdef GPU_GAUGE_FORCE 4080 checkGaugeParam(qudaGaugeParam);
4090 if (!gaugePrecise)
errorQuda(
"No resident gauge field to use");
4121 if (!momResident)
errorQuda(
"No resident momentum field to use");
4145 gaugeForce(*cudaMom, *cudaGauge, eb3, input_path_buf, path_length, loop_coeff, num_paths, max_length);
4151 gaugeForce(*force, *cudaGauge, 1.0, input_path_buf, path_length, loop_coeff, num_paths, max_length);
4165 if (gaugePrecise && gaugePrecise != cudaSiteLink)
delete gaugePrecise;
4166 gaugePrecise = cudaSiteLink;
4168 delete cudaSiteLink;
4172 if (momResident && momResident != cudaMom)
delete momResident;
4178 if (cpuSiteLink)
delete cpuSiteLink;
4179 if (cpuMom)
delete cpuMom;
4193 errorQuda(
"Gauge force has not been built");
4194 #endif // GPU_GAUGE_FORCE 4201 if (!cloverPrecise)
errorQuda(
"Clover field not allocated");
4226 extendedGaugeResident = gauge;
4234 errorQuda(
"Only scalar and vector geometries are supported\n");
4286 gParam.
gauge = h_force;
4312 for(
int dir=0; dir<4; ++dir) qParam.
x[dir] = gParam.
x[dir];
4322 if (!momResident)
errorQuda(
"Cannot use resident momentum field since none appears resident");
4331 errorQuda(
"Resident gauge field is required");
4334 errorQuda(
"Gauge field requires the staggered phase factors to be applied");
4339 errorQuda(
"Requested staggered phase %d, but found %d\n",
4347 std::vector<ColorSpinorField*>
X(nvector);
4351 if (solutionResident.size() < (
unsigned int)nvector)
4352 errorQuda(
"solutionResident.size() %lu does not match number of shifts %d",
4353 solutionResident.size(), nvector);
4368 for (
int i=0; i<nvector; i++) {
4372 else errorQuda(
"%s requires resident solution", __func__);
4383 for (
auto v : solutionResident)
if (v)
delete solutionResident[i];
4384 solutionResident.clear();
4393 for (
int i=0; i<nvector; i++) {
4396 double coeff[2] = {inv_param->
residue[i], 0.0};
4403 applyU(cudaForce, *gaugePrecise);
4425 for (
int i=0; i<nvector; i++)
delete X[i];
4435 const double level2_coeff[6],
4436 const double fat7_coeff[6],
4437 const void*
const w_link,
4438 const void*
const v_link,
4439 const void*
const u_link,
4446 #ifdef GPU_STAGGERED_OPROD 4447 using namespace quda;
4452 checkGaugeParam(gParam);
4468 const double hisq_force_filter = 5e-5;
4469 const double max_det_error = 1e-10;
4470 const bool allow_svd =
true;
4471 const bool svd_only =
false;
4472 const double svd_rel_err = 1e-8;
4473 const double svd_abs_err = 1e-8;
4478 double act_path_coeff[6] = {0,1,level2_coeff[2],level2_coeff[3],level2_coeff[4],level2_coeff[5]};
4491 param.
gauge = (
void*)w_link;
4493 param.
gauge = (
void*)v_link;
4495 param.
gauge = (
void*)u_link;
4509 for (
int dir=0; dir<4; ++dir) {
4510 param.
x[dir] += 2*R[dir];
4511 param.
r[dir] = R[dir];
4530 for (
int dir=0; dir<4; ++dir) qParam.
x[dir] = oParam.
x[dir];
4540 qParam.
v = fermion[0];
4543 GaugeField *oprod[2] = {stapleOprod, naikOprod};
4546 for(
int i=0; i<num_terms; ++i){
4550 qParam.
v = fermion[i];
4555 cudaQuark = cpuQuark;
4565 oneLinkOprod->
copy(*stapleOprod);
4566 ax(level2_coeff[0], *oneLinkOprod);
4567 GaugeField *oprod[2] = {oneLinkOprod, naikOprod};
4570 for(
int i=0; i<num_naik_terms; ++i){
4574 qParam.
v = fermion[i + num_terms - num_naik_terms];
4579 cudaQuark = cpuQuark;
4596 delete oneLinkOprod;
4634 cudaMemset((
void**)(cudaOutForce->
Gauge_p()), 0, cudaOutForce->
Bytes());
4653 if (!momResident)
errorQuda(
"No resident momentum field to use");
4666 if (cpuMom)
delete cpuMom;
4670 momResident =
nullptr;
4673 delete cudaOutForce;
4680 errorQuda(
"HISQ force has not been built");
4685 double *coeff,
double kappa2,
double ck,
4686 int nvector,
double multiplicity,
void *gauge,
4689 using namespace quda;
4693 checkGaugeParam(gauge_param);
4694 if (!gaugePrecise)
errorQuda(
"No resident gauge field");
4723 for(
int dir=0; dir<4; ++dir) qParam.
x[dir] = fParam.
x[dir];
4730 std::vector<ColorSpinorField*> quarkX, quarkP;
4731 for (
int i=0; i<nvector; i++) {
4753 if (solutionResident.size() < (
unsigned int)nvector)
4754 errorQuda(
"solutionResident.size() %lu does not match number of shifts %d",
4755 solutionResident.size(), nvector);
4767 std::vector<double> force_coeff(nvector);
4769 for(
int i=0; i<nvector; i++){
4786 x.
Even() = cpuQuarkX;
4792 x.
Even() = *(solutionResident[i]);
4804 force_coeff[i] = 2.0*dt*coeff[i]*kappa2;
4821 std::vector< std::vector<double> > ferm_epsilon(nvector);
4822 for (
int shift = 0; shift < nvector; shift++) {
4823 ferm_epsilon[shift].reserve(2);
4824 ferm_epsilon[shift][0] = 2.0*ck*coeff[shift]*dt;
4825 ferm_epsilon[shift][1] = -kappa2 * 2.0*ck*coeff[shift]*dt;
4837 if (u != &gaugeEx)
delete u;
4849 for (
int i=0; i<nvector; i++) {
4856 for (
auto v : solutionResident)
if (v)
delete v;
4857 solutionResident.clear();
4878 checkGaugeParam(param);
4918 if (!gaugePrecise)
errorQuda(
"No resident gauge field allocated");
4920 gaugePrecise =
nullptr;
4926 if (!momResident)
errorQuda(
"No resident mom field allocated");
4928 momResident =
nullptr;
4936 (
bool)conj_mom, (
bool)exact);
4942 cudaOutGauge->saveCPUField(*cpuGauge);
4949 gaugePrecise = cudaOutGauge;
4951 delete cudaOutGauge;
4955 if (momResident !=
nullptr && momResident != cudaMom)
delete momResident;
4962 if (cpuMom)
delete cpuMom;
4975 checkGaugeParam(param);
4992 if (!gaugePrecise)
errorQuda(
"No resident gauge field to use");
5016 if (gaugePrecise !=
nullptr && cudaGauge != gaugePrecise)
delete gaugePrecise;
5033 checkGaugeParam(param);
5048 if (!gaugePrecise)
errorQuda(
"No resident gauge field to use");
5070 if (gaugePrecise !=
nullptr && cudaGauge != gaugePrecise)
delete gaugePrecise;
5089 checkGaugeParam(param);
5111 if (!momResident)
errorQuda(
"No resident mom field allocated");
5123 if (momResident !=
nullptr && momResident != cudaMom)
delete momResident;
5127 momResident =
nullptr;
5160 {
MatQuda(h_out, h_in, inv_param); }
5175 if (!gaugePrecise)
errorQuda(
"Resident gauge field not allocated");
5182 errorQuda(
"Fortran multi-shift solver presently only supports QUDA_TIFR_PADDED_DIRAC_ORDER");
5184 const int *X = gaugePrecise->
X();
5185 size_t cb_offset = (X[0]/2) * X[1] * (X[2] + 4) * X[3] * gaugePrecise->
Ncolor() * nSpin * 2 * param->
cpu_prec;
5187 for (
int i=0; i<param->
num_offset; i++) hp_x[i] = static_cast<char*>(h_x) + i*cb_offset;
5195 cudaHostRegister(ptr, *bytes, cudaHostRegisterDefault);
5200 cudaHostUnregister(ptr);
5212 bool *conj_mom,
bool *exact,
5217 static inline int opp(
int dir) {
return 7-dir; }
5223 if (num_loop_types >= 1)
5224 for(
int i=0; i<4; ++i){
5225 if(i==dir)
continue;
5226 paths[
index][0] = i; paths[
index][1] =
opp(dir); paths[index++][2] =
opp(i);
5227 paths[
index][0] =
opp(i); paths[
index][1] =
opp(dir); paths[index++][2] = i;
5231 if (num_loop_types >= 2)
5232 for(
int i=0; i<4; ++i){
5233 if(i==dir)
continue;
5248 if (num_loop_types >= 3) {
5250 for(
int i=0; i<4; ++i){
5251 for(
int j=0; j<4; ++j){
5252 if(i==dir || j==dir || i==j)
continue;
5271 switch (*num_loop_types) {
5282 errorQuda(
"Invalid num_loop_types = %d\n", *num_loop_types);
5285 auto *loop_coeff =
static_cast<double*
>(
safe_malloc(numPaths*
sizeof(
double)));
5286 int *path_length =
static_cast<int*
>(
safe_malloc(numPaths*
sizeof(
int)));
5288 if (*num_loop_types >= 1)
for(
int i= 0; i< 6; ++i) {
5289 loop_coeff[i] = coeff[0];
5292 if (*num_loop_types >= 2)
for(
int i= 6; i<24; ++i) {
5293 loop_coeff[i] = coeff[1];
5296 if (*num_loop_types >= 3)
for(
int i=24; i<48; ++i) {
5297 loop_coeff[i] = coeff[2];
5301 int** input_path_buf[4];
5302 for(
int dir=0; dir<4; ++dir){
5303 input_path_buf[dir] =
static_cast<int**
>(
safe_malloc(numPaths*
sizeof(
int*)));
5304 for(
int i=0; i<numPaths; ++i){
5305 input_path_buf[dir][i] =
static_cast<int*
>(
safe_malloc(path_length[i]*
sizeof(
int)));
5312 computeGaugeForceQuda(mom, gauge, input_path_buf, path_length, loop_coeff, numPaths, max_length, *dt, param);
5314 for(
auto & dir : input_path_buf){
5315 for(
int i=0; i<numPaths; ++i)
host_free(dir[i]);
5358 static int bqcd_rank_from_coords(
const int *coords,
void *fdata)
5360 int *
dims =
static_cast<int *
>(fdata);
5362 int rank = coords[3];
5363 for (
int i = 2; i >= 0; i--) {
5364 rank = dims[i] * rank + coords[i];
5382 bool pack_ = *pack ? true :
false;
5390 if (!gaugePrecise)
errorQuda(
"Cannot generate Gauss GaugeField as there is no resident gauge field");
5403 if (extendedGaugeResident) {
5423 if (!gaugePrecise)
errorQuda(
"Cannot compute plaquette as there is no resident gauge field");
5426 extendedGaugeResident = data;
5445 if (!gaugePrecise)
errorQuda(
"Cannot perform deep copy of resident gauge field as there is no resident gauge field");
5448 extendedGaugeResident = data;
5461 if (gaugePrecise ==
nullptr)
errorQuda(
"Gauge field must be loaded");
5468 if (gaugeSmeared !=
nullptr) {
5477 printfQuda(
"Wuppertal smearing done with gaugePrecise\n");
5497 for (
unsigned int i=0; i<nSteps; i++) {
5502 printfQuda(
"Step %d, vector norm %e\n", i, norm);
5514 printfQuda(
"Out CPU %e CUDA %e\n", cpu, gpu);
5517 if (gaugeSmeared !=
nullptr)
5532 if (gaugePrecise ==
nullptr)
errorQuda(
"Gauge field must be loaded");
5540 double3 plaq =
plaquette(*gaugeSmeared);
5542 printfQuda(
"Plaquette after 0 APE steps: %le %le %le\n", plaq.x, plaq.y, plaq.z);
5545 for (
unsigned int i=0; i<nSteps; i++) {
5546 cudaGaugeTemp->copy(*gaugeSmeared);
5548 APEStep(*gaugeSmeared, *cudaGaugeTemp, alpha);
5551 delete cudaGaugeTemp;
5557 printfQuda(
"Plaquette after %d APE steps: %le %le %le\n", nSteps, plaq.x, plaq.y, plaq.z);
5567 if (gaugePrecise ==
nullptr)
errorQuda(
"Gauge field must be loaded");
5575 double3 plaq =
plaquette(*gaugeSmeared);
5577 printfQuda(
"Plaquette after 0 STOUT steps: %le %le %le\n", plaq.x, plaq.y, plaq.z);
5580 for (
unsigned int i=0; i<nSteps; i++) {
5581 cudaGaugeTemp->copy(*gaugeSmeared);
5583 STOUTStep(*gaugeSmeared, *cudaGaugeTemp, rho);
5586 delete cudaGaugeTemp;
5592 printfQuda(
"Plaquette after %d STOUT steps: %le %le %le\n", nSteps, plaq.x, plaq.y, plaq.z);
5602 if (gaugePrecise ==
nullptr)
errorQuda(
"Gauge field must be loaded");
5610 double3 plaq =
plaquette(*gaugeSmeared);
5612 printfQuda(
"Plaquette after 0 OvrImpSTOUT steps: %le %le %le\n", plaq.x, plaq.y, plaq.z);
5615 for (
unsigned int i=0; i<nSteps; i++) {
5616 cudaGaugeTemp->copy(*gaugeSmeared);
5621 delete cudaGaugeTemp;
5627 printfQuda(
"Plaquette after %d OvrImpSTOUT steps: %le %le %le\n", nSteps, plaq.x, plaq.y, plaq.z);
5635 const unsigned int verbose_interval,
const double relax_boost,
const double tolerance,
const unsigned int reunit_interval, \
5636 const unsigned int stopWtheta,
QudaGaugeParam* param ,
double* timeinfo)
5641 checkGaugeParam(param);
5661 cudaInGauge->loadCPUField(*
cpuGauge);
5675 gaugefixingOVR(*cudaInGauge, gauge_dir, Nsteps, verbose_interval, relax_boost, tolerance, \
5676 reunit_interval, stopWtheta);
5683 gaugefixingOVR(*cudaInGaugeEx, gauge_dir, Nsteps, verbose_interval, relax_boost, tolerance, \
5684 reunit_interval, stopWtheta);
5694 cudaInGauge->saveCPUField(*
cpuGauge);
5701 gaugePrecise = cudaInGauge;
5717 const unsigned int verbose_interval,
const double alpha,
const unsigned int autotune,
const double tolerance, \
5718 const unsigned int stopWtheta,
QudaGaugeParam* param ,
double* timeinfo)
5723 checkGaugeParam(param);
5745 cudaInGauge->loadCPUField(*
cpuGauge);
5759 gaugefixingFFT(*cudaInGauge, gauge_dir, Nsteps, verbose_interval, alpha, autotune, tolerance, stopWtheta);
5767 cudaInGauge->saveCPUField(*
cpuGauge);
5775 gaugePrecise = cudaInGauge;
5802 cpuParam.
v = (
void *)hp_y;
5814 std::vector<ColorSpinorField *> x, y;
5818 size_t data_bytes = x[0]->Volume() * x[0]->Nspin() * x[0]->Nspin() * 2 * x[0]->Precision();
5832 qudaMemcpy(h_result, d_result, data_bytes, cudaMemcpyDeviceToHost);
5851 if (!gaugeSmeared) {
5885 if (!gaugeSmeared) {
5910 qudaMemcpy(h_qDensity, d_qDensity, size, cudaMemcpyDeviceToHost);
void new_quda_invert_param_(QudaInvertParam *param)
cudaColorSpinorField * tmp2
QudaCloverFieldOrder order
static QudaGaugeParam qudaGaugeParam
void computeFmunu(GaugeField &Fmunu, const GaugeField &gauge)
Compute the Fmunu tensor.
void setRho(double rho)
Bakes in the rho factor into the clover field, (for real diagonal additive Hasenbusch), e.g., A + rho.
static bool reunit_allow_svd
double iter_res_offset[QUDA_MAX_MULTI_SHIFT]
void copyExtendedResidentGaugeQuda(void *resident_gauge, QudaFieldLocation loc)
void ax(double a, ColorSpinorField &x)
QudaDiracFieldOrder dirac_order
QudaMassNormalization mass_normalization
#define qudaMemcpy(dst, src, count, kind)
double tol_hq_offset[QUDA_MAX_MULTI_SHIFT]
QudaReconstructType reconstruct_sloppy
void fatLongKSLink(cudaGaugeField *fat, cudaGaugeField *lng, const cudaGaugeField &gauge, const double *coeff)
Compute the fat and long links for an improved staggered (Kogut-Susskind) fermions.
void cloverDerivative(cudaGaugeField &force, cudaGaugeField &gauge, cudaGaugeField &oprod, double coeff, QudaParity parity)
Compute the derivative of the clover matrix in the direction mu,nu and compute the resulting force gi...
QudaGhostExchange ghostExchange
void freeCloverQuda(void)
void dumpNullVectors() const
Dump the null-space vectors to disk. Will recurse dumping all levels.
static bool do_not_profile_quda
void computeKSLinkQuda(void *fatlink, void *longlink, void *ulink, void *inlink, double *path_coeff, QudaGaugeParam *param)
void setDiracSloppyParam(DiracParam &diracParam, QudaInvertParam *inv_param, bool pc)
static TimeProfile profileStaggeredForce("computeStaggeredForceQuda")
Profiler for computeHISQForceQuda.
void setPrecision(QudaPrecision precision, QudaPrecision ghost_precision=QUDA_INVALID_PRECISION, bool force_native=false)
void * createGaugeFieldQuda(void *gauge, int geometry, QudaGaugeParam *param)
cudaColorSpinorField * tmp1
int commDimPartitioned(int dir)
void loadCloverQuda(void *h_clover, void *h_clovinv, QudaInvertParam *inv_param)
void setVerbosityQuda(QudaVerbosity verbosity, const char prefix[], FILE *outfile)
QudaFieldLocation clover_location
double computeQCharge(const GaugeField &Fmunu)
Compute the topological charge.
enum QudaPrecision_s QudaPrecision
void destroy()
Destroy the CUBLAS context.
void Dslash5(ColorSpinorField &out, const ColorSpinorField &in, const QudaParity parity) const
void kinetic_quda_(double *kin, void *momentum, QudaGaugeParam *param)
Evaluate the kinetic (momentum) contribution to classical Hamiltonian for Hybrid Monte Carlo...
static TimeProfile profileFatLink("computeKSLinkQuda")
Profiler for computeGaugeForceQuda.
double_complex c_5[QUDA_MAX_DWF_LS]
void load_gauge_quda_(void *h_gauge, QudaGaugeParam *param)
void saveCPUField(cpuGaugeField &cpu) const
Upload from this field into a CPU field.
void computeStaggeredForceQuda(void *h_mom, double dt, double delta, void *h_force, void **x, QudaGaugeParam *gauge_param, QudaInvertParam *inv_param)
double momActionQuda(void *momentum, QudaGaugeParam *param)
void * V(bool inverse=false)
__host__ __device__ ValueType norm(const complex< ValueType > &z)
Returns the magnitude of z squared.
Multi-Shift Conjugate Gradient Solver.
static TimeProfile profileGaugeUpdate("updateGaugeFieldQuda")
Profiler for createExtendedGaugeField.
double shift
Shift term added onto operator (M/M^dag M/M M^dag + shift)
void hisqLongLinkForce(GaugeField &newOprod, const GaugeField &oprod, const GaugeField &link, double coeff)
Compute the long-link contribution to the fermion force.
void setUnitarizeForceConstants(double unitarize_eps, double hisq_force_filter, double max_det_error, bool allow_svd, bool svd_only, double svd_rel_error, double svd_abs_error)
Set the constant parameters for the force unitarization.
#define QUDA_MAX_MULTI_SHIFT
Maximum number of shifts supported by the multi-shift solver. This number may be changed if need be...
cudaGaugeField * gaugeExtended
QudaInverterType inv_type_precondition
void createDirac(Dirac *&d, Dirac *&dSloppy, Dirac *&dPre, QudaInvertParam ¶m, const bool pc_solve)
void printQudaGaugeParam(QudaGaugeParam *param)
QudaVerbosity getVerbosity()
void computeCloverForce(GaugeField &force, const GaugeField &U, std::vector< ColorSpinorField *> &x, std::vector< ColorSpinorField *> &p, std::vector< double > &coeff)
Compute the force contribution from the solver solution fields.
QudaPrecision cuda_prec_ritz
static TimeProfile profileQCharge("qChargeQuda")
Profiler for APEQuda.
void loadSloppyCloverQuda(const QudaPrecision prec[])
double norm2(const ColorSpinorField &a)
void setUnitarizeLinksConstants(double unitarize_eps, double max_error, bool allow_svd, bool svd_only, double svd_rel_error, double svd_abs_error)
QudaDslashType dslash_type
QudaReconstructType reconstruct_precondition
QudaInverterType inv_type
Fortran interface functions.
int return_clover_inverse
QudaSolveType smoother_solve_type[QUDA_MAX_MG_LEVEL]
void hisqCompleteForce(GaugeField &oprod, const GaugeField &link)
Multiply the computed the force matrix by the gauge field and perform traceless anti-hermitian projec...
void performSTOUTnStep(unsigned int nSteps, double rho)
__host__ __device__ ValueType sqrt(ValueType x)
Complex cDotProduct(ColorSpinorField &, ColorSpinorField &)
void STOUTStep(GaugeField &dataDs, const GaugeField &dataOr, double rho)
Apply STOUT smearing to the gauge field.
void updateMomentum(GaugeField &mom, double coeff, GaugeField &force, const char *fname)
void setOutputPrefix(const char *prefix)
void loadGaugeQuda(void *h_gauge, QudaGaugeParam *param)
void plaq_quda_(double plaq[3])
static TimeProfile profileMulti("invertMultiShiftQuda")
Profiler for eigensolveQuda.
static TimeProfile profileOvrImpSTOUT("OvrImpSTOUTQuda")
Profiler for projectSU3Quda.
uint64_t checksum(bool mini=false) const
static bool enable_profiler
cudaColorSpinorField * tmp
cudaGaugeField * gaugeLongPrecise
const ColorSpinorField & Even() const
void saveTuneCache(bool error=false)
static TimeProfile profileGaugeForce("computeGaugeForceQuda")
Profiler for updateGaugeFieldQuda.
const ColorSpinorField & Odd() const
static ColorSpinorField * Create(const ColorSpinorParam ¶m)
static TimeProfile profileAPE("APEQuda")
Profiler for STOUTQuda.
QudaStaggeredPhase staggered_phase_type
QudaSchwarzType smoother_schwarz_type[QUDA_MAX_MG_LEVEL]
cudaGaugeField * gaugeFatPrecondition
void unitarizeForce(cudaGaugeField &newForce, const cudaGaugeField &oldForce, const cudaGaugeField &gauge, int *unitarization_failed)
Unitarize the fermion force.
#define QUDA_VERSION_MINOR
__host__ __device__ void copy(T1 &a, const T2 &b)
void invertMultiSrcQuda(void **_hp_x, void **_hp_b, QudaInvertParam *param)
void setMPICommHandleQuda(void *mycomm)
void copy(ColorSpinorField &dst, const ColorSpinorField &src)
static TimeProfile profilePlaq("plaqQuda")
Profiler for wuppertalQuda.
void free_clover_quda_(void)
QudaGaugeParam gauge_param
void initCommsGridQuda(int nDim, const int *dims, QudaCommsMap func, void *fdata)
QudaFieldGeometry Geometry() const
static cudaGaugeField * createExtendedGauge(cudaGaugeField &in, const int *R, TimeProfile &profile, bool redundant_comms=false, QudaReconstructType recon=QUDA_RECONSTRUCT_INVALID)
int computeGaugeFixingOVRQuda(void *gauge, const unsigned int gauge_dir, const unsigned int Nsteps, const unsigned int verbose_interval, const double relax_boost, const double tolerance, const unsigned int reunit_interval, const unsigned int stopWtheta, QudaGaugeParam *param, double *timeinfo)
Gauge fixing with overrelaxation with support for single and multi GPU.
QudaPrecision cuda_prec_refinement_sloppy
void massRescale(cudaColorSpinorField &b, QudaInvertParam ¶m)
ColorSpinorField & Component(const int idx) const
QudaPrecision smoother_halo_precision[QUDA_MAX_MG_LEVEL]
QudaPrecision clover_cuda_prec_refinement_sloppy
void destroyDeflationQuda(void *df)
QudaGaugeFieldOrder gauge_order
void computeCloverForceQuda(void *h_mom, double dt, void **h_x, void **h_p, double *coeff, double kappa2, double ck, int nvector, double multiplicity, void *gauge, QudaGaugeParam *gauge_param, QudaInvertParam *inv_param)
double computeQChargeDensity(const GaugeField &Fmunu, void *result)
Compute the topological charge density per lattice site.
static TimeProfile profileInit2End("initQuda-endQuda", false)
Complex c_5[QUDA_MAX_DWF_LS]
bool forceMonitor()
Whether we are monitoring the force or not.
cudaGaugeField * gaugeFatRefinement
void gaugeForce(GaugeField &mom, const GaugeField &u, double coeff, int ***input_path, int *length, double *path_coeff, int num_paths, int max_length)
Compute the gauge-force contribution to the momentum.
cudaGaugeField * gaugeLongRefinement
void dumpMultigridQuda(void *mg_, QudaMultigridParam *mg_param)
Dump the null-space vectors to disk.
void reset(bool refresh=false)
This method resets the solver, e.g., when a parameter has changed such as the mass.
void saveGaugeQuda(void *h_gauge, QudaGaugeParam *param)
int make_resident_solution
void computeHISQForceQuda(void *const milc_momentum, double dt, const double level2_coeff[6], const double fat7_coeff[6], const void *const w_link, const void *const v_link, const void *const u_link, void **fermion, int num_terms, int num_naik_terms, double **coeff, QudaGaugeParam *gParam)
void gaugefixingOVR(cudaGaugeField &data, const int gauge_dir, const int Nsteps, const int verbose_interval, const double relax_boost, const double tolerance, const int reunit_interval, const int stopWtheta)
Gauge fixing with overrelaxation with support for single and multi GPU.
void invert_multishift_quda_(void *h_x, void *hp_b, QudaInvertParam *param)
QudaBoolean setup_minimize_memory
static TimeProfile profileMomAction("momActionQuda")
Profiler for endQuda.
double qChargeDensityQuda(void *h_qDensity)
Calculates the topological charge from gaugeSmeared, if it exist, or from gaugePrecise if no smeared ...
virtual void MdagM(ColorSpinorField &out, const ColorSpinorField &in) const =0
QudaSiteSubset siteSubset
void eigensolveQuda(void **host_evecs, double _Complex *host_evals, QudaEigParam *eig_param)
QudaPrecision clover_cuda_prec_sloppy
cudaGaugeField * gaugeLongExtended
QudaPrecision chrono_precision
void cloverInvert(CloverField &clover, bool computeTraceLog)
This function compute the Cholesky decomposition of each clover matrix and stores the clover inverse ...
QudaFieldLocation input_location
void destroyGaugeFieldQuda(void *gauge)
void computeCloverSigmaTrace(GaugeField &output, const CloverField &clover, double coeff)
Compute the matrix tensor field necessary for the force calculation from the clover trace action...
void dslashQuda_4dpc(void *h_out, void *h_in, QudaInvertParam *inv_param, QudaParity parity, int test_type)
double true_res_hq_offset[QUDA_MAX_MULTI_SHIFT]
int computeGaugeFixingFFTQuda(void *gauge, const unsigned int gauge_dir, const unsigned int Nsteps, const unsigned int verbose_interval, const double alpha, const unsigned int autotune, const double tolerance, const unsigned int stopWtheta, QudaGaugeParam *param, double *timeinfo)
Gauge fixing with Steepest descent method with FFTs with support for single GPU only.
cudaCloverField * cloverPrecondition
QudaUseInitGuess use_init_guess
DeflationParam * deflParam
int n_vec[QUDA_MAX_MG_LEVEL]
void init_quda_(int *dev)
void flush_pinned()
Free all outstanding pinned-memory allocations.
int getGaugePadding(GaugeFieldParam ¶m)
void APEStep(GaugeField &dataDs, const GaugeField &dataOr, double alpha)
Apply APE smearing to the gauge field.
double_complex b_5[QUDA_MAX_DWF_LS]
double computeMomAction(const GaugeField &mom)
Compute and return global the momentum action 1/2 mom^2.
void setDiracParam(DiracParam &diracParam, QudaInvertParam *inv_param, bool pc)
void comm_init(int ndim, const int *dims, QudaCommsMap rank_from_coords, void *map_data)
Initialize the communications, implemented in comm_single.cpp, comm_qmp.cpp, and comm_mpi.cpp.
QudaSolutionType solution_type
QudaFieldLocation setup_location[QUDA_MAX_MG_LEVEL]
QudaMemoryType mem_type_ritz
QudaSolverNormalization solver_normalization
static double svd_rel_error
QudaPrecision clover_cuda_prec
std::vector< cudaColorSpinorField * > solutionResident
void invertMultiShiftQuda(void **_hp_x, void *_hp_b, QudaInvertParam *param)
bool is_composite
for deflation solvers:
void Dslash4(ColorSpinorField &out, const ColorSpinorField &in, const QudaParity parity) const
void loadCPUField(const cpuGaugeField &cpu)
Download into this field from a CPU field.
double Last(QudaProfileType idx)
QudaInvertParam * invert_param
Complex b_5[QUDA_MAX_DWF_LS]
cudaGaugeField * gaugeFatSloppy
cudaDeviceProp deviceProp
#define qudaDeviceSynchronize()
void ax(const double &a, GaugeField &u)
Scale the gauge field by the scalar a.
void init()
Initialize the memory pool allocator.
QudaFieldLocation output_location
void unitarizeLinks(cudaGaugeField &outfield, const cudaGaugeField &infield, int *fails)
void hisqStaplesForce(GaugeField &newOprod, const GaugeField &oprod, const GaugeField &link, const double path_coeff[6])
Compute the fat-link contribution to the fermion force.
QudaPrecision clover_cuda_prec_precondition
QudaFieldLocation location
QudaInvertParam inv_param
int setNumaAffinityNVML(int deviceid)
bool canReuseResidentGauge(QudaInvertParam *inv_param)
static EigenSolver * create(QudaEigParam *eig_param, const DiracMatrix &mat, TimeProfile &profile)
Creates the eigensolver using the parameters given and the matrix.
void apply_staggered_phase_quda_()
Apply the staggered phase factors to the resident gauge field.
double true_res_hq_offset[QUDA_MAX_MULTI_SHIFT]
void freeSloppyGaugeQuda()
void updateInvertParam(QudaInvertParam ¶m, int offset=-1)
static GaugeField * Create(const GaugeFieldParam ¶m)
Create the gauge field, with meta data specified in the parameter struct.
cpuGaugeField * cpuFatLink
QudaFieldOrder fieldOrder
void arpack_solve(std::vector< ColorSpinorField *> &h_evecs, std::vector< Complex > &h_evals, const DiracMatrix &mat, QudaEigParam *eig_param, TimeProfile &profile)
The QUDA interface function. One passes two allocated arrays to hold the the eigenmode data...
void Dslash5inv(ColorSpinorField &out, const ColorSpinorField &in, const QudaParity parity, const double &kappa5) const
bool StaggeredPhaseApplied() const
void gaussGaugeQuda(unsigned long long seed, double sigma)
Generate Gaussian distributed fields and store in the resident gauge field. We create a Gaussian-dist...
void exchangeExtendedGhost(const int *R, bool no_comms_fill=false)
This does routine will populate the border / halo region of a gauge field that has been created using...
void updateMultigridQuda(void *mg_, QudaMultigridParam *mg_param)
Updates the multigrid preconditioner for the new gauge / clover field.
cudaGaugeField * gaugeRefinement
void flushProfile()
Flush profile contents, setting all counts to zero.
QudaPrecision cuda_prec_sloppy
static bool initialized
Profiler for initQuda.
void Clover(ColorSpinorField &out, const ColorSpinorField &in, const QudaParity parity) const
cudaCloverField * cloverSloppy
multigrid_solver(QudaMultigridParam &mg_param, TimeProfile &profile)
static bool reunit_svd_only
virtual void prepare(ColorSpinorField *&src, ColorSpinorField *&sol, ColorSpinorField &x, ColorSpinorField &b, const QudaSolutionType) const =0
double tol_offset[QUDA_MAX_MULTI_SHIFT]
cudaCloverField * cloverRefinement
int commDim[QUDA_MAX_DIM]
double true_res_offset[QUDA_MAX_MULTI_SHIFT]
void MatQuda(void *h_out, void *h_in, QudaInvertParam *inv_param)
static TimeProfile profileSTOUT("STOUTQuda")
Profiler for OvrImpSTOUTQuda.
void load_clover_quda_(void *h_clover, void *h_clovinv, QudaInvertParam *inv_param)
void projectSU3Quda(void *gauge_h, double tol, QudaGaugeParam *param)
QudaInvertParam newQudaInvertParam(void)
cudaGaugeField * cudaFatLink
static const std::string quda_version
void setPrecision(QudaPrecision precision)
void flush_device()
Free all outstanding device-memory allocations.
void Dagger(QudaDagType dag) const
static Solver * create(SolverParam ¶m, DiracMatrix &mat, DiracMatrix &matSloppy, DiracMatrix &matPrecon, TimeProfile &profile)
QudaPrecision cuda_prec_precondition
void free_sloppy_gauge_quda_()
QudaSiteSubset SiteSubset() const
QudaCloverFieldOrder clover_order
void createDslashEvents()
void updateGaugeFieldQuda(void *gauge, void *momentum, double dt, int conj_mom, int exact, QudaGaugeParam *param)
double Anisotropy() const
QudaGammaBasis gammaBasis
#define pool_device_malloc(size)
void remove_staggered_phase_quda_()
Remove the staggered phase factors to the resident gauge field.
static int lex_rank_from_coords(const int *coords, void *fdata)
void freeSloppyCloverQuda()
QudaGaugeFieldOrder order
virtual void blocksolve(ColorSpinorField &out, ColorSpinorField &in)
void flushForceMonitor()
Flush any outstanding force monitoring information.
void performAPEnStep(unsigned int nSteps, double alpha)
__host__ __device__ ValueType pow(ValueType x, ExponentType e)
static TimeProfile profileClover("loadCloverQuda")
Profiler for dslashQuda.
void performWuppertalnStep(void *h_out, void *h_in, QudaInvertParam *inv_param, unsigned int nSteps, double alpha)
#define QUDA_VERSION_SUBMINOR
void unregister_pinned_quda_(void *ptr)
Pinned a pre-existing memory allocation.
QudaPrecision precision_null[QUDA_MAX_MG_LEVEL]
QudaPrecision cuda_prec_sloppy
void exchangeGhost(QudaLinkDirection link_direction=QUDA_LINK_BACKWARDS)
Exchange the ghost and store store in the padded region.
static bool invalidate_clover
void Dslash4pre(ColorSpinorField &out, const ColorSpinorField &in, const QudaParity parity) const
static double svd_abs_error
double true_res_offset[QUDA_MAX_MULTI_SHIFT]
void comm_set_gridsize_(int *grid)
std::complex< double > Complex
double offset[QUDA_MAX_MULTI_SHIFT]
void projectSU3(cudaGaugeField &U, double tol, int *fails)
Project the input gauge field onto the SU(3) group. This is a destructive operation. The number of link failures is reported so appropriate action can be taken.
cudaGaugeField * gaugeFatPrecise
int(* QudaCommsMap)(const int *coords, void *fdata)
static TimeProfile profileDslash("dslashQuda")
Profiler for invertQuda.
void saveProfile(const std::string label="")
Save profile to disk.
void saveGaugeFieldQuda(void *gauge, void *inGauge, QudaGaugeParam *param)
static TimeProfile profileGauge("loadGaugeQuda")
Profile for loadCloverQuda.
cudaCloverField * cloverPrecise
enum QudaParity_s QudaParity
void register_pinned_quda_(void *ptr, size_t *bytes)
Pinned a pre-existing memory allocation.
QudaReconstructType reconstruct
void applyU(GaugeField &force, GaugeField &U)
static bool comms_initialized
void OvrImpSTOUTStep(GaugeField &dataDs, const GaugeField &dataOr, double rho, double epsilon)
Apply Over Improved STOUT smearing to the gauge field.
static int * num_failures_h
void Dslash4(ColorSpinorField &out, const ColorSpinorField &in, const QudaParity parity) const
void init()
Create the CUBLAS context.
void dslashQuda_mdwf(void *h_out, void *h_in, QudaInvertParam *inv_param, QudaParity parity, int test_type)
QudaFieldLocation location
static void freeGhostBuffer(void)
static TimeProfile profileGauss("gaussQuda")
Profiler for plaqQuda.
void MatDagMatQuda(void *h_out, void *h_in, QudaInvertParam *inv_param)
static TimeProfile profileProject("projectSU3Quda")
Profiler for staggeredPhaseQuda.
QudaPrecision halo_precision
void invert_quda_(void *hp_x, void *hp_b, QudaInvertParam *param)
#define safe_malloc(size)
void zero(ColorSpinorField &a)
void copy(const CloverField &src, bool inverse=true)
Copy into this CloverField from the generic CloverField src.
QudaPrecision cuda_prec_refinement_sloppy
static void init_default_comms()
void setMass(double mass)
void pushVerbosity(QudaVerbosity verbosity)
Push a new verbosity onto the stack.
static int * num_failures_d
void init_quda_device_(int *dev)
cudaGaugeField * gaugeLongSloppy
int compute_clover_inverse
#define checkCudaErrorNoSync()
void update_gauge_field_quda_(void *gauge, void *momentum, double *dt, bool *conj_mom, bool *exact, QudaGaugeParam *param)
void Mdag(ColorSpinorField &out, const ColorSpinorField &in) const
void gaugefixingFFT(cudaGaugeField &data, const int gauge_dir, const int Nsteps, const int verbose_interval, const double alpha, const int autotune, const double tolerance, const int stopWtheta)
Gauge fixing with Steepest descent method with FFTs with support for single GPU only.
static int index(int ndim, const int *dims, const int *x)
virtual void M(ColorSpinorField &out, const ColorSpinorField &in) const =0
void invertQuda(void *hp_x, void *hp_b, QudaInvertParam *param)
void printQudaInvertParam(QudaInvertParam *param)
void clover_quda_(void *h_out, void *h_in, QudaInvertParam *inv_param, QudaParity *parity, int *inverse)
void loadSloppyGaugeQuda(const QudaPrecision *prec, const QudaReconstructType *recon)
void wuppertalStep(ColorSpinorField &out, const ColorSpinorField &in, int parity, const GaugeField &U, double A, double B)
static void profilerStop(const char *f)
enum QudaFieldLocation_s QudaFieldLocation
QudaFieldLocation location
void setPrecision(QudaPrecision precision, bool force_native=false)
Helper function for setting the precision and corresponding field order for QUDA internal fields...
__device__ __host__ Matrix< T, 3 > inverse(const Matrix< T, 3 > &u)
static TimeProfile profileInvert("invertQuda")
Profiler for invertMultiShiftQuda.
cpuColorSpinorField * out
static TimeProfile profilePhase("staggeredPhaseQuda")
Profiler for contractions.
QudaPrecision cuda_prec_precondition
static TimeProfile GaugeFixOVRQuda("GaugeFixOVRQuda")
Profiler for toal time spend between init and end.
cudaGaugeField * gaugePrecondition
bool twisted
Clover coefficient.
static TimeProfile profileCovDev("covDevQuda")
Profiler for momentum action.
void printQudaEigParam(QudaEigParam *param)
__device__ __host__ void pack(Arg &arg, int ghost_idx, int s, int parity)
Conjugate-Gradient Solver.
deflated_solver(QudaEigParam &eig_param, TimeProfile &profile)
void contractQuda(const ColorSpinorField &x, const ColorSpinorField &y, void *result, QudaContractType cType)
This computes the optimum guess for the system Ax=b in the L2 residual norm. For use in the HMD force...
static TimeProfile profileEigensolve("eigensolveQuda")
Profiler for computeFatLinkQuda.
enum QudaReconstructType_s QudaReconstructType
Main header file for the QUDA library.
static double unitarize_eps
static TimeProfile profileInit("initQuda")
Profile for loadGaugeQuda / saveGaugeQuda.
static bool redundant_comms
#define QUDA_MAX_MG_LEVEL
Maximum number of multi-grid levels. This number may be increased if needed.
std::vector< ColorSpinorField * > B
void set_kernel_pack_t_(int *pack)
fTemporary function exposed for TIFR benchmarking
#define QUDA_MAX_DWF_LS
Maximum length of the Ls dimension for domain-wall fermions.
void setDiracRefineParam(DiracParam &diracParam, QudaInvertParam *inv_param, const bool pc)
QudaPrecision Precision() const
void * newDeflationQuda(QudaEigParam *eig_param)
void staggeredPhaseQuda(void *gauge_h, QudaGaugeParam *param)
void printQudaMultigridParam(QudaMultigridParam *param)
void mat_dag_mat_quda_(void *h_out, void *h_in, QudaInvertParam *inv_param)
void new_quda_gauge_param_(QudaGaugeParam *param)
cudaGaugeField * fatGauge
QudaTwistFlavorType twist_flavor
static void profilerStart(const char *f)
void cloverQuda(void *h_out, void *h_in, QudaInvertParam *inv_param, QudaParity parity, int inverse)
cudaGaugeField * gaugeSmeared
static TimeProfile GaugeFixFFTQuda("GaugeFixFFTQuda")
double residue[QUDA_MAX_MULTI_SHIFT]
cudaGaugeField * cudaGauge
static TimeProfile profileExtendedGauge("createExtendedGaugeField")
Profiler for computeCloverForceQuda.
quda::cudaGaugeField * checkGauge(QudaInvertParam *param)
#define device_malloc(size)
QudaReconstructType reconstruct
void setKernelPackT(bool pack)
void printAPIProfile()
Print out the timer profile for CUDA API calls.
void gamma5(ColorSpinorField &out, const ColorSpinorField &in)
Applies a gamma5 matrix to a spinor (wrapper to ApplyGamma)
enum QudaContractType_s QudaContractType
static TimeProfile profileEnd("endQuda")
Profiler for GaugeFixing.
QudaReconstructType Reconstruct() const
QudaResidualType residual_type
enum QudaFieldGeometry_s QudaFieldGeometry
QudaUseInitGuess use_init_guess
void flushChronoQuda(int i)
Flush the chronological history for the given index.
cudaGaugeField * longGauge
void dslash_quda_(void *h_out, void *h_in, QudaInvertParam *inv_param, QudaParity *parity)
void popVerbosity()
Pop the verbosity restoring the prior one on the stack.
enum QudaVerbosity_s QudaVerbosity
void updateGaugeField(GaugeField &out, double dt, const GaugeField &in, const GaugeField &mom, bool conj_mom, bool exact)
#define pool_device_free(ptr)
void createCloverQuda(QudaInvertParam *invertParam)
int computeGaugeForceQuda(void *mom, void *siteLink, int ***input_path_buf, int *path_length, double *loop_coeff, int num_paths, int max_length, double eb3, QudaGaugeParam *qudaGaugeParam)
QudaGaugeFieldOrder Order() const
void computeStaggeredOprod(GaugeField *out[], ColorSpinorField &in, const double coeff[], int nFace)
Compute the outer-product field between the staggered quark field's one and (for HISQ and ASQTAD) thr...
#define QUDA_MAX_DIM
Maximum number of dimensions supported by QUDA. In practice, no routines make use of more than 5...
bool compute_fat_link_max
QudaFieldGeometry geometry
static void PrintGlobal()
void setOutputFile(FILE *outfile)
cudaGaugeField * gaugePrecise
QudaTboundary TBoundary() const
QudaStaggeredPhase StaggeredPhase() const
#define mapped_malloc(size)
int use_resident_solution
static Dirac * create(const DiracParam ¶m)
void Dslash5(ColorSpinorField &out, const ColorSpinorField &in, const QudaParity parity) const
static double unscaled_shifts[QUDA_MAX_MULTI_SHIFT]
void dslashQuda(void *h_out, void *h_in, QudaInvertParam *inv_param, QudaParity parity)
void compute_gauge_force_quda_(void *mom, void *gauge, int *num_loop_types, double *coeff, double *dt, QudaGaugeParam *param)
Compute the gauge force and update the mometum field.
void updateR()
update the radius for halos.
static TimeProfile profileWuppertal("wuppertalQuda")
Profiler for gaussQuda.
cudaGaugeField * cudaForce
void flush_chrono_quda_(int *index)
Flush the chronological history for the given index.
static TimeProfile profileContract("contractQuda")
Profiler for covariant derivative.
cudaGaugeField * gaugeFatExtended
void copy(const GaugeField &src)
void Dslash5inv(ColorSpinorField &out, const ColorSpinorField &in, const QudaParity parity) const
void CloverInv(ColorSpinorField &out, const ColorSpinorField &in, const QudaParity parity) const
virtual void reconstruct(ColorSpinorField &x, const ColorSpinorField &b, const QudaSolutionType) const =0
void computeCloverSigmaOprod(GaugeField &oprod, std::vector< ColorSpinorField *> &x, std::vector< ColorSpinorField *> &p, std::vector< std::vector< double > > &coeff)
Compute the outer product from the solver solution fields arising from the diagonal term of the fermi...
QudaPrecision Precision() const
std::vector< std::vector< ColorSpinorField * > > chronoResident(QUDA_MAX_CHRONO)
virtual void Dslash(ColorSpinorField &out, const ColorSpinorField &in, const QudaParity parity) const =0
cudaGaugeField * extendedGaugeResident
void computeClover(CloverField &clover, const GaugeField &gauge, double coeff, QudaFieldLocation location)
QudaDslashType dslash_type_precondition
QudaPrecision clover_cpu_prec
QudaPrecision cuda_prec_ritz
static TimeProfile profileHISQForce("computeHISQForceQuda")
Profiler for plaqQuda.
void reorder_location_set(QudaFieldLocation reorder_location_)
Set whether data is reorderd on the CPU or GPU. This can set at QUDA initialization using the environ...
static void freeGhostBuffer(void)
Free statically allocated ghost buffers.
void destroyMultigridQuda(void *mg)
Free resources allocated by the multigrid solver.
void destroyDslashEvents()
void gaugeGauss(GaugeField &U, RNG &rngstate, double epsilon)
Generate Gaussian distributed su(N) or SU(N) fields. If U is a momentum field, then we generate rando...
void performOvrImpSTOUTnStep(unsigned int nSteps, double rho, double epsilon)
void setVerbosity(QudaVerbosity verbosity)
void copyExtendedGauge(GaugeField &out, const GaugeField &in, QudaFieldLocation location, void *Out=0, void *In=0)
#define QUDA_VERSION_MAJOR
cudaGaugeField * momResident
void applyStaggeredPhase(QudaStaggeredPhase phase=QUDA_STAGGERED_PHASE_INVALID)
static TimeProfile profileCloverForce("computeCloverForceQuda")
Profiler for computeStaggeredForceQuda.
double3 plaquette(const GaugeField &U)
Compute the plaquette of the gauge field.
static void createGaugeForcePaths(int **paths, int dir, int num_loop_types)
void * newMultigridQuda(QudaMultigridParam *mg_param)
void plaqQuda(double plaq[3])
QudaReconstructType reconstruct_refinement_sloppy
cudaGaugeField * gaugeSloppy
int comm_dim_partitioned(int dim)
void initQudaDevice(int dev)
QudaGaugeParam newQudaGaugeParam(void)
QudaInvertParam * invert_param
void checkClover(QudaInvertParam *param)
double reliable_delta_refinement
void mat_quda_(void *h_out, void *h_in, QudaInvertParam *inv_param)
void compute_staggered_force_quda_(void *h_mom, double *dt, double *delta, void *gauge, void *x, QudaGaugeParam *gauge_param, QudaInvertParam *inv_param)
void setDiracPreParam(DiracParam &diracParam, QudaInvertParam *inv_param, const bool pc, bool comms)
cudaGaugeField * gaugeLongPrecondition
void removeStaggeredPhase()