39 #ifdef GPU_GAUGE_FORCE
44 #define MAX(a,b) ((a)>(b)? (a):(b))
45 #define TDIFF(a,b) (b.tv_sec - a.tv_sec + 0.000001*(b.tv_usec - a.tv_usec))
69 static int R[4] = {0, 0, 0, 0};
71 static bool redundant_comms =
false;
78 static bool InitMagma =
false;
86 printfQuda(
"\nMAGMA library was already initialized..\n");
97 printfQuda(
"\nMAGMA library was not initialized..\n");
137 #define QUDA_MAX_CHRONO 12
142 static int *num_failures_h =
nullptr;
143 static int *num_failures_d =
nullptr;
145 static bool initialized =
false;
154 static TimeProfile profileClover(
"loadCloverQuda");
163 static TimeProfile profileInvertMultiSrc(
"invertMultiSrcQuda");
166 static TimeProfile profileMulti(
"invertMultiShiftQuda");
169 static TimeProfile profileEigensolve(
"eigensolveQuda");
172 static TimeProfile profileFatLink(
"computeKSLinkQuda");
175 static TimeProfile profileGaugeForce(
"computeGaugeForceQuda");
178 static TimeProfile profileGaugeUpdate(
"updateGaugeFieldQuda");
181 static TimeProfile profileExtendedGauge(
"createExtendedGaugeField");
184 static TimeProfile profileCloverForce(
"computeCloverForceQuda");
187 static TimeProfile profileStaggeredForce(
"computeStaggeredForceQuda");
190 static TimeProfile profileHISQForce(
"computeHISQForceQuda");
196 static TimeProfile profileWuppertal(
"wuppertalQuda");
202 static TimeProfile profileGaugeObs(
"gaugeObservablesQuda");
211 static TimeProfile profileOvrImpSTOUT(
"OvrImpSTOUTQuda");
217 static TimeProfile profileProject(
"projectSU3Quda");
220 static TimeProfile profilePhase(
"staggeredPhaseQuda");
223 static TimeProfile profileContract(
"contractQuda");
233 static TimeProfile profileMomAction(
"momActionQuda");
239 static TimeProfile GaugeFixFFTQuda(
"GaugeFixFFTQuda");
240 static TimeProfile GaugeFixOVRQuda(
"GaugeFixOVRQuda");
243 static TimeProfile profileInit2End(
"initQuda-endQuda",
false);
245 static bool enable_profiler =
false;
246 static bool do_not_profile_quda =
false;
248 static void profilerStart(
const char *f)
250 static std::vector<int> target_list;
251 static bool enable =
false;
252 static bool init =
false;
254 char *profile_target_env = getenv(
"QUDA_ENABLE_TARGET_PROFILE");
256 if ( profile_target_env ) {
257 std::stringstream target_stream(profile_target_env);
260 while(target_stream >> target) {
261 target_list.push_back(target);
262 if (target_stream.peek() ==
',') target_stream.ignore();
265 if (target_list.size() > 0) {
266 std::sort(target_list.begin(), target_list.end());
267 target_list.erase( unique( target_list.begin(), target_list.end() ), target_list.end() );
268 warningQuda(
"Targeted profiling enabled for %lu functions\n", target_list.size());
273 char* donotprofile_env = getenv(
"QUDA_DO_NOT_PROFILE");
274 if (donotprofile_env && (!(strcmp(donotprofile_env,
"0") == 0))) {
275 do_not_profile_quda=
true;
281 static int target_count = 0;
282 static unsigned int i = 0;
283 if (do_not_profile_quda){
288 if (i < target_list.size() && target_count++ == target_list[i]) {
289 enable_profiler =
true;
298 static void profilerStop(
const char *f) {
299 if (do_not_profile_quda) {
303 if (enable_profiler) {
306 enable_profiler =
false;
332 static int lex_rank_from_coords(
const int *coords,
void *fdata)
336 int rank = coords[0];
337 for (
int i = 1; i < md->ndim; i++) {
338 rank = md->
dims[i] * rank + coords[i];
347 static int qmp_rank_from_coords(
const int *coords,
void *fdata)
349 return QMP_get_node_number_from(coords);
356 #if defined(QMP_COMMS) || defined(MPI_COMMS)
357 MPI_Comm MPI_COMM_HANDLE_USER;
358 static bool user_set_comm_handle =
false;
363 #if defined(QMP_COMMS) || defined(MPI_COMMS)
364 MPI_COMM_HANDLE_USER = *((MPI_Comm *)mycomm);
365 user_set_comm_handle =
true;
369 static bool comms_initialized =
false;
373 if (comms_initialized)
return;
376 errorQuda(
"Number of communication grid dimensions must be 4");
383 if (QMP_logical_topology_is_declared()) {
384 if (QMP_get_logical_number_of_dimensions() != 4) {
385 errorQuda(
"QMP logical topology must have 4 dimensions");
387 for (
int i=0; i<nDim; i++) {
388 int qdim = QMP_get_logical_dimensions()[i];
389 if(qdim != dims[i]) {
390 errorQuda(
"QMP logical dims[%d]=%d does not match dims[%d]=%d argument", i, qdim, i, dims[i]);
394 func = qmp_rank_from_coords;
396 warningQuda(
"QMP logical topology is undeclared; using default lexicographical ordering");
399 map_data.
ndim = nDim;
400 for (
int i=0; i<nDim; i++) {
401 map_data.
dims[i] = dims[i];
403 fdata = (
void *) &map_data;
404 func = lex_rank_from_coords;
412 #if defined(QMP_COMMS) || defined(MPI_COMMS)
413 comm_init(nDim, dims, func, fdata, user_set_comm_handle, (
void *)&MPI_COMM_HANDLE_USER);
418 comms_initialized =
true;
422 static void init_default_comms()
424 #if defined(QMP_COMMS)
425 if (QMP_logical_topology_is_declared()) {
426 int ndim = QMP_get_logical_number_of_dimensions();
427 const int *dims = QMP_get_logical_dimensions();
430 errorQuda(
"initQuda() called without prior call to initCommsGridQuda(),"
431 " and QMP logical topology has not been declared");
433 #elif defined(MPI_COMMS)
434 errorQuda(
"When using MPI for communications, initCommsGridQuda() must be called before initQuda()");
436 const int dims[4] = {1, 1, 1, 1};
443 #define STR(x) STR_(x)
456 if (initialized)
return;
473 if (!comms_initialized) {
474 errorQuda(
"initDeviceQuda() called with a negative device ordinal, but comms have not been initialized");
479 if (dev < 0 || dev >= 16)
errorQuda(
"Invalid device number %d", dev);
485 char *reorder_str = getenv(
"QUDA_REORDER_LOCATION");
487 if (!reorder_str || (strcmp(reorder_str,
"CPU") && strcmp(reorder_str,
"cpu")) ) {
488 warningQuda(
"Data reordering done on GPU (set with QUDA_REORDER_LOCATION=GPU/CPU)");
491 warningQuda(
"Data reordering done on CPU (set with QUDA_REORDER_LOCATION=GPU/CPU)");
508 if (!comms_initialized) init_default_comms();
522 num_failures_h =
static_cast<int *
>(
mapped_malloc(
sizeof(
int)));
539 if (!comms_initialized) init_default_comms();
551 static bool invalidate_clover =
true;
557 if (!initialized)
errorQuda(
"QUDA not initialized");
560 checkGaugeParam(
param);
572 static size_t checksum = SIZE_MAX;
573 size_t in_checksum = in->
checksum(
true);
574 if (in_checksum == checksum) {
576 printfQuda(
"Gauge field unchanged - using cached gauge field %lu\n", checksum);
580 invalidate_clover =
false;
583 checksum = in_checksum;
584 invalidate_clover =
true;
699 sloppy->
copy(*precise);
707 precondition = precise;
710 precondition = sloppy;
713 precondition->
copy(*precise);
725 refinement->
copy(*sloppy);
733 eigensolver = precise;
736 eigensolver = precondition;
739 eigensolver = sloppy;
742 eigensolver->
copy(*precise);
814 if (!initialized)
errorQuda(
"QUDA not initialized");
815 checkGaugeParam(
param);
834 default:
errorQuda(
"Invalid gauge type");
855 bool device_calc =
false;
860 if (!initialized)
errorQuda(
"QUDA not initialized");
889 if (!h_clover && !pc_solve && !pc_solution) {
894 if (!h_clover && pc_solve && pc_solution && asymmetric && !device_calc) {
901 clover_param.
nDim = 4;
910 clover_param.
twisted = twisted;
916 clover_param.
norm =
nullptr;
917 clover_param.
invNorm =
nullptr;
919 clover_param.
direct = h_clover || device_calc ? true :
false;
925 bool clover_update =
false;
947 inParam.
direct = h_clover ? true :
false;
948 inParam.
inverse = h_clovinv ? true :
false;
949 inParam.
clover = h_clover;
985 clover_param.
direct =
true;
996 if (!h_clover && !h_clovinv)
errorQuda(
"Requested clover field return but no clover host pointers set");
1022 hack->
copy(*hackOfTheHack);
1023 delete hackOfTheHack;
1030 qudaMemcpy((
char*)(in->
V(
false)), (
char*)(hack->
V(
false)), in->
Bytes(), cudaMemcpyDeviceToHost);
1033 qudaMemcpy((
char*)(in->
V(
true)), (
char*)(hack->
V(
true)), in->
Bytes(), cudaMemcpyDeviceToHost);
1062 clover_param.
direct =
true;
1065 clover_param.
direct =
false;
1120 if (!initialized)
errorQuda(
"QUDA not initialized");
1192 if (!initialized)
errorQuda(
"QUDA not initialized");
1430 if (!initialized)
errorQuda(
"QUDA not initialized");
1455 if (!initialized)
errorQuda(
"QUDA not initialized");
1468 for (
auto v : basis) {
1478 if (!initialized)
return;
1501 num_failures_h =
nullptr;
1502 num_failures_d =
nullptr;
1512 initialized =
false;
1515 comms_initialized =
false;
1522 profileInit.
Print();
1523 profileGauge.
Print();
1524 profileClover.
Print();
1525 profileDslash.
Print();
1526 profileInvert.
Print();
1527 profileInvertMultiSrc.
Print();
1528 profileMulti.
Print();
1529 profileEigensolve.
Print();
1530 profileFatLink.
Print();
1531 profileGaugeForce.
Print();
1532 profileGaugeUpdate.
Print();
1533 profileExtendedGauge.
Print();
1534 profileCloverForce.
Print();
1535 profileStaggeredForce.
Print();
1536 profileHISQForce.
Print();
1537 profileContract.
Print();
1538 profileBLAS.
Print();
1539 profileCovDev.
Print();
1540 profilePlaq.
Print();
1541 profileGaugeObs.
Print();
1543 profileSTOUT.
Print();
1544 profileOvrImpSTOUT.
Print();
1545 profileWFlow.
Print();
1546 profileProject.
Print();
1547 profilePhase.
Print();
1548 profileMomAction.
Print();
1551 profileInit2End.
Print();
1601 if (
sizeof(
Complex) !=
sizeof(
double _Complex)) {
1602 errorQuda(
"Irreconcilable difference between interface and internal complex number conventions");
1617 if (
sizeof(
Complex) !=
sizeof(
double _Complex)) {
1618 errorQuda(
"Irreconcilable difference between interface and internal complex number conventions");
1624 for (
int i = 0; i < diracParam.
Ls; i++) {
1625 printfQuda(
"fromQUDA diracParam: b5[%d] = %f + i%f, c5[%d] = %f + i%f\n", i, diracParam.
b_5[i].real(),
1626 diracParam.
b_5[i].imag(), i, diracParam.
c_5[i].real(), diracParam.
c_5[i].imag());
1681 for (
int i=0; i<4; i++) diracParam.
commDim[i] = 1;
1698 for (
int i=0; i<4; i++) {
1716 for (
int i=0; i<4; i++) {
1741 for (
int i=0; i<4; i++) {
1742 diracParam.
commDim[i] = comms ? 1 : 0;
1773 for (
int i = 0; i < 4; i++) { diracParam.
commDim[i] = comms ? 1 : 0; }
1805 const bool pc_solve)
1826 const bool pc_solve)
1857 printfQuda(
"Mass rescale: mass normalization: %d\n",
param.mass_normalization);
1859 printfQuda(
"Mass rescale: norm of source in = %g\n", nin);
1864 switch (
param.solution_type) {
1881 switch (
param.solution_type) {
1887 for (
int i = 0; i <
param.num_offset; i++)
param.offset[i] *= 2.0 *
kappa;
1906 for (
int i = 0; i <
param.num_offset; i++)
param.offset[i] *= 2.0 *
kappa;
1921 errorQuda(
"Solution type %d not supported",
param.solution_type);
1927 printfQuda(
"Mass rescale: mass normalization: %d\n",
param.mass_normalization);
1929 printfQuda(
"Mass rescale: norm of source out = %g\n", nin);
1945 errorQuda(
"Clover field not allocated");
2015 printfQuda(
"Out CPU %e CUDA %e\n", cpu, gpu);
2039 errorQuda(
"Clover field not allocated");
2089 printfQuda(
"Out CPU %e CUDA %e\n", cpu, gpu);
2109 errorQuda(
"Clover field not allocated");
2162 printfQuda(
"Out CPU %e CUDA %e\n", cpu, gpu);
2293 if (!initialized)
errorQuda(
"QUDA not initialized");
2300 errorQuda(
"Cannot apply the clover term for a non Wilson-clover or Twisted-mass-clover dslash");
2345 printfQuda(
"Out CPU %e CUDA %e\n", cpu, gpu);
2370 if (!initialized)
errorQuda(
"QUDA not initialized");
2379 checkEigParam(eig_param);
2392 Dirac *dSloppy =
nullptr;
2393 Dirac *dPre =
nullptr;
2405 std::vector<ColorSpinorField *> host_evecs_;
2406 for (
int i = 0; i < eig_param->
n_conv; i++) {
2407 cpuParam.
v = host_evecs[i];
2418 std::vector<Complex> evals(eig_param->
n_conv, 0.0);
2419 std::vector<ColorSpinorField *> kSpace;
2428 errorQuda(
"Cannot compute imaginary spectra with a hermitian operator");
2434 errorQuda(
"gamma5 premultiplication is only supported for M type operators: dag = %s, normop = %s",
2444 arpack_solve(host_evecs_, evals, m, eig_param, profileEigensolve);
2447 (*eig_solve)(kSpace, evals);
2453 arpack_solve(host_evecs_, evals, m, eig_param, profileEigensolve);
2456 (*eig_solve)(kSpace, evals);
2462 arpack_solve(host_evecs_, evals, m, eig_param, profileEigensolve);
2465 (*eig_solve)(kSpace, evals);
2471 arpack_solve(host_evecs_, evals, m, eig_param, profileEigensolve);
2474 (*eig_solve)(kSpace, evals);
2480 arpack_solve(host_evecs_, evals, m, eig_param, profileEigensolve);
2483 (*eig_solve)(kSpace, evals);
2487 errorQuda(
"Invalid use_norm_op and dagger combination");
2491 for (
int i = 0; i < eig_param->
n_conv; i++) { memcpy(host_evals + i, &evals[i],
sizeof(
Complex)); }
2498 for (
int i = 0; i < eig_param->
n_conv; i++) *host_evecs_[i] = *kSpace[i];
2503 for (
int i = 0; i < eig_param->
n_conv; i++)
delete host_evecs_[i];
2507 for (
int i = 0; i < eig_param->
n_conv; i++)
delete kSpace[i];
2519 : profile(profile) {
2525 checkMultigridParam(&mg_param);
2531 for (
int i=0; i<mg_param.
n_level; i++) {
2536 errorQuda(
"Outer MG solver can only use QUDA_DIRECT_SOLVE at present");
2583 B.resize(mg_param.
n_vec[0]);
2590 csParam.
v = (
void *)std::numeric_limits<uint64_t>::max();
2591 csParam.
norm = (
void *)std::numeric_limits<uint64_t>::max();
2608 profilerStart(__func__);
2620 profilerStop(__func__);
2621 return static_cast<void*
>(mg);
2630 profilerStart(__func__);
2638 checkMultigridParam(mg_param);
2656 mg->d->setMass(
param->mass);
2659 mg->dSmooth->setMass(
param->mass);
2661 if (mg->dSmoothSloppy != mg->dSmooth) {
2668 mg->dSmoothSloppy->setMass(
param->mass);
2678 if (mg->m)
delete mg->m;
2679 if (mg->mSmooth)
delete mg->mSmooth;
2680 if (mg->mSmoothSloppy)
delete mg->mSmoothSloppy;
2682 if (mg->d)
delete mg->d;
2683 if (mg->dSmooth)
delete mg->dSmooth;
2684 if (mg->dSmoothSloppy && mg->dSmoothSloppy != mg->dSmooth)
delete mg->dSmoothSloppy;
2692 mg->m =
new DiracM(*(mg->d));
2700 mg->mSmooth =
new DiracM(*(mg->dSmooth));
2707 mg->mSmoothSloppy =
new DiracM(*(mg->dSmoothSloppy));
2709 mg->mgParam->matResidual = mg->m;
2710 mg->mgParam->matSmooth = mg->mSmooth;
2711 mg->mgParam->matSmoothSloppy = mg->mSmoothSloppy;
2713 mg->mgParam->updateInvertParam(*
param);
2714 if (mg->mgParam->mg_global.invert_param !=
param) mg->mgParam->mg_global.invert_param =
param;
2716 bool refresh =
true;
2717 mg->mg->reset(refresh);
2730 profilerStop(__func__);
2735 profilerStart(__func__);
2740 checkMultigridParam(mg_param);
2743 mg->mg->dumpNullVectors();
2747 profilerStop(__func__);
2751 : d(nullptr), m(nullptr), RV(nullptr), deflParam(nullptr), defl(nullptr), profile(profile) {
2779 ritzParam.is_composite =
true;
2780 ritzParam.is_component =
false;
2781 ritzParam.composite_dim =
param->n_ev *
param->deflation_grid;
2782 ritzParam.setPrecision(
param->cuda_prec_ritz);
2785 ritzParam.setPrecision(
param->cuda_prec_ritz,
param->cuda_prec_ritz,
true);
2796 for(
int d = 0;
d < ritzParam.nDim;
d++) ritzVolume *= ritzParam.x[
d];
2800 size_t byte_estimate = (size_t)ritzParam.composite_dim*(
size_t)ritzVolume*(ritzParam.nColor*ritzParam.nSpin*ritzParam.Precision());
2801 printfQuda(
"allocating bytes: %lu (lattice volume %d, prec %d)", byte_estimate, ritzVolume, ritzParam.Precision());
2827 return static_cast<void*
>(defl);
2839 profilerStart(__func__);
2847 if (!initialized)
errorQuda(
"QUDA not initialized");
2852 checkInvertParam(
param, hp_x, hp_b);
2877 Dirac *dSloppy =
nullptr;
2878 Dirac *dPre =
nullptr;
2879 Dirac *dEig =
nullptr;
2886 Dirac &diracSloppy = *dSloppy;
2887 Dirac &diracPre = *dPre;
2888 Dirac &diracEig = *dEig;
2913 bool invalidate =
false;
2914 if (
param->use_resident_solution == 1) {
2916 if (b->
Precision() != v->Precision() || b->
SiteSubset() != v->SiteSubset()) { invalidate =
true;
break; }
2937 errorQuda(
"Initial guess not supported for two-pass solver");
2958 if (nb==0.0)
errorQuda(
"Source has zero norm");
2962 printfQuda(
"Source: CPU = %g, CUDA copy = %g\n", nh_b, nb);
2966 printfQuda(
"Solution: CPU = %g, CUDA copy = %g\n", nh_x, nx);
2984 printfQuda(
"Prepared solution = %g\n", nout);
2989 printfQuda(
"Prepared source post mass rescale = %g\n", nin);
3013 if (pc_solution && !pc_solve) {
3014 errorQuda(
"Preconditioned (PC) solution_type requires a PC solve_type");
3017 if (!mat_solution && !pc_solution && pc_solve) {
3018 errorQuda(
"Unpreconditioned MATDAG_MAT solution_type requires an unpreconditioned solve_type");
3021 if (!mat_solution && norm_error_solve) {
3022 errorQuda(
"Normal-error solve requires Mat solution");
3026 errorQuda(
"Multigrid preconditioning only supported for direct solves");
3029 if (
param->chrono_use_resident && ( norm_error_solve) ){
3030 errorQuda(
"Chronological forcasting only presently supported for M^dagger M solver");
3035 if (mat_solution && !direct_solve && !norm_error_solve) {
3038 }
else if (!mat_solution && direct_solve) {
3039 DiracMdag m(
dirac), mSloppy(diracSloppy), mPre(diracPre), mEig(diracEig);
3042 (*solve)(*out, *in);
3049 DiracM m(
dirac), mSloppy(diracSloppy), mPre(diracPre), mEig(diracEig);
3060 std::vector<ColorSpinorField*> Ap;
3061 for (
unsigned int k=0; k < basis.size(); k++) {
3066 for (
unsigned int j=0; j<basis.size(); j++) m(*Ap[j], *basis[j], *
tmp, *tmp2);
3068 for (
unsigned int j=0; j<basis.size(); j++) mSloppy(*Ap[j], *basis[j], *
tmp, *tmp2);
3070 errorQuda(
"Unexpected precision %d for chrono vectors (doesn't match outer %d or sloppy precision %d)",
3074 bool orthogonal =
true;
3075 bool apply_mat =
false;
3076 bool hermitian =
false;
3077 MinResExt mre(m, orthogonal, apply_mat, hermitian, profileInvert);
3080 mre(*out, *
tmp, basis, Ap);
3083 if (ap)
delete (ap);
3086 if (tmp2 != out)
delete tmp2;
3092 (*solve)(*out, *in);
3095 }
else if (!norm_error_solve) {
3096 DiracMdagM m(
dirac), mSloppy(diracSloppy), mPre(diracPre), mEig(diracEig);
3106 std::vector<ColorSpinorField*> Ap;
3109 for (
unsigned int k=0; k < basis.size(); k++) {
3114 for (
unsigned int j=0; j<basis.size(); j++) m(*Ap[j], *basis[j], *
tmp, *tmp2);
3116 for (
unsigned int j=0; j<basis.size(); j++) mSloppy(*Ap[j], *basis[j], *
tmp, *tmp2);
3118 errorQuda(
"Unexpected precision %d for chrono vectors (doesn't match outer %d or sloppy precision %d)",
3122 bool orthogonal =
true;
3123 bool apply_mat =
false;
3124 bool hermitian =
true;
3125 MinResExt mre(m, orthogonal, apply_mat, hermitian, profileInvert);
3128 mre(*out, *
tmp, basis, Ap);
3134 if (tmp2 != out)
delete tmp2;
3143 (*solve)(*out, *in);
3148 (*solve)(*out, *in);
3153 DiracMMdag m(
dirac), mSloppy(diracSloppy), mPre(diracPre), mEig(diracEig);
3169 if (
param->chrono_make_resident) {
3170 if(
param->chrono_max_dim < 1){
3171 errorQuda(
"Cannot chrono_make_resident with chrono_max_dim %i",
param->chrono_max_dim);
3174 const int i =
param->chrono_index;
3180 if(
param->chrono_max_dim < (
int)basis.size()){
3181 errorQuda(
"Requested chrono_max_dim %i is smaller than already existing chroology %i",
param->chrono_max_dim,(
int)basis.size());
3184 if(not
param->chrono_replace_last){
3186 if ((
int)basis.size() <
param->chrono_max_dim) {
3194 for (
unsigned int j=basis.size()-1; j>0; j--) basis[j] = basis[j-1];
3207 if (!
param->make_resident_solution) {
3215 if (
param->compute_action) {
3217 param->action[0] = action.real();
3218 param->action[1] = action.imag();
3224 printfQuda(
"Reconstructed: CUDA solution = %g, CPU copy = %g\n", nx, nh_x);
3234 if (
param->use_resident_solution && !
param->make_resident_solution) {
3237 }
else if (!
param->make_resident_solution) {
3255 profilerStop(__func__);
3259 void *milc_longlinks)
3272 pad_size =
MAX(x_face_size, y_face_size);
3273 pad_size =
MAX(pad_size, z_face_size);
3274 pad_size =
MAX(pad_size, t_face_size);
3277 int fat_pad = pad_size;
3278 int link_pad = 3 * pad_size;
3310 template <
class Interface,
class... Args>
3312 void *h_gauge,
void *milc_fatlinks,
void *milc_longlinks,
3314 void *h_clover,
void *h_clovinv,
3315 Interface op, Args... args)
3323 profilerStart(__func__);
3329 errorQuda(
"split_key = [%d,%d,%d,%d] is not valid.\n", split_key[0], split_key[1], split_key[2], split_key[3]);
3332 if (num_sub_partition == 1) {
3334 for (
int n = 0; n <
param->num_src; n++) { op(_hp_x[n], _hp_b[n],
param, args...); }
3344 if (
param->num_src_per_sub_partition * num_sub_partition !=
param->num_src) {
3345 errorQuda(
"We need to have split_grid[0](=%d) * split_grid[1](=%d) * split_grid[2](=%d) * split_grid[3](=%d) * "
3346 "num_src_per_sub_partition(=%d) == num_src(=%d).",
3347 split_key[0], split_key[1], split_key[2], split_key[3],
param->num_src_per_sub_partition,
param->num_src);
3357 checkInvertParam(
param, _hp_x[0], _hp_b[0]);
3361 is_staggered =
false;
3362 }
else if (milc_fatlinks) {
3363 is_staggered =
true;
3365 errorQuda(
"Both h_gauge and milc_fatlinks are null.");
3366 is_staggered =
true;
3379 if (!is_staggered) {
3399 std::vector<ColorSpinorField *> _h_b(
param->num_src);
3400 for (
int i = 0; i <
param->num_src; i++) {
3401 cpuParam.
v = _hp_b[i];
3406 std::vector<ColorSpinorField *> _h_x(
param->num_src);
3407 for (
int i = 0; i <
param->num_src; i++) {
3408 cpuParam.
v = _hp_x[i];
3414 printfQuda(
"Spliting the grid into sub-partitions: (%2d,%2d,%2d,%2d) / (%2d,%2d,%2d,%2d).\n",
comm_dim(0),
3418 if (
comm_dim(d) % split_key[d] != 0) {
3421 if (!is_staggered) {
3422 gf_param->
x[d] *= split_key[d];
3423 gf_param->
pad *= split_key[d];
3425 milc_fatlink_param->
x[d] *= split_key[d];
3426 milc_fatlink_param->
pad *= split_key[d];
3427 milc_longlink_param->
x[d] *= split_key[d];
3428 milc_longlink_param->
pad *= split_key[d];
3436 if (
param->clover_coeff == 0.0 &&
param->clover_csw == 0.0)
errorQuda(
"called with neither clover term nor inverse and clover coefficient nor Csw not set");
3443 if (h_clover || h_clovinv) {
3445 clover_param.
nDim = 4;
3448 clover_param.
csw =
param->clover_csw;
3457 for (
int d = 0; d < 4; d++) { clover_param.
x[d] = field_dim[d]; }
3460 clover_param.
norm =
nullptr;
3461 clover_param.
invNorm =
nullptr;
3463 clover_param.
direct = h_clover ? true :
false;
3464 clover_param.
inverse = h_clovinv ? true :
false;
3465 clover_param.
clover = h_clover;
3472 for (
int d = 0; d <
CommKey::n_dim; d++) { clover_param.
x[d] *= split_key[d]; }
3476 std::vector<quda::CloverField *> v_c(1);
3477 v_c[0] = input_clover;
3486 if (!is_staggered) {
3489 std::vector<quda::GaugeField *> v_g(1);
3497 std::vector<quda::GaugeField *> v_g(1);
3498 v_g[0] = milc_fatlink_field;
3500 v_g[0] = milc_longlink_field;
3511 for (
int d = 0; d <
CommKey::n_dim; d++) { cpu_cs_param_split.
x[d] *= split_key[d]; }
3512 std::vector<quda::ColorSpinorField *> _collect_b(
param->num_src_per_sub_partition,
nullptr);
3513 std::vector<quda::ColorSpinorField *> _collect_x(
param->num_src_per_sub_partition,
nullptr);
3514 for (
int n = 0; n <
param->num_src_per_sub_partition; n++) {
3517 auto first = _h_b.begin() + n * num_sub_partition;
3518 auto last = _h_b.begin() + (n + 1) * num_sub_partition;
3519 std::vector<ColorSpinorField *> _v_b(first, last);
3520 split_field(*_collect_b[n], _v_b, split_key, pc_type);
3534 if (!is_staggered) {
3539 collected_milc_longlink_field->
Gauge_p());
3546 if (collected_clover) {
3554 for (
int n = 0; n <
param->num_src_per_sub_partition; n++) {
3555 op(_collect_x[n]->
V(), _collect_b[n]->
V(),
param, args...);
3569 for (
int n = 0; n <
param->num_src_per_sub_partition; n++) {
3570 auto first = _h_x.begin() + n * num_sub_partition;
3571 auto last = _h_x.begin() + (n + 1) * num_sub_partition;
3572 std::vector<ColorSpinorField *> _v_x(first, last);
3573 join_field(_v_x, *_collect_x[n], split_key, pc_type);
3576 for (
auto p : _collect_b) {
delete p; }
3577 for (
auto p : _collect_x) {
delete p; }
3579 for (
auto p : _h_x) {
delete p; }
3580 for (
auto p : _h_b) {
delete p; }
3582 if (!is_staggered) {
3584 delete collected_gauge;
3586 delete milc_fatlink_field;
3587 delete milc_longlink_field;
3588 delete collected_milc_fatlink_field;
3589 delete collected_milc_longlink_field;
3592 if (input_clover) {
delete input_clover; }
3593 if (collected_clover) {
delete collected_clover; }
3599 if (!is_staggered) {
3611 profilerStop(__func__);
3638 callMultiSrcQuda(_hp_x, _hp_b,
param, h_gauge,
nullptr,
nullptr,
gauge_param,
nullptr,
nullptr, op,
parity);
3645 callMultiSrcQuda(_hp_x, _hp_b,
param,
nullptr, milc_fatlinks, milc_longlinks,
gauge_param,
nullptr,
nullptr, op,
3653 callMultiSrcQuda(_hp_x, _hp_b,
param, h_gauge,
nullptr,
nullptr,
gauge_param, h_clover, h_clovinv, op,
parity);
3670 profilerStart(__func__);
3675 if (!initialized)
errorQuda(
"QUDA not initialized");
3677 checkInvertParam(
param, _hp_x[0], _hp_b);
3683 errorQuda(
"Number of shifts %d requested greater than QUDA_MAX_MULTI_SHIFT %d",
param->num_offset,
3697 errorQuda(
"For Staggered-type fermions, multi-shift solver only suports MATPC solution type");
3701 errorQuda(
"For Staggered-type fermions, multi-shift solver only supports DIRECT_PC solve types");
3707 errorQuda(
"For Wilson-type fermions, multi-shift solver does not support MAT or MATPC solution types");
3710 errorQuda(
"For Wilson-type fermions, multi-shift solver does not support DIRECT or DIRECT_PC solve types");
3712 if (pc_solution & !pc_solve) {
3713 errorQuda(
"For Wilson-type fermions, preconditioned (PC) solution_type requires a PC solve_type");
3715 if (!pc_solution & pc_solve) {
3716 errorQuda(
"For Wilson-type fermions, in multi-shift solver, a preconditioned (PC) solve_type requires a PC solution_type");
3725 for (
int i=0; i<
param->num_offset-1; i++) {
3726 for (
int j=i+1; j<
param->num_offset; j++) {
3728 errorQuda(
"Offsets must be ordered from smallest to largest");
3734 hp_x =
new void* [
param->num_offset ];
3737 for(
int i=0;i <
param->num_offset;i++){
3754 Dirac *dSloppy =
nullptr;
3755 Dirac *dPre =
nullptr;
3756 Dirac *dRefine =
nullptr;
3761 Dirac &diracSloppy = *dSloppy;
3765 std::vector<ColorSpinorField*> x;
3766 x.resize(
param->num_offset);
3767 std::vector<ColorSpinorField*> p;
3768 std::unique_ptr<double[]> r2_old(
new double[
param->num_offset]);
3782 std::vector<ColorSpinorField*> h_x;
3783 h_x.resize(
param->num_offset);
3786 for(
int i=0; i <
param->num_offset; i++) {
3787 cpuParam.
v = hp_x[i];
3806 bool invalidate =
false;
3808 if (cudaParam.
Precision() != v->Precision()) {
3831 if (nb==0.0)
errorQuda(
"Source has zero norm");
3835 printfQuda(
"Source: CPU = %g, CUDA copy = %g\n", nh_b, nb);
3845 for (
int i = 0; i <
param->num_offset; i++) { unscaled_shifts[i] =
param->offset[i]; }
3856 mSloppy =
new DiracM(diracSloppy);
3864 MultiShiftCG cg_m(*m, *mSloppy, solverParam, profileMulti);
3865 cg_m(x, *b, p, r2_old.get());
3872 if (
param->compute_true_res) {
3881 Dirac &diracSloppy = *dRefine;
3883 #define REFINE_INCREASING_MASS
3884 #ifdef REFINE_INCREASING_MASS
3885 for(
int i=0; i <
param->num_offset; i++) {
3887 for(
int i=
param->num_offset-1; i >= 0; i--) {
3890 param->true_res_hq_offset[i] : 0;
3892 param->tol_hq_offset[i] : 0;
3902 const double iter_tol = (
param->iter_res_offset[i] < prec_tol ? prec_tol : (
param->iter_res_offset[i] *1.1));
3903 const double refine_tol = (
param->tol_offset[i] == 0.0 ? iter_tol :
param->tol_offset[i]);
3905 if (
param->true_res_offset[i] > refine_tol || rsd_hq >
tol_hq) {
3907 printfQuda(
"Refining shift %d: L2 residual %e / %e, heavy quark %e / %e (actual / requested)\n",
3922 mSloppy =
new DiracM(diracSloppy);
3936 #ifdef REFINE_INCREASING_MASS
3937 const int nRefine = i+1;
3939 const int nRefine =
param->num_offset - i + 1;
3942 std::vector<ColorSpinorField *> q;
3944 std::vector<ColorSpinorField *> z;
3949 for (
int j = 0; j < nRefine; j++) {
3955 #ifdef REFINE_INCREASING_MASS
3956 for (
int j=1; j<nRefine; j++) *z[j] = *x[j];
3958 for (
int j=1; j<nRefine; j++) *z[j] = *x[
param->num_offset-j];
3961 bool orthogonal =
true;
3962 bool apply_mat =
true;
3963 bool hermitian =
true;
3964 MinResExt mre(*m, orthogonal, apply_mat, hermitian, profileMulti);
3966 mre(*x[i],
tmp, z, q);
3968 for(
int j=0; j < nRefine; j++) {
3975 solverParam.
iter = 0;
3977 solverParam.
tol = (
param->tol_offset[i] > 0.0 ?
param->tol_offset[i] : iter_tol);
3979 solverParam.
delta =
param->reliable_delta_refinement;
3982 CG cg(*m, *mSloppy, *mSloppy, *mSloppy, solverParam, profileMulti);
3984 cg(*x[i], *b, p[i], r2_old[i]);
4006 for(
int i=0; i <
param->num_offset; i++) {
4007 param->offset[i] = unscaled_shifts[i];
4012 if (
param->compute_action) {
4015 param->action[0] = action.real();
4016 param->action[1] = action.imag();
4019 for(
int i=0; i <
param->num_offset; i++) {
4029 if (!
param->make_resident_solution) *h_x[i] = *x[i];
4035 if (!
param->make_resident_solution) {
4043 for(
int i=0; i <
param->num_offset; i++){
4057 for (
auto& pp : p)
delete pp;
4068 profilerStop(__func__);
4077 checkGaugeParam(
param);
4115 longKSLink(cudaLongLink, *cudaInLinkEx, path_coeff);
4118 cudaLongLink->
saveCPUField(cpuLongLink, profileFatLink);
4121 delete cudaLongLink;
4136 delete cudaInLinkEx;
4140 const double unitarize_eps = 1e-14;
4141 const double max_error = 1e-10;
4142 const int reunit_allow_svd = 1;
4143 const int reunit_svd_only = 0;
4144 const double svd_rel_error = 1e-6;
4145 const double svd_abs_error = 1e-6;
4147 svd_rel_error, svd_abs_error);
4152 *num_failures_h = 0;
4154 if (*num_failures_h > 0)
errorQuda(
"Error in unitarization component of the hisq fattening: %d failures", *num_failures_h);
4157 cudaUnitarizedLink->
saveCPUField(cpuUnitarizedLink, profileFatLink);
4160 delete cudaUnitarizedLink;
4170 errorQuda(
"Fat-link has not been built");
4179 for(
int dir=0; dir<4; ++dir) face_size[dir] = (volume/
param.x[dir])/2;
4180 pad = *std::max_element(face_size, face_size+4);
4187 double* loop_coeff,
int num_paths,
int max_length,
double eb3,
QudaGaugeParam* qudaGaugeParam)
4189 #ifdef GPU_GAUGE_FORCE
4193 checkGaugeParam(qudaGaugeParam);
4264 gaugeForce(*force, *
cudaGauge, 1.0, input_path_buf, path_length, loop_coeff, num_paths, max_length);
4281 delete cudaSiteLink;
4291 if (cpuSiteLink)
delete cpuSiteLink;
4305 errorQuda(
"Gauge force has not been built");
4315 checkGaugeParam(
param);
4373 for (
int d=0; d<4; d++) R[d] = (d==0 ? 2 : 1) * (redundant_comms ||
commDimPartitioned(d));
4399 errorQuda(
"Only scalar and vector geometries are supported\n");
4472 for(
int dir=0; dir<4; ++dir) qParam.
x[dir] =
gParam.
x[dir];
4491 errorQuda(
"Resident gauge field is required");
4494 errorQuda(
"Gauge field requires the staggered phase factors to be applied");
4499 errorQuda(
"Requested staggered phase %d, but found %d\n",
4507 std::vector<ColorSpinorField*> X(nvector);
4512 errorQuda(
"solutionResident.size() %lu does not match number of shifts %d",
4528 for (
int i=0; i<nvector; i++) {
4532 else errorQuda(
"%s requires resident solution", __func__);
4553 for (
int i=0; i<nvector; i++) {
4585 for (
int i=0; i<nvector; i++)
delete X[i];
4593 const double level2_coeff[6],
4594 const double fat7_coeff[6],
4595 const void*
const w_link,
4596 const void*
const v_link,
4597 const void*
const u_link,
4604 #ifdef GPU_STAGGERED_OPROD
4605 using namespace quda;
4625 const double unitarize_eps = 1e-14;
4626 const double hisq_force_filter = 5e-5;
4627 const double max_det_error = 1e-10;
4628 const bool allow_svd =
true;
4629 const bool svd_only =
false;
4630 const double svd_rel_err = 1e-8;
4631 const double svd_abs_err = 1e-8;
4636 double act_path_coeff[6] = {0,1,level2_coeff[2],level2_coeff[3],level2_coeff[4],level2_coeff[5]};
4649 param.gauge = (
void*)w_link;
4651 param.gauge = (
void*)v_link;
4653 param.gauge = (
void*)u_link;
4667 for (
int dir=0; dir<4; ++dir) {
4668 param.x[dir] += 2*R[dir];
4669 param.r[dir] = R[dir];
4688 for (
int dir=0; dir<4; ++dir) qParam.
x[dir] = oParam.
x[dir];
4698 qParam.
v = fermion[0];
4701 GaugeField *oprod[2] = {stapleOprod, naikOprod};
4704 for(
int i=0; i<num_terms; ++i){
4708 qParam.
v = fermion[i];
4713 cudaQuark = cpuQuark;
4723 oneLinkOprod->
copy(*stapleOprod);
4724 ax(level2_coeff[0], *oneLinkOprod);
4725 GaugeField *oprod[2] = {oneLinkOprod, naikOprod};
4728 for(
int i=0; i<num_naik_terms; ++i){
4732 qParam.
v = fermion[i + num_terms - num_naik_terms];
4737 cudaQuark = cpuQuark;
4754 delete oneLinkOprod;
4786 *num_failures_h = 0;
4790 if (*num_failures_h>0)
errorQuda(
"Error in the unitarization component of the hisq fermion force: %d failures\n", *num_failures_h);
4810 if (
gParam->use_resident_mom) {
4817 if (
gParam->return_result_mom) {
4826 if (!
gParam->make_resident_mom) {
4831 delete cudaOutForce;
4838 errorQuda(
"HISQ force has not been built");
4843 double *coeff,
double kappa2,
double ck,
4844 int nvector,
double multiplicity,
void *gauge,
4847 using namespace quda;
4881 for(
int dir=0; dir<4; ++dir) qParam.
x[dir] = fParam.
x[dir];
4888 std::vector<ColorSpinorField*> quarkX, quarkP;
4889 for (
int i=0; i<nvector; i++) {
4912 errorQuda(
"solutionResident.size() %lu does not match number of shifts %d",
4925 std::vector<double> force_coeff(nvector);
4927 for(
int i=0; i<nvector; i++){
4944 x.
Even() = cpuQuarkX;
4962 force_coeff[i] = 2.0*dt*coeff[i]*kappa2;
4979 std::vector< std::vector<double> > ferm_epsilon(nvector);
4980 for (
int shift = 0; shift < nvector; shift++) {
4981 ferm_epsilon[shift].reserve(2);
4982 ferm_epsilon[shift][0] = 2.0*ck*coeff[shift]*dt;
4983 ferm_epsilon[shift][1] = -kappa2 * 2.0*ck*coeff[shift]*dt;
4995 if (u != &gaugeEx)
delete u;
5007 for (
int i=0; i<nvector; i++) {
5035 checkGaugeParam(
param);
5093 (
bool)conj_mom, (
bool)exact);
5099 cudaOutGauge->saveCPUField(*
cpuGauge);
5108 delete cudaOutGauge;
5130 checkGaugeParam(
param);
5157 *num_failures_h = 0;
5166 if(*num_failures_h>0)
5167 errorQuda(
"Error in the SU(3) unitarization: %d failures\n", *num_failures_h);
5191 checkGaugeParam(
param);
5215 *num_failures_h = 0;
5247 checkGaugeParam(
param);
5341 errorQuda(
"Fortran multi-shift solver presently only supports QUDA_TIFR_PADDED_DIRAC_ORDER and not %d",
param->dirac_order);
5346 for (
int i=0; i<
param->num_offset; i++) hp_x[i] =
static_cast<char*
>(h_x) + i*cb_offset;
5354 cudaHostRegister(ptr, *
bytes, cudaHostRegisterDefault);
5359 cudaHostUnregister(ptr);
5371 bool *conj_mom,
bool *exact,
5376 static inline int opp(
int dir) {
return 7-dir; }
5378 static void createGaugeForcePaths(
int **paths,
int dir,
int num_loop_types){
5382 if (num_loop_types >= 1)
5383 for(
int i=0; i<4; ++i){
5384 if(i==dir)
continue;
5385 paths[index][0] = i; paths[index][1] = opp(dir); paths[index++][2] = opp(i);
5386 paths[index][0] = opp(i); paths[index][1] = opp(dir); paths[index++][2] = i;
5390 if (num_loop_types >= 2)
5391 for(
int i=0; i<4; ++i){
5392 if(i==dir)
continue;
5393 paths[index][0] = paths[index][1] = i; paths[index][2] = opp(dir); paths[index][3] = paths[index][4] = opp(i);
5395 paths[index][0] = paths[index][1] = opp(i); paths[index][2] = opp(dir); paths[index][3] = paths[index][4] = i;
5397 paths[index][0] = dir; paths[index][1] = i; paths[index][2] = paths[index][3] = opp(dir); paths[index][4] = opp(i);
5399 paths[index][0] = dir; paths[index][1] = opp(i); paths[index][2] = paths[index][3] = opp(dir); paths[index][4] = i;
5401 paths[index][0] = i; paths[index][1] = paths[index][2] = opp(dir); paths[index][3] = opp(i); paths[index][4] = dir;
5403 paths[index][0] = opp(i); paths[index][1] = paths[index][2] = opp(dir); paths[index][3] = i; paths[index][4] = dir;
5407 if (num_loop_types >= 3) {
5409 for(
int i=0; i<4; ++i){
5410 for(
int j=0; j<4; ++j){
5411 if(i==dir || j==dir || i==j)
continue;
5412 paths[index][0] = i; paths[index][1] = j; paths[index][2] = opp(dir); paths[index][3] = opp(i), paths[index][4] = opp(j);
5414 paths[index][0] = i; paths[index][1] = opp(j); paths[index][2] = opp(dir); paths[index][3] = opp(i), paths[index][4] = j;
5416 paths[index][0] = opp(i); paths[index][1] = j; paths[index][2] = opp(dir); paths[index][3] = i, paths[index][4] = opp(j);
5418 paths[index][0] = opp(i); paths[index][1] = opp(j); paths[index][2] = opp(dir); paths[index][3] = i, paths[index][4] = j;
5430 switch (*num_loop_types) {
5441 errorQuda(
"Invalid num_loop_types = %d\n", *num_loop_types);
5444 auto *loop_coeff =
static_cast<double*
>(
safe_malloc(numPaths*
sizeof(
double)));
5445 int *path_length =
static_cast<int*
>(
safe_malloc(numPaths*
sizeof(
int)));
5447 if (*num_loop_types >= 1)
for(
int i= 0; i< 6; ++i) {
5448 loop_coeff[i] = coeff[0];
5451 if (*num_loop_types >= 2)
for(
int i= 6; i<24; ++i) {
5452 loop_coeff[i] = coeff[1];
5455 if (*num_loop_types >= 3)
for(
int i=24; i<48; ++i) {
5456 loop_coeff[i] = coeff[2];
5460 int** input_path_buf[4];
5461 for(
int dir=0; dir<4; ++dir){
5462 input_path_buf[dir] =
static_cast<int**
>(
safe_malloc(numPaths*
sizeof(
int*)));
5463 for(
int i=0; i<numPaths; ++i){
5464 input_path_buf[dir][i] =
static_cast<int*
>(
safe_malloc(path_length[i]*
sizeof(
int)));
5466 createGaugeForcePaths(input_path_buf[dir], dir, *num_loop_types);
5473 for(
auto & dir : input_path_buf){
5474 for(
int i=0; i<numPaths; ++i)
host_free(dir[i]);
5517 static int bqcd_rank_from_coords(
const int *coords,
void *fdata)
5519 int *dims =
static_cast<int *
>(fdata);
5521 int rank = coords[3];
5522 for (
int i = 2; i >= 0; i--) {
5523 rank = dims[i] * rank + coords[i];
5541 bool pack_ = *pack ? true :
false;
5549 if (!
gaugePrecise)
errorQuda(
"Cannot generate Gauss GaugeField as there is no resident gauge field");
5604 if (!
gaugePrecise)
errorQuda(
"Cannot perform deep copy of resident gauge field as there is no resident gauge field");
5636 printfQuda(
"Wuppertal smearing done with gaugePrecise\n");
5657 double a = alpha / (1. + 6. * alpha);
5658 double b = 1. / (1. + 6. * alpha);
5660 for (
unsigned int i = 0; i < n_steps; i++) {
5662 ApplyLaplace(out, in, *precise, 3, a, b, in,
parity,
false,
nullptr, profileWuppertal);
5677 printfQuda(
"Out CPU %e CUDA %e\n", cpu, gpu);
5711 for (
unsigned int i = 0; i < n_steps; i++) {
5721 delete cudaGaugeTemp;
5745 for (
unsigned int i = 0; i < n_steps; i++) {
5755 delete cudaGaugeTemp;
5779 for (
unsigned int i = 0; i < n_steps; i++) {
5789 delete cudaGaugeTemp;
5819 printfQuda(
"flow t, plaquette, E_tot, E_spatial, E_temporal, Q charge\n");
5824 for (
unsigned int i = 0; i < n_steps; i++) {
5835 printfQuda(
"%le %.16e %+.16e %+.16e %+.16e %+.16e\n", step_size * (i + 1),
param.plaquette[0],
param.energy[0],
5847 const unsigned int verbose_interval,
const double relax_boost,
const double tolerance,
5853 checkGaugeParam(
param);
5870 cudaInGauge->loadCPUField(*
cpuGauge);
5882 gaugeFixingOVR(*cudaInGauge, gauge_dir, Nsteps, verbose_interval, relax_boost, tolerance, reunit_interval,
5890 gaugeFixingOVR(*cudaInGaugeEx, gauge_dir, Nsteps, verbose_interval, relax_boost, tolerance, reunit_interval,
5900 cudaInGauge->saveCPUField(*
cpuGauge);
5922 const unsigned int verbose_interval,
const double alpha,
const unsigned int autotune,
const double tolerance, \
5927 checkGaugeParam(
param);
5947 cudaInGauge->loadCPUField(*
cpuGauge);
5959 gaugeFixingFFT(*cudaInGauge, gauge_dir, Nsteps, verbose_interval, alpha, autotune, tolerance, stopWtheta);
5965 cudaInGauge->saveCPUField(*
cpuGauge);
5998 cpuParam.
v = (
void *)hp_y;
6010 std::vector<ColorSpinorField *> x, y;
6014 size_t data_bytes = x[0]->Volume() * x[0]->Nspin() * x[0]->Nspin() * 2 * x[0]->Precision();
6028 qudaMemcpy(h_result, d_result, data_bytes, cudaMemcpyDeviceToHost);
6045 checkGaugeObservableParam(
param);
Conjugate-Gradient Solver.
static CloverField * Create(const CloverFieldParam ¶m)
void setRho(double rho)
Bakes in the rho factor into the clover field, (for real diagonal additive Hasenbusch),...
void * V(bool inverse=false)
const ColorSpinorField & Odd() const
QudaSiteSubset SiteSubset() const
static ColorSpinorField * Create(const ColorSpinorParam ¶m)
const ColorSpinorField & Even() const
QudaGammaBasis gammaBasis
QudaFieldLocation location
void setPrecision(QudaPrecision precision, QudaPrecision ghost_precision=QUDA_INVALID_PRECISION, bool force_native=false)
QudaFieldOrder fieldOrder
virtual void Dslash4(ColorSpinorField &out, const ColorSpinorField &in, const QudaParity parity) const
Apply the local MdagM operator: equivalent to applying zero Dirichlet boundary condition to MdagM on ...
virtual void prefetch(QudaFieldLocation mem_space, qudaStream_t stream=0) const
If managed memory and prefetch is enabled, prefetch the gauge field and temporary spinors to the CPU ...
void setMass(double mass)
void Dagger(QudaDagType dag) const
sets whether operator is daggered or not
static Dirac * create(const DiracParam ¶m)
Creates a subclass from parameters.
void Mdag(ColorSpinorField &out, const ColorSpinorField &in) const
Apply Mdag (daggered operator of M.
double shift
Shift term added onto operator (M/M^dag M/M M^dag + shift)
Complex b_5[QUDA_MAX_DWF_LS]
int commDim[QUDA_MAX_DIM]
Complex c_5[QUDA_MAX_DWF_LS]
cudaGaugeField * longGauge
cudaGaugeField * fatGauge
QudaPrecision halo_precision
static EigenSolver * create(QudaEigParam *eig_param, const DiracMatrix &mat, TimeProfile &profile)
Creates the eigensolver using the parameters given and the matrix.
virtual void prepare(ColorSpinorField *&src, ColorSpinorField *&sol, ColorSpinorField &x, ColorSpinorField &b, const QudaSolutionType) const
virtual void M(ColorSpinorField &out, const ColorSpinorField &in) const
Apply M for the dirac op. E.g. the Schur Complement operator.
virtual void MdagM(ColorSpinorField &out, const ColorSpinorField &in) const
Apply MdagM operator which may be optimized.
virtual void Dslash(ColorSpinorField &out, const ColorSpinorField &in, const QudaParity parity) const
apply 'dslash' operator for the DiracOp. This may be e.g. AD
virtual void reconstruct(ColorSpinorField &x, const ColorSpinorField &b, const QudaSolutionType) const
QudaFieldGeometry Geometry() const
QudaStaggeredPhase StaggeredPhase() const
void removeStaggeredPhase()
QudaGaugeFieldOrder Order() const
static GaugeField * Create(const GaugeFieldParam ¶m)
Create the gauge field, with meta data specified in the parameter struct.
void applyStaggeredPhase(QudaStaggeredPhase phase=QUDA_STAGGERED_PHASE_INVALID)
uint64_t checksum(bool mini=false) const
double Anisotropy() const
bool StaggeredPhaseApplied() const
QudaReconstructType Reconstruct() const
QudaPrecision Precision() const
static void freeGhostBuffer(void)
Free statically allocated ghost buffers.
This computes the optimum guess for the system Ax=b in the L2 residual norm. For use in the HMD force...
Multi-Shift Conjugate Gradient Solver.
static Solver * create(SolverParam ¶m, const DiracMatrix &mat, const DiracMatrix &matSloppy, const DiracMatrix &matPrecon, const DiracMatrix &matEig, TimeProfile &profile)
Solver factory.
static void PrintGlobal()
double Last(QudaProfileType idx)
static void freeGhostBuffer(void)
void copy(const CloverField &src, bool inverse=true)
Copy into this CloverField from the generic CloverField src.
void copy(const GaugeField &src)
void exchangeGhost(QudaLinkDirection link_direction=QUDA_LINK_BACKWARDS)
Exchange the ghost and store store in the padded region.
void loadCPUField(const cpuGaugeField &cpu)
Download into this field from a CPU field.
void saveCPUField(cpuGaugeField &cpu) const
Upload from this field into a CPU field.
void exchangeExtendedGhost(const int *R, bool no_comms_fill=false)
This does routine will populate the border / halo region of a gauge field that has been created using...
void comm_init(int ndim, const int *dims, QudaCommsMap rank_from_coords, void *map_data, bool user_set_comm_handle=false, void *user_comm=nullptr)
Initialize the communications, implemented in comm_single.cpp, comm_qmp.cpp, and comm_mpi....
int comm_dim_partitioned(int dim)
int(* QudaCommsMap)(const int *coords, void *fdata)
int commDimPartitioned(int dir)
QudaReconstructType link_recon_sloppy
QudaReconstructType link_recon
QudaReconstructType link_recon_precondition
constexpr quda::CommKey default_comm_key
void push_communicator(const quda::CommKey &split_key)
cudaColorSpinorField * tmp
QudaGaugeParam gauge_param
QudaInvertParam inv_param
@ QUDA_SOURCE_NORMALIZATION
@ QUDA_TWISTED_MASSPC_DIRAC
@ QUDA_GAUGE_LAPLACE_DIRAC
@ QUDA_GAUGE_COVDEV_DIRAC
@ QUDA_TWISTED_CLOVERPC_DIRAC
@ QUDA_MOBIUS_DOMAIN_WALLPC_EOFA_DIRAC
@ QUDA_CLOVER_HASENBUSCH_TWIST_DIRAC
@ QUDA_TWISTED_MASS_DIRAC
@ QUDA_CLOVER_HASENBUSCH_TWISTPC_DIRAC
@ QUDA_DOMAIN_WALL_4D_DIRAC
@ QUDA_MOBIUS_DOMAIN_WALL_EOFA_DIRAC
@ QUDA_GAUGE_LAPLACEPC_DIRAC
@ QUDA_MOBIUS_DOMAIN_WALLPC_DIRAC
@ QUDA_TWISTED_CLOVER_DIRAC
@ QUDA_DOMAIN_WALL_4DPC_DIRAC
@ QUDA_MOBIUS_DOMAIN_WALL_DIRAC
@ QUDA_DOMAIN_WALLPC_DIRAC
enum QudaWFlowType_s QudaWFlowType
enum QudaPrecision_s QudaPrecision
@ QUDA_STAGGERED_PHASE_NO
@ QUDA_TWISTED_CLOVER_DSLASH
@ QUDA_CLOVER_WILSON_DSLASH
@ QUDA_TWISTED_MASS_DSLASH
@ QUDA_DOMAIN_WALL_DSLASH
@ QUDA_MOBIUS_DWF_EOFA_DSLASH
@ QUDA_CLOVER_HASENBUSCH_TWIST_DSLASH
@ QUDA_DOMAIN_WALL_4D_DSLASH
@ QUDA_CUDA_FIELD_LOCATION
@ QUDA_CPU_FIELD_LOCATION
@ QUDA_KAPPA_NORMALIZATION
@ QUDA_ASYMMETRIC_MASS_NORMALIZATION
@ QUDA_MASS_NORMALIZATION
@ QUDA_USE_INIT_GUESS_YES
@ QUDA_PARITY_SITE_SUBSET
@ QUDA_DEGRAND_ROSSI_GAMMA_BASIS
enum QudaPCType_s QudaPCType
@ QUDA_TIFR_PADDED_DIRAC_ORDER
@ QUDA_CPS_WILSON_DIRAC_ORDER
@ QUDA_HEAVY_QUARK_RESIDUAL
enum QudaFieldGeometry_s QudaFieldGeometry
@ QUDA_TRANSFER_COARSE_KD
@ QUDA_TRANSFER_OPTIMIZED_KD
enum QudaFieldLocation_s QudaFieldLocation
@ QUDA_GHOST_EXCHANGE_EXTENDED
@ QUDA_GHOST_EXCHANGE_PAD
@ QUDA_MATPC_ODD_ODD_ASYMMETRIC
@ QUDA_MATPC_EVEN_EVEN_ASYMMETRIC
@ QUDA_INC_EIGCG_INVERTER
@ QUDA_EVEN_ODD_SITE_ORDER
enum QudaReconstructType_s QudaReconstructType
@ QUDA_MATDAG_MAT_SOLUTION
@ QUDA_MATPCDAG_MATPC_SOLUTION
@ QUDA_FLOAT2_GAUGE_ORDER
@ QUDA_TIFR_PADDED_GAUGE_ORDER
enum QudaContractType_s QudaContractType
@ QUDA_FLOAT2_FIELD_ORDER
@ QUDA_SPACE_COLOR_SPIN_FIELD_ORDER
@ QUDA_SPACE_SPIN_COLOR_FIELD_ORDER
enum QudaVerbosity_s QudaVerbosity
@ QUDA_REFERENCE_FIELD_CREATE
@ QUDA_TWIST_NONDEG_DOUBLET
enum QudaParity_s QudaParity
cudaGaugeField * cudaForce
cudaGaugeField * cudaGauge
cudaGaugeField * cudaFatLink
cpuGaugeField * cpuFatLink
void eigensolveQuda(void **host_evecs, double _Complex *host_evals, QudaEigParam *eig_param)
void computeHISQForceQuda(void *const milc_momentum, double dt, const double level2_coeff[6], const double fat7_coeff[6], const void *const w_link, const void *const v_link, const void *const u_link, void **fermion, int num_terms, int num_naik_terms, double **coeff, QudaGaugeParam *gParam)
double momActionQuda(void *momentum, QudaGaugeParam *param)
void compute_gauge_force_quda_(void *mom, void *gauge, int *num_loop_types, double *coeff, double *dt, QudaGaugeParam *param)
Compute the gauge force and update the mometum field.
void invertMultiSrcQuda(void **_hp_x, void **_hp_b, QudaInvertParam *param, void *h_gauge, QudaGaugeParam *gauge_param)
Perform the solve like @invertQuda but for multiple rhs by spliting the comm grid into sub-partitions...
void gaussGaugeQuda(unsigned long long seed, double sigma)
Generate Gaussian distributed fields and store in the resident gauge field. We create a Gaussian-dist...
void update_gauge_field_quda_(void *gauge, void *momentum, double *dt, bool *conj_mom, bool *exact, QudaGaugeParam *param)
quda::cudaGaugeField * checkGauge(QudaInvertParam *param)
std::vector< cudaColorSpinorField * > solutionResident
void * createGaugeFieldQuda(void *gauge, int geometry, QudaGaugeParam *param)
cudaGaugeField * gaugeFatPrecise
void new_quda_gauge_param_(QudaGaugeParam *param)
void destroyGaugeFieldQuda(void *gauge)
cudaGaugeField * momResident
void set_kernel_pack_t_(int *pack)
fTemporary function exposed for TIFR benchmarking
void new_quda_invert_param_(QudaInvertParam *param)
void load_gauge_quda_(void *h_gauge, QudaGaugeParam *param)
void * newDeflationQuda(QudaEigParam *eig_param)
void apply_staggered_phase_quda_()
Apply the staggered phase factors to the resident gauge field.
void checkClover(QudaInvertParam *param)
std::vector< std::vector< ColorSpinorField * > > chronoResident(QUDA_MAX_CHRONO)
void free_sloppy_gauge_quda_()
void invertQuda(void *hp_x, void *hp_b, QudaInvertParam *param)
void momResidentQuda(void *mom, QudaGaugeParam *param)
cudaGaugeField * gaugeLongPrecise
cudaGaugeField * gaugeSloppy
void mat_quda_(void *h_out, void *h_in, QudaInvertParam *inv_param)
void invert_quda_(void *hp_x, void *hp_b, QudaInvertParam *param)
int computeGaugeForceQuda(void *mom, void *siteLink, int ***input_path_buf, int *path_length, double *loop_coeff, int num_paths, int max_length, double eb3, QudaGaugeParam *qudaGaugeParam)
cudaGaugeField * gaugeLongPrecondition
cudaGaugeField * extendedGaugeResident
cudaCloverField * cloverPrecondition
void setMPICommHandleQuda(void *mycomm)
cudaGaugeField * gaugeRefinement
void plaqQuda(double plaq[3])
void performAPEnStep(unsigned int n_steps, double alpha, int meas_interval)
void invertMultiSrcCloverQuda(void **_hp_x, void **_hp_b, QudaInvertParam *param, void *h_gauge, QudaGaugeParam *gauge_param, void *h_clover, void *h_clovinv)
Really the same with @invertMultiSrcQuda but for clover-style fermions, by accepting pointers to dire...
void invertMultiShiftQuda(void **_hp_x, void *_hp_b, QudaInvertParam *param)
void invert_multishift_quda_(void *h_x, void *hp_b, QudaInvertParam *param)
cudaGaugeField * gaugeFatExtended
void destroyDeflationQuda(void *df)
void setVerbosityQuda(QudaVerbosity verbosity, const char prefix[], FILE *outfile)
void freeSloppyCloverQuda()
cudaGaugeField * gaugeSmeared
void loadSloppyCloverQuda(const QudaPrecision prec[])
void init_quda_(int *dev)
void saveGaugeFieldQuda(void *gauge, void *inGauge, QudaGaugeParam *param)
void freeSloppyGaugeQuda()
void dslashQuda(void *h_out, void *h_in, QudaInvertParam *inv_param, QudaParity parity)
cudaCloverField * cloverSloppy
void * newMultigridQuda(QudaMultigridParam *mg_param)
TimeProfile & getProfileBLAS()
Profiler for covariant derivative.
void invertMultiSrcStaggeredQuda(void **_hp_x, void **_hp_b, QudaInvertParam *param, void *milc_fatlinks, void *milc_longlinks, QudaGaugeParam *gauge_param)
Really the same with @invertMultiSrcQuda but for staggered-style fermions, by accepting pointers to f...
cudaGaugeField * gaugeLongRefinement
void dslashMultiSrcCloverQuda(void **_hp_x, void **_hp_b, QudaInvertParam *param, QudaParity parity, void *h_gauge, QudaGaugeParam *gauge_param, void *h_clover, void *h_clovinv)
Really the same with @dslashMultiSrcQuda but for clover-style fermions, by accepting pointers to dire...
int computeGaugeFixingFFTQuda(void *gauge, const unsigned int gauge_dir, const unsigned int Nsteps, const unsigned int verbose_interval, const double alpha, const unsigned int autotune, const double tolerance, const unsigned int stopWtheta, QudaGaugeParam *param, double *timeinfo)
Gauge fixing with Steepest descent method with FFTs with support for single GPU only.
void staggeredPhaseQuda(void *gauge_h, QudaGaugeParam *param)
void mat_dag_mat_quda_(void *h_out, void *h_in, QudaInvertParam *inv_param)
void dslashMultiSrcQuda(void **_hp_x, void **_hp_b, QudaInvertParam *param, QudaParity parity, void *h_gauge, QudaGaugeParam *gauge_param)
Perform the solve like @dslashQuda but for multiple rhs by spliting the comm grid into sub-partitions...
cudaGaugeField * gaugeFatRefinement
void MatDagMatQuda(void *h_out, void *h_in, QudaInvertParam *inv_param)
void destroyMultigridQuda(void *mg)
Free resources allocated by the multigrid solver.
void initQudaDevice(int dev)
void updateGaugeFieldQuda(void *gauge, void *momentum, double dt, int conj_mom, int exact, QudaGaugeParam *param)
void loadGaugeQuda(void *h_gauge, QudaGaugeParam *param)
cudaGaugeField * gaugeLongExtended
cudaGaugeField * gaugeLongSloppy
int computeGaugeFixingOVRQuda(void *gauge, const unsigned int gauge_dir, const unsigned int Nsteps, const unsigned int verbose_interval, const double relax_boost, const double tolerance, const unsigned int reunit_interval, const unsigned int stopWtheta, QudaGaugeParam *param, double *timeinfo)
Gauge fixing with overrelaxation with support for single and multi GPU.
void loadCloverQuda(void *h_clover, void *h_clovinv, QudaInvertParam *inv_param)
void free_clover_quda_(void)
void comm_set_gridsize_(int *grid)
void freeCloverQuda(void)
void loadSloppyGaugeQuda(const QudaPrecision *prec, const QudaReconstructType *recon)
cudaGaugeField * gaugeEigensolver
void saveGaugeQuda(void *h_gauge, QudaGaugeParam *param)
void computeKSLinkQuda(void *fatlink, void *longlink, void *ulink, void *inlink, double *path_coeff, QudaGaugeParam *param)
void performWFlownStep(unsigned int n_steps, double step_size, int meas_interval, QudaWFlowType wflow_type)
void register_pinned_quda_(void *ptr, size_t *bytes)
Pinned a pre-existing memory allocation.
void kinetic_quda_(double *kin, void *momentum, QudaGaugeParam *param)
Evaluate the kinetic (momentum) contribution to classical Hamiltonian for Hybrid Monte Carlo.
cudaGaugeField * gaugePrecise
cudaGaugeField * gaugeFatSloppy
void dslash_quda_(void *h_out, void *h_in, QudaInvertParam *inv_param, QudaParity *parity)
void dslashMultiSrcStaggeredQuda(void **_hp_x, void **_hp_b, QudaInvertParam *param, QudaParity parity, void *milc_fatlinks, void *milc_longlinks, QudaGaugeParam *gauge_param)
Really the same with @dslashMultiSrcQuda but for staggered-style fermions, by accepting pointers to f...
cudaCloverField * cloverPrecise
cudaGaugeField * gaugeFatPrecondition
void computeStaggeredForceQuda(void *h_mom, double dt, double delta, void *, void **x, QudaGaugeParam *gauge_param, QudaInvertParam *inv_param)
void flushChronoQuda(int i)
Flush the chronological history for the given index.
void init_quda_device_(int *dev)
void callMultiSrcQuda(void **_hp_x, void **_hp_b, QudaInvertParam *param, void *h_gauge, void *milc_fatlinks, void *milc_longlinks, QudaGaugeParam *gauge_param, void *h_clover, void *h_clovinv, Interface op, Args... args)
void copyExtendedResidentGaugeQuda(void *resident_gauge, QudaFieldLocation loc)
void loadFatLongGaugeQuda(QudaInvertParam *inv_param, QudaGaugeParam *gauge_param, void *milc_fatlinks, void *milc_longlinks)
void gaugeObservablesQuda(QudaGaugeObservableParam *param)
Calculates a variety of gauge-field observables. If a smeared gauge field is presently loaded (in gau...
void remove_staggered_phase_quda_()
Remove the staggered phase factors to the resident gauge field.
cudaGaugeField * gaugePrecondition
void load_clover_quda_(void *h_clover, void *h_clovinv, QudaInvertParam *inv_param)
void cloverQuda(void *h_out, void *h_in, QudaInvertParam *inv_param, QudaParity parity, int inverse)
void dumpMultigridQuda(void *mg_, QudaMultigridParam *mg_param)
Dump the null-space vectors to disk.
void unregister_pinned_quda_(void *ptr)
Pinned a pre-existing memory allocation.
void initCommsGridQuda(int nDim, const int *dims, QudaCommsMap func, void *fdata)
cudaCloverField * cloverEigensolver
void performSTOUTnStep(unsigned int n_steps, double rho, int meas_interval)
int getGaugePadding(GaugeFieldParam ¶m)
void createCloverQuda(QudaInvertParam *invertParam)
void updateR()
update the radius for halos.
void computeCloverForceQuda(void *h_mom, double dt, void **h_x, void **h_p, double *coeff, double kappa2, double ck, int nvector, double multiplicity, void *gauge, QudaGaugeParam *gauge_param, QudaInvertParam *inv_param)
void performOvrImpSTOUTnStep(unsigned int n_steps, double rho, double epsilon, int meas_interval)
void flush_chrono_quda_(int *index)
Flush the chronological history for the given index.
cudaCloverField * cloverRefinement
cudaGaugeField * gaugeLongEigensolver
void checkBLASParam(QudaBLASParam ¶m)
void clover_quda_(void *h_out, void *h_in, QudaInvertParam *inv_param, QudaParity *parity, int *inverse)
void projectSU3Quda(void *gauge_h, double tol, QudaGaugeParam *param)
cudaGaugeField * gaugeExtended
void plaq_quda_(double plaq[3])
void compute_staggered_force_quda_(void *h_mom, double *dt, double *delta, void *gauge, void *x, QudaGaugeParam *gauge_param, QudaInvertParam *inv_param)
void performWuppertalnStep(void *h_out, void *h_in, QudaInvertParam *inv_param, unsigned int n_steps, double alpha)
cudaGaugeField * gaugeFatEigensolver
void updateMultigridQuda(void *mg_, QudaMultigridParam *mg_param)
Updates the multigrid preconditioner for the new gauge / clover field.
void MatQuda(void *h_out, void *h_in, QudaInvertParam *inv_param)
#define pool_device_malloc(size)
#define safe_malloc(size)
#define pool_device_free(ptr)
#define get_mapped_device_pointer(ptr)
#define mapped_malloc(size)
void destroy()
Destroy the BLAS context.
void init()
Create the BLAS context.
void destroy()
Destroy the BLAS context.
void set_native(bool native)
void ax(double a, ColorSpinorField &x)
void zero(ColorSpinorField &a)
double norm2(const ColorSpinorField &a)
void copy(ColorSpinorField &dst, const ColorSpinorField &src)
Complex cDotProduct(ColorSpinorField &, ColorSpinorField &)
void stop()
Stop profiling.
void start()
Start profiling.
void create_context()
Create the streams associated with parallel execution.
void init(int dev)
Create the device context. Called by initQuda when initializing the library.
void destroy()
Free any persistent context state. Called by endQuda when tearing down the library.
void setUnitarizeForceConstants(double unitarize_eps, double hisq_force_filter, double max_det_error, bool allow_svd, bool svd_only, double svd_rel_error, double svd_abs_error)
Set the constant parameters for the force unitarization.
void hisqCompleteForce(GaugeField &oprod, const GaugeField &link)
Multiply the computed the force matrix by the gauge field and perform traceless anti-hermitian projec...
void hisqLongLinkForce(GaugeField &newOprod, const GaugeField &oprod, const GaugeField &link, double coeff)
Compute the long-link contribution to the fermion force.
void hisqStaplesForce(GaugeField &newOprod, const GaugeField &oprod, const GaugeField &link, const double path_coeff[6])
Compute the fat-link contribution to the fermion force.
void unitarizeForce(GaugeField &newForce, const GaugeField &oldForce, const GaugeField &gauge, int *unitarization_failed)
Unitarize the fermion force.
void init()
Initialize the memory pool allocator.
void flush_pinned()
Free all outstanding pinned-memory allocations.
void flush_device()
Free all outstanding device-memory allocations.
void applyU(GaugeField &force, GaugeField &U)
void APEStep(GaugeField &dataDs, GaugeField &dataOr, double alpha)
Apply APE smearing to the gauge field.
bool canReuseResidentGauge(QudaInvertParam *inv_param)
void createDslashEvents()
void setDiracRefineParam(DiracParam &diracParam, QudaInvertParam *inv_param, const bool pc)
void setKernelPackT(bool pack)
void saveTuneCache(bool error=false)
void gaugeObservables(GaugeField &u, QudaGaugeObservableParam ¶m, TimeProfile &profile)
Calculates a variety of gauge-field observables.
void arpack_solve(std::vector< ColorSpinorField * > &h_evecs, std::vector< Complex > &h_evals, const DiracMatrix &mat, QudaEigParam *eig_param, TimeProfile &profile)
The QUDA interface function. One passes two allocated arrays to hold the the eigenmode data,...
void setDiracSloppyParam(DiracParam &diracParam, QudaInvertParam *inv_param, bool pc)
double computeMomAction(const GaugeField &mom)
Compute and return global the momentum action 1/2 mom^2.
constexpr int product(const CommKey &input)
void massRescale(cudaColorSpinorField &b, QudaInvertParam ¶m, bool for_multishift)
void setUnitarizeLinksConstants(double unitarize_eps, double max_error, bool allow_svd, bool svd_only, double svd_rel_error, double svd_abs_error)
double3 plaquette(const GaugeField &U)
Compute the plaquette of the gauge field.
void join_field(std::vector< Field * > &v_base_field, const Field &collect_field, const CommKey &comm_key, QudaPCType pc_type=QUDA_4D_PC)
bool forceMonitor()
Whether we are monitoring the force or not.
void computeCloverSigmaTrace(GaugeField &output, const CloverField &clover, double coeff)
Compute the matrix tensor field necessary for the force calculation from the clover trace action....
void split_field(Field &collect_field, std::vector< Field * > &v_base_field, const CommKey &comm_key, QudaPCType pc_type=QUDA_4D_PC)
void longKSLink(GaugeField *lng, const GaugeField &u, const double *coeff)
Compute the long links for an improved staggered (Kogut-Susskind) fermions.
void createDiracWithRefine(Dirac *&d, Dirac *&dSloppy, Dirac *&dPre, Dirac *&dRef, QudaInvertParam ¶m, const bool pc_solve)
void fatKSLink(GaugeField *fat, const GaugeField &u, const double *coeff)
Compute the fat links for an improved staggered (Kogut-Susskind) fermions.
void destroyDslashEvents()
__device__ __host__ Matrix< T, 3 > inverse(const Matrix< T, 3 > &u)
void printAPIProfile()
Print out the timer profile for CUDA API calls.
void OvrImpSTOUTStep(GaugeField &dataDs, GaugeField &dataOr, double rho, double epsilon)
Apply Over Improved STOUT smearing to the gauge field.
std::complex< double > Complex
void gaugeFixingFFT(GaugeField &data, const int gauge_dir, const int Nsteps, const int verbose_interval, const double alpha, const int autotune, const double tolerance, const int stopWtheta)
Gauge fixing with Steepest descent method with FFTs with support for single GPU only.
__host__ __device__ ValueType sqrt(ValueType x)
void WFlowStep(GaugeField &out, GaugeField &temp, GaugeField &in, double epsilon, QudaWFlowType wflow_type)
Apply Wilson Flow steps W1, W2, Vt to the gauge field. This routine assumes that the input and output...
void flushForceMonitor()
Flush any outstanding force monitoring information.
void computeCloverSigmaOprod(GaugeField &oprod, std::vector< ColorSpinorField * > &x, std::vector< ColorSpinorField * > &p, std::vector< std::vector< double > > &coeff)
Compute the outer product from the solver solution fields arising from the diagonal term of the fermi...
void updateGaugeField(GaugeField &out, double dt, const GaugeField &in, const GaugeField &mom, bool conj_mom, bool exact)
void ApplyLaplace(ColorSpinorField &out, const ColorSpinorField &in, const GaugeField &U, int dir, double a, double b, const ColorSpinorField &x, int parity, bool dagger, const int *comm_override, TimeProfile &profile)
Driver for applying the Laplace stencil.
cudaGaugeField * createExtendedGauge(cudaGaugeField &in, const int *R, TimeProfile &profile, bool redundant_comms=false, QudaReconstructType recon=QUDA_RECONSTRUCT_INVALID)
void computeClover(CloverField &clover, const GaugeField &fmunu, double coeff)
Driver for computing the clover field from the field strength tensor.
void gaugeGauss(GaugeField &U, RNG &rngstate, double epsilon)
Generate Gaussian distributed su(N) or SU(N) fields. If U is a momentum field, then we generate rando...
void gaugeFixingOVR(GaugeField &data, const int gauge_dir, const int Nsteps, const int verbose_interval, const double relax_boost, const double tolerance, const int reunit_interval, const int stopWtheta)
Gauge fixing with overrelaxation with support for single and multi GPU.
void cloverDerivative(cudaGaugeField &force, cudaGaugeField &gauge, cudaGaugeField &oprod, double coeff, QudaParity parity)
Compute the derivative of the clover matrix in the direction mu,nu and compute the resulting force gi...
void cloverInvert(CloverField &clover, bool computeTraceLog)
This function compute the Cholesky decomposition of each clover matrix and stores the clover inverse ...
void setDiracEigParam(DiracParam &diracParam, QudaInvertParam *inv_param, const bool pc, bool comms)
void unitarizeLinks(GaugeField &outfield, const GaugeField &infield, int *fails)
__host__ __device__ ValueType pow(ValueType x, ExponentType e)
void reorder_location_set(QudaFieldLocation reorder_location_)
Set whether data is reorderd on the CPU or GPU. This can set at QUDA initialization using the environ...
void flushProfile()
Flush profile contents, setting all counts to zero.
void setDiracParam(DiracParam &diracParam, QudaInvertParam *inv_param, bool pc)
void computeStaggeredOprod(GaugeField *out[], ColorSpinorField &in, const double coeff[], int nFace)
Compute the outer-product field between the staggered quark field's one and (for HISQ and ASQTAD) thr...
void gamma5(ColorSpinorField &out, const ColorSpinorField &in)
Applies a gamma5 matrix to a spinor (wrapper to ApplyGamma)
__host__ __device__ ValueType norm(const complex< ValueType > &z)
Returns the magnitude of z squared.
__host__ __device__ std::enable_if<!isFixed< T1 >::value &&!isFixed< T2 >::value, void >::type copy(T1 &a, const T2 &b)
Copy function which is trival between floating point types. When converting to an integer type,...
void computeFmunu(GaugeField &Fmunu, const GaugeField &gauge)
Compute the Fmunu tensor.
constexpr bool dynamic_clover_inverse()
Helper function that returns whether we have enabled dyanmic clover inversion or not.
void updateMomentum(GaugeField &mom, double coeff, GaugeField &force, const char *fname)
void STOUTStep(GaugeField &dataDs, GaugeField &dataOr, double rho)
Apply STOUT smearing to the gauge field.
void createDiracWithEig(Dirac *&d, Dirac *&dSloppy, Dirac *&dPre, Dirac *&dRef, QudaInvertParam ¶m, const bool pc_solve)
void copyExtendedGauge(GaugeField &out, const GaugeField &in, QudaFieldLocation location, void *Out=0, void *In=0)
void createDirac(Dirac *&d, Dirac *&dSloppy, Dirac *&dPre, QudaInvertParam ¶m, const bool pc_solve)
void projectSU3(GaugeField &U, double tol, int *fails)
Project the input gauge field onto the SU(3) group. This is a destructive operation....
void gaugeForce(GaugeField &mom, const GaugeField &u, double coeff, int ***input_path, int *length, double *path_coeff, int num_paths, int max_length)
Compute the gauge-force contribution to the momentum.
void setDiracPreParam(DiracParam &diracParam, QudaInvertParam *inv_param, const bool pc, bool comms)
void saveProfile(const std::string label="")
Save profile to disk.
void computeCloverForce(GaugeField &force, const GaugeField &U, std::vector< ColorSpinorField * > &x, std::vector< ColorSpinorField * > &p, std::vector< double > &coeff)
Compute the force contribution from the solver solution fields.
void contractQuda(const ColorSpinorField &x, const ColorSpinorField &y, void *result, QudaContractType cType)
Main header file for the QUDA library.
void printQudaMultigridParam(QudaMultigridParam *param)
void printQudaInvertParam(QudaInvertParam *param)
QudaGaugeParam newQudaGaugeParam(void)
void printQudaEigParam(QudaEigParam *param)
QudaGaugeObservableParam newQudaGaugeObservableParam(void)
QudaInvertParam newQudaInvertParam(void)
void printQudaGaugeParam(QudaGaugeParam *param)
#define qudaMemcpy(dst, src, count, kind)
#define qudaMemset(ptr, value, count)
#define qudaDeviceSynchronize()
#define QUDA_MAX_DWF_LS
Maximum length of the Ls dimension for domain-wall fermions.
#define QUDA_MAX_MG_LEVEL
Maximum number of multi-grid levels. This number may be increased if needed.
#define QUDA_MAX_DIM
Maximum number of dimensions supported by QUDA. In practice, no routines make use of more than 5.
#define QUDA_VERSION_SUBMINOR
#define QUDA_VERSION_MAJOR
#define QUDA_VERSION_MINOR
#define QUDA_MAX_MULTI_SHIFT
Maximum number of shifts supported by the multi-shift solver. This number may be changed if need be.
Fortran interface functions.
QudaEigSpectrumType spectrum
QudaPrecision cuda_prec_ritz
QudaBoolean compute_gamma5
QudaFieldLocation location
QudaInvertParam * invert_param
QudaMemoryType mem_type_ritz
QudaReconstructType reconstruct_precondition
QudaReconstructType reconstruct
QudaPrecision cuda_prec_precondition
QudaPrecision cuda_prec_refinement_sloppy
QudaFieldLocation location
QudaPrecision cuda_prec_sloppy
QudaReconstructType reconstruct_sloppy
QudaGaugeFieldOrder gauge_order
QudaReconstructType reconstruct_eigensolver
QudaStaggeredPhase staggered_phase_type
QudaReconstructType reconstruct_refinement_sloppy
int staggered_phase_applied
QudaPrecision cuda_prec_eigensolver
QudaPrecision cuda_prec_refinement_sloppy
QudaSolutionType solution_type
QudaCloverFieldOrder clover_order
QudaMassNormalization mass_normalization
QudaFieldLocation clover_location
QudaPrecision clover_cuda_prec_refinement_sloppy
QudaPrecision cuda_prec_eigensolver
QudaPrecision clover_cuda_prec
int compute_clover_inverse
QudaTwistFlavorType twist_flavor
QudaPrecision clover_cpu_prec
QudaDslashType dslash_type
int return_clover_inverse
QudaPrecision clover_cuda_prec_precondition
double_complex b_5[QUDA_MAX_DWF_LS]
QudaDslashType dslash_type_precondition
QudaPrecision clover_cuda_prec_eigensolver
QudaInverterType inv_type
double residue[QUDA_MAX_MULTI_SHIFT]
double_complex c_5[QUDA_MAX_DWF_LS]
QudaPrecision clover_cuda_prec_sloppy
QudaPrecision cuda_prec_sloppy
QudaFieldLocation input_location
QudaFieldLocation output_location
QudaPrecision cuda_prec_precondition
int use_resident_solution
QudaDiracFieldOrder dirac_order
QudaBoolean thin_update_only
QudaPrecision precision_null[QUDA_MAX_MG_LEVEL]
int n_vec[QUDA_MAX_MG_LEVEL]
QudaTransferType transfer_type[QUDA_MAX_MG_LEVEL]
QudaFieldLocation setup_location[QUDA_MAX_MG_LEVEL]
QudaSolveType smoother_solve_type[QUDA_MAX_MG_LEVEL]
QudaBoolean setup_minimize_memory
QudaSchwarzType smoother_schwarz_type[QUDA_MAX_MG_LEVEL]
QudaInvertParam * invert_param
QudaPrecision smoother_halo_precision[QUDA_MAX_MG_LEVEL]
bool twisted
Overall clover coefficient.
double coeff
C_sw clover coefficient.
QudaCloverFieldOrder order
QudaFieldLocation location
void setPrecision(QudaPrecision precision, bool force_native=false)
Helper function for setting the precision and corresponding field order for QUDA internal fields.
static constexpr int n_dim
constexpr bool is_valid() const
QudaReconstructType reconstruct
QudaGaugeFieldOrder order
QudaFieldGeometry geometry
void setPrecision(QudaPrecision precision, bool force_native=false)
Helper function for setting the precision and corresponding field order for QUDA internal fields.
QudaGhostExchange ghostExchange
QudaSiteSubset siteSubset
QudaPrecision Precision() const
double true_res_offset[QUDA_MAX_MULTI_SHIFT]
double true_res_hq_offset[QUDA_MAX_MULTI_SHIFT]
QudaUseInitGuess use_init_guess
void updateInvertParam(QudaInvertParam ¶m, int offset=-1)
deflated_solver(QudaEigParam &eig_param, TimeProfile &profile)
DeflationParam * deflParam
multigrid_solver(QudaMultigridParam &mg_param, TimeProfile &profile)
std::vector< ColorSpinorField * > B
DEVICEHOST void swap(Real &a, Real &b)
void pushVerbosity(QudaVerbosity verbosity)
Push a new verbosity onto the stack.
void popOutputPrefix()
Pop the output prefix restoring the prior one on the stack.
void popVerbosity()
Pop the verbosity restoring the prior one on the stack.
void pushOutputPrefix(const char *prefix)
Push a new output prefix onto the stack.
QudaVerbosity getVerbosity()
void setVerbosity(QudaVerbosity verbosity)
void setOutputPrefix(const char *prefix)
void setOutputFile(FILE *outfile)