48 #ifdef GPU_GAUGE_FORCE 53 #define MAX(a,b) ((a)>(b)? (a):(b)) 54 #define TDIFF(a,b) (b.tv_sec - a.tv_sec + 0.000001*(b.tv_usec - a.tv_usec)) 56 #define spinorSiteSize 24 // real numbers per spinor 58 #define MAX_GPU_NUM_PER_NODE 16 83 static int R[4] = {0, 0, 0, 0};
98 printfQuda(
"\nMAGMA library was already initialized..\n");
109 printfQuda(
"\nMAGMA library was not initialized..\n");
144 #define QUDA_MAX_CHRONO 2 155 pthread_mutex_t pthread_mutex;
271 int rank = coords[0];
272 for (
int i = 1;
i < md->
ndim;
i++) {
282 static int qmp_rank_from_coords(
const int *coords,
void *fdata)
284 return QMP_get_node_number_from(coords);
294 errorQuda(
"Number of communication grid dimensions must be 4");
301 if (QMP_logical_topology_is_declared()) {
302 if (QMP_get_logical_number_of_dimensions() != 4) {
303 errorQuda(
"QMP logical topology must have 4 dimensions");
305 for (
int i=0;
i<nDim;
i++) {
306 int qdim = QMP_get_logical_dimensions()[
i];
307 if(qdim != dims[
i]) {
308 errorQuda(
"QMP logical dims[%d]=%d does not match dims[%d]=%d argument",
i, qdim,
i, dims[
i]);
312 func = qmp_rank_from_coords;
314 warningQuda(
"QMP logical topology is undeclared; using default lexicographical ordering");
317 map_data.
ndim = nDim;
318 for (
int i=0;
i<nDim;
i++) {
319 map_data.
dims[
i] = dims[
i];
321 fdata = (
void *) &map_data;
336 #if defined(QMP_COMMS) 337 if (QMP_logical_topology_is_declared()) {
338 int ndim = QMP_get_logical_number_of_dimensions();
339 const int *dims = QMP_get_logical_dimensions();
342 errorQuda(
"initQuda() called without prior call to initCommsGridQuda()," 343 " and QMP logical topology has not been declared");
345 #elif defined(MPI_COMMS) 346 errorQuda(
"When using MPI for communications, initCommsGridQuda() must be called before initQuda()");
348 const int dims[4] = {1, 1, 1, 1};
355 #define STR(x) STR_(x) 383 #if defined(MULTI_GPU) && (CUDA_VERSION == 4000) 386 char* cni_str =
getenv(
"CUDA_NIC_INTEROP");
388 errorQuda(
"Environment variable CUDA_NIC_INTEROP is not set");
390 int cni_int =
atoi(cni_str);
392 errorQuda(
"Environment variable CUDA_NIC_INTEROP is not set to 1");
397 cudaGetDeviceCount(&deviceCount);
398 if (deviceCount == 0) {
402 for(
int i=0;
i<deviceCount;
i++) {
413 errorQuda(
"initDeviceQuda() called with a negative device ordinal, but comms have not been initialized");
418 if (dev < 0 || dev >= 16)
errorQuda(
"Invalid device number %d", dev);
424 errorQuda(
"Device %d does not support CUDA", dev);
435 const int my_major = __COMPUTE_CAPABILITY__ / 100;
436 const int my_minor = (__COMPUTE_CAPABILITY__ - my_major * 100) / 10;
439 errorQuda(
"** Running on a device with compute capability %i.%i but QUDA was compiled for %i.%i. ** \n --- Please set the correct QUDA_GPU_ARCH when running cmake.\n",
deviceProp.major,
deviceProp.minor, my_major, my_minor);
444 char *allow_jit_env =
getenv(
"QUDA_ALLOW_JIT");
445 if (allow_jit_env &&
strcmp(allow_jit_env,
"1") == 0) {
446 if (
getVerbosity() >
QUDA_SILENT)
warningQuda(
"** Running on a device with compute capability %i.%i but QUDA was compiled for %i.%i. **\n -- Jitting the PTX since QUDA_ALLOW_JIT=1 was set. Note that this will take some time.\n",
deviceProp.major,
deviceProp.minor, my_major, my_minor);
448 errorQuda(
"** Running on a device with compute capability %i.%i but QUDA was compiled for %i.%i. **\n --- Please set the correct QUDA_GPU_ARCH when running cmake.\n If you want the PTX to be jitted for your current GPU arch please set the enviroment variable QUDA_ALLOW_JIT=1.",
deviceProp.major,
deviceProp.minor, my_major, my_minor);
453 warningQuda(
"** Running on a device with compute capability %i.%i but QUDA was compiled for %i.%i. **\n -- This might result in a lower performance. Please consider adjusting QUDA_GPU_ARCH when running cmake.\n",
deviceProp.major,
deviceProp.minor, my_major, my_minor);
465 #if ((CUDA_VERSION >= 6000) && defined NUMA_NVML) 466 char *enable_numa_env =
getenv(
"QUDA_ENABLE_NUMA");
467 if (enable_numa_env &&
strcmp(enable_numa_env,
"0") == 0) {
477 cudaDeviceSetCacheConfig(cudaFuncCachePreferL1);
482 char *reorder_str =
getenv(
"QUDA_REORDER_LOCATION");
484 if (!reorder_str || (
strcmp(reorder_str,
"CPU") &&
strcmp(reorder_str,
"cpu")) ) {
485 warningQuda(
"Data reordering done on GPU (set with QUDA_REORDER_LOCATION=GPU/CPU)");
488 warningQuda(
"Data reordering done on CPU (set with QUDA_REORDER_LOCATION=GPU/CPU)");
509 #if (CUDA_VERSION >= 5050) 516 cudaStreamCreateWithPriority(&
streams[
Nstream-1], cudaStreamDefault, leastPriority);
558 pthread_mutexattr_t mutex_attr;
559 pthread_mutexattr_init(&mutex_attr);
560 pthread_mutexattr_settype(&mutex_attr, PTHREAD_MUTEX_RECURSIVE);
561 pthread_mutex_init(&pthread_mutex, &mutex_attr);
571 for (
int dir=0; dir<4; ++dir)
y[dir] =
in.
X()[dir] + 2*
R[dir];
577 gParamEx.order =
in.Order();
579 gParamEx.t_boundary =
in.TBoundary();
581 gParamEx.tadpole =
in.Tadpole();
582 for (
int d=0;
d<4;
d++) gParamEx.
r[
d] =
R[
d];
609 checkGaugeParam(
param);
627 static size_t checksum = SIZE_MAX;
628 size_t in_checksum =
in->checksum(
true);
629 if (in_checksum == checksum) {
637 checksum = in_checksum;
723 sloppy->
copy(*precise);
739 precondition->
copy(*sloppy);
742 precondition = sloppy;
794 const int R[] = { R_[0], R_[1], R_[2], R_[3] };
809 errorQuda(
"Non-cpu output location not yet supported");
812 checkGaugeParam(
param);
863 bool device_calc =
false;
896 if (!h_clover && !pc_solve && !pc_solution) {
901 if (!h_clover && pc_solve && pc_solution && asymmetric && !device_calc) {
906 #ifdef DYNAMIC_CLOVER 907 bool dynamic_clover =
twisted ? true :
false;
909 bool dynamic_clover =
false;
913 clover_param.
nDim = 4;
921 clover_param.
norm =
nullptr;
922 clover_param.
invNorm =
nullptr;
924 clover_param.
direct = h_clover || device_calc ? true :
false;
925 clover_param.
inverse = (h_clovinv || pc_solve) && !dynamic_clover ?
true :
false;
930 bool clover_update =
false;
948 inParam.
direct = h_clover ? true :
false;
949 inParam.
inverse = h_clovinv ? true :
false;
950 inParam.
clover = h_clover;
972 if (!dynamic_clover) {
987 clover_param.
direct =
true;
988 clover_param.
inverse = dynamic_clover ? false :
true;
996 if (!h_clover && !h_clovinv)
errorQuda(
"Requested clover field return but no clover host pointers set");
1006 if (!dynamic_clover) {
1020 hack->
copy(*hackOfTheHack);
1021 delete hackOfTheHack;
1062 clover_param.
direct =
true;
1065 clover_param.
direct =
false;
1283 for (
unsigned int j=0; j<basis.size(); j++) {
1284 if (basis[j].first)
delete basis[j].first;
1285 if (basis[j].second)
delete basis[j].second;
1375 char *device_reset_env =
getenv(
"QUDA_DEVICE_RESET");
1376 if (device_reset_env &&
strcmp(device_reset_env,
"1") == 0) {
1465 for (
int i=0;
i<4;
i++) diracParam.
commDim[
i] = 1;
1478 for (
int i=0;
i<4;
i++) {
1500 for (
int i=0;
i<4;
i++) {
1501 diracParam.
commDim[
i] = comms ? 1 : 0;
1541 printfQuda(
"Mass rescale: mass normalization: %d\n",
param.mass_normalization);
1543 printfQuda(
"Mass rescale: norm of source in = %g\n", nin);
1548 switch (
param.solution_type) {
1563 for(
int i=0;
i<
param.num_offset;
i++) {
1568 switch (
param.solution_type) {
1602 errorQuda(
"Solution type %d not supported",
param.solution_type);
1608 printfQuda(
"Mass rescale: mass normalization: %d\n",
param.mass_normalization);
1610 printfQuda(
"Mass rescale: norm of source out = %g\n", nin);
1627 errorQuda(
"Clover field not allocated");
1694 printfQuda(
"Out CPU %e CUDA %e\n", cpu, gpu);
1713 errorQuda(
"This type of dslashQuda operator is defined for QUDA_DOMAIN_WALL_$D_DSLASH and QUDA_MOBIUS_DWF_DSLASH only");
1770 printfQuda(
"Out CPU %e CUDA %e\n", cpu, gpu);
1784 errorQuda(
"This type of dslashQuda operator is defined for QUDA_DOMAIN_WALL_$D_DSLASH and QUDA_MOBIUS_DWF_DSLASH only");
1843 printfQuda(
"Out CPU %e CUDA %e\n", cpu, gpu);
1863 errorQuda(
"Clover field not allocated");
1913 printfQuda(
"Out CPU %e CUDA %e\n", cpu, gpu);
1934 errorQuda(
"Clover field not allocated");
1987 printfQuda(
"Out CPU %e CUDA %e\n", cpu, gpu);
2085 errorQuda(
"Cannot apply the clover term for a non Wilson-clover or Twisted-mass-clover dslash");
2130 printfQuda(
"Out CPU %e CUDA %e\n", cpu, gpu);
2144 void lanczosQuda(
int k0,
int m,
void *hp_Apsi,
void *hp_r,
void *hp_V,
2145 void *hp_alpha,
void *hp_beta,
QudaEigParam *eig_param)
2162 checkInvertParam(
param);
2167 checkEigParam(eig_param);
2191 cpuParam.
v = hp_Apsi;
2199 for(
int k = 0 ; k < m ; k++)
2201 cpuParam.
v = ((
double**)hp_V)[k];
2217 printfQuda(
"r vector CPU %1.14e CUDA %1.14e\n", cpu, gpu);
2220 printfQuda(
"Apsi vector CPU %1.14e CUDA %1.14e\n", cpu, gpu);
2227 for(
int k = 0 ; k < m ; k++)
2233 printfQuda(
"Eig_Vec[%d] CPU %1.14e CUDA %1.14e\n", k, cpu, gpu);
2243 (*eig_solve)((
double*)hp_alpha, (
double*)hp_beta, Eig_Vec, *r, *Apsi, k0, m);
2251 (*eig_solve)((
double*)hp_alpha, (
double*)hp_beta, Eig_Vec, *r, *Apsi, k0, m);
2259 (*eig_solve)((
double*)hp_alpha, (
double*)hp_beta, Eig_Vec, *r, *Apsi, k0, m);
2264 errorQuda(
"invalid ritz matrix type\n");
2270 for(
int k = 0 ; k < m ; k++)
2272 *h_Eig_Vec[k] = *Eig_Vec[k];
2281 for(
int k = 0 ; k < m ; k++)
2284 delete h_Eig_Vec[k];
2298 : profile(profile) {
2303 checkMultigridParam(&mg_param);
2313 errorQuda(
"Outer MG solver can only use QUDA_DIRECT_SOLVE at present");
2348 printfQuda(
"Creating vector of null space fields of length %d\n", mg_param.
n_vec[0]);
2353 B.resize(mg_param.
n_vec[0]);
2374 return static_cast<void*
>(mg);
2386 checkMultigridParam(mg_param);
2392 if (mg->
m)
delete mg->
m;
2396 if (mg->
d)
delete mg->
d;
2436 :
d(nullptr), m(nullptr), RV(nullptr), deflParam(nullptr), defl(nullptr), profile(profile) {
2464 ritzParam.is_composite =
true;
2465 ritzParam.is_component =
false;
2466 ritzParam.composite_dim =
param->nev*
param->deflation_grid;
2467 ritzParam.setPrecision(
param->cuda_prec_ritz);
2481 for(
int d = 0;
d < ritzParam.nDim;
d++) ritzVolume *= ritzParam.x[
d];
2485 size_t byte_estimate = (
size_t)ritzParam.composite_dim*(
size_t)ritzVolume*(ritzParam.nColor*ritzParam.nSpin*ritzParam.precision);
2486 printfQuda(
"allocating bytes: %lu (lattice volume %d, prec %d)" , byte_estimate, ritzVolume, ritzParam.precision);
2511 return static_cast<void*
>(defl);
2534 checkInvertParam(
param);
2555 if (!pc_solve)
param->spinorGiB *= 2;
2568 Dirac *dSloppy = NULL;
2575 Dirac &diracSloppy = *dSloppy;
2576 Dirac &diracPre = *dPre;
2601 bool invalidate =
false;
2603 if (cudaParam.
precision != v->Precision()) { invalidate =
true;
break; }
2620 errorQuda(
"Initial guess not supported for two-pass solver");
2631 if (nb==0.0)
errorQuda(
"Source has zero norm");
2637 printfQuda(
"Source: CPU = %g, CUDA copy = %g\n", nh_b, nb);
2638 printfQuda(
"Solution: CPU = %g, CUDA copy = %g\n", nh_x, nx);
2655 printfQuda(
"Prepared solution = %g\n", nout);
2660 printfQuda(
"Prepared source post mass rescale = %g\n", nin);
2684 if (pc_solution && !pc_solve) {
2685 errorQuda(
"Preconditioned (PC) solution_type requires a PC solve_type");
2688 if (!mat_solution && !pc_solution && pc_solve) {
2689 errorQuda(
"Unpreconditioned MATDAG_MAT solution_type requires an unpreconditioned solve_type");
2692 if (!mat_solution && norm_error_solve) {
2693 errorQuda(
"Normal-error solve requires Mat solution");
2697 errorQuda(
"Multigrid preconditioning only supported for direct solves");
2700 if (
param->use_resident_chrono && (direct_solve || norm_error_solve) ){
2701 errorQuda(
"Chronological forcasting only presently supported for M^dagger M solver");
2704 if (mat_solution && !direct_solve && !norm_error_solve) {
2707 }
else if (!mat_solution && direct_solve) {
2711 (*solve)(*
out, *
in);
2718 DiracM m(
dirac), mSloppy(diracSloppy), mPre(diracPre);
2721 (*solve)(*
out, *
in);
2724 }
else if (!norm_error_solve) {
2734 for (
unsigned int j=0; j<basis.size(); j++) m(*basis[j].second, *basis[j].first,
tmp,
tmp2);
2736 bool orthogonal =
true;
2737 bool apply_mat =
false;
2745 (*solve)(*
out, *
in);
2773 if (!
param->make_resident_solution) {
2781 if (
param->make_resident_chrono) {
2782 int i =
param->chrono_index;
2789 if ((
int)basis.size() <
param->max_chrono_dim) {
2796 for (
unsigned int j=basis.size()-1; j>0; j--) basis[j].first = basis[j-1].first;
2797 basis[0].first =
tmp;
2798 *(basis[0]).first = *
x;
2801 if (
param->compute_action) {
2803 param->action[0] = action.real();
2804 param->action[1] = action.imag();
2810 printfQuda(
"Reconstructed: CUDA solution = %g, CPU copy = %g\n", nx, nh_x);
2820 if (!
param->make_resident_solution) {
2864 checkInvertParam(
param);
2885 if (!pc_solve)
param->spinorGiB *= 2;
2898 Dirac *dSloppy = NULL;
2905 Dirac &diracSloppy = *dSloppy;
2906 Dirac &diracPre = *dPre;
2929 hp_x =
new void* [
param->num_src ];
2932 hp_b =
new void* [
param->num_src];
2934 for(
int i=0;
i <
param->num_src;
i++){
2941 std::vector<ColorSpinorField*> h_b;
2942 h_b.resize(
param->num_src);
2943 for(
int i=0;
i <
param->num_src;
i++) {
2944 cpuParam.
v = hp_b[
i];
2950 std::vector<ColorSpinorField*> h_x;
2951 h_x.resize(
param->num_src);
2953 for(
int i=0;
i <
param->num_src;
i++) {
2954 cpuParam.
v = hp_x[
i];
2974 for(
int i=0;
i <
param->num_src;
i++) {
2975 b->Component(
i) = *h_b[
i];
2984 errorQuda(
"Initial guess not supported for two-pass solver");
2991 for(
int i=0;
i <
param->num_src;
i++) {
2992 x->Component(
i) = *h_x[
i];
3006 double * nb =
new double[
param->num_src];
3007 for(
int i=0;
i <
param->num_src;
i++) {
3009 printfQuda(
"Source %i: CPU = %g, CUDA copy = %g\n",
i, nb[
i], nb[
i]);
3010 if (nb[
i]==0.0)
errorQuda(
"Source has zero norm");
3016 printfQuda(
"Source %i: CPU = %g, CUDA copy = %g\n",
i, nh_b, nb[
i]);
3017 printfQuda(
"Solution %i: CPU = %g, CUDA copy = %g\n",
i, nh_x, nx);
3025 for(
int i=0;
i <
param->num_src;
i++) {
3031 for(
int i=0;
i <
param->num_src;
i++) {
3039 for(
int i=0;
i <
param->num_src;
i++) {
3044 printfQuda(
"Prepared solution %i = %g\n",
i, nout);
3049 printfQuda(
"Prepared source %i post mass rescale = %g\n",
i, nin);
3074 if (pc_solution && !pc_solve) {
3075 errorQuda(
"Preconditioned (PC) solution_type requires a PC solve_type");
3078 if (!mat_solution && !pc_solution && pc_solve) {
3079 errorQuda(
"Unpreconditioned MATDAG_MAT solution_type requires an unpreconditioned solve_type");
3082 if (!mat_solution && norm_error_solve) {
3083 errorQuda(
"Normal-error solve requires Mat solution");
3086 if (
param->inv_type_precondition ==
QUDA_MG_INVERTER && (pc_solve || pc_solution || !direct_solve || !mat_solution))
3087 errorQuda(
"Multigrid preconditioning only supported for direct non-red-black solve");
3089 if (mat_solution && !direct_solve && !norm_error_solve) {
3090 for(
int i=0;
i <
param->num_src;
i++) {
3094 }
else if (!mat_solution && direct_solve) {
3099 for(
int i=0;
i <
param->num_src;
i++) {
3107 DiracM m(
dirac), mSloppy(diracSloppy), mPre(diracPre);
3113 }
else if (!norm_error_solve) {
3122 errorQuda(
"norm_error_solve not supported in multi source solve");
3133 for(
int i=0;
i <
param->num_src;
i++) {
3141 for(
int i=0;
i<
param->num_src;
i++){
3147 for(
int i=0;
i<
param->num_src;
i++){
3154 if (!
param->make_resident_solution) {
3156 for(
int i=0;
i<
param->num_src;
i++){
3157 *h_x[
i] =
x->Component(
i);
3163 for(
int i=0;
i<
param->num_src;
i++){
3166 printfQuda(
"Reconstructed: CUDA solution = %g, CPU copy = %g\n", nx, nh_x);
3171 for(
int i=0;
i <
param->num_src;
i++){
3218 checkInvertParam(
param);
3224 errorQuda(
"Number of shifts %d requested greater than QUDA_MAX_MULTI_SHIFT %d",
3235 errorQuda(
"Multi-shift solver does not support MAT or MATPC solution types");
3238 errorQuda(
"Multi-shift solver does not support DIRECT or DIRECT_PC solve types");
3240 if (pc_solution & !pc_solve) {
3241 errorQuda(
"Preconditioned (PC) solution_type requires a PC solve_type");
3243 if (!pc_solution & pc_solve) {
3244 errorQuda(
"In multi-shift solver, a preconditioned (PC) solve_type requires a PC solution_type");
3249 if( !pc_solve)
param->spinorGiB *= 2;
3256 errorQuda(
"QUDA only currently supports multi-shift CG");
3267 for (
int i=0;
i<
param->num_offset-1;
i++) {
3268 for (
int j=
i+1; j<
param->num_offset; j++) {
3270 errorQuda(
"Offsets must be ordered from smallest to largest");
3276 hp_x =
new void* [
param->num_offset ];
3279 for(
int i=0;
i <
param->num_offset;
i++){
3296 Dirac *dSloppy = NULL;
3302 Dirac &diracSloppy = *dSloppy;
3305 std::vector<ColorSpinorField*>
x;
3306 x.resize(
param->num_offset);
3320 std::vector<ColorSpinorField*> h_x;
3321 h_x.resize(
param->num_offset);
3324 for(
int i=0;
i <
param->num_offset;
i++) {
3325 cpuParam.
v = hp_x[
i];
3343 bool invalidate =
false;
3345 if (cudaParam.
precision != v->Precision()) { invalidate =
true;
break; }
3365 if (nb==0.0)
errorQuda(
"Source has zero norm");
3369 printfQuda(
"Source: CPU = %g, CUDA copy = %g\n", nh_b, nb);
3389 if (
param->compute_true_res) {
3396 #define REFINE_INCREASING_MASS 3397 #ifdef REFINE_INCREASING_MASS 3398 for(
int i=0;
i <
param->num_offset;
i++) {
3400 for(
int i=
param->num_offset-1;
i >= 0;
i--) {
3403 param->true_res_hq_offset[
i] : 0;
3405 param->tol_hq_offset[
i] : 0;
3415 const double iter_tol = (
param->iter_res_offset[
i] < prec_tol ? prec_tol : (
param->iter_res_offset[
i] *1.1));
3416 const double refine_tol = (
param->tol_offset[
i] == 0.0 ? iter_tol :
param->tol_offset[
i]);
3418 if ((
param->true_res_offset[
i] > refine_tol || rsd_hq >
tol_hq)) {
3420 printfQuda(
"Refining shift %d: L2 residual %e / %e, heavy quark %e / %e (actual / requested)\n",
3435 m.shift =
param->offset[
i];
3441 #ifdef REFINE_INCREASING_MASS 3442 const int nRefine =
i+1;
3444 const int nRefine =
param->num_offset -
i + 1;
3447 std::vector<ColorSpinorField*> q;
3449 std::vector<ColorSpinorField*>
z;
3454 for(
int j=0; j < nRefine; j++) {
3460 #ifdef REFINE_INCREASING_MASS 3461 for (
int j=1; j<nRefine; j++) *
z[j] = *
x[j];
3463 for (
int j=1; j<nRefine; j++) *
z[j] = *
x[
param->num_offset-j];
3466 bool orthogonal =
true;
3467 bool apply_mat =
true;
3472 for(
int j=0; j < nRefine; j++) {
3479 solverParam.
iter = 0;
3481 solverParam.
tol = (
param->tol_offset[
i] > 0.0 ?
param->tol_offset[
i] : iter_tol);
3502 for(
int i=0;
i <
param->num_offset;
i++) {
3508 if (
param->compute_action) {
3511 param->action[0] = action.real();
3512 param->action[1] = action.imag();
3515 for(
int i=0;
i <
param->num_offset;
i++) {
3525 if (!
param->make_resident_solution) *h_x[
i] = *
x[
i];
3531 if (!
param->make_resident_solution) {
3539 for(
int i=0;
i <
param->num_offset;
i++){
3568 checkGaugeParam(
param);
3572 const double max_error = 1
e-10;
3631 if (ulink) cudaUnitarizedLink->
saveCPUField(cpuUnitarizedLink);
3639 if (ulink)
delete cudaUnitarizedLink;
3640 delete cudaInLinkEx;
3645 errorQuda(
"Fat-link has not been built");
3646 #endif // GPU_FATLINK 3656 for(
int dir=0; dir<4; ++dir) face_size[dir] = (volume/
param.x[dir])/2;
3657 pad = *std::max_element(face_size, face_size+4);
3666 #ifdef GPU_GAUGE_FORCE 3750 delete cudaSiteLink;
3760 if (cpuSiteLink)
delete cpuSiteLink;
3775 errorQuda(
"Gauge force has not been built");
3776 #endif // GPU_GAUGE_FORCE 3818 errorQuda(
"Only scalar and vector geometries are supported\n");
3896 for(
int dir=0; dir<4; ++dir) qParam.
x[dir] =
gParam.
x[dir];
3915 errorQuda(
"Resident gauge field is required");
3921 std::vector<ColorSpinorField*>
X(nvector);
3926 errorQuda(
"solutionResident.size() %lu does not match number of shifts %d",
3938 for (
int i=0;
i<nvector;
i++) {
3942 else errorQuda(
"%s requires resident solution", __func__);
3963 for (
int i=0;
i<nvector;
i++) {
3995 for (
int i=0;
i<nvector;
i++)
delete X[
i];
4006 const double level2_coeff[6],
4007 const double fat7_coeff[6],
4008 const void*
const w_link,
4009 const void*
const v_link,
4010 const void*
const u_link,
4017 #ifdef GPU_STAGGERED_OPROD 4018 using namespace quda;
4039 const double hisq_force_filter = 5
e-5;
4040 const double max_det_error = 1
e-10;
4041 const bool allow_svd =
true;
4042 const bool svd_only =
false;
4043 const double svd_rel_err = 1
e-8;
4044 const double svd_abs_err = 1
e-8;
4049 double act_path_coeff[6] = {0,1,level2_coeff[2],level2_coeff[3],level2_coeff[4],level2_coeff[5]};
4062 param.gauge = (
void*)w_link;
4064 param.gauge = (
void*)v_link;
4066 param.gauge = (
void*)u_link;
4081 for (
int dir=0; dir<4; ++dir) {
4082 param.x[dir] += 2*
R[dir];
4101 for (
int dir=0; dir<4; ++dir) qParam.
x[dir] = oParam.
x[dir];
4111 qParam.
v = fermion[0];
4114 GaugeField *oprod[2] = {stapleOprod, naikOprod};
4117 for(
int i=0;
i<num_terms; ++
i){
4121 qParam.
v = fermion[
i];
4126 cudaQuark = cpuQuark;
4136 oneLinkOprod->
copy(*stapleOprod);
4137 ax(level2_coeff[0], *oneLinkOprod);
4138 GaugeField *oprod[2] = {oneLinkOprod, naikOprod};
4141 for(
int i=0;
i<num_naik_terms; ++
i){
4145 qParam.
v = fermion[
i + num_terms - num_naik_terms];
4150 cudaQuark = cpuQuark;
4167 delete oneLinkOprod;
4205 cudaMemset((
void**)(cudaOutForce->
Gauge_p()), 0, cudaOutForce->
Bytes());
4223 if (
gParam->use_resident_mom) {
4228 if (
gParam->return_result_mom) {
4237 if (!
gParam->make_resident_mom) {
4242 delete cudaOutForce;
4250 errorQuda(
"HISQ force has not been built");
4255 double *
coeff,
double kappa2,
double ck,
4256 int nvector,
double multiplicity,
void *gauge,
4260 using namespace quda;
4294 for(
int dir=0; dir<4; ++dir) qParam.
x[dir] = fParam.
x[dir];
4301 std::vector<ColorSpinorField*> quarkX, quarkP;
4302 for (
int i=0;
i<nvector;
i++) {
4325 errorQuda(
"solutionResident.size() %lu does not match number of shifts %d",
4338 std::vector<double> force_coeff(nvector);
4340 for(
int i=0;
i<nvector;
i++){
4357 x.Even() = cpuQuarkX;
4375 force_coeff[
i] = 2.0*dt*
coeff[
i]*kappa2;
4392 std::vector< std::vector<double> > ferm_epsilon(nvector);
4394 ferm_epsilon[
shift].reserve(2);
4408 if (u != &gaugeEx)
delete u;
4420 for (
int i=0;
i<nvector;
i++) {
4450 checkGaugeParam(
param);
4508 (
bool)conj_mom, (
bool)exact);
4523 delete cudaOutGauge;
4548 checkGaugeParam(
param);
4606 checkGaugeParam(
param);
4662 checkGaugeParam(
param);
4755 errorQuda(
"Fortran multi-shift solver presently only supports QUDA_TIFR_PADDED_DIRAC_ORDER");
4760 for (
int i=0;
i<
param->num_offset;
i++) hp_x[
i] = static_cast<char*>(h_x) +
i*cb_offset;
4768 cudaHostRegister(
ptr, *
bytes, cudaHostRegisterDefault);
4773 cudaHostUnregister(
ptr);
4785 bool *conj_mom,
bool *exact,
4790 static inline int opp(
int dir) {
return 7-dir; }
4796 if (num_loop_types >= 1)
4797 for(
int i=0;
i<4; ++
i){
4798 if(
i==dir)
continue;
4804 if (num_loop_types >= 2)
4805 for(
int i=0;
i<4; ++
i){
4806 if(
i==dir)
continue;
4821 if (num_loop_types >= 3) {
4823 for(
int i=0;
i<4; ++
i){
4824 for(
int j=0; j<4; ++j){
4825 if(
i==dir || j==dir ||
i==j)
continue;
4844 switch (*num_loop_types) {
4855 errorQuda(
"Invalid num_loop_types = %d\n", *num_loop_types);
4858 double *loop_coeff =
static_cast<double*
>(
safe_malloc(numPaths*
sizeof(
double)));
4859 int *path_length =
static_cast<int*
>(
safe_malloc(numPaths*
sizeof(
int)));
4861 if (*num_loop_types >= 1)
for(
int i= 0;
i< 6; ++
i) {
4862 loop_coeff[
i] =
coeff[0];
4865 if (*num_loop_types >= 2)
for(
int i= 6;
i<24; ++
i) {
4866 loop_coeff[
i] =
coeff[1];
4869 if (*num_loop_types >= 3)
for(
int i=24;
i<48; ++
i) {
4870 loop_coeff[
i] =
coeff[2];
4874 int** input_path_buf[4];
4875 for(
int dir=0; dir<4; ++dir){
4876 input_path_buf[dir] =
static_cast<int**
>(
safe_malloc(numPaths*
sizeof(
int*)));
4877 for(
int i=0;
i<numPaths; ++
i){
4878 input_path_buf[dir][
i] =
static_cast<int*
>(
safe_malloc(path_length[
i]*
sizeof(
int)));
4887 for(
int dir=0; dir<4; ++dir){
4888 for(
int i=0;
i<numPaths; ++
i)
host_free(input_path_buf[dir][
i]);
4931 static int bqcd_rank_from_coords(
const int *coords,
void *fdata)
4933 int *dims =
static_cast<int *
>(fdata);
4935 int rank = coords[3];
4936 for (
int i = 2;
i >= 0;
i--) {
4955 bool pack_ = *
pack ? true :
false;
4963 #ifdef GPU_GAUGE_TOOLS 4968 errorQuda(
"Cannot generate Gauss GaugeField as there is no resident gauge field");
5024 unsigned int nSteps,
double alpha)
5037 printfQuda(
"Wuppertal smearing done with gaugeSmeared\n");
5045 printfQuda(
"Wuppertal smearing done with gaugePrecise\n");
5065 for (
unsigned int i=0;
i<nSteps;
i++) {
5082 printfQuda(
"Out CPU %e CUDA %e\n", cpu, gpu);
5110 printfQuda(
"Plaquette after 0 APE steps: %le %le %le\n", plq.x, plq.y, plq.z);
5113 for (
unsigned int i=0;
i<nSteps;
i++) {
5119 delete cudaGaugeTemp;
5125 printfQuda(
"Plaquette after %d APE steps: %le %le %le\n", nSteps, plq.x, plq.y, plq.z);
5145 printfQuda(
"Plaquette after 0 STOUT steps: %le %le %le\n", plq.x, plq.y, plq.z);
5148 for (
unsigned int i=0;
i<nSteps;
i++) {
5154 delete cudaGaugeTemp;
5160 printfQuda(
"Plaquette after %d STOUT steps: %le %le %le\n", nSteps, plq.x, plq.y, plq.z);
5180 printfQuda(
"Plaquette after 0 OvrImpSTOUT steps: %le %le %le\n", plq.x, plq.y, plq.z);
5183 for (
unsigned int i=0;
i<nSteps;
i++) {
5189 delete cudaGaugeTemp;
5195 printfQuda(
"Plaquette after %d OvrImpSTOUT steps: %le %le %le\n", nSteps, plq.x, plq.y, plq.z);
5203 const unsigned int verbose_interval,
const double relax_boost,
const double tolerance,
const unsigned int reunit_interval, \
5209 checkGaugeParam(
param);
5243 gaugefixingOVR(*cudaInGauge, gauge_dir, Nsteps, verbose_interval, relax_boost, tolerance, \
5244 reunit_interval, stopWtheta);
5251 gaugefixingOVR(*cudaInGaugeEx, gauge_dir, Nsteps, verbose_interval, relax_boost, tolerance, \
5252 reunit_interval, stopWtheta);
5288 const unsigned int verbose_interval,
const double alpha,
const unsigned int autotune,
const double tolerance, \
5294 checkGaugeParam(
param);
5330 gaugefixingFFT(*cudaInGauge, gauge_dir, Nsteps, verbose_interval, alpha, autotune, tolerance, stopWtheta);
5378 errorQuda(
"Precision not supported for contractions\n");
5391 errorQuda(
"Precision not supported for contractions\n");
void new_quda_invert_param_(QudaInvertParam *param)
QudaCloverFieldOrder order
static QudaGaugeParam qudaGaugeParam
void setRho(double rho)
Bakes in the rho factor into the clover field, (for real diagonal additive Hasenbusch), e.g., A + rho.
void contract(const cudaColorSpinorField x, const cudaColorSpinorField y, void *ctrn, const QudaContractType cType)
QudaDiracFieldOrder dirac_order
QudaMassNormalization mass_normalization
#define qudaMemcpy(dst, src, count, kind)
QudaReconstructType reconstruct_sloppy
double c_5[QUDA_MAX_DWF_LS]
NEW: used by mobius domain wall only.
void Init()
Initialize CURAND RNG states.
DiracMatrix * matSmoothSloppy
void fatLongKSLink(cudaGaugeField *fat, cudaGaugeField *lng, const cudaGaugeField &gauge, const double *coeff)
Compute the fat and long links for an improved staggered (Kogut-Susskind) fermions.
void cloverDerivative(cudaGaugeField &force, cudaGaugeField &gauge, cudaGaugeField &oprod, double coeff, QudaParity parity)
Compute the derivative of the clover matrix in the direction mu,nu and compute the resulting force gi...
QudaGhostExchange ghostExchange
void freeCloverQuda(void)
void computeKSLinkQuda(void *fatlink, void *longlink, void *ulink, void *inlink, double *path_coeff, QudaGaugeParam *param)
void setDiracSloppyParam(DiracParam &diracParam, QudaInvertParam *inv_param, bool pc)
static TimeProfile profileStaggeredForce("computeStaggeredForceQuda")
Profiler for computeHISQForceQuda.
void * createGaugeFieldQuda(void *gauge, int geometry, QudaGaugeParam *param)
void exchange_cpu_sitelink_ex(int *X, int *R, void **sitelink, QudaGaugeFieldOrder cpu_order, QudaPrecision gPrecision, int optflag, int geometry)
double b_5[QUDA_MAX_DWF_LS]
int commDimPartitioned(int dir)
void loadCloverQuda(void *h_clover, void *h_clovinv, QudaInvertParam *inv_param)
void setVerbosityQuda(QudaVerbosity verbosity, const char prefix[], FILE *outfile)
double3 plaquette(const GaugeField &U, QudaFieldLocation location)
QudaFieldLocation clover_location
enum QudaPrecision_s QudaPrecision
void kinetic_quda_(double *kin, void *momentum, QudaGaugeParam *param)
Evaluate the kinetic (momentum) contribution to classical Hamiltonian for Hybrid Monte Carlo...
static TimeProfile profileFatLink("computeKSLinkQuda")
Profiler for computeGaugeForceQuda.
void load_gauge_quda_(void *h_gauge, QudaGaugeParam *param)
void saveCPUField(cpuGaugeField &cpu) const
Upload from this field into a CPU field.
void computeStaggeredForceQuda(void *h_mom, double dt, double delta, void *h_force, void **x, QudaGaugeParam *gauge_param, QudaInvertParam *inv_param)
double momActionQuda(void *momentum, QudaGaugeParam *param)
void * V(bool inverse=false)
void computeHISQForceQuda(void *const milc_momentum, long long *flops, const double level2_coeff[6], const double fat7_coeff[6], const void *const w_link, const void *const v_link, const void *const u_link, void **fermion, int num_terms, int num_naik_terms, double **coeff, QudaGaugeParam *gParam)
__host__ __device__ ValueType norm(const complex< ValueType > &z)
Returns the magnitude of z squared.
static TimeProfile profileGaugeUpdate("updateGaugeFieldQuda")
Profiler for createExtendedGaugeField.
void setUnitarizeForceConstants(double unitarize_eps, double hisq_force_filter, double max_det_error, bool allow_svd, bool svd_only, double svd_rel_error, double svd_abs_error)
Set the constant parameters for the force unitarization.
#define QUDA_MAX_MULTI_SHIFT
Maximum number of shifts supported by the multi-shift solver. This number may be changed if need be...
cudaGaugeField * gaugeExtended
void createDirac(Dirac *&d, Dirac *&dSloppy, Dirac *&dPre, QudaInvertParam ¶m, const bool pc_solve)
void printQudaGaugeParam(QudaGaugeParam *param)
QudaVerbosity getVerbosity()
void computeCloverForce(GaugeField &force, const GaugeField &U, std::vector< ColorSpinorField *> &x, std::vector< ColorSpinorField *> &p, std::vector< double > &coeff)
Compute the force contribution from the solver solution fields.
static TimeProfile profileQCharge("qChargeQuda")
Profiler for APEQuda.
void createSmoother()
Create the smoothers.
double norm2(const ColorSpinorField &a)
void setUnitarizeLinksConstants(double unitarize_eps, double max_error, bool allow_svd, bool svd_only, double svd_rel_error, double svd_abs_error)
QudaDslashType dslash_type
void setPrecision(QudaPrecision precision)
Helper function for setting the precision and corresponding field order for QUDA internal fields...
QudaReconstructType reconstruct_precondition
QudaInverterType inv_type
Fortran interface functions.
double c_5[QUDA_MAX_DWF_LS]
int return_clover_inverse
void computeFmunu(GaugeField &Fmunu, const GaugeField &gauge, QudaFieldLocation location)
QudaSolveType smoother_solve_type[QUDA_MAX_MG_LEVEL]
void destroySmoother()
Free the smoothers.
void performSTOUTnStep(unsigned int nSteps, double rho)
__host__ __device__ ValueType sqrt(ValueType x)
cudaGaugeField *& gaugeFatExtended
Complex cDotProduct(ColorSpinorField &, ColorSpinorField &)
void STOUTStep(GaugeField &dataDs, const GaugeField &dataOr, double rho)
std::complex< double > Complex
void setOutputPrefix(const char *prefix)
void loadGaugeQuda(void *h_gauge, QudaGaugeParam *param)
void gaussGaugeQuda(long seed)
void plaq_quda_(double plaq[3])
static TimeProfile profileMulti("invertMultiShiftQuda")
Profiler for computeFatLinkQuda.
static bool reunit_svd_only
static TimeProfile profileOvrImpSTOUT("OvrImpSTOUTQuda")
Profiler for projectSU3Quda.
cudaColorSpinorField * tmp
cudaGaugeField * gaugeLongPrecise
static TimeProfile profileGaugeForce("computeGaugeForceQuda")
Profiler for updateGaugeFieldQuda.
static ColorSpinorField * Create(const ColorSpinorParam ¶m)
static TimeProfile profileAPE("APEQuda")
Profiler for STOUTQuda.
#define QUDA_VERSION_MINOR
__host__ __device__ void copy(T1 &a, const T2 &b)
void invertMultiSrcQuda(void **_hp_x, void **_hp_b, QudaInvertParam *param)
void copy(ColorSpinorField &dst, const ColorSpinorField &src)
void ax(const double &a, ColorSpinorField &x)
static TimeProfile profilePlaq("plaqQuda")
Profiler for wuppertalQuda.
void free_clover_quda_(void)
QudaGaugeParam gauge_param
void initCommsGridQuda(int nDim, const int *dims, QudaCommsMap func, void *fdata)
QudaFieldGeometry Geometry() const
static cudaGaugeField * createExtendedGauge(cudaGaugeField &in, const int *R, TimeProfile &profile, bool redundant_comms=false, QudaReconstructType recon=QUDA_RECONSTRUCT_INVALID)
double pow(double, double)
int computeGaugeFixingOVRQuda(void *gauge, const unsigned int gauge_dir, const unsigned int Nsteps, const unsigned int verbose_interval, const double relax_boost, const double tolerance, const unsigned int reunit_interval, const unsigned int stopWtheta, QudaGaugeParam *param, double *timeinfo)
Gauge fixing with overrelaxation with support for single and multi GPU.
void massRescale(cudaColorSpinorField &b, QudaInvertParam ¶m)
ColorSpinorField & Component(const int idx) const
std::vector< std::vector< std::pair< ColorSpinorField *, ColorSpinorField * > > > chronoResident(QUDA_MAX_CHRONO)
void loadSloppyGaugeQuda(QudaPrecision prec_sloppy, QudaPrecision prec_precondition)
void destroyDeflationQuda(void *df)
QudaGaugeFieldOrder gauge_order
void computeCloverForceQuda(void *h_mom, double dt, void **h_x, void **h_p, double *coeff, double kappa2, double ck, int nvector, double multiplicity, void *gauge, QudaGaugeParam *gauge_param, QudaInvertParam *inv_param)
static TimeProfile profileInit2End("initQuda-endQuda", false)
static double svd_rel_error
void gaugeForce(GaugeField &mom, const GaugeField &u, double coeff, int ***input_path, int *length, double *path_coeff, int num_paths, int max_length)
Compute the gauge-force contribution to the momentum.
void saveGaugeQuda(void *h_gauge, QudaGaugeParam *param)
void gaugefixingOVR(cudaGaugeField &data, const int gauge_dir, const int Nsteps, const int verbose_interval, const double relax_boost, const double tolerance, const int reunit_interval, const int stopWtheta)
Gauge fixing with overrelaxation with support for single and multi GPU.
void invert_multishift_quda_(void *h_x, void *hp_b, QudaInvertParam *param)
static TimeProfile profileMomAction("momActionQuda")
Profiler for endQuda.
QudaSiteSubset siteSubset
void exit(int) __attribute__((noreturn))
QudaPrecision clover_cuda_prec_sloppy
cudaGaugeField * gaugeLongExtended
QudaFieldLocation input_location
void solve(Complex *psi, std::vector< ColorSpinorField *> &p, std::vector< ColorSpinorField *> &q, ColorSpinorField &b)
Solve the equation A p_k psi_k = b by minimizing the residual and using Gaussian elimination.
void destroyGaugeFieldQuda(void *gauge)
void computeCloverSigmaTrace(GaugeField &output, const CloverField &clover, double coeff)
Compute the matrix tensor field necessary for the force calculation from the clover trace action...
void dslashQuda_4dpc(void *h_out, void *h_in, QudaInvertParam *inv_param, QudaParity parity, int test_type)
int computeGaugeFixingFFTQuda(void *gauge, const unsigned int gauge_dir, const unsigned int Nsteps, const unsigned int verbose_interval, const double alpha, const unsigned int autotune, const double tolerance, const unsigned int stopWtheta, QudaGaugeParam *param, double *timeinfo)
Gauge fixing with Steepest descent method with FFTs with support for single GPU only.
cudaCloverField * cloverPrecondition
DeflationParam * deflParam
int n_vec[QUDA_MAX_MG_LEVEL]
QudaPrecision prec_sloppy
void init_quda_(int *dev)
void flush_pinned()
Free all outstanding pinned-memory allocations.
int getGaugePadding(GaugeFieldParam ¶m)
void gaugeGauss(GaugeField &dataDs, RNG &rngstate)
void APEStep(GaugeField &dataDs, const GaugeField &dataOr, double alpha)
char * index(const char *, int)
double computeMomAction(const GaugeField &mom)
Compute and return global the momentum action 1/2 mom^2.
void setDiracParam(DiracParam &diracParam, QudaInvertParam *inv_param, bool pc)
void comm_init(int ndim, const int *dims, QudaCommsMap rank_from_coords, void *map_data)
QudaSolutionType solution_type
QudaMemoryType mem_type_ritz
int strcmp(const char *__s1, const char *__s2)
QudaPrecision clover_cuda_prec
std::vector< cudaColorSpinorField * > solutionResident
void invertMultiShiftQuda(void **_hp_x, void *_hp_b, QudaInvertParam *param)
bool is_composite
for deflation solvers:
void loadCPUField(const cpuGaugeField &cpu)
Download into this field from a CPU field.
double Last(QudaProfileType idx)
QudaInvertParam * invert_param
cudaDeviceProp deviceProp
void cloverInvert(CloverField &clover, bool computeTraceLog, QudaFieldLocation location)
This function compute the Cholesky decomposition of each clover matrix and stores the clover inverse ...
void ax(const double &a, GaugeField &u)
Scale the gauge field by the scalar a.
static unsigned int delta
void init()
Initialize the memory pool allocator.
void hisqLongLinkForce(GaugeField &newOprod, const GaugeField &oprod, const GaugeField &link, double coeff, long long *flops=nullptr)
Compute the long-link contribution to the fermion force.
QudaFieldLocation output_location
void unitarizeLinks(cudaGaugeField &outfield, const cudaGaugeField &infield, int *fails)
QudaPrecision clover_cuda_prec_precondition
QudaFieldLocation location
QudaInvertParam inv_param
int setNumaAffinityNVML(int deviceid)
bool canReuseResidentGauge(QudaInvertParam *inv_param)
void apply_staggered_phase_quda_()
Apply the staggered phase factors to the resident gauge field.
VOLATILE spinorFloat kappa
void hisqStaplesForce(GaugeField &newOprod, const GaugeField &oprod, const GaugeField &link, const double path_coeff[6], long long *flops=nullptr)
Compute the fat-link contribution to the fermion force.
double true_res_hq_offset[QUDA_MAX_MULTI_SHIFT]
void updateInvertParam(QudaInvertParam ¶m, int offset=-1)
cpuGaugeField * cpuFatLink
QudaFieldOrder fieldOrder
bool StaggeredPhaseApplied() const
void exchangeExtendedGhost(const int *R, bool no_comms_fill=false)
This does routine will populate the border / halo region of a gauge field that has been created using...
void updateMultigridQuda(void *mg_, QudaMultigridParam *mg_param)
Updates the multigrid preconditioner for the new gauge / clover field.
void flushProfile()
Flush profile contents, setting all counts to zero.
static bool initialized
Profiler for initQuda.
void Release()
Release Device memory for CURAND RNG states.
cudaCloverField * cloverSloppy
multigrid_solver(QudaMultigridParam &mg_param, TimeProfile &profile)
int commDim[QUDA_MAX_DIM]
void MatQuda(void *h_out, void *h_in, QudaInvertParam *inv_param)
static TimeProfile profileSTOUT("STOUTQuda")
Profiler for OvrImpSTOUTQuda.
void load_clover_quda_(void *h_clover, void *h_clovinv, QudaInvertParam *inv_param)
void projectSU3Quda(void *gauge_h, double tol, QudaGaugeParam *param)
cudaGaugeField *& gaugeFatPrecondition
QudaInvertParam newQudaInvertParam(void)
cudaGaugeField * cudaFatLink
static const std::string quda_version
void setPrecision(QudaPrecision precision)
void flush_device()
Free all outstanding device-memory allocations.
void Dagger(QudaDagType dag) const
static Solver * create(SolverParam ¶m, DiracMatrix &mat, DiracMatrix &matSloppy, DiracMatrix &matPrecon, TimeProfile &profile)
for(int s=0;s< param.dc.Ls;s++)
QudaPrecision cuda_prec_precondition
void free_sloppy_gauge_quda_()
QudaCloverFieldOrder clover_order
Class declaration to initialize and hold CURAND RNG states.
static __inline__ size_t p
void createDslashEvents()
void updateGaugeFieldQuda(void *gauge, void *momentum, double dt, int conj_mom, int exact, QudaGaugeParam *param)
double Anisotropy() const
QudaGammaBasis gammaBasis
void remove_staggered_phase_quda_()
Remove the staggered phase factors to the resident gauge field.
static int lex_rank_from_coords(const int *coords, void *fdata)
void freeSloppyCloverQuda()
QudaGaugeFieldOrder order
void exchangeExtendedGhost(cudaColorSpinorField *spinor, int R[], int parity, cudaStream_t *stream_p)
void performAPEnStep(unsigned int nSteps, double alpha)
static TimeProfile profileClover("loadCloverQuda")
Profiler for dslashQuda.
void performWuppertalnStep(void *h_out, void *h_in, QudaInvertParam *inv_param, unsigned int nSteps, double alpha)
static unsigned int unsigned int shift
double b_5[QUDA_MAX_DWF_LS]
NEW: used by domain wall and twisted mass.
#define QUDA_VERSION_SUBMINOR
void unregister_pinned_quda_(void *ptr)
Pinned a pre-existing memory allocation.
QudaPrecision cuda_prec_sloppy
void exchangeGhost(QudaLinkDirection link_direction=QUDA_LINK_BACKWARDS)
Exchange the ghost and store store in the padded region.
static bool invalidate_clover
enum QudaGaugeFieldOrder_s QudaGaugeFieldOrder
static double unitarize_eps
QudaMatPCType matpcType
NEW: used by mobius domain wall only.
double true_res_offset[QUDA_MAX_MULTI_SHIFT]
void comm_set_gridsize_(int *grid)
void projectSU3(cudaGaugeField &U, double tol, int *fails)
Project the input gauge field onto the SU(3) group. This is a destructive operation. The number of link failures is reported so appropriate action can be taken.
int(* QudaCommsMap)(const int *coords, void *fdata)
static TimeProfile profileDslash("dslashQuda")
Profiler for invertQuda.
void saveProfile(const std::string label="")
Save profile to disk.
void saveGaugeFieldQuda(void *gauge, void *inGauge, QudaGaugeParam *param)
QudaSolutionType RitzMat_lanczos
static TimeProfile profileGauge("loadGaugeQuda")
Profile for loadCloverQuda.
cudaCloverField * cloverPrecise
enum QudaParity_s QudaParity
void register_pinned_quda_(void *ptr, size_t *bytes)
Pinned a pre-existing memory allocation.
QudaReconstructType reconstruct
void lanczosQuda(int k0, int m, void *hp_Apsi, void *hp_r, void *hp_V, void *hp_alpha, void *hp_beta, QudaEigParam *eig_param)
void applyU(GaugeField &force, GaugeField &U)
static bool comms_initialized
void OvrImpSTOUTStep(GaugeField &dataDs, const GaugeField &dataOr, double rho, double epsilon)
static int * num_failures_h
void dslashQuda_mdwf(void *h_out, void *h_in, QudaInvertParam *inv_param, QudaParity parity, int test_type)
static Eig_Solver * create(QudaEigParam ¶m, RitzMat &ritz_mat, TimeProfile &profile)
QudaFieldLocation location
static void freeGhostBuffer(void)
cudaError_t qudaDeviceSynchronize()
Wrapper around cudaDeviceSynchronize or cuDeviceSynchronize.
static TimeProfile profileGauss("gaussQuda")
Profiler for plaqQuda.
void MatDagMatQuda(void *h_out, void *h_in, QudaInvertParam *inv_param)
static TimeProfile profileProject("projectSU3Quda")
Profiler for staggeredPhaseQuda.
void * memcpy(void *__dst, const void *__src, size_t __n)
void unitarizeForce(cudaGaugeField &newForce, const cudaGaugeField &oldForce, const cudaGaugeField &gauge, int *unitarization_failed, long long *flops=NULL)
Unitarize the fermion force.
void invert_quda_(void *hp_x, void *hp_b, QudaInvertParam *param)
#define safe_malloc(size)
void zero(ColorSpinorField &a)
void copy(const CloverField &src, bool inverse=true)
Copy into this CloverField from the generic CloverField src.
double shift
Shift term added onto operator (M^dag M + shift)
static void init_default_comms()
void setMass(double mass)
void pushVerbosity(QudaVerbosity verbosity)
static int * num_failures_d
void init_quda_device_(int *dev)
cudaGaugeField * gaugeLongSloppy
int compute_clover_inverse
QudaPrecision prec_precondition
void loadSloppyCloverQuda(QudaPrecision prec_sloppy, QudaPrecision prec_precondition)
#define checkCudaErrorNoSync()
void update_gauge_field_quda_(void *gauge, void *momentum, double *dt, bool *conj_mom, bool *exact, QudaGaugeParam *param)
void Mdag(ColorSpinorField &out, const ColorSpinorField &in) const
void plaqQuda(double plq[3])
static bool reunit_allow_svd
void gaugefixingFFT(cudaGaugeField &data, const int gauge_dir, const int Nsteps, const int verbose_interval, const double alpha, const int autotune, const double tolerance, const int stopWtheta)
Gauge fixing with Steepest descent method with FFTs with support for single GPU only.
void invertQuda(void *hp_x, void *hp_b, QudaInvertParam *param)
void printQudaInvertParam(QudaInvertParam *param)
void clover_quda_(void *h_out, void *h_in, QudaInvertParam *inv_param, QudaParity *parity, int *inverse)
void wuppertalStep(ColorSpinorField &out, const ColorSpinorField &in, int parity, const GaugeField &U, double A, double B)
virtual void M(ColorSpinorField &out, const ColorSpinorField &in) const
void hisqCompleteForce(GaugeField &momentum, const GaugeField &oprod, const GaugeField &link, long long *flops=nullptr)
Multiply the computed the force matrix by the gauge field and perform traceless anti-hermitian projec...
QudaFieldLocation location
static TimeProfile profileInvert("invertQuda")
Profiler for invertMultiShiftQuda.
cpuColorSpinorField * out
static TimeProfile profilePhase("staggeredPhaseQuda")
Profiler for contractions.
static TimeProfile GaugeFixOVRQuda("GaugeFixOVRQuda")
Profiler for toal time spend between init and end.
cudaGaugeField * gaugePrecondition
bool twisted
Clover coefficient.
static TimeProfile profileCovDev("covDevQuda")
Profiler for contractions.
deflated_solver(QudaEigParam &eig_param, TimeProfile &profile)
static double svd_abs_error
enum QudaReconstructType_s QudaReconstructType
Main header file for the QUDA library.
static TimeProfile profileInit("initQuda")
Profile for loadGaugeQuda / saveGaugeQuda.
static bool redundant_comms
#define QUDA_MAX_MG_LEVEL
Maximum number of multi-grid levels. This number may be increased if needed.
std::vector< ColorSpinorField * > B
void set_kernel_pack_t_(int *pack)
fTemporary function exposed for TIFR benchmarking
virtual void MdagM(ColorSpinorField &out, const ColorSpinorField &in) const
#define QUDA_MAX_DWF_LS
Maximum length of the Ls dimension for domain-wall fermions.
void applyStaggeredPhase()
void * newDeflationQuda(QudaEigParam *eig_param)
void staggeredPhaseQuda(void *gauge_h, QudaGaugeParam *param)
void printQudaMultigridParam(QudaMultigridParam *param)
void freeSloppyGaugeQuda(void)
double computeQCharge(GaugeField &Fmunu, QudaFieldLocation location)
void mat_dag_mat_quda_(void *h_out, void *h_in, QudaInvertParam *inv_param)
virtual void prepare(ColorSpinorField *&src, ColorSpinorField *&sol, ColorSpinorField &x, ColorSpinorField &b, const QudaSolutionType) const
void new_quda_gauge_param_(QudaGaugeParam *param)
void contractCuda(const cudaColorSpinorField &x, const cudaColorSpinorField &y, void *result, const QudaContractType contract_type, const QudaParity parity, TimeProfile &profile)
cudaGaugeField * fatGauge
QudaTwistFlavorType twist_flavor
void cloverQuda(void *h_out, void *h_in, QudaInvertParam *inv_param, QudaParity parity, int inverse)
virtual void Dslash(ColorSpinorField &out, const ColorSpinorField &in, const QudaParity parity) const
cudaGaugeField * gaugeSmeared
static TimeProfile GaugeFixFFTQuda("GaugeFixFFTQuda")
double residue[QUDA_MAX_MULTI_SHIFT]
cudaGaugeField * cudaGauge
static TimeProfile profileExtendedGauge("createExtendedGaugeField")
Profiler for computeCloverForceQuda.
quda::cudaGaugeField * checkGauge(QudaInvertParam *param)
void updateMomentum(GaugeField &mom, double coeff, GaugeField &force)
QudaReconstructType reconstruct
void setKernelPackT(bool pack)
void printAPIProfile()
Print out the timer profile for CUDA API calls.
cudaGaugeField *& gaugeFatSloppy
void gamma5(ColorSpinorField &out, const ColorSpinorField &in)
Applies a gamma5 matrix to a spinor (wrapper to ApplyGamma)
enum QudaContractType_s QudaContractType
static TimeProfile profileEnd("endQuda")
Profiler for GaugeFixing.
QudaReconstructType Reconstruct() const
enum QudaFieldGeometry_s QudaFieldGeometry
QudaUseInitGuess use_init_guess
void flushChronoQuda(int i)
Flush the chronological history for the given index.
cudaGaugeField * longGauge
void dslash_quda_(void *h_out, void *h_in, QudaInvertParam *inv_param, QudaParity *parity)
enum QudaVerbosity_s QudaVerbosity
void updateGaugeField(GaugeField &out, double dt, const GaugeField &in, const GaugeField &mom, bool conj_mom, bool exact)
void createCloverQuda(QudaInvertParam *invertParam)
int computeGaugeForceQuda(void *mom, void *siteLink, int ***input_path_buf, int *path_length, double *loop_coeff, int num_paths, int max_length, double eb3, QudaGaugeParam *qudaGaugeParam)
void computeStaggeredOprod(GaugeField *out[], ColorSpinorField &in, const double coeff[], int nFace)
Compute the outer-product field between the staggered quark field's one and (for HISQ and ASQTAD) thr...
#define QUDA_MAX_DIM
Maximum number of dimensions supported by QUDA. In practice, no routines make use of more than 5...
QudaFieldGeometry geometry
static void PrintGlobal()
void setOutputFile(FILE *outfile)
cudaGaugeField * gaugePrecise
#define mapped_malloc(size)
int use_resident_solution
static Dirac * create(const DiracParam ¶m)
cudaGaugeField *& gaugeFatPrecise
static double unscaled_shifts[QUDA_MAX_MULTI_SHIFT]
void dslashQuda(void *h_out, void *h_in, QudaInvertParam *inv_param, QudaParity parity)
void compute_gauge_force_quda_(void *mom, void *gauge, int *num_loop_types, double *coeff, double *dt, QudaGaugeParam *param)
Compute the gauge force and update the mometum field.
void updateR()
update the radius for halos.
static TimeProfile profileWuppertal("wuppertalQuda")
Profiler for gaussQuda.
cudaGaugeField * cudaForce
void flush_chrono_quda_(int *index)
Flush the chronological history for the given index.
static TimeProfile profileContract("contractQuda")
Profiler for contractions.
void copy(const GaugeField &src)
void mat(void *out, void **link, void *in, int dagger_bit, int mu, QudaPrecision sPrecision, QudaPrecision gPrecision)
static __inline__ size_t size_t d
void computeCloverSigmaOprod(GaugeField &oprod, std::vector< ColorSpinorField *> &x, std::vector< ColorSpinorField *> &p, std::vector< std::vector< double > > &coeff)
Compute the outer product from the solver solution fields arising from the diagonal term of the fermi...
QudaPrecision Precision() const
cudaGaugeField * extendedGaugeResident
void computeClover(CloverField &clover, const GaugeField &gauge, double coeff, QudaFieldLocation location)
QudaDslashType dslash_type_precondition
QudaPrecision clover_cpu_prec
QudaSiteSubset siteSubset
QudaPrecision cuda_prec_ritz
static TimeProfile profileHISQForce("computeHISQForceQuda")
Profiler for plaqQuda.
void reorder_location_set(QudaFieldLocation reorder_location_)
Set whether data is reorderd on the CPU or GPU. This can set at QUDA initialization using the environ...
static void freeGhostBuffer(void)
Free statically allocated ghost buffers.
void destroyMultigridQuda(void *mg)
Free resources allocated by the multigrid solver.
void destroyDslashEvents()
char * getenv(const char *)
void performOvrImpSTOUTnStep(unsigned int nSteps, double rho, double epsilon)
void copyExtendedGauge(GaugeField &out, const GaugeField &in, QudaFieldLocation location, void *Out=0, void *In=0)
#define QUDA_VERSION_MAJOR
void setVerbosity(const QudaVerbosity verbosity)
cudaGaugeField * momResident
DiracMatrix * matResidual
virtual void reconstruct(ColorSpinorField &x, const ColorSpinorField &b, const QudaSolutionType) const
static TimeProfile profileCloverForce("computeCloverForceQuda")
Profiler for computeStaggeredForceQuda.
static void createGaugeForcePaths(int **paths, int dir, int num_loop_types)
void * newMultigridQuda(QudaMultigridParam *mg_param)
cudaGaugeField * gaugeSloppy
int comm_dim_partitioned(int dim)
void initQudaDevice(int dev)
QudaGaugeParam newQudaGaugeParam(void)
QudaInvertParam * invert_param
void checkClover(QudaInvertParam *param)
void mat_quda_(void *h_out, void *h_in, QudaInvertParam *inv_param)
void compute_staggered_force_quda_(void *h_mom, double *dt, double *delta, void *gauge, void *x, QudaGaugeParam *gauge_param, QudaInvertParam *inv_param)
void setDiracPreParam(DiracParam &diracParam, QudaInvertParam *inv_param, const bool pc, bool comms)
cudaGaugeField * gaugeLongPrecondition
void removeStaggeredPhase()