46 #ifdef GPU_GAUGE_FORCE
51 #define MAX(a,b) ((a)>(b)? (a):(b))
52 #define TDIFF(a,b) (b.tv_sec - a.tv_sec + 0.000001*(b.tv_usec - a.tv_usec))
54 #define spinorSiteSize 24 // real numbers per spinor
56 #define MAX_GPU_NUM_PER_NODE 16
85 static bool InitMagma =
false;
93 else printfQuda(
"\nMAGMA library was already initialized..\n");
101 else printfQuda(
"\nMAGMA library was not initialized..\n");
141 pthread_mutex_t pthread_mutex;
144 static bool initialized =
false;
153 static TimeProfile profileClover(
"loadCloverQuda");
159 static TimeProfile profileMulti(
"invertMultiShiftQuda");
162 static TimeProfile profileMultiMixed(
"invertMultiShiftMixedQuda");
165 static TimeProfile profileFatLink(
"computeKSLinkQuda");
168 static TimeProfile profileGaugeForce(
"computeGaugeForceQuda");
171 static TimeProfile profileGaugeUpdate(
"updateGaugeFieldQuda");
174 static TimeProfile profileExtendedGauge(
"createExtendedGaugeField");
178 static TimeProfile profileCloverCreate(
"createCloverQuda");
181 static TimeProfile profileCloverDerivative(
"computeCloverDerivativeQuda");
184 static TimeProfile profileCloverTrace(
"computeCloverTraceQuda");
187 static TimeProfile profileStaggeredOprod(
"computeStaggeredOprodQuda");
190 static TimeProfile profileAsqtadForce(
"computeAsqtadForceQuda");
193 static TimeProfile profileHISQForce(
"computeHISQForceQuda");
196 static TimeProfile profileHISQForceComplete(
"computeHISQForceCompleteQuda");
202 static TimeProfile profileContract(
"contractQuda");
230 static int lex_rank_from_coords(
const int *coords,
void *fdata)
234 int rank = coords[0];
235 for (
int i = 1; i < md->
ndim; i++) {
236 rank = md->
dims[i] * rank + coords[i];
245 static int qmp_rank_from_coords(
const int *coords,
void *fdata)
247 return QMP_get_node_number_from(coords);
252 static bool comms_initialized =
false;
257 errorQuda(
"Number of communication grid dimensions must be 4");
264 if (QMP_logical_topology_is_declared()) {
265 if (QMP_get_logical_number_of_dimensions() != 4) {
266 errorQuda(
"QMP logical topology must have 4 dimensions");
268 for (
int i=0; i<nDim; i++) {
269 int qdim = QMP_get_logical_dimensions()[i];
270 if(qdim != dims[i]) {
271 errorQuda(
"QMP logical dims[%d]=%d does not match dims[%d]=%d argument", i, qdim, i, dims[i]);
275 func = qmp_rank_from_coords;
277 warningQuda(
"QMP logical topology is undeclared; using default lexicographical ordering");
280 map_data.
ndim = nDim;
281 for (
int i=0; i<nDim; i++) {
282 map_data.
dims[i] = dims[i];
284 fdata = (
void *) &map_data;
285 func = lex_rank_from_coords;
293 comms_initialized =
true;
297 static void init_default_comms()
299 #if defined(QMP_COMMS)
300 if (QMP_logical_topology_is_declared()) {
301 int ndim = QMP_get_logical_number_of_dimensions();
302 const int *
dims = QMP_get_logical_dimensions();
305 errorQuda(
"initQuda() called without prior call to initCommsGridQuda(),"
306 " and QMP logical topology has not been declared");
308 #elif defined(MPI_COMMS)
309 errorQuda(
"When using MPI for communications, initCommsGridQuda() must be called before initQuda()");
311 const int dims[4] = {1, 1, 1, 1};
323 if (initialized)
return;
326 #if defined(GPU_DIRECT) && defined(MULTI_GPU) && (CUDA_VERSION == 4000)
329 char* cni_str = getenv(
"CUDA_NIC_INTEROP");
331 errorQuda(
"Environment variable CUDA_NIC_INTEROP is not set");
333 int cni_int = atoi(cni_str);
335 errorQuda(
"Environment variable CUDA_NIC_INTEROP is not set to 1");
340 cudaGetDeviceCount(&deviceCount);
341 if (deviceCount == 0) {
345 for(
int i=0; i<deviceCount; i++) {
355 if (!comms_initialized) {
356 errorQuda(
"initDeviceQuda() called with a negative device ordinal, but comms have not been initialized");
361 if (dev < 0 || dev >= 16)
errorQuda(
"Invalid device number %d", dev);
367 errorQuda(
"Device %d does not support CUDA", dev);
386 if(
deviceProp.canMapHostMemory) cudaSetDeviceFlags(cudaDeviceMapHost);
390 cudaDeviceSetCacheConfig(cudaFuncCachePreferL1);
400 if (!comms_initialized) init_default_comms();
404 #if (CUDA_VERSION >= 5050)
405 int greatestPriority;
407 cudaDeviceGetStreamPriorityRange(&leastPriority, &greatestPriority);
408 for (
int i=0; i<
Nstream-1; i++) {
409 cudaStreamCreateWithPriority(&
streams[i], cudaStreamDefault, greatestPriority);
411 cudaStreamCreateWithPriority(&
streams[Nstream-1], cudaStreamDefault, leastPriority);
413 for (
int i=0; i<
Nstream; i++) {
420 #ifdef GPU_STAGGERED_OPROD
433 if (!comms_initialized) init_default_comms();
442 pthread_mutexattr_t mutex_attr;
443 pthread_mutexattr_init(&mutex_attr);
444 pthread_mutexattr_settype(&mutex_attr, PTHREAD_MUTEX_RECURSIVE);
445 pthread_mutex_init(&pthread_mutex, &mutex_attr);
459 if (!initialized)
errorQuda(
"QUDA not initialized");
462 checkGaugeParam(param);
525 #if (__COMPUTE_CAPABILITY__ >= 200)
526 sloppy->
copy(*precise);
545 #if (__COMPUTE_CAPABILITY__ >= 200)
546 precondition->
copy(*sloppy);
548 precondition->
copy(*in);
552 precondition = sloppy;
559 for(
int i=0; i<4; ++i){
561 gauge_param.
x[i] += 2*R[i];
575 switch (param->
type) {
628 errorQuda(
"Non-cpu output location not yet supported");
630 if (!initialized)
errorQuda(
"QUDA not initialized");
631 checkGaugeParam(param);
637 switch (param->
type) {
662 bool device_calc =
false;
667 if (!initialized)
errorQuda(
"QUDA not initialized");
669 if (!h_clover && !h_clovinv) {
674 errorQuda(
"loadCloverQuda() called with neither clover term nor inverse");
680 errorQuda(
"Half precision not supported on CPU");
683 errorQuda(
"Gauge field must be loaded before clover");
686 errorQuda(
"Wrong dslash_type in loadCloverQuda()");
704 if (!h_clover && !pc_solve && !pc_solution) {
709 if (!h_clover && pc_solve && pc_solution && asymmetric && !device_calc) {
724 cpuParam.
direct = h_clover ?
true :
false;
725 cpuParam.
inverse = h_clovinv ?
true :
false;
726 cpuParam.
clover = h_clover;
739 cpuParam.
clover = h_clover;
760 clover_param.
nDim = 4;
764 clover_param.
direct = h_clover ?
true :
false;
765 clover_param.
inverse = (h_clovinv || pc_solve) ?
true :
false;
770 clover_param.
direct =
true;
773 clover_param.
direct =
false;
825 clover_param.
norm = 0;
827 clover_param.
mu2 = 0.;
828 clover_param.
nDim = 4;
829 for(
int dir=0; dir<4; ++dir) clover_param.
x[dir] =
gaugePrecise->
X()[dir];
833 clover_param.
direct =
true;
842 clover_param.
direct =
false;
845 clover_param.
mu2 = 4.*inv_param->
kappa*inv_param->
kappa*inv_param->
mu*inv_param->
mu;
848 clover_param.
direct =
true;
872 clover_param.
direct =
true;
877 clover_param.
direct =
false;
896 if (!h_clovinv && pc_solve && !device_calc) {
899 clover_param.
direct =
false;
911 cudaMemcpy((
char*)(in->
V(
false))+in->
Bytes()/2, (
char*)(hack.
V(
true))+hack.
Bytes()/2,
912 in->
Bytes()/2, cudaMemcpyDeviceToHost);
931 if (!initialized)
errorQuda(
"QUDA not initialized");
975 if (!initialized)
errorQuda(
"QUDA not initialized");
998 if (!initialized)
errorQuda(
"QUDA not initialized");
1022 if (!initialized)
return;
1034 if(cudaStapleField)
delete cudaStapleField; cudaStapleField=NULL;
1035 if(cudaStapleField1)
delete cudaStapleField1; cudaStapleField1=NULL;
1049 #ifdef GPU_STAGGERED_OPROD
1055 #if (!defined(USE_QDPJIT) && !defined(GPU_COMMS))
1060 initialized =
false;
1063 comms_initialized =
false;
1069 profileInit.
Print();
1070 profileGauge.
Print();
1071 profileCloverCreate.
Print();
1072 profileClover.
Print();
1073 profileInvert.
Print();
1074 profileMulti.
Print();
1075 profileMultiMixed.
Print();
1076 profileFatLink.
Print();
1077 profileGaugeForce.
Print();
1078 profileGaugeUpdate.
Print();
1079 profileExtendedGauge.
Print();
1080 profileCloverDerivative.
Print();
1081 profileCloverTrace.
Print();
1082 profileStaggeredOprod.
Print();
1083 profileAsqtadForce.
Print();
1084 profileHISQForce.
Print();
1085 profileContract.
Print();
1086 profileCovDev.
Print();
1118 diracParam.
Ls = inv_param->
Ls;
1123 diracParam.
Ls = inv_param->
Ls;
1124 }
else errorQuda(
"For 4D type of DWF dslash, pc must be turned on, %d", inv_param->
dslash_type);
1131 diracParam.
Ls = inv_param->
Ls;
1132 memcpy(diracParam.
b_5, inv_param->
b_5,
sizeof(
double)*inv_param->
Ls);
1133 memcpy(diracParam.
c_5, inv_param->
c_5,
sizeof(
double)*inv_param->
Ls);
1134 }
else errorQuda(
"At currently, only preconditioned Mobius DWF is supported, %d", inv_param->
dslash_type);
1175 diracParam.
m5 = inv_param->
m5;
1176 diracParam.
mu = inv_param->
mu;
1178 for (
int i=0; i<4; i++) diracParam.
commDim[i] = 1;
1192 for (
int i=0; i<4; i++) {
1215 for (
int i=0; i<4; i++) {
1247 double kappa5 = (0.5/(5.0 + param.
m5));
1253 printfQuda(
"Mass rescale: Kappa is: %g\n", kappa);
1255 double nin =
norm2(b);
1256 printfQuda(
"Mass rescale: norm of source in = %g\n", nin);
1277 unscaled_shifts[i] = param.
offset[i];
1292 axCuda(4.0*kappa*kappa, b);
1298 axCuda(4.0*kappa*kappa, b);
1310 axCuda(4.0*kappa*kappa, b);
1320 printfQuda(
"Mass rescale: Kappa is: %g\n", kappa);
1322 double nin =
norm2(b);
1323 printfQuda(
"Mass rescale: norm of source out = %g\n", nin);
1337 errorQuda(
"Clover field not allocated");
1339 errorQuda(
"Clover field not allocated");
1354 double cpu =
norm2(*in_h);
1355 double gpu =
norm2(in);
1385 dirac->
Dslash(out, tmp1, parity);
1387 dirac->
Dslash(out, in, parity);
1400 double cpu =
norm2(*out_h);
1401 double gpu =
norm2(out);
1402 printfQuda(
"Out CPU %e CUDA %e\n", cpu, gpu);
1416 errorQuda(
"This type of dslashQuda operator is defined for QUDA_DOMAIN_WALL_$D_DSLASH and QUDA_MOBIUS_DWF_DSLASH only");
1433 double cpu =
norm2(*in_h);
1434 double gpu =
norm2(in);
1456 switch (test_type) {
1458 dirac.
Dslash4(out, in, parity);
1461 dirac.
Dslash5(out, in, parity);
1476 double cpu =
norm2(*out_h);
1477 double gpu =
norm2(out);
1478 printfQuda(
"Out CPU %e CUDA %e\n", cpu, gpu);
1492 errorQuda(
"This type of dslashQuda operator is defined for QUDA_DOMAIN_WALL_$D_DSLASH and QUDA_MOBIUS_DWF_DSLASH only");
1509 double cpu =
norm2(*in_h);
1510 double gpu =
norm2(in);
1532 switch (test_type) {
1534 dirac.
Dslash4(out, in, parity);
1537 dirac.
Dslash5(out, in, parity);
1555 double cpu =
norm2(*out_h);
1556 double gpu =
norm2(out);
1557 printfQuda(
"Out CPU %e CUDA %e\n", cpu, gpu);
1577 errorQuda(
"Clover field not allocated");
1579 errorQuda(
"Clover field not allocated");
1594 double cpu =
norm2(*in_h);
1595 double gpu =
norm2(in);
1612 axCuda(0.25/(kappa*kappa), out);
1631 double cpu =
norm2(*out_h);
1632 double gpu =
norm2(out);
1633 printfQuda(
"Out CPU %e CUDA %e\n", cpu, gpu);
1651 if (!initialized)
errorQuda(
"QUDA not initialized");
1654 errorQuda(
"Clover field not allocated");
1656 errorQuda(
"Clover field not allocated");
1671 double cpu =
norm2(*in_h);
1672 double gpu =
norm2(in);
1686 dirac->
MdagM(out, in);
1694 axCuda(0.25/(kappa*kappa), out);
1699 axCuda(0.25/(kappa*kappa), out);
1711 double cpu =
norm2(*out_h);
1712 double gpu =
norm2(out);
1713 printfQuda(
"Out CPU %e CUDA %e\n", cpu, gpu);
1756 if (!initialized)
errorQuda(
"QUDA not initialized");
1763 errorQuda(
"Cannot apply the clover term for a non Wilson-clover or Twisted-mass-clover dslash");
1775 double cpu =
norm2(*in_h);
1776 double gpu =
norm2(in);
1797 if (!inverse) dirac.
Clover(out, in, parity);
1808 double cpu =
norm2(*out_h);
1809 double gpu =
norm2(out);
1810 printfQuda(
"Out CPU %e CUDA %e\n", cpu, gpu);
1824 void lanczosQuda(
int k0,
int m,
void *hp_Apsi,
void *hp_r,
void *hp_V,
1825 void *hp_alpha,
void *hp_beta,
QudaEigParam *eig_param)
1838 if (!initialized)
errorQuda(
"QUDA not initialized");
1846 checkInvertParam(param);
1847 checkEigParam(eig_param);
1863 const int *
X = cudaGauge->
X();
1871 cpuParam.
v = hp_Apsi;
1879 for(
int k = 0 ; k < m ; k++)
1881 cpuParam.
v = ((
double**)hp_V)[k];
1897 printfQuda(
"r vector CPU %1.14e CUDA %1.14e\n", cpu, gpu);
1898 cpu =
norm2(*h_Apsi);
1900 printfQuda(
"Apsi vector CPU %1.14e CUDA %1.14e\n", cpu, gpu);
1907 for(
int k = 0 ; k < m ; k++)
1911 cpu =
norm2(*h_Eig_Vec[k]);
1912 gpu =
norm2(*Eig_Vec[k]);
1913 printfQuda(
"Eig_Vec[%d] CPU %1.14e CUDA %1.14e\n", k, cpu, gpu);
1921 RitzMat ritz_mat(mat,*eig_param);
1923 (*eig_solve)((
double*)hp_alpha, (
double*)hp_beta, Eig_Vec, *r, *Apsi, k0, m);
1929 RitzMat ritz_mat(mat,*eig_param);
1931 (*eig_solve)((
double*)hp_alpha, (
double*)hp_beta, Eig_Vec, *r, *Apsi, k0, m);
1937 RitzMat ritz_mat(mat,*eig_param);
1939 (*eig_solve)((
double*)hp_alpha, (
double*)hp_beta, Eig_Vec, *r, *Apsi, k0, m);
1944 errorQuda(
"invalid ritz matrix type\n");
1950 for(
int k = 0 ; k < m ; k++)
1952 *h_Eig_Vec[k] = *Eig_Vec[k];
1961 for(
int k = 0 ; k < m ; k++)
1964 delete h_Eig_Vec[k];
1987 if (!initialized)
errorQuda(
"QUDA not initialized");
1995 checkInvertParam(param);
2026 Dirac *dSloppy = NULL;
2033 Dirac &diracSloppy = *dSloppy;
2034 Dirac &diracPre = *dPre;
2043 const int *
X = cudaGauge->
X();
2065 errorQuda(
"Initial guess not supported for two-pass solver");
2076 double nb =
norm2(*b);
2077 if (nb==0.0)
errorQuda(
"Source has zero norm");
2080 double nh_b =
norm2(*h_b);
2081 double nh_x =
norm2(*h_x);
2082 double nx =
norm2(*x);
2083 printfQuda(
"Source: CPU = %g, CUDA copy = %g\n", nh_b, nb);
2084 printfQuda(
"Solution: CPU = %g, CUDA copy = %g\n", nh_x, nx);
2097 double nin =
norm2(*in);
2098 double nout =
norm2(*out);
2100 printfQuda(
"Prepared solution = %g\n", nout);
2104 double nin =
norm2(*in);
2105 printfQuda(
"Prepared source post mass rescale = %g\n", nin);
2129 if (pc_solution && !pc_solve) {
2130 errorQuda(
"Preconditioned (PC) solution_type requires a PC solve_type");
2133 if (!mat_solution && !pc_solution && pc_solve) {
2134 errorQuda(
"Unpreconditioned MATDAG_MAT solution_type requires an unpreconditioned solve_type");
2137 if (!mat_solution && norm_error_solve) {
2138 errorQuda(
"Normal-error solve requires Mat solution");
2141 if (mat_solution && !direct_solve && !norm_error_solve) {
2143 dirac.
Mdag(*in, tmp);
2144 }
else if (!mat_solution && direct_solve) {
2145 DiracMdag m(dirac), mSloppy(diracSloppy), mPre(diracPre);
2148 (*solve)(*
out, *
in);
2155 DiracM m(dirac), mSloppy(diracSloppy), mPre(diracPre);
2158 (*solve)(*
out, *
in);
2161 }
else if (!norm_error_solve) {
2162 DiracMdagM m(dirac), mSloppy(diracSloppy), mPre(diracPre);
2165 (*solve)(*
out, *
in);
2169 DiracMMdag m(dirac), mSloppy(diracSloppy), mPre(diracPre);
2174 dirac.
Mdag(*out, tmp);
2180 double nx =
norm2(*x);
2195 double nx =
norm2(*x);
2196 double nh_x =
norm2(*h_x);
2197 printfQuda(
"Reconstructed: CUDA solution = %g, CPU copy = %g\n", nx, nh_x);
2225 if (!initialized)
errorQuda(
"QUDA not initialized");
2233 checkInvertParam(param);
2264 Dirac *dSloppy = NULL;
2271 Dirac &diracSloppy = *dSloppy;
2272 Dirac &diracPre = *dPre;
2281 const int *
X = cudaGauge->
X();
2303 errorQuda(
"Initial guess not supported for two-pass solver");
2314 double nb =
norm2(*b);
2315 if (nb==0.0)
errorQuda(
"Source has zero norm");
2318 double nh_b =
norm2(*h_b);
2319 double nh_x =
norm2(*h_x);
2320 double nx =
norm2(*x);
2321 printfQuda(
"Source: CPU = %g, CUDA copy = %g\n", nh_b, nb);
2322 printfQuda(
"Solution: CPU = %g, CUDA copy = %g\n", nh_x, nx);
2335 double nin =
norm2(*in);
2336 double nout =
norm2(*out);
2338 printfQuda(
"Prepared solution = %g\n", nout);
2342 double nin =
norm2(*in);
2343 printfQuda(
"Prepared source post mass rescale = %g\n", nin);
2367 if (pc_solution && !pc_solve) {
2368 errorQuda(
"Preconditioned (PC) solution_type requires a PC solve_type");
2371 if (!mat_solution && !pc_solution && pc_solve) {
2372 errorQuda(
"Unpreconditioned MATDAG_MAT solution_type requires an unpreconditioned solve_type");
2375 if (!mat_solution && norm_error_solve) {
2376 errorQuda(
"Normal-error solve requires Mat solution");
2379 if (mat_solution && !direct_solve && !norm_error_solve) {
2381 dirac.
Mdag(*in, tmp);
2382 }
else if (!mat_solution && direct_solve) {
2383 DiracMdag m(dirac), mSloppy(diracSloppy), mPre(diracPre);
2386 (*solve)(*
out, *
in);
2393 DiracM m(dirac), mSloppy(diracSloppy), mPre(diracPre);
2396 (*solve)(*
out, *
in);
2399 }
else if (!norm_error_solve){
2400 DiracMdagM m(dirac), mSloppy(diracSloppy), mPre(diracPre);
2403 (*solve)(*
out, *
in);
2407 DiracMMdag m(dirac), mSloppy(diracSloppy), mPre(diracPre);
2412 dirac.
Mdag(*out, tmp);
2418 double nx =
norm2(*x);
2431 cudaParam.
x[0] *= 2;
2442 double nx =
norm2(*x);
2443 double nh_x =
norm2(*h_x);
2444 printfQuda(
"Reconstructed: CUDA solution = %g, CPU copy = %g\n", nx, nh_x);
2483 if (!initialized)
errorQuda(
"QUDA not initialized");
2486 checkInvertParam(param);
2489 errorQuda(
"Number of shifts %d requested greater than QUDA_MAX_MULTI_SHIFT %d",
2500 errorQuda(
"Multi-shift solver does not support MAT or MATPC solution types");
2503 errorQuda(
"Multi-shift solver does not support DIRECT or DIRECT_PC solve types");
2505 if (pc_solution & !pc_solve) {
2506 errorQuda(
"Preconditioned (PC) solution_type requires a PC solve_type");
2508 if (!pc_solution & pc_solve) {
2509 errorQuda(
"In multi-shift solver, a preconditioned (PC) solve_type requires a PC solution_type");
2521 errorQuda(
"QUDA only currently supports multi-shift CG");
2535 errorQuda(
"Offsets must be ordered from smallest to largest");
2561 Dirac *dSloppy = NULL;
2567 Dirac &diracSloppy = *dSloppy;
2588 cpuParam.
v = hp_x[i];
2610 double nb =
norm2(*b);
2611 if (nb==0.0)
errorQuda(
"Solution has zero norm");
2614 double nh_b =
norm2(*h_b);
2615 printfQuda(
"Source: CPU = %g, CUDA copy = %g\n", nh_b, nb);
2629 MultiShiftCG cg_m(m, mSloppy, solverParam, profileMulti);
2677 printfQuda(
"Refining shift %d: L2 residual %e / %e, heavy quark %e / %e (actual / requested)\n",
2692 m.shift = param->
offset[i];
2697 solverParam.
iter = 0;
2702 CG cg(m, mSloppy, solverParam, profileMulti);
2719 param->
offset[i] = unscaled_shifts[i];
2729 double nx =
norm2(*x[i]);
2776 if (!initialized)
errorQuda(
"QUDA not initialized");
2779 checkInvertParam(param);
2782 errorQuda(
"Number of shifts %d requested greater than QUDA_MAX_MULTI_SHIFT %d",
2793 errorQuda(
"Multi-shift solver does not support MAT or MATPC solution types");
2796 errorQuda(
"Multi-shift solver does not support DIRECT or DIRECT_PC solve types");
2798 if (pc_solution & !pc_solve) {
2799 errorQuda(
"Preconditioned (PC) solution_type requires a PC solve_type");
2801 if (!pc_solution & pc_solve) {
2802 errorQuda(
"In multi-shift solver, a preconditioned (PC) solve_type requires a PC solution_type");
2814 errorQuda(
"QUDA only currently supports multi-shift CG");
2828 errorQuda(
"Offsets must be ordered from smallest to largest");
2833 void **hp_xe =
new void* [ param->
num_offset ];
2834 void **hp_xo =
new void* [ param->
num_offset ];
2835 void **hp_ye =
new void* [ param->
num_offset ];
2836 void **hp_yo =
new void* [ param->
num_offset ];
2840 hp_xe[i] = _hp_xe[i];
2841 hp_xo[i] = _hp_xo[i];
2842 hp_ye[i] = _hp_ye[i];
2843 hp_yo[i] = _hp_yo[i];
2855 Dirac *dSloppy = NULL;
2861 Dirac &diracSloppy = *dSloppy;
2886 cpuParam.
v = hp_xe[i];
2891 cpuParam.
v = hp_xo[i];
2896 cpuParam.
v = hp_ye[i];
2901 cpuParam.
v = hp_yo[i];
2927 double nb =
norm2(*b);
2928 if (nb==0.0)
errorQuda(
"Solution has zero norm");
2931 double nh_b =
norm2(*h_b);
2932 printfQuda(
"Source: CPU = %g, CUDA copy = %g\n", nh_b, nb);
2946 MultiShiftCG cg_m(m, mSloppy, solverParam, profileMulti);
2965 printfQuda(
"Refining shift %d: L2 residual %e / %e, heavy quark %e / %e (actual / requested)\n",
2980 m.shift = param->
offset[i];
2989 CG cg(m, mSloppy, solverParam, profileMulti);
3006 param->
offset[i] = unscaled_shifts[i];
3016 double nx =
norm2(*xe[i]);
3021 dirac.
M(*ye, *xe[i]);
3081 if (!initialized)
errorQuda(
"QUDA not initialized");
3089 checkInvertParam(param);
3128 errorQuda(
"\nInitCG requires sloppy gauge field in half precision. It seems that the half precision field is not loaded,\n please check you cuda_prec_precondition parameter.\n");
3141 for (
int i=0; i<4; i++) {
3142 diracHalfPrecParam.
commDim[i] = 1;
3154 Dirac &diracSloppy = *dSloppy;
3155 Dirac &diracHalf = *dHalfPrec;
3156 Dirac &diracDeflate = *d;
3166 const int *
X = cudaGauge->
X();
3188 errorQuda(
"Initial guess not supported for two-pass solver");
3199 double nb =
norm2(*b);
3200 if (nb==0.0)
errorQuda(
"Source has zero norm");
3203 double nh_b =
norm2(*h_b);
3204 double nh_x =
norm2(*h_x);
3205 double nx =
norm2(*x);
3206 printfQuda(
"Source: CPU = %g, CUDA copy = %g\n", nh_b, nb);
3207 printfQuda(
"Solution: CPU = %g, CUDA copy = %g\n", nh_x, nx);
3221 double nin =
norm2(*in);
3222 double nout =
norm2(*out);
3224 printfQuda(
"Prepared solution = %g\n", nout);
3228 double nin =
norm2(*in);
3229 printfQuda(
"Prepared source post mass rescale = %g\n", nin);
3233 errorQuda(
"\nIncorrect eigenvector space setup...\n");
3235 if (pc_solution && !pc_solve) {
3236 errorQuda(
"Preconditioned (PC) solution_type requires a PC solve_type");
3239 if (!mat_solution && !pc_solution && pc_solve) {
3240 errorQuda(
"Unpreconditioned MATDAG_MAT solution_type requires an unpreconditioned solve_type");
3243 if (mat_solution && !direct_solve) {
3245 dirac.
Mdag(*in, tmp);
3250 DiracMdagM m(dirac), mSloppy(diracSloppy), mHalf(diracHalf), mDeflate(diracDeflate);
3262 printfQuda(
"\nDelete incremental EigCG solver resources...\n");
3273 errorQuda(
"\nUnknown deflated solver...\n");
3277 double nx =
norm2(*x);
3292 double nx =
norm2(*x);
3293 double nh_x =
norm2(*h_x);
3294 printfQuda(
"Reconstructed: CUDA solution = %g, CPU copy = %g\n", nx, nh_x);
3322 #include <sys/time.h>
3328 int Vsh_x = X[1]*X[2]*X[3]/2;
3329 int Vsh_y = X[0]*X[2]*X[3]/2;
3330 int Vsh_z = X[0]*X[1]*X[3]/2;
3332 int Vsh_t = X[0]*X[1]*X[2]/2;
3335 for (
int i=0; i<4; i++) E[i] = X[i] + 4;
3344 Vh_2d_max =
MAX(Vh_2d_max, X[0]*X[3]/2);
3345 Vh_2d_max =
MAX(Vh_2d_max, X[1]*X[2]/2);
3346 Vh_2d_max =
MAX(Vh_2d_max, X[1]*X[3]/2);
3347 Vh_2d_max =
MAX(Vh_2d_max, X[2]*X[3]/2);
3384 for(
int dir=0; dir<4; ++dir)
gParam.
x[dir] = qudaGaugeParam->
X[dir];
3386 for(
int dir=0; dir<4; ++dir)
gParam.
x[dir] = qudaGaugeParam->
X[dir] + 4;
3389 if (cudaStapleField == NULL || cudaStapleField1 == NULL) {
3407 llfat_cuda(cudaFatLink, cudaLongLink, *cudaSiteLink, *cudaStapleField, *cudaStapleField1, qudaGaugeParam, act_path_coeff);
3409 llfat_cuda_ex(cudaFatLink, cudaLongLink, *cudaSiteLink, *cudaStapleField, *cudaStapleField1, qudaGaugeParam, act_path_coeff);
3415 delete cudaStapleField; cudaStapleField = NULL;
3416 delete cudaStapleField1; cudaStapleField1 = NULL;
3427 #include <dslash_init.cuh>
3437 const double unitarize_eps = 1e-14;
3438 const double max_error = 1e-10;
3439 const int reunit_allow_svd = 1;
3440 const int reunit_svd_only = 0;
3441 const double svd_rel_error = 1e-6;
3442 const double svd_abs_error = 1e-6;
3444 reunit_allow_svd, reunit_svd_only,
3445 svd_rel_error, svd_abs_error);
3456 for(
int dir=0; dir<4; ++dir){ qudaGaugeParam_ex->
X[dir] = param->
X[dir]+4; }
3506 for(
int dir=0; dir<4; ++dir)
gParam.
x[dir] = qudaGaugeParam_ex->
X[dir];
3519 inlinkPtr = cudaInLink;
3522 inlinkPtr = cudaInLinkEx;
3534 int R[4] = {2, 2, 2, 2};
3540 quda::computeFatLinkCore(inlinkPtr, const_cast<double*>(path_coeff), param, method, cudaFatLink, cudaLongLink, profileFatLink);
3545 int* num_failures_dev;
3546 cudaMalloc((
void**)&num_failures_dev,
sizeof(
int));
3547 cudaMemset(num_failures_dev, 0,
sizeof(
int));
3548 if(num_failures_dev == NULL)
errorQuda(
"cudaMalloc fialed for dev_pointer\n");
3557 cudaMemcpy(&num_failures, num_failures_dev,
sizeof(
int), cudaMemcpyDeviceToHost);
3559 cudaFree(num_failures_dev);
3561 errorQuda(
"Error in the unitarization component of the hisq fattening\n");
3574 if(longlink)
delete cudaLongLink;
3577 delete cudaUnitarizedLink;
3578 if(cudaInLinkEx)
delete cudaInLinkEx;
3586 #endif // GPU_FATLINK
3591 int volume = param.
x[0]*param.
x[1]*param.
x[2]*param.
x[3];
3593 for(
int dir=0; dir<4; ++dir) face_size[dir] = (volume/param.
x[dir])/2;
3594 pad = *std::max_element(face_size, face_size+4);
3600 #ifdef GPU_GAUGE_FORCE
3602 namespace gaugeforce {
3603 #include <dslash_init.cuh>
3609 double* loop_coeff,
int num_paths,
int max_length,
double eb3,
3618 #ifdef GPU_GAUGE_FORCE
3622 checkGaugeParam(qudaGaugeParam);
3630 for (
int d=0; d<4; d++) gParamEx.
x[d] = gParam.
x[d] + 4;
3634 gParam.
gauge = siteLink;
3643 printfQuda(
"GaugeForce: Using resident gauge field\n");
3676 int R[4] = {2, 2, 2, 2};
3693 gParamMom.
gauge=mom;
3703 printfQuda(
"GaugeForce: Using resident mom field\n");
3728 gauge_force_cuda(*cudaMom, eb3, *cudaGauge, qudaGaugeParam, input_path_buf,
3729 path_length, loop_coeff, num_paths, max_length);
3742 delete cudaSiteLink;
3770 errorQuda(
"Gauge force has not been built");
3771 #endif // GPU_GAUGE_FORCE
3781 printfQuda(
"About to create cloverPrecise\n");
3783 cloverParam.
nDim = 4;
3784 for(
int dir=0; dir<4; ++dir) cloverParam.
x[dir] =
gaugePrecise->
X()[dir];
3787 cloverParam.
direct =
true;
3789 cloverParam.
norm = 0;
3797 cloverParam.
direct =
true;
3801 cloverParam.
direct =
false;
3803 cloverParam.
mu2 = 4.*invertParam->
kappa*invertParam->
kappa*invertParam->
mu*invertParam->
mu;
3810 int R[4] = {2,2,2,2};
3812 for(
int dir=0; dir<4; ++dir) y[dir] =
gaugePrecise->
X()[dir] + 2*R[dir];
3824 for (
int d=0; d<4; d++) gParamEx.r[d] = R[d];
3838 cudaGaugeExtended->exchangeExtendedGhost(R,
true);
3851 for(
int dir=0; dir<4; ++dir)
gParam.
x[dir] += 4;
3860 cpuGaugeExtended.exchangeExtendedGhost(R,
true);
3897 }
else if(geometry == 4){
3900 errorQuda(
"Only scalar and vector geometries are supported\n");
3951 if (geometry == 1) {
3953 }
else if(geometry == 4) {
3956 errorQuda(
"Only scalar and vector geometries are supported");
3991 for(
int dir=0; dir<4; ++dir) param_ex.
X[dir] = param->
X[dir]+4;
3994 gParam_ex.geometry = geom;
4007 int R[4] = {2,2,2,2};
4027 int R[4] = {2,2,2,2};
4075 checkGaugeParam(param);
4078 #ifndef USE_EXTENDED_VOLUME
4079 #define USE_EXTENDED_VOLUME
4091 #ifndef USE_EXTENDED_VOLUME
4094 gParam.
gauge = oprod;
4116 #ifndef USE_EXTENDED_VOLUME
4131 cloverDerivative(*cudaOut, *gPointer, *oPointer, mu, nu, coeff, parity, conjugate);
4187 bool use_resident_solution =
false;
4190 use_resident_solution =
true;
4192 errorQuda(
"No input quark field defined");
4226 if (use_resident_solution) {
4237 const double act_path_coeff[6],
4238 const void*
const one_link_src[4],
4239 const void*
const naik_src[4],
4240 const void*
const link,
4244 #ifdef GPU_HISQ_FORCE
4245 long long partialFlops;
4246 using namespace quda::fermion_force;
4278 param.
gauge = (
void*)link;
4282 param.
gauge = (
void*)one_link_src;
4285 param.
gauge = (
void*)naik_src;
4291 param.
gauge = milc_momentum;
4309 for(
int dir=0; dir<4; ++dir) param.
x[dir] += 4;
4321 int R[4] = {2, 2, 2, 2};
4328 cudaMemset((
void**)(cudaInForce_ex->Gauge_p()), 0, cudaInForce_ex->Bytes());
4337 cudaMemset((
void**)(cudaInForce_ex->Gauge_p()), 0, cudaInForce_ex->Bytes());
4339 cudaInForce_ex->exchangeExtendedGhost(R,
true);
4342 cudaMemset((
void**)(cudaOutForce->Gauge_p()), 0, cudaOutForce->Bytes());
4345 cudaMemset((
void**)(cudaOutForce_ex->Gauge_p()), 0, cudaOutForce_ex->Bytes());
4347 *flops += partialFlops;
4350 *flops += partialFlops;
4358 cudaInForce_ex->exchangeExtendedGhost(R,
true);
4365 *flops += partialFlops;
4367 *flops += partialFlops;
4370 *flops += partialFlops;
4372 *flops += partialFlops;
4382 delete cudaOutForce;
4386 delete cudaInForce_ex;
4387 delete cudaOutForce_ex;
4392 delete cpuOneLinkInForce;
4393 delete cpuNaikInForce;
4402 errorQuda(
"HISQ force has not been built");
4409 const double level2_coeff[6],
4410 const double fat7_coeff[6],
4413 double** quark_coeff,
4414 const void*
const w_link,
4415 const void*
const v_link,
4416 const void*
const u_link,
4450 const double level2_coeff[6],
4451 const double fat7_coeff[6],
4452 const void*
const staple_src[4],
4453 const void*
const one_link_src[4],
4454 const void*
const naik_src[4],
4455 const void*
const w_link,
4456 const void*
const v_link,
4457 const void*
const u_link,
4460 #ifdef GPU_HISQ_FORCE
4462 long long partialFlops;
4464 using namespace quda::fermion_force;
4468 double act_path_coeff[6] = {0,1,level2_coeff[2],level2_coeff[3],level2_coeff[4],level2_coeff[5]};
4477 param.
gauge = (
void*)milc_momentum;
4488 param.
gauge = (
void*)w_link;
4490 param.
gauge = (
void*)v_link;
4492 param.
gauge = (
void*)u_link;
4506 param.
gauge = (
void*)staple_src;
4508 param.
gauge = (
void*)one_link_src;
4510 param.
gauge = (
void*)naik_src;
4522 for(
int dir=0; dir<4; ++dir) param.
x[dir] += 4;
4541 const double unitarize_eps = 1e-14;
4542 const double hisq_force_filter = 5e-5;
4543 const double max_det_error = 1e-10;
4544 const bool allow_svd =
true;
4545 const bool svd_only =
false;
4546 const double svd_rel_err = 1e-8;
4547 const double svd_abs_err = 1e-8;
4564 int R[4] = {2, 2, 2, 2};
4567 cudaGaugeEx->exchangeExtendedGhost(R,
true);
4577 cudaInForceEx->exchangeExtendedGhost(R,
true);
4584 cudaOutForceEx->exchangeExtendedGhost(R,
true);
4593 hisqStaplesForceCuda(act_path_coeff, *gParam, *inForcePtr, *gaugePtr, outForcePtr, &partialFlops);
4594 *flops += partialFlops;
4604 cudaInForceEx->exchangeExtendedGhost(R,
true);
4610 hisqLongLinkForceCuda(act_path_coeff[1], *gParam, *inForcePtr, *gaugePtr, outForcePtr, &partialFlops);
4611 *flops += partialFlops;
4615 cudaOutForceEx->exchangeExtendedGhost(R,
true);
4625 cudaGaugeEx->exchangeExtendedGhost(R,
true);
4630 int numFailures = 0;
4631 int* numFailuresDev;
4633 if(cudaMalloc((
void**)&numFailuresDev,
sizeof(
int)) == cudaErrorMemoryAllocation){
4634 errorQuda(
"cudaMalloc failed for numFailuresDev\n");
4636 cudaMemset(numFailuresDev, 0,
sizeof(
int));
4641 unitarizeForceCuda(*outForcePtr, *gaugePtr, inForcePtr, numFailuresDev, &partialFlops);
4642 *flops += partialFlops;
4645 cudaMemcpy(&numFailures, numFailuresDev,
sizeof(
int), cudaMemcpyDeviceToHost);
4647 cudaFree(numFailuresDev);
4650 errorQuda(
"Error in the unitarization component of the hisq fermion force\n");
4653 cudaMemset((
void**)(outForcePtr->Gauge_p()), 0, outForcePtr->Bytes());
4661 cudaGaugeEx->exchangeExtendedGhost(R,
true);
4667 *flops += partialFlops;
4669 *flops += partialFlops;
4681 delete cpuStapleForce;
4682 delete cpuOneLinkForce;
4683 delete cpuNaikForce;
4691 delete cudaInForceEx;
4692 delete cudaOutForceEx;
4695 delete cudaOutForce;
4701 errorQuda(
"HISQ force has not been built");
4711 using namespace quda;
4714 checkGaugeParam(param);
4728 oParam.
gauge = oprod[0];
4732 oParam.
gauge = oprod[1];
4763 for(
int dir=0; dir<4; ++dir) qParam.x[dir] = oParam.
x[dir];
4777 const int Ninternal = 6;
4783 for(
int i=0; i<num_terms; ++i){
4787 qParam.v = fermion[i];
4794 cudaQuarkEven = cpuQuarkEven;
4795 cudaQuarkOdd = cpuQuarkOdd;
4801 computeStaggeredOprod(cudaOprod0, cudaOprod1, cudaQuarkEven, cudaQuarkOdd, faceBuffer1, 0, coeff[i]);
4804 computeStaggeredOprod(cudaOprod0, cudaOprod1, cudaQuarkEven, cudaQuarkOdd, faceBuffer2, 1, coeff[i]);
4945 checkGaugeParam(param);
4966 gParam.
gauge = momentum;
5015 (
bool)conj_mom, (
bool)exact);
5029 delete cudaOutGauge;
5069 {
MatQuda(h_out, h_in, inv_param); }
5086 bool *conj_mom,
bool *exact,
5092 double *loop_coeff,
int *num_paths,
int *max_length,
double *dt,
5097 int ***input_path = (
int***)
safe_malloc(dim*
sizeof(
int**));
5098 for (
int i=0; i<
dim; i++) {
5099 input_path[i] = (
int**)
safe_malloc(*num_paths*
sizeof(
int*));
5100 for (
int j=0; j<*num_paths; j++) {
5101 input_path[i][j] = (
int*)
safe_malloc(path_length[j]*
sizeof(
int));
5102 for (
int k=0; k<path_length[j]; k++) {
5103 input_path[i][j][k] = input_path_buf[(i* (*num_paths) + j)* (*max_length) + k];
5108 computeGaugeForceQuda(mom, gauge, input_path, path_length, loop_coeff, *num_paths, *max_length, *dt, param, 0);
5110 for (
int i=0; i<
dim; i++) {
5111 for (
int j=0; j<*num_paths; j++) {
host_free(input_path[i][j]); }
5141 cudaDeviceSynchronize();
5148 static int bqcd_rank_from_coords(
const int *coords,
void *fdata)
5150 int *
dims =
static_cast<int *
>(fdata);
5152 int rank = coords[3];
5153 for (
int i = 2; i >= 0; i--) {
5154 rank = dims[i] * rank + coords[i];
5172 bool pack_ = *pack ?
true :
false;
5187 for(
int dir=0; dir<4; ++dir) y[dir] =
gaugePrecise->
X()[dir] + 4;
5200 int R[4] = {2,2,2,2};
5203 cudaDeviceSynchronize();
5215 errorQuda(
"Gauge field must be loaded");
5222 for(
int dir=0; dir<4; ++dir) y[dir] =
gaugePrecise->
X()[dir] + 4;
5235 int R[4] = {2,2,2,2};
5244 int R[4] = {2,2,2,2};
5245 for (
int dir=0; dir<4; ++dir) y[dir] =
gaugePrecise->
X()[dir] + 4;
5247 for (
int dir=0; dir<4; ++dir) y[dir] =
gaugePrecise->
X()[dir];
5275 for (
unsigned int i=0; i<nSteps; i++) {
5287 delete cudaGaugeTemp;
void new_quda_invert_param_(QudaInvertParam *param)
QudaCloverFieldOrder order
QudaGaugeParam gauge_param
void Dslash5(cudaColorSpinorField &out, const cudaColorSpinorField &in, const QudaParity parity) const
void destroyQudaGaugeField(void *gauge)
QudaDiracFieldOrder dirac_order
QudaMassNormalization mass_normalization
double tol_hq_offset[QUDA_MAX_MULTI_SHIFT]
void destroyStaggeredOprodEvents()
QudaReconstructType reconstruct_sloppy
double c_5[QUDA_MAX_DWF_LS]
NEW: used by mobius domain wall only.
void freeCloverQuda(void)
void * createExtendedGaugeField(void *gauge, int geometry, QudaGaugeParam *param)
int compute_gauge_force_quda_(void *mom, void *gauge, int *input_path_buf, int *path_length, double *loop_coeff, int *num_paths, int *max_length, double *dt, QudaGaugeParam *param)
void setDiracSloppyParam(DiracParam &diracParam, QudaInvertParam *inv_param, bool pc)
double b_5[QUDA_MAX_DWF_LS]
void invert_multishift_quda_(void *hp_x[QUDA_MAX_MULTI_SHIFT], void *hp_b, QudaInvertParam *param)
void computeKSOprodQuda(void *oprod, void *fermion, double coeff, int X[4], QudaPrecision prec)
void loadCloverQuda(void *h_clover, void *h_clovinv, QudaInvertParam *inv_param)
void setVerbosityQuda(QudaVerbosity verbosity, const char prefix[], FILE *outfile)
QudaFieldLocation clover_location
void llfat_cuda(cudaGaugeField *cudaFatLink, cudaGaugeField *cudaLongLink, cudaGaugeField &cudaSiteLink, cudaGaugeField &cudaStaple, cudaGaugeField &cudaStaple1, QudaGaugeParam *param, double *act_path_coeff)
void computeCloverTraceQuda(void *out, void *clov, int mu, int nu, int dim[4])
enum QudaPrecision_s QudaPrecision
void saveGaugeField(void *gauge, void *inGauge, QudaGaugeParam *param)
int commDimPartitioned(int dir)
void load_gauge_quda_(void *h_gauge, QudaGaugeParam *param)
void invertMDQuda(void *hp_x, void *hp_b, QudaInvertParam *param)
virtual void StoreRitzVecs(void *host_buffer, double *inv_eigenvals, const int *X, QudaInvertParam *inv_par, const int nev, bool cleanResources=false)=0
void * V(bool inverse=false)
void setUnitarizeForceConstants(double unitarize_eps, double hisq_force_filter, double max_det_error, bool allow_svd, bool svd_only, double svd_rel_error, double svd_abs_error)
void Dslash5inv(cudaColorSpinorField &out, const cudaColorSpinorField &in, const QudaParity parity, const double &k) const
#define QUDA_MAX_MULTI_SHIFT
Maximum number of shifts supported by the multi-shift solver. This number may be changed if need be...
cudaGaugeField * gaugeExtended
void createDirac(Dirac *&d, Dirac *&dSloppy, Dirac *&dPre, QudaInvertParam ¶m, const bool pc_solve)
void printQudaGaugeParam(QudaGaugeParam *param)
QudaVerbosity getVerbosity()
void unitarizeForceCuda(cudaGaugeField &cudaOldForce, cudaGaugeField &cudaGauge, cudaGaugeField *cudaNewForce, int *unitarization_failed, long long *flops=NULL)
void Clover(cudaColorSpinorField &out, const cudaColorSpinorField &in, const QudaParity parity) const
void saveCPUField(cpuGaugeField &, const QudaFieldLocation &) const
QudaDslashType dslash_type
void setUnitarizeLinksConstants(double unitarize_eps, double max_error, bool allow_svd, bool svd_only, double svd_rel_error, double svd_abs_error, bool check_unitarization=true)
QudaReconstructType reconstruct_precondition
QudaInverterType inv_type
Fortran interface functions.
void computeHISQForceCompleteQuda(void *const milc_momentum, const double level2_coeff[6], const double fat7_coeff[6], void **quark_array, int num_terms, double **quark_coeff, const void *const w_link, const void *const v_link, const void *const u_link, const QudaGaugeParam *gParam)
double c_5[QUDA_MAX_DWF_LS]
void computeKSLinkQuda(void *fatlink, void *longlink, void *ulink, void *inlink, double *path_coeff, QudaGaugeParam *param, QudaComputeFatMethod method)
__host__ __device__ ValueType sqrt(ValueType x)
cudaGaugeField *& gaugeFatExtended
__global__ void const RealA *const const RealA *const const RealA *const const RealB *const const RealB *const int int mu
QudaGaugeFieldOrder Order() const
void setOutputPrefix(const char *prefix)
void loadGaugeQuda(void *h_gauge, QudaGaugeParam *param)
void CloverInv(cudaColorSpinorField &out, const cudaColorSpinorField &in, const QudaParity parity) const
cudaColorSpinorField * tmp1
cudaGaugeField * gaugeLongPrecise
void mat(void *out, void **fatlink, void **longlink, void *in, double kappa, int dagger_bit, QudaPrecision sPrecision, QudaPrecision gPrecision)
void completeKSForce(GaugeField &mom, const GaugeField &oprod, const GaugeField &gauge, QudaFieldLocation location, long long *flops=NULL)
virtual void reconstruct(cudaColorSpinorField &x, const cudaColorSpinorField &b, const QudaSolutionType) const =0
void Dslash5inv(cudaColorSpinorField &out, const cudaColorSpinorField &in, const QudaParity parity, const double &k) const
void free_clover_quda_(void)
void initCommsGridQuda(int nDim, const int *dims, QudaCommsMap func, void *fdata)
double plaquette(const GaugeField &data, QudaFieldLocation location)
void massRescale(cudaColorSpinorField &b, QudaInvertParam ¶m)
virtual void CleanResources()=0
QudaGaugeFieldOrder gauge_order
void computeFatLinkCore(cudaGaugeField *cudaSiteLink, double *act_path_coeff, QudaGaugeParam *qudaGaugeParam, QudaComputeFatMethod method, cudaGaugeField *cudaFatLink, cudaGaugeField *cudaLongLink, TimeProfile &profile)
void setFatLinkPadding(QudaComputeFatMethod method, QudaGaugeParam *param)
void cloverDerivative(cudaGaugeField &out, cudaGaugeField &gauge, cudaGaugeField &oprod, int mu, int nu, double coeff, QudaParity parity, int conjugate)
void saveGaugeQuda(void *h_gauge, QudaGaugeParam *param)
void llfat_init_cuda_ex(QudaGaugeParam *param_ex)
void setUnitarizeLinksPadding(int input_padding, int output_padding)
cudaColorSpinorField * solutionResident
QudaSiteSubset siteSubset
cudaGaugeField * cudaGauge
QudaPrecision clover_cuda_prec_sloppy
cudaGaugeField * gaugeLongExtended
QudaFieldLocation input_location
void dslashQuda_4dpc(void *h_out, void *h_in, QudaInvertParam *inv_param, QudaParity parity, int test_type)
double true_res_hq_offset[QUDA_MAX_MULTI_SHIFT]
cudaCloverField * cloverPrecondition
QudaUseInitGuess use_init_guess
void init_quda_(int *dev)
int getGaugePadding(GaugeFieldParam ¶m)
void setDiracParam(DiracParam &diracParam, QudaInvertParam *inv_param, bool pc)
void comm_init(int ndim, const int *dims, QudaCommsMap rank_from_coords, void *map_data)
cudaColorSpinorField & Odd() const
QudaSolutionType solution_type
QudaSolverNormalization solver_normalization
int numa_affinity_enabled
virtual void Dslash(cudaColorSpinorField &out, const cudaColorSpinorField &in, const QudaParity parity) const =0
QudaPrecision clover_cuda_prec
cudaCloverField * cloverInvPrecise
QudaPrecision Precision() const
void invertMultiShiftQuda(void **_hp_x, void *_hp_b, QudaInvertParam *param)
void computeAsqtadForceQuda(void *const milc_momentum, long long *flops, const double act_path_coeff[6], const void *const one_link_src[4], const void *const naik_src[4], const void *const link, const QudaGaugeParam *gParam)
QudaInvertParam * invert_param
void setTuning(QudaTune tune)
cudaDeviceProp deviceProp
void cloverInvert(CloverField &clover, bool computeTraceLog, QudaFieldLocation location)
void Dslash4(cudaColorSpinorField &out, const cudaColorSpinorField &in, const QudaParity parity) const
void exchange_cpu_sitelink_ex(int *X, int *R, void **sitelink, QudaGaugeFieldOrder cpu_order, QudaPrecision gPrecision, int optflag, int geometry)
void Dslash5(cudaColorSpinorField &out, const cudaColorSpinorField &in, const QudaParity parity) const
cudaColorSpinorField * tmp
QudaFieldLocation output_location
QudaPrecision clover_cuda_prec_precondition
void apply_staggered_phase_quda_()
VOLATILE spinorFloat kappa
void Dagger(QudaDagType dag)
double true_res_hq_offset[QUDA_MAX_MULTI_SHIFT]
void updateInvertParam(QudaInvertParam ¶m, int offset=-1)
void llfat_cuda_ex(cudaGaugeField *cudaFatLink, cudaGaugeField *cudaLongLink, cudaGaugeField &cudaSiteLink, cudaGaugeField &cudaStaple, cudaGaugeField &cudaStaple1, QudaGaugeParam *param, double *act_path_coeff)
cpuGaugeField * cpuFatLink
void exchangeExtendedGhost(const int *R, bool no_comms_fill=false)
cudaCloverField * cloverSloppy
static void flushPinnedCache()
double tol_offset[QUDA_MAX_MULTI_SHIFT]
int commDim[QUDA_MAX_DIM]
double true_res_offset[QUDA_MAX_MULTI_SHIFT]
void MatQuda(void *h_out, void *h_in, QudaInvertParam *inv_param)
void load_clover_quda_(void *h_clover, void *h_clovinv, QudaInvertParam *inv_param)
cudaGaugeField *& gaugeFatPrecondition
QudaInvertParam newQudaInvertParam(void)
cudaGaugeField * cudaFatLink
void setPrecision(QudaPrecision precision)
static Solver * create(SolverParam ¶m, DiracMatrix &mat, DiracMatrix &matSloppy, DiracMatrix &matPrecon, TimeProfile &profile)
void hisqCompleteForceCuda(const QudaGaugeParam ¶m, const cudaGaugeField &oprod, const cudaGaugeField &link, cudaGaugeField *force, long long *flops=NULL)
static void freeBuffer(int index=0)
QudaPrecision cuda_prec_precondition
void free_sloppy_gauge_quda_()
QudaCloverFieldOrder clover_order
void createDslashEvents()
void updateGaugeFieldQuda(void *gauge, void *momentum, double dt, int conj_mom, int exact, QudaGaugeParam *param)
void remove_staggered_phase_quda_()
void invert_md_quda_(void *hp_x, void *hp_b, QudaInvertParam *param)
QudaGaugeFieldOrder order
cudaCloverField * cloverInvPrecondition
virtual void MdagM(cudaColorSpinorField &out, const cudaColorSpinorField &in) const =0
QudaReconstructType Reconstruct() const
void performAPEnStep(unsigned int nSteps, double alpha)
__host__ __device__ ValueType pow(ValueType x, ExponentType e)
static void freeGhostBuffer(void)
double b_5[QUDA_MAX_DWF_LS]
NEW: used by domain wall and twisted mass.
void copyCuda(cudaColorSpinorField &dst, const cudaColorSpinorField &src)
__constant__ double coeff
QudaPrecision cuda_prec_sloppy
void loadCPUField(const cpuGaugeField &, const QudaFieldLocation &)
void initLatticeConstants(const LatticeField &lat, TimeProfile &profile)
enum QudaGaugeFieldOrder_s QudaGaugeFieldOrder
QudaMatPCType matpcType
NEW: used by mobius domain wall only.
double true_res_offset[QUDA_MAX_MULTI_SHIFT]
void comm_set_gridsize_(int *grid)
double offset[QUDA_MAX_MULTI_SHIFT]
void gauge_force_cuda(cudaGaugeField &cudaMom, double eb3, cudaGaugeField &cudaSiteLink, QudaGaugeParam *param, int ***input_path, int *length, double *path_coeff, int num_paths, int max_length)
QudaSolutionType RitzMat_lanczos
cudaCloverField * cloverPrecise
void compute_staggered_force_quda_(void *cudaMom, void *qudaQuark, double *coeff)
enum QudaParity_s QudaParity
QudaReconstructType reconstruct
void lanczosQuda(int k0, int m, void *hp_Apsi, void *hp_r, void *hp_V, void *hp_alpha, void *hp_beta, QudaEigParam *eig_param)
void extendGaugeField(void *out, void *in)
cudaCloverField * cloverInv
void computeStaggeredOprod(cudaGaugeField &out, cudaColorSpinorField &in, FaceBuffer &facebuffer, const unsigned int parity, const double coeff, const unsigned int displacement)
void dslashQuda_mdwf(void *h_out, void *h_in, QudaInvertParam *inv_param, QudaParity parity, int test_type)
static Eig_Solver * create(QudaEigParam ¶m, RitzMat &ritz_mat, TimeProfile &profile)
void unitarizeLinksCuda(const QudaGaugeParam ¶m, cudaGaugeField &infield, cudaGaugeField *outfield, int *num_failures)
void computeCloverSigmaTrace(GaugeField &gauge, const CloverField &clover, int dir1, int dir2, QudaFieldLocation location)
static void freeGhostBuffer(void)
void computeStaggeredForceQuda(void *cudaMom, void *qudaQuark, double coeff)
void MatDagMatQuda(void *h_out, void *h_in, QudaInvertParam *inv_param)
void invertMultiShiftMDQuda(void **_hp_xe, void **_hp_xo, void **_hp_ye, void **_hp_yo, void *_hp_b, QudaInvertParam *param)
void invert_quda_(void *hp_x, void *hp_b, QudaInvertParam *param)
#define safe_malloc(size)
void copy(const CloverField &src, bool inverse=true)
double shift
Shift term added onto operator (M^dag M + shift)
void setMass(double mass)
void pushVerbosity(QudaVerbosity verbosity)
void init_quda_device_(int *dev)
cudaGaugeField * gaugeLongSloppy
void Dslash4(cudaColorSpinorField &out, const cudaColorSpinorField &in, const QudaParity parity) const
QudaGhostExchange ghostExchange
#define checkCudaErrorNoSync()
void hisqLongLinkForceCuda(double coeff, const QudaGaugeParam ¶m, const cudaGaugeField &oprod, const cudaGaugeField &link, cudaGaugeField *newOprod, long long *flops=NULL)
void Dslash4pre(cudaColorSpinorField &out, const cudaColorSpinorField &in, const QudaParity parity) const
void update_gauge_field_quda_(void *gauge, void *momentum, double *dt, bool *conj_mom, bool *exact, QudaGaugeParam *param)
void APEStep(GaugeField &dataDs, const GaugeField &dataOr, double alpha, QudaFieldLocation location)
void invertQuda(void *hp_x, void *hp_b, QudaInvertParam *param)
void printQudaInvertParam(QudaInvertParam *param)
void computeHISQForceQuda(void *const milc_momentum, long long *flops, const double level2_coeff[6], const double fat7_coeff[6], const void *const staple_src[4], const void *const one_link_src[4], const void *const naik_src[4], const void *const w_link, const void *const v_link, const void *const u_link, const QudaGaugeParam *gParam)
void clover_quda_(void *h_out, void *h_in, QudaInvertParam *inv_param, QudaParity *parity, int *inverse)
void hisqStaplesForceCuda(const double path_coeff[6], const QudaGaugeParam ¶m, const cudaGaugeField &oprod, const cudaGaugeField &link, cudaGaugeField *newOprod, long long *flops=NULL)
void incrementalEigQuda(void *_h_x, void *_h_b, QudaInvertParam *param, void *_h_u, double *inv_eigenvals, int last_rhs)
cudaGaugeField * cudaGauge_ex
QudaFieldLocation location
QudaInvertParam inv_param
cpuColorSpinorField * out
void Stop(QudaProfileType idx)
QudaPrecision cuda_prec_precondition
cudaGaugeField * gaugePrecondition
void loadTuneCache(QudaVerbosity verbosity)
__constant__ int Vh_2d_max
enum QudaReconstructType_s QudaReconstructType
Main header file for the QUDA library.
static DeflatedSolver * create(SolverParam ¶m, DiracMatrix &mat, DiracMatrix &matSloppy, DiracMatrix &matCGSloppy, DiracMatrix &matDeflate, TimeProfile &profile)
void * createGaugeField(void *gauge, int geometry, QudaGaugeParam *param)
virtual void M(cudaColorSpinorField &out, const cudaColorSpinorField &in) const =0
void set_kernel_pack_t_(int *pack)
QudaPrecision Precision() const
virtual void prepare(cudaColorSpinorField *&src, cudaColorSpinorField *&sol, cudaColorSpinorField &x, cudaColorSpinorField &b, const QudaSolutionType) const =0
double Last(QudaProfileType idx)
#define QUDA_MAX_DWF_LS
Maximum length of the Ls dimension for domain-wall fermions.
void applyStaggeredPhase()
void freeSloppyGaugeQuda(void)
void mat_dag_mat_quda_(void *h_out, void *h_in, QudaInvertParam *inv_param)
void new_quda_gauge_param_(QudaGaugeParam *param)
cudaGaugeField * fatGauge
QudaTwistFlavorType twist_flavor
void cloverQuda(void *h_out, void *h_in, QudaInvertParam *inv_param, QudaParity parity, int inverse)
cudaGaugeField * gaugeSmeared
quda::cudaGaugeField * checkGauge(QudaInvertParam *param)
QudaReconstructType reconstruct
void setKernelPackT(bool pack)
void Start(QudaProfileType idx)
cudaGaugeField *& gaugeFatSloppy
void copy(const GaugeField &)
QudaResidualType residual_type
enum QudaFieldGeometry_s QudaFieldGeometry
QudaUseInitGuess use_init_guess
cudaGaugeField * longGauge
void dslash_quda_(void *h_out, void *h_in, QudaInvertParam *inv_param, QudaParity *parity)
enum QudaVerbosity_s QudaVerbosity
void updateGaugeField(GaugeField &out, double dt, const GaugeField &in, const GaugeField &mom, bool conj_mom, bool exact)
void createCloverQuda(QudaInvertParam *invertParam)
void computeStaggeredOprodQuda(void **oprod, void **fermion, int num_terms, double **coeff, QudaGaugeParam *param)
#define QUDA_MAX_DIM
Maximum number of dimensions supported by QUDA. In practice, no routines make use of more than 5...
cudaCloverField * cloverInvSloppy
bool compute_fat_link_max
QudaFieldGeometry geometry
void setOutputFile(FILE *outfile)
cudaGaugeField * gaugePrecise
int(* QudaCommsMap)(const int *coords, void *fdata)
static Dirac * create(const DiracParam ¶m)
cudaGaugeField *& gaugeFatPrecise
QudaFieldGeometry Geometry() const
void dslashQuda(void *h_out, void *h_in, QudaInvertParam *inv_param, QudaParity parity)
void llfat_init_cuda(QudaGaugeParam *param)
void Mdag(cudaColorSpinorField &out, const cudaColorSpinorField &in) const
cudaGaugeField * cudaOprod
void gauge_force_init_cuda(QudaGaugeParam *param, int max_length)
void axCuda(const double &a, cudaColorSpinorField &x)
cudaGaugeField * extendedGaugeResident
void computeClover(CloverField &clover, const GaugeField &gauge, double coeff, QudaFieldLocation location)
QudaDslashType dslash_type_precondition
double norm2(const ColorSpinorField &)
QudaPrecision clover_cpu_prec
void initLatticeConstants(const LatticeField &lat, TimeProfile &profile)
QudaTboundary TBoundary() const
enum QudaComputeFatMethod_s QudaComputeFatMethod
void computeCloverDerivativeQuda(void *out, void *gauge, void *oprod, int mu, int nu, double coeff, QudaParity parity, QudaGaugeParam *param, int conjugate)
void destroyDslashEvents()
void copyExtendedGauge(GaugeField &out, const GaugeField &in, QudaFieldLocation location, void *Out=0, void *In=0)
void setVerbosity(const QudaVerbosity verbosity)
cudaGaugeField * momResident
cudaColorSpinorField & Even() const
double Anisotropy() const
void saveTuneCache(QudaVerbosity verbosity)
cudaGaugeField * gaugeSloppy
void createStaggeredOprodEvents()
void initQudaDevice(int dev)
int computeGaugeForceQuda(void *mom, void *siteLink, int ***input_path_buf, int *path_length, double *loop_coeff, int num_paths, int max_length, double eb3, QudaGaugeParam *qudaGaugeParam, double *timeinfo)
void setDiracPreParam(DiracParam &diracParam, QudaInvertParam *inv_param, const bool pc)
QudaGaugeParam newQudaGaugeParam(void)
QudaPreserveSource preserve_source
void mat_quda_(void *h_out, void *h_in, QudaInvertParam *inv_param)
cudaGaugeField * gaugeLongPrecondition
void removeStaggeredPhase()