34 #ifdef GPU_GAUGE_FORCE
38 #define MAX(a,b) ((a)>(b)? (a):(b))
39 #define TDIFF(a,b) (b.tv_sec - a.tv_sec + 0.000001*(b.tv_usec - a.tv_usec))
41 #define spinorSiteSize 24 // real numbers per spinor
43 #define MAX_GPU_NUM_PER_NODE 16
87 static bool initialized =
false;
102 static TimeProfile profileMulti(
"invertMultiShiftQuda");
105 static TimeProfile profileMultiMixed(
"invertMultiShiftMixedQuda");
127 static int lex_rank_from_coords(
const int *coords,
void *fdata)
131 int rank = coords[0];
132 for (
int i = 1; i < md->
ndim; i++) {
133 rank = md->
dims[i] * rank + coords[i];
142 static int qmp_rank_from_coords(
const int *coords,
void *fdata)
144 return QMP_get_node_number_from(coords);
149 static bool comms_initialized =
false;
154 errorQuda(
"Number of communication grid dimensions must be 4");
160 if (QMP_logical_topology_is_declared()) {
161 if (QMP_get_logical_number_of_dimensions() != 4) {
162 errorQuda(
"QMP logical topology must have 4 dimensions");
164 for (
int i=0; i<nDim; i++) {
165 int qdim = QMP_get_logical_dimensions()[i];
166 if(qdim != dims[i]) {
167 errorQuda(
"QMP logical dims[%d]=%d does not match dims[%d]=%d argument", i, qdim, i, dims[i]);
171 func = qmp_rank_from_coords;
173 warningQuda(
"QMP logical topology is undeclared; using default lexicographical ordering");
177 map_data.
ndim = nDim;
178 for (
int i=0; i<nDim; i++) {
179 map_data.
dims[i] = dims[i];
181 fdata = (
void *) &map_data;
182 func = lex_rank_from_coords;
190 comms_initialized =
true;
199 if (initialized)
return;
202 #if defined(GPU_DIRECT) && defined(MULTI_GPU) && (CUDA_VERSION == 4000)
205 char* cni_str = getenv(
"CUDA_NIC_INTEROP");
207 errorQuda(
"Environment variable CUDA_NIC_INTEROP is not set\n");
209 int cni_int = atoi(cni_str);
211 errorQuda(
"Environment variable CUDA_NIC_INTEROP is not set to 1\n");
216 cudaGetDeviceCount(&deviceCount);
217 if (deviceCount == 0) {
221 for(
int i=0; i<deviceCount; i++) {
229 if (!comms_initialized) {
230 #if defined(QMP_COMMS)
231 if (QMP_logical_topology_is_declared()) {
232 int ndim = QMP_get_logical_number_of_dimensions();
233 const int *dims = QMP_get_logical_dimensions();
236 errorQuda(
"initQuda() called without prior call to initCommsGridQuda(),"
237 " and QMP logical topology has not been declared");
239 #elif defined(MPI_COMMS)
240 errorQuda(
"When using MPI for communications, initCommsGridQuda() must be called before initQuda()");
242 const int dims[4] = {1, 1, 1, 1};
250 if (dev < 0 || dev >= 16)
errorQuda(
"Invalid device number %d", dev);
256 errorQuda(
"Device %d does not support CUDA", dev);
271 if(
deviceProp.canMapHostMemory) cudaSetDeviceFlags(cudaDeviceMapHost);
274 cudaDeviceSetCacheConfig(cudaFuncCachePreferL1);
279 for (
int i=0; i<
Nstream; i++) {
297 if (!initialized)
errorQuda(
"QUDA not initialized");
300 checkGaugeParam(param);
351 precondition = sloppy;
355 switch (param->
type) {
391 if (!initialized)
errorQuda(
"QUDA not initialized");
392 checkGaugeParam(param);
398 switch (param->
type) {
427 if (!initialized)
errorQuda(
"QUDA not initialized");
429 if (!h_clover && !h_clovinv) {
430 errorQuda(
"loadCloverQuda() called with neither clover term nor inverse");
433 errorQuda(
"Half precision not supported on CPU");
436 errorQuda(
"Gauge field must be loaded before clover");
439 errorQuda(
"Wrong dslash_type in loadCloverQuda()");
456 if (!h_clovinv && pc_solve && pc_solution) {
462 if (!h_clover && !pc_solve && !pc_solution) {
467 if (!h_clover && pc_solve && pc_solution && asymmetric) {
472 clover_param.
nDim = 4;
512 if (!initialized)
errorQuda(
"QUDA not initialized");
541 if (!initialized)
errorQuda(
"QUDA not initialized");
556 if (!initialized)
return;
583 comms_initialized =
false;
590 profileGauge.
Print();
591 profileClover.
Print();
592 profileInvert.
Print();
593 profileMulti.
Print();
594 profileMultiMixed.
Print();
624 diracParam.
Ls = inv_param->
Ls;
654 diracParam.
m5 = inv_param->
m5;
655 diracParam.
mu = inv_param->
mu;
658 for (
int i=0; i<4; i++) {
673 for (
int i=0; i<4; i++) {
689 for (
int i=0; i<4; i++) {
714 printfQuda(
"Mass rescale: Kappa is: %g\n", kappa);
715 printfQuda(
"Mass rescale: mass normalization: %d\n", mass_normalization);
716 double nin =
norm2(b);
717 printfQuda(
"Mass rescale: norm of source in = %g\n", nin);
722 errorQuda(
"Staggered code only supports QUDA_MASS_NORMALIZATION");
728 switch (solution_type) {
738 axCuda(4.0*kappa*kappa, b);
743 axCuda(4.0*kappa*kappa, b);
750 axCuda(16.0*pow(kappa,4), b);
752 axCuda(4.0*kappa*kappa, b);
756 errorQuda(
"Solution type %d not supported", solution_type);
761 printfQuda(
"Mass rescale: Kappa is: %g\n", kappa);
762 printfQuda(
"Mass rescale: mass normalization: %d\n", mass_normalization);
763 double nin =
norm2(b);
764 printfQuda(
"Mass rescale: norm of source out = %g\n", nin);
774 errorQuda(
"Staggered code only supports QUDA_MASS_NORMALIZATION");
780 switch (solution_type) {
790 coeff *= 4.0*kappa*
kappa;
795 coeff *= 4.0*kappa*
kappa;
802 coeff*=16.0*pow(kappa,4);
804 coeff*=4.0*kappa*
kappa;
808 errorQuda(
"Solution type %d not supported", solution_type);
840 double cpu =
norm2(*in_h);
841 double gpu =
norm2(in);
862 dirac->
Dslash(out, in, parity);
872 double cpu =
norm2(*out_h);
873 double gpu =
norm2(out);
905 double cpu =
norm2(*in_h);
906 double gpu =
norm2(in);
923 axCuda(0.25/(kappa*kappa), out);
941 double cpu =
norm2(*out_h);
942 double gpu =
norm2(out);
960 if (!initialized)
errorQuda(
"QUDA not initialized");
977 double cpu =
norm2(*in_h);
978 double gpu =
norm2(in);
992 dirac->
MdagM(out, in);
998 axCuda(1.0/pow(2.0*kappa,4), out);
1000 axCuda(0.25/(kappa*kappa), out);
1005 axCuda(0.25/(kappa*kappa), out);
1016 double cpu =
norm2(*out_h);
1017 double gpu =
norm2(out);
1018 printfQuda(
"Out CPU %e CUDA %e\n", cpu, gpu);
1052 if (!initialized)
errorQuda(
"QUDA not initialized");
1059 errorQuda(
"Cannot apply the clover term for a non Wilson-clover dslash");
1071 double cpu =
norm2(*in_h);
1072 double gpu =
norm2(in);
1093 if (!inverse) dirac.
Clover(out, in, parity);
1103 double cpu =
norm2(*out_h);
1104 double gpu =
norm2(out);
1105 printfQuda(
"Out CPU %e CUDA %e\n", cpu, gpu);
1126 if (!initialized)
errorQuda(
"QUDA not initialized");
1134 checkInvertParam(param);
1159 Dirac *dSloppy = NULL;
1166 Dirac &diracSloppy = *dSloppy;
1167 Dirac &diracPre = *dPre;
1176 const int *
X = cudaGauge->
X();
1196 errorQuda(
"Initial guess not supported for two-pass solver");
1208 double nh_b =
norm2(*h_b);
1209 double nb =
norm2(*b);
1210 double nh_x =
norm2(*h_x);
1211 double nx =
norm2(*x);
1212 printfQuda(
"Source: CPU = %g, CUDA copy = %g\n", nh_b, nb);
1213 printfQuda(
"Solution: CPU = %g, CUDA copy = %g\n", nh_x, nx);
1221 double nin =
norm2(*in);
1222 double nout =
norm2(*out);
1224 printfQuda(
"Prepared solution = %g\n", nout);
1230 double nin =
norm2(*in);
1231 printfQuda(
"Prepared source post mass rescale = %g\n", nin);
1254 if (pc_solution && !pc_solve) {
1255 errorQuda(
"Preconditioned (PC) solution_type requires a PC solve_type");
1258 if (!mat_solution && !pc_solution && pc_solve) {
1259 errorQuda(
"Unpreconditioned MATDAG_MAT solution_type requires an unpreconditioned solve_type");
1262 if (mat_solution && !direct_solve) {
1264 dirac.
Mdag(*in, tmp);
1265 }
else if (!mat_solution && direct_solve) {
1266 DiracMdag m(dirac), mSloppy(diracSloppy), mPre(diracPre);
1268 (*solve)(*
out, *
in);
1274 DiracM m(dirac), mSloppy(diracSloppy), mPre(diracPre);
1276 (*solve)(*
out, *
in);
1279 DiracMdagM m(dirac), mSloppy(diracSloppy), mPre(diracPre);
1281 (*solve)(*
out, *
in);
1286 double nx =
norm2(*x);
1296 double nx =
norm2(*x);
1297 double nh_x =
norm2(*h_x);
1298 printfQuda(
"Reconstructed: CUDA solution = %g, CPU copy = %g\n", nx, nh_x);
1333 if (!initialized)
errorQuda(
"QUDA not initialized");
1336 checkInvertParam(param);
1339 errorQuda(
"Number of shifts %d requested greater than QUDA_MAX_MULTI_SHIFT %d",
1350 errorQuda(
"Multi-shift solver does not support MAT or MATPC solution types");
1353 errorQuda(
"Multi-shift solver does not support DIRECT or DIRECT_PC solve types");
1355 if (pc_solution & !pc_solve) {
1356 errorQuda(
"Preconditioned (PC) solution_type requires a PC solve_type");
1358 if (!pc_solution & pc_solve) {
1359 errorQuda(
"In multi-shift solver, a preconditioned (PC) solve_type requires a PC solution_type");
1371 errorQuda(
"QUDA only currently supports multi-shift CG");
1385 errorQuda(
"Offsets must be ordered from smallest to largest");
1410 Dirac *dSloppy = NULL;
1416 Dirac &diracSloppy = *dSloppy;
1438 cpuParam.
v = hp_x[i];
1460 double nh_b =
norm2(*h_b);
1461 double nb =
norm2(*b);
1462 printfQuda(
"Source: CPU = %g, CUDA copy = %g\n", nh_b, nb);
1469 double *unscaled_shifts =
new double [param->
num_offset];
1471 unscaled_shifts[i] = param->
offset[i];
1526 printfQuda(
"Refining shift %d: L2 residual %e / %e, heavy quark %e / %e (actual / requested)\n",
1539 m.shift = param->
offset[i];
1546 CG cg(m, mSloppy, *param, profileMulti);
1560 param->
offset[i] = unscaled_shifts[i];
1563 delete [] unscaled_shifts;
1568 double nx =
norm2(*x[i]);
1608 #include <sys/time.h>
1614 int Vsh_x = X[1]*X[2]*X[3]/2;
1615 int Vsh_y = X[0]*X[2]*X[3]/2;
1616 int Vsh_z = X[0]*X[1]*X[3]/2;
1618 int Vsh_t = X[0]*X[1]*X[2]/2;
1621 for (
int i=0; i<4; i++) E[i] = X[i] + 4;
1630 Vh_2d_max =
MAX(Vh_2d_max, X[0]*X[3]/2);
1631 Vh_2d_max =
MAX(Vh_2d_max, X[1]*X[2]/2);
1632 Vh_2d_max =
MAX(Vh_2d_max, X[1]*X[3]/2);
1633 Vh_2d_max =
MAX(Vh_2d_max, X[2]*X[3]/2);
1663 gettimeofday(&time_array[0], NULL);
1675 static cudaGaugeField* cudaStapleField=NULL, *cudaStapleField1=NULL;
1676 if(cudaStapleField == NULL || cudaStapleField1 == NULL){
1685 gettimeofday(&time_array[1], NULL);
1688 llfat_cuda(*cudaFatLink, *cudaSiteLink, *cudaStapleField, *cudaStapleField1, qudaGaugeParam, act_path_coeff);
1690 llfat_cuda_ex(*cudaFatLink, *cudaSiteLink, *cudaStapleField, *cudaStapleField1, qudaGaugeParam, act_path_coeff);
1692 gettimeofday(&time_array[2], NULL);
1695 delete cudaStapleField; cudaStapleField = NULL;
1696 delete cudaStapleField1; cudaStapleField1 = NULL;
1698 gettimeofday(&time_array[3], NULL);
1712 #define TDIFF_MS(a,b) (b.tv_sec - a.tv_sec + 0.000001*(b.tv_usec - a.tv_usec))*1000
1715 struct timeval t7, t8, t9, t10, t11;
1717 gettimeofday(&t0, NULL);
1725 memcpy(qudaGaugeParam_ex, qudaGaugeParam,
sizeof(
QudaGaugeParam));
1727 qudaGaugeParam_ex->
X[0] = qudaGaugeParam->
X[0]+4;
1728 qudaGaugeParam_ex->
X[1] = qudaGaugeParam->
X[1]+4;
1729 qudaGaugeParam_ex->
X[2] = qudaGaugeParam->
X[2]+4;
1730 qudaGaugeParam_ex->
X[3] = qudaGaugeParam->
X[3]+4;
1743 if(cpuFatLink == NULL){
1749 if(cpuFatLink == NULL){
1750 errorQuda(
"ERROR: Creating cpuFatLink failed\n");
1753 cpuFatLink->
setGauge((
void**)fatlink);
1757 if(cudaFatLink == NULL){
1769 if(cpuSiteLink == NULL){
1779 if(cpuSiteLink == NULL){
1780 errorQuda(
"ERROR: Creating cpuSiteLink failed\n");
1783 cpuSiteLink->setGauge(sitelink);
1786 if(cudaSiteLink == NULL){
1798 gettimeofday(&t7, NULL);
1802 errorQuda(
"Only QDP-ordered site links are supported in the multi-gpu standard fattening code\n");
1807 gettimeofday(&t8, NULL);
1811 int R[4] = {2, 2, 2, 2};
1813 cpuSiteLink->Order(),qudaGaugeParam->
cpu_prec, 0);
1815 gettimeofday(&t7, NULL);
1817 gettimeofday(&t8, NULL);
1821 struct timeval time_array[4];
1824 computeFatLinkCore(cudaSiteLink, act_path_coeff, qudaGaugeParam, method, cudaFatLink, time_array);
1827 gettimeofday(&t9, NULL);
1831 gettimeofday(&t10, NULL);
1835 delete cpuSiteLink; cpuSiteLink = NULL;
1839 delete cudaSiteLink; cudaSiteLink = NULL;
1842 gettimeofday(&t11, NULL);
1843 #ifdef DSLASH_PROFILING
1844 printfQuda(
"total time: %f ms, init(cuda/cpu gauge field creation,etc)=%f ms,"
1845 " sitelink cpu->gpu=%f ms, computation in gpu =%f ms, fatlink gpu->cpu=%f ms\n",
1846 TDIFF_MS(t0, t11), TDIFF_MS(t0, t7) + TDIFF_MS(time_array[0],time_array[1]), TDIFF_MS(t7, t8), TDIFF_MS(time_array[1], time_array[2]), TDIFF_MS(t9,t10));
1847 printfQuda(
"finally cleanup =%f ms\n", TDIFF_MS(t10, t11) + TDIFF_MS(time_array[2],time_array[3]));
1854 #ifdef GPU_GAUGE_FORCE
1857 void* loop_coeff,
int num_paths,
int max_length,
double eb3,
1860 struct timeval t0, t1, t2, t3;
1862 gettimeofday(&t0,NULL);
1868 memcpy(qudaGaugeParam_ex, qudaGaugeParam,
sizeof(
QudaGaugeParam));
1869 E[0] = qudaGaugeParam_ex->
X[0] = qudaGaugeParam->
X[0] + 4;
1870 E[1] = qudaGaugeParam_ex->
X[1] = qudaGaugeParam->
X[1] + 4;
1871 E[2] = qudaGaugeParam_ex->
X[2] = qudaGaugeParam->
X[2] + 4;
1872 E[3] = qudaGaugeParam_ex->
X[3] = qudaGaugeParam->
X[3] + 4;
1875 int* X = qudaGaugeParam->
X;
1880 int pad = E[2]*E[1]*E[0]/2;
1883 int pad = X[2]*X[1]*X[0]/2;
1889 gParamSL.
gauge = sitelink;
1905 gParamMom.
gauge=mom;
1910 gParamMom.
pad = pad;
1923 int R[4] = {2, 2, 2, 2};
1933 gettimeofday(&t1,NULL);
1935 gauge_force_cuda(*cudaMom, eb3, *cudaSiteLink, qudaGaugeParam, input_path_buf,
1936 path_length, loop_coeff, num_paths, max_length);
1938 gettimeofday(&t2,NULL);
1945 delete cudaSiteLink;
1948 gettimeofday(&t3,NULL);
1951 timeinfo[0] =
TDIFF(t0, t1);
1952 timeinfo[1] =
TDIFF(t1, t2);
1953 timeinfo[2] =
TDIFF(t2, t3);
1979 {
MatQuda(h_out, h_in, inv_param); }
1995 static int bqcd_rank_from_coords(
const int *coords,
void *fdata)
1997 int *dims =
static_cast<int *
>(fdata);
1999 int rank = coords[3];
2000 for (
int i = 2; i >= 0; i--) {
2001 rank = dims[i] * rank + coords[i];