4 typedef std::map<TuneKey, TuneParam>
map;
42 for (
int i=3;
i>=0;
i--) {
45 for (
int j=3; j>
i; j--)
if (
commDim[j]) prev = 2*j;
54 for (
int i=3;
i>=0;
i--) {
66 for (
int i=0;
i<4; ++
i) {
67 param.threadDimMapLower[
i] = 0;
68 param.threadDimMapUpper[
i] = 0;
69 if (!
dslash.dslashParam.commDim[
i])
continue;
70 param.threadDimMapLower[
i] = (prev >= 0 ?
param.threadDimMapUpper[prev] : 0);
81 #define PROFILE(f, profile, idx) \ 82 profile.TPSTART(idx); \ 86 #define PROFILE(f, profile, idx) f; 104 void *issueMPIReceive(
void* receiveParam)
106 ReceiveParam*
param =
static_cast<ReceiveParam*
>(receiveParam);
107 for(
int i=3;
i>=0;
i--){
108 if(!dslashParam.commDim[
i])
continue;
109 for(
int dir=1; dir>=0; dir--){
118 TimeProfile* profile;
124 void* launchInteriorKernel(
void* interiorParam)
126 InteriorParam*
param =
static_cast<InteriorParam*
>(interiorParam);
127 cudaSetDevice(
param->current_device);
147 for(
int i=3;
i>=0;
i--){
148 if (!
dslash.dslashParam.commDim[
i])
continue;
149 for(
int dir=1; dir>=0; dir--) {
167 if ( (location &
Device) &
Host)
errorQuda(
"MemoryLocation cannot be both Device and Host");
172 for (
int i=3;
i>=0;
i--)
174 {
pack =
true;
break; }
178 for (
int dir=0; dir<2; dir++) {
190 pack_dest, location,
dslash.dslashParam.twist_a,
dslash.dslashParam.twist_b),
209 for (
int i = 3;
i >=0;
i--) {
210 if (!
dslash.dslashParam.commDim[
i])
continue;
212 for (
int dir=1; dir>=0; dir--) {
239 template <
typename T>
243 for (
int i = 3;
i >=0;
i--) {
244 if (!dslashParam.commDim[
i])
continue;
280 bool gdr_send,
bool gdr_recv,
bool zero_copy_recv,
bool async,
int scatterIndex=-1) {
284 cudaStream_t *
stream =
nullptr;
296 if (!gdr_recv && !zero_copy_recv) {
299 #if (CUDA_VERSION >= 8000) 303 errorQuda(
"Async dslash policy variants require CUDA 8.0 and above");
309 if (scatterIndex == -1) scatterIndex = 2*
dim+dir;
330 template <
typename T>
335 if (!dslashParam.commDim[
dim])
continue;
336 for (
int dir=0; dir<2; dir++) {
357 static char aux_copy[TuneKey::aux_n];
358 static bool set_mapped =
false;
361 if (set_mapped)
errorQuda(
"set_mapped already set");
365 dslash.augmentAux(
dslash.dslashParam.kernel_type,
",zero_copy");
368 if (!set_mapped)
errorQuda(
"set_mapped not set");
370 dslash.setAux(
dslash.dslashParam.kernel_type, aux_copy);
379 const int volume,
const int *faceVolumeCB, TimeProfile &profile) = 0;
394 auto &dslashParam =
dslash.dslashParam;
396 dslashParam.threads = volume;
405 const int packIndex =
Nstream-1;
415 for (
int i=3;
i>=0;
i--) {
416 if (!dslashParam.commDim[
i])
continue;
418 for (
int dir=1; dir>=0; dir--) {
425 if (cudaSuccess == event_test) {
446 for (
int dir=1; dir>=0; dir--) {
455 dslashParam.kernel_type =
static_cast<KernelType>(
i);
456 dslashParam.threads =
dslash.Nface()*faceVolumeCB[
i];
467 in->bufferIndex = (1 -
in->bufferIndex);
480 auto &dslashParam =
dslash.dslashParam;
482 dslashParam.threads = volume;
493 const int packIndex =
Nstream-2;
495 pthread_t receiveThread, interiorThread;
496 ReceiveParam receiveParam;
497 receiveParam.profile = &profile;
498 receiveParam.nFace = (
dslash.Nface() >> 1);
499 receiveParam.dagger =
dslash.Dagger();
501 if(pthread_create(&receiveThread, NULL, issueMPIReceive, &receiveParam)){
505 InteriorParam interiorParam;
506 interiorParam.dslash = &
dslash;
507 interiorParam.profile = &profile;
509 cudaGetDevice(&(interiorParam.current_device));
510 if(pthread_create(&interiorThread, NULL, launchInteriorKernel, &interiorParam)){
515 for (
int i=3;
i>=0;
i--)
517 {
pack =
true;
break; }
534 for(
int i = 3;
i >=0;
i--){
535 if (!dslashParam.commDim[
i])
continue;
537 for (
int dir=1; dir>=0; dir--) {
554 #if (!defined MULTI_GPU) 560 if(pthread_join(receiveThread, NULL))
errorQuda(
"pthread_join failed");
561 bool interiorLaunched =
false;
564 for (
int i=3;
i>=0;
i--) {
565 if (!dslashParam.commDim[
i])
continue;
567 for (
int dir=1; dir>=0; dir--) {
574 if (cudaSuccess == event_test) {
605 if(!interiorLaunched){
606 if(pthread_join(interiorThread, NULL))
errorQuda(
"pthread_join failed");
607 interiorLaunched =
true;
614 dslashParam.kernel_type =
static_cast<KernelType>(
i);
615 dslashParam.threads =
dslash.Nface()*faceVolumeCB[
i];
627 in->bufferIndex = (1 -
in->bufferIndex);
631 errorQuda(
"Pthreads has not been built\n");
646 auto &dslashParam =
dslash.dslashParam;
648 dslashParam.threads = volume;
657 const int packIndex =
Nstream-1;
668 for (
int i=3;
i>=0;
i--) {
669 if (!dslashParam.commDim[
i])
continue;
671 for (
int dir=1; dir>=0; dir--) {
677 if (cudaSuccess == event_test) {
697 for (
int i=3;
i>=0;
i--) {
712 in->bufferIndex = (1 -
in->bufferIndex);
728 auto &dslashParam =
dslash.dslashParam;
730 dslashParam.threads = volume;
734 const int packIndex =
Nstream-1;
740 bool pack_event =
false;
741 for (
int p2p=0; p2p<2; p2p++) {
742 for (
int i=3;
i>=0;
i--) {
743 if (!dslashParam.commDim[
i])
continue;
746 cudaEventSynchronize(
packEnd[
in->bufferIndex]);
750 for (
int dir=1; dir>=0; dir--) {
762 for (
int i=3;
i>=0;
i--) {
763 if (!dslashParam.commDim[
i])
continue;
765 for (
int dir=1; dir>=0; dir--) {
778 dslashParam.kernel_type =
static_cast<KernelType>(
i);
779 dslashParam.threads =
dslash.Nface()*faceVolumeCB[
i];
790 in->bufferIndex = (1 -
in->bufferIndex);
806 auto &dslashParam =
dslash.dslashParam;
808 dslashParam.threads = volume;
812 const int packIndex =
Nstream-1;
818 bool pack_event =
false;
819 for (
int p2p=0; p2p<2; p2p++) {
820 for (
int i=3;
i>=0;
i--) {
821 if (!dslashParam.commDim[
i])
continue;
824 cudaEventSynchronize(
packEnd[
in->bufferIndex]);
828 for (
int dir=1; dir>=0; dir--) {
840 for (
int i=3;
i>=0;
i--) {
841 if (!dslashParam.commDim[
i])
continue;
843 for (
int dir=1; dir>=0; dir--) {
863 in->bufferIndex = (1 -
in->bufferIndex);
878 auto &dslashParam =
dslash.dslashParam;
880 dslashParam.threads = volume;
889 const int packIndex =
Nstream-1;
899 for (
int i=3;
i>=0;
i--) {
900 if (!dslashParam.commDim[
i])
continue;
902 for (
int dir=1; dir>=0; dir--) {
908 if (cudaSuccess == event_test) {
928 dslashParam.kernel_type =
static_cast<KernelType>(
i);
929 dslashParam.threads =
dslash.Nface()*faceVolumeCB[
i];
940 in->bufferIndex = (1 -
in->bufferIndex);
956 auto &dslashParam =
dslash.dslashParam;
958 dslashParam.threads = volume;
967 const int packIndex =
Nstream-1;
977 for (
int i=3;
i>=0;
i--) {
978 if (!dslashParam.commDim[
i])
continue;
980 for (
int dir=1; dir>=0; dir--) {
986 if (cudaSuccess == event_test) {
1013 in->bufferIndex = (1 -
in->bufferIndex);
1020 #define CUDA_CALL( call ) \ 1022 CUresult cudaStatus = call; \ 1023 if ( CUDA_SUCCESS != cudaStatus ) { \ 1024 const char *err_str = nullptr; \ 1025 cuGetErrorString(cudaStatus, &err_str); \ 1026 fprintf(stderr, "ERROR: CUDA call \"%s\" in line %d of file %s failed with %s (%d).\n", #call, __LINE__, __FILE__, err_str, cudaStatus); \ 1030 #define CUDA_CALL( call ) call 1038 #if (CUDA_VERSION >= 8000) 1040 void operator()(
DslashCuda &
dslash, cudaColorSpinorField*
in,
const int volume,
const int *faceVolumeCB, TimeProfile &profile) {
1045 auto &dslashParam =
dslash.dslashParam;
1047 dslashParam.threads = volume;
1056 const int packIndex =
Nstream-1;
1066 for (
int i=3;
i>=0;
i--) {
1067 if (!dslashParam.commDim[
i])
continue;
1069 for (
int dir=1; dir>=0; dir--) {
1075 if (cudaSuccess == event_test) {
1084 *((
volatile cuuint32_t*)(
commsEnd_h+2*
i+1-dir)) = 0;
1103 for (
int dir=1; dir>=0; dir--) {
1112 dslashParam.kernel_type =
static_cast<KernelType>(
i);
1113 dslashParam.threads =
dslash.Nface()*faceVolumeCB[
i];
1126 in->bufferIndex = (1 -
in->bufferIndex);
1132 errorQuda(
"Async dslash policy variants require CUDA 8.0 and above");
1135 #endif // CUDA_VERSION >= 8000 1146 #if (CUDA_VERSION >= 8000) 1148 void operator()(
DslashCuda &
dslash, cudaColorSpinorField*
in,
const int volume,
const int *faceVolumeCB, TimeProfile &profile) {
1153 auto &dslashParam =
dslash.dslashParam;
1155 dslashParam.threads = volume;
1164 const int packIndex =
Nstream-1;
1175 for (
int i=3;
i>=0;
i--) {
1176 if (!dslashParam.commDim[
i])
continue;
1178 for (
int dir=1; dir>=0; dir--) {
1185 if (cudaSuccess == event_test) {
1194 *((
volatile cuuint32_t*)(
commsEnd_h+2*
i+1-dir)) = 0;
1214 for (
int i=3;
i>=0;
i--) {
1229 in->bufferIndex = (1 -
in->bufferIndex);
1236 errorQuda(
"Async dslash policy variants require CUDA 8.0 and above");
1239 #endif // CUDA_VERSION >= 8000 1255 auto &dslashParam =
dslash.dslashParam;
1257 dslashParam.threads = volume;
1271 for (
int i=3;
i>=0;
i--) {
1278 for (
int p2p=0; p2p<2; p2p++) {
1279 for (
int i=3;
i>=0;
i--) {
1280 if (!dslashParam.commDim[
i])
continue;
1282 for (
int dir=1; dir>=0; dir--) {
1295 for (
int i=3;
i>=0;
i--) {
1296 if (!dslashParam.commDim[
i])
continue;
1298 for (
int dir=1; dir>=0; dir--) {
1311 for (
int dir=1; dir>=0; dir--) {
1320 dslashParam.kernel_type =
static_cast<KernelType>(
i);
1321 dslashParam.threads =
dslash.Nface()*faceVolumeCB[
i];
1332 in->bufferIndex = (1 -
in->bufferIndex);
1349 auto &dslashParam =
dslash.dslashParam;
1351 dslashParam.threads = volume;
1358 issuePack(*
in,
dslash, 1-dslashParam.parity, static_cast<MemoryLocation>(
Host | (
Remote*dslashParam.remote_write) ), packScatterIndex);
1365 for (
int i=3;
i>=0;
i--) {
1372 for (
int p2p=0; p2p<2; p2p++) {
1373 for (
int i=3;
i>=0;
i--) {
1374 if (!dslashParam.commDim[
i])
continue;
1376 for (
int dir=1; dir>=0; dir--) {
1389 for (
int i=3;
i>=0;
i--) {
1390 if (!dslashParam.commDim[
i])
continue;
1392 for (
int dir=1; dir>=0; dir--) {
1406 for (
int i=3;
i>=0;
i--) {
1422 in->bufferIndex = (1 -
in->bufferIndex);
1437 auto &dslashParam =
dslash.dslashParam;
1439 dslashParam.threads = volume;
1453 for (
int i=3;
i>=0;
i--) {
1460 for (
int p2p=0; p2p<2; p2p++) {
1461 for (
int i=3;
i>=0;
i--) {
1462 if (!dslashParam.commDim[
i])
continue;
1464 for (
int dir=1; dir>=0; dir--) {
1477 for (
int i=3;
i>=0;
i--) {
1478 if (!dslashParam.commDim[
i])
continue;
1480 for (
int dir=1; dir>=0; dir--) {
1493 dslashParam.kernel_type =
static_cast<KernelType>(
i);
1494 dslashParam.threads =
dslash.Nface()*faceVolumeCB[
i];
1505 in->bufferIndex = (1 -
in->bufferIndex);
1522 auto &dslashParam =
dslash.dslashParam;
1524 dslashParam.threads = volume;
1538 for (
int i=3;
i>=0;
i--) {
1545 for (
int p2p=0; p2p<2; p2p++) {
1546 for (
int i=3;
i>=0;
i--) {
1547 if (!dslashParam.commDim[
i])
continue;
1549 for (
int dir=1; dir>=0; dir--) {
1562 for (
int i=3;
i>=0;
i--) {
1563 if (!dslashParam.commDim[
i])
continue;
1565 for (
int dir=1; dir>=0; dir--) {
1585 in->bufferIndex = (1 -
in->bufferIndex);
1602 auto &dslashParam =
dslash.dslashParam;
1604 dslashParam.threads = volume;
1618 for (
int i=3;
i>=0;
i--) {
1625 for (
int p2p=0; p2p<2; p2p++) {
1626 for (
int i=3;
i>=0;
i--) {
1627 if (!dslashParam.commDim[
i])
continue;
1629 for (
int dir=1; dir>=0; dir--) {
1642 for (
int i=3;
i>=0;
i--) {
1643 if (!dslashParam.commDim[
i])
continue;
1645 for (
int dir=1; dir>=0; dir--) {
1660 dslashParam.kernel_type =
static_cast<KernelType>(
i);
1661 dslashParam.threads =
dslash.Nface()*faceVolumeCB[
i];
1672 in->bufferIndex = (1 -
in->bufferIndex);
1689 auto &dslashParam =
dslash.dslashParam;
1691 dslashParam.threads = volume;
1705 for (
int i=3;
i>=0;
i--) {
1712 for (
int p2p=0; p2p<2; p2p++) {
1713 for (
int i=3;
i>=0;
i--) {
1714 if (!dslashParam.commDim[
i])
continue;
1716 for (
int dir=1; dir>=0; dir--) {
1729 for (
int i=3;
i>=0;
i--) {
1730 if (!dslashParam.commDim[
i])
continue;
1732 for (
int dir=1; dir>=0; dir--) {
1756 in->bufferIndex = (1 -
in->bufferIndex);
1768 auto &dslashParam =
dslash.dslashParam;
1770 dslashParam.threads = volume;
1799 static std::vector<QudaDslashPolicy>
policies(static_cast<int>(QudaDslashPolicy::QUDA_DSLASH_POLICY_DISABLED), QudaDslashPolicy::QUDA_DSLASH_POLICY_DISABLED);
1808 static std::vector<QudaP2PPolicy>
p2p_policies(static_cast<int>(QudaP2PPolicy::QUDA_P2P_POLICY_DISABLED), QudaP2PPolicy::QUDA_P2P_POLICY_DISABLED);
1816 switch(dslashPolicy){
1817 case QudaDslashPolicy::QUDA_DSLASH:
1820 case QudaDslashPolicy::QUDA_DSLASH_ASYNC:
1823 case QudaDslashPolicy::QUDA_PTHREADS_DSLASH:
1826 case QudaDslashPolicy::QUDA_FUSED_DSLASH:
1829 case QudaDslashPolicy::QUDA_FUSED_DSLASH_ASYNC:
1832 case QudaDslashPolicy::QUDA_GDR_DSLASH:
1836 case QudaDslashPolicy::QUDA_FUSED_GDR_DSLASH:
1840 case QudaDslashPolicy::QUDA_GDR_RECV_DSLASH:
1844 case QudaDslashPolicy::QUDA_FUSED_GDR_RECV_DSLASH:
1848 case QudaDslashPolicy::QUDA_ZERO_COPY_PACK_DSLASH:
1851 case QudaDslashPolicy::QUDA_FUSED_ZERO_COPY_PACK_DSLASH:
1854 case QudaDslashPolicy::QUDA_ZERO_COPY_PACK_GDR_RECV_DSLASH:
1858 case QudaDslashPolicy::QUDA_FUSED_ZERO_COPY_PACK_GDR_RECV_DSLASH:
1862 case QudaDslashPolicy::QUDA_ZERO_COPY_DSLASH:
1865 case QudaDslashPolicy::QUDA_FUSED_ZERO_COPY_DSLASH:
1868 case QudaDslashPolicy::QUDA_DSLASH_NC:
1872 errorQuda(
"Dslash policy %d not recognized",static_cast<int>(dslashPolicy));
1899 cudaColorSpinorField *
in;
1911 const int volume,
const int *ghostFace, TimeProfile &profile)
1912 :
dslash(
dslash), dslashParam(
dslash.dslashParam),
in(
in), volume(volume), ghostFace(ghostFace), profile(profile)
1922 p2p_policies[
static_cast<std::size_t>(QudaP2PPolicy::QUDA_P2P_REMOTE_WRITE)] = QudaP2PPolicy::QUDA_P2P_REMOTE_WRITE;
1925 p2p_policies[
static_cast<std::size_t>(QudaP2PPolicy::QUDA_P2P_COPY_ENGINE)] = QudaP2PPolicy::QUDA_P2P_COPY_ENGINE;
1930 static char *dslash_policy_env =
getenv(
"QUDA_ENABLE_DSLASH_POLICY");
1931 if (dslash_policy_env) {
1932 std::stringstream policy_list(dslash_policy_env);
1935 while (policy_list >> policy_) {
1939 if ( ( dslash_policy == QudaDslashPolicy::QUDA_GDR_DSLASH ||
1940 dslash_policy == QudaDslashPolicy::QUDA_FUSED_GDR_DSLASH ||
1941 dslash_policy == QudaDslashPolicy::QUDA_GDR_RECV_DSLASH ||
1942 dslash_policy == QudaDslashPolicy::QUDA_FUSED_GDR_RECV_DSLASH)
1944 errorQuda(
"Cannot select a GDR policy %d unless QUDA_ENABLE_GDR is set", static_cast<int>(dslash_policy));
1949 if (policy_list.peek() ==
',') policy_list.ignore();
1951 if (
first_active_policy == static_cast<int>(QudaDslashPolicy::QUDA_DSLASH_POLICY_DISABLED))
errorQuda(
"No valid policy found in QUDA_ENABLE_DSLASH_POLICY");
1963 enable_policy(QudaDslashPolicy::QUDA_FUSED_GDR_RECV_DSLASH);
1966 enable_policy(QudaDslashPolicy::QUDA_ZERO_COPY_PACK_DSLASH);
1967 enable_policy(QudaDslashPolicy::QUDA_FUSED_ZERO_COPY_PACK_DSLASH);
1970 enable_policy(QudaDslashPolicy::QUDA_ZERO_COPY_PACK_GDR_RECV_DSLASH);
1971 enable_policy(QudaDslashPolicy::QUDA_FUSED_ZERO_COPY_PACK_GDR_RECV_DSLASH);
1974 #ifdef USE_TEXTURE_OBJECTS 1977 enable_policy(QudaDslashPolicy::QUDA_FUSED_ZERO_COPY_DSLASH);
1981 #if (CUDA_VERSION >= 8000) && 0 1982 #if (CUDA_VERSION >= 9000) 1985 int can_use_stream_mem_ops;
1986 cuDeviceGetAttribute(&can_use_stream_mem_ops, CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_MEM_OPS,
device);
1988 int can_use_stream_mem_ops = 1;
1990 if (can_use_stream_mem_ops) {
1997 static char *dslash_pack_env =
getenv(
"QUDA_ENABLE_DSLASH_PACK");
1998 if (dslash_pack_env &&
strcmp(dslash_pack_env,
"0") == 0) {
2003 static char *dslash_interior_env =
getenv(
"QUDA_ENABLE_DSLASH_INTERIOR");
2004 if (dslash_interior_env &&
strcmp(dslash_interior_env,
"0") == 0) {
2009 static char *dslash_exterior_env =
getenv(
"QUDA_ENABLE_DSLASH_EXTERIOR");
2010 if (dslash_exterior_env &&
strcmp(dslash_exterior_env,
"0") == 0) {
2015 static char *dslash_copy_env =
getenv(
"QUDA_ENABLE_DSLASH_COPY");
2016 if (dslash_copy_env &&
strcmp(dslash_copy_env,
"0") == 0) {
2021 static char *dslash_comms_env =
getenv(
"QUDA_ENABLE_DSLASH_COMMS");
2022 if (dslash_comms_env &&
strcmp(dslash_comms_env,
"0") == 0) {
2035 if (p2p == QudaP2PPolicy::QUDA_P2P_POLICY_DISABLED)
continue;
2039 dslashParam.
remote_write = (p2p == QudaP2PPolicy::QUDA_P2P_REMOTE_WRITE ? 1 : 0);
2043 if ( (
i == QudaDslashPolicy::QUDA_DSLASH ||
2044 i == QudaDslashPolicy::QUDA_FUSED_DSLASH ||
2045 i == QudaDslashPolicy::QUDA_DSLASH_ASYNC ||
2046 i == QudaDslashPolicy::QUDA_FUSED_DSLASH_ASYNC) &&
2050 (*dslashImp)(
dslash,
in, volume, ghostFace, profile);
2053 }
else if ( (
i == QudaDslashPolicy::QUDA_GDR_DSLASH ||
2054 i == QudaDslashPolicy::QUDA_FUSED_GDR_DSLASH ||
2055 i == QudaDslashPolicy::QUDA_GDR_RECV_DSLASH ||
2056 i == QudaDslashPolicy::QUDA_FUSED_GDR_RECV_DSLASH ||
2057 i == QudaDslashPolicy::QUDA_ZERO_COPY_PACK_DSLASH ||
2058 i == QudaDslashPolicy::QUDA_FUSED_ZERO_COPY_PACK_DSLASH ||
2059 i == QudaDslashPolicy::QUDA_ZERO_COPY_PACK_GDR_RECV_DSLASH ||
2060 i == QudaDslashPolicy::QUDA_FUSED_ZERO_COPY_PACK_GDR_RECV_DSLASH ||
2061 i == QudaDslashPolicy::QUDA_ZERO_COPY_DSLASH ||
2062 i == QudaDslashPolicy::QUDA_FUSED_ZERO_COPY_DSLASH) ||
2063 ((
i == QudaDslashPolicy::QUDA_DSLASH ||
2064 i == QudaDslashPolicy::QUDA_FUSED_DSLASH ||
2065 i == QudaDslashPolicy::QUDA_DSLASH_ASYNC ||
2066 i == QudaDslashPolicy::QUDA_FUSED_DSLASH_ASYNC) && dslashParam.
remote_write) ) {
2077 if ( (
i == QudaDslashPolicy::QUDA_GDR_DSLASH ||
2078 i == QudaDslashPolicy::QUDA_FUSED_GDR_DSLASH ||
2079 i == QudaDslashPolicy::QUDA_GDR_RECV_DSLASH ||
2080 i == QudaDslashPolicy::QUDA_FUSED_GDR_RECV_DSLASH) && !dslashParam.
remote_write ) {
2082 QudaDslashPolicy::QUDA_DSLASH : QudaDslashPolicy::QUDA_FUSED_DSLASH;
2085 (*dslashImp)(
dslash,
in, volume, ghostFace, profile);
2087 (*dslashImp)(
dslash,
in, volume, ghostFace, profile);
2094 (*dslashImp)(
dslash,
in, volume, ghostFace, profile);
2100 }
else if (
i != QudaDslashPolicy::QUDA_DSLASH_POLICY_DISABLED){
2101 errorQuda(
"Unsupported dslash policy %d\n", static_cast<int>(
i));
2119 if (
config != tp.aux.w) {
2120 errorQuda(
"Machine configuration (P2P/GDR=%d) changed since tunecache was created (P2P/GDR=%d). Please delete " 2121 "this file or set the QUDA_RESOURCE_PATH environment variable to point to a new path.",
2125 if (tp.aux.x >= static_cast<int>(
policies.size()))
errorQuda(
"Requested policy that is outside of range");
2126 if (static_cast<QudaDslashPolicy>(tp.aux.x) == QudaDslashPolicy::QUDA_DSLASH_POLICY_DISABLED)
errorQuda(
"Requested policy is disabled");
2135 if (
p == QudaDslashPolicy::QUDA_GDR_DSLASH ||
2136 p == QudaDslashPolicy::QUDA_FUSED_GDR_DSLASH ||
2137 p == QudaDslashPolicy::QUDA_ZERO_COPY_PACK_DSLASH ||
2138 p == QudaDslashPolicy::QUDA_FUSED_ZERO_COPY_PACK_DSLASH ||
2139 p == QudaDslashPolicy::QUDA_ZERO_COPY_PACK_GDR_RECV_DSLASH ||
2140 p == QudaDslashPolicy::QUDA_FUSED_ZERO_COPY_PACK_GDR_RECV_DSLASH ||
2141 p == QudaDslashPolicy::QUDA_ZERO_COPY_DSLASH ||
2142 p == QudaDslashPolicy::QUDA_FUSED_ZERO_COPY_DSLASH ||
2148 DslashPolicyImp* dslashImp = DslashFactory::create(static_cast<QudaDslashPolicy>(tp.aux.x));
2149 (*dslashImp)(
dslash,
in, volume, ghostFace, profile);
2166 if (
policies[
param.aux.x] != QudaDslashPolicy::QUDA_DSLASH_POLICY_DISABLED)
return true;
2172 if (
p2p_policies[
param.aux.y] != QudaP2PPolicy::QUDA_P2P_POLICY_DISABLED)
return true;
2182 Tunable::initTuneParam(
param);
2187 Tunable::defaultTuneParam(
param);
2194 TuneKey key =
dslash.tuneKey();
2203 long long flops_ =
dslash.flops();
2211 long long bytes_ =
dslash.bytes();
void disable_policy(QudaDslashPolicy p)
void operator()(DslashCuda &dslash, cudaColorSpinorField *in, const int volume, const int *faceVolumeCB, TimeProfile &profile)
cudaError_t qudaEventQuery(cudaEvent_t &event)
Wrapper around cudaEventQuery or cuEventQuery.
std::map< TuneKey, TuneParam > map
cudaError_t qudaStreamWaitEvent(cudaStream_t stream, cudaEvent_t event, unsigned int flags)
Wrapper around cudaEventRecord or cuEventRecord.
static int first_active_policy
QudaVerbosity getVerbosity()
cudaEvent_t scatterEnd[Nstream]
int dslashCompleted[Nstream]
#define PROFILE(f, profile, idx)
void setFusedParam(DslashParam ¶m, DslashCuda &dslash, const int *faceVolumeCB)
static __inline__ dim3 dim3 void size_t cudaStream_t int dim
int commsCompleted[Nstream]
bool commsComplete(cudaColorSpinorField &in, const DslashCuda &dslash, int dim, int dir, bool gdr_send, bool gdr_recv, bool zero_copy_recv, bool async, int scatterIndex=-1)
Wrapper for querying if communication is finished in the dslash, and if it is take the appropriate ac...
char * strcpy(char *__dst, const char *__src)
char * strcat(char *__s1, const char *__s2)
void disableProfileCount()
void operator()(DslashCuda &dslash, cudaColorSpinorField *in, const int volume, const int *faceVolumeCB, TimeProfile &profile)
void operator()(DslashCuda &dslash, cudaColorSpinorField *in, const int volume, const int *faceVolumeCB, TimeProfile &profile)
void initTuneParam(TuneParam ¶m) const
int getStreamIndex(const T &dslashParam)
Returns a stream index for posting the pack/scatters to. We desire a stream index that is not being u...
void comm_enable_peer2peer(bool enable)
Enable / disable peer-to-peer communication: used for dslash policies that do not presently support p...
char * index(const char *, int)
int strcmp(const char *__s1, const char *__s2)
static bool dslash_pack_compute
void operator()(DslashCuda &dslash, cudaColorSpinorField *in, const int volume, const int *faceVolumeCB, TimeProfile &profile)
virtual ~DslashPolicyImp()
cudaError_t qudaStreamSynchronize(cudaStream_t &stream)
Wrapper around cudaStreamSynchronize or cuStreamSynchronize.
const map & getTuneCache()
void operator()(DslashCuda &dslash, cudaColorSpinorField *in, const int volume, const int *faceVolumeCB, TimeProfile &profile)
const char * comm_dim_topology_string()
Return a string that defines the comm topology (for use as a tuneKey)
void operator()(DslashCuda &dslash, cudaColorSpinorField *in, const int volume, const int *faceVolumeCB, TimeProfile &profile)
static __inline__ size_t p
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
cudaEvent_t gatherEnd[Nstream]
void operator()(DslashCuda &dslash, cudaColorSpinorField *in, const int volume, const int *faceVolumeCB, TimeProfile &profile)
static std::vector< DslashCoarsePolicy > policy
void operator()(DslashCuda &dslash, cudaColorSpinorField *in, const int volume, const int *faceVolumeCB, TimeProfile &profile)
void setPolicyTuning(bool)
void operator()(DslashCuda &dslash, cudaColorSpinorField *in, const int volume, const int *faceVolumeCB, TimeProfile &profile)
bool advanceTuneParam(TuneParam ¶m) const
void operator()(DslashCuda &dslash, cudaColorSpinorField *in, const int volume, const int *faceVolumeCB, TimeProfile &profile)
void operator()(DslashCuda &dslash, cudaColorSpinorField *in, const int volume, const int *faceVolumeCB, TimeProfile &profile)
void setMappedGhost(DslashCuda &dslash, cudaColorSpinorField &in, bool to_mapped)
Set the ghosts to the mapped CPU ghost buffer, or unsets if already set. Note this must not be called...
void apply(const cudaStream_t &stream)
virtual ~DslashPolicyTune()
void operator()(DslashCuda &dslash, cudaColorSpinorField *in, const int volume, const int *faceVolumeCB, TimeProfile &profile)
unsigned int sharedBytesPerBlock(const TuneParam ¶m) const
static std::vector< QudaP2PPolicy > p2p_policies(static_cast< int >(QudaP2PPolicy::QUDA_P2P_POLICY_DISABLED), QudaP2PPolicy::QUDA_P2P_POLICY_DISABLED)
bool comm_peer2peer_enabled(int dir, int dim)
DslashPolicyTune(DslashCuda &dslash, cudaColorSpinorField *in, const int volume, const int *ghostFace, TimeProfile &profile)
cudaEvent_t dslashStart[2]
static cudaColorSpinorField * inSpinor
void enableProfileCount()
cudaColorSpinorField * in
static bool dslash_exterior_compute
DslashCommsPattern(const int commDim[], bool gdr_send=false)
static bool dslash_interior_compute
bool comm_gdr_enabled()
Query if GPU Direct RDMA communication is enabled (global setting)
void setKernelPackT(bool pack)
void issueRecv(cudaColorSpinorField &input, const DslashCuda &dslash, cudaStream_t *stream, bool gdr)
This helper function simply posts all receives in all directions.
cudaError_t qudaEventRecord(cudaEvent_t &event, cudaStream_t stream=0)
Wrapper around cudaEventRecord or cuEventRecord.
void enable_policy(QudaDslashPolicy p)
void defaultTuneParam(TuneParam ¶m) const
bool advanceAux(TuneParam ¶m) const
unsigned int sharedBytesPerThread() const
void operator()(DslashCuda &dslash, cudaColorSpinorField *in, const int volume, const int *faceVolumeCB, TimeProfile &profile)
#define QUDA_MAX_DIM
Maximum number of dimensions supported by QUDA. In practice, no routines make use of more than 5...
DslashParam & dslashParam
static std::vector< QudaDslashPolicy > policies(static_cast< int >(QudaDslashPolicy::QUDA_DSLASH_POLICY_DISABLED), QudaDslashPolicy::QUDA_DSLASH_POLICY_DISABLED)
QudaTune getTuning()
Query whether autotuning is enabled or not. Default is enabled but can be overridden by setting QUDA_...
void issueGather(cudaColorSpinorField &in, const DslashCuda &dslash)
This helper function simply posts the device-host memory copies of all halos in all dimensions and di...
void completeDslash(const ColorSpinorField &in, const T &dslashParam)
Ensure that the dslash is complete. By construction, the dslash will have completed (or is in flight)...
int comm_peer2peer_enabled_global()
void operator()(DslashCuda &dslash, cudaColorSpinorField *in, const int volume, const int *faceVolumeCB, TimeProfile &profile)
char * getenv(const char *)
bool comm_gdr_blacklist()
Query if GPU Direct RDMA communication is blacklisted for this GPU.
static DslashPolicyImp * create(const QudaDslashPolicy &dslashPolicy)
static int first_active_p2p_policy
void operator()(DslashCuda &dslash, cudaColorSpinorField *in, const int volume, const int *faceVolumeCB, TimeProfile &profile)
int gatherCompleted[Nstream]
cudaEvent_t cudaEvent_t end
void issuePack(cudaColorSpinorField &in, const DslashCuda &dslash, int parity, MemoryLocation location, int packIndex)
This helper function simply posts the packing kernel needed for halo exchange.
void operator()(DslashCuda &dslash, cudaColorSpinorField *in, const int volume, const int *faceVolumeCB, TimeProfile &profile)
CUdeviceptr commsEnd_d[Nstream]
enum cudaDeviceAttr attr int device