22 #if CUDA_VERSION >= 8000 23 extern cuuint32_t *commsEnd_h;
24 extern CUdeviceptr commsEnd_d[
Nstream];
53 for (
int i = 0; i <
Nstream - 1; i++) gatherCompleted[i] = gdr_send ? 1 : 0;
54 gatherCompleted[Nstream - 1] = 1;
55 commsCompleted[Nstream - 1] = 1;
56 dslashCompleted[Nstream - 1] = 1;
62 for (
int i = 3; i >= 0; i--) {
64 int prev = Nstream - 1;
65 for (
int j = 3; j > i; j--)
67 previousDir[2 * i + 1] = prev;
68 previousDir[2 * i + 0] = 2 * i + 1;
75 for (
int i = 3; i >= 0; i--) { commDimTotal +=
commDim[i]; }
76 commDimTotal *= gdr_send ? 2 : 4;
80 template <
typename Arg,
typename Dslash>
86 for (
int i = 0; i < 4; ++i) {
87 param.threadDimMapLower[i] = 0;
88 param.threadDimMapUpper[i] = 0;
90 param.threadDimMapLower[i] = (prev >= 0 ? param.threadDimMapUpper[prev] : 0);
91 param.threadDimMapUpper[i] = param.threadDimMapLower[i] + dslash.
Nface() * faceVolumeCB[i];
92 param.threads = param.threadDimMapUpper[i];
100 #ifdef DSLASH_PROFILE 101 #define PROFILE(f, profile, idx) \ 102 profile.TPSTART(idx); \ 106 #define PROFILE(f, profile, idx) f; 117 template <
typename Dslash>
120 for(
int i=3; i>=0; i--){
122 for(
int dir=1; dir>=0; dir--) {
138 template <
typename Dslash>
143 if ( (location &
Device) &
Host)
errorQuda(
"MemoryLocation cannot be both Device and Host");
146 for (
int i=3; i>=0; i--)
153 for (
int dim=0; dim<4; dim++) {
154 for (
int dir=0; dir<2; dir++) {
156 pack_dest[2*dim+dir] =
Remote;
158 pack_dest[2*dim+dir] =
Host;
160 pack_dest[2*dim+dir] =
Device;
166 location,
arg.spin_project,
arg.twist_a,
arg.twist_b,
arg.twist_c),
183 for (
int i = 3; i >=0; i--) {
186 for (
int dir=1; dir>=0; dir--) {
212 template <
typename T>
216 for (
int i = 3; i >=0; i--) {
217 if (!dslashParam.commDim[i])
continue;
222 if (index == -1) index = 0;
252 template <
typename Dslash>
254 bool gdr_recv,
bool zero_copy_recv,
bool async,
int scatterIndex = -1)
257 cudaStream_t *
stream =
nullptr;
269 if (!gdr_recv && !zero_copy_recv) {
272 #if (CUDA_VERSION >= 8000) && 0 274 *((
volatile cuuint32_t*)(commsEnd_h+2*dim+dir2)) = 1;
276 errorQuda(
"Async dslash policy variants require CUDA 8.0 and above");
282 if (scatterIndex == -1) scatterIndex = 2*dim+dir;
303 template <
typename T>
307 for (
int dim=3; dim>=0; dim--) {
308 if (!dslashParam.commDim[dim])
continue;
309 for (
int dir=0; dir<2; dir++) {
332 static bool set_mapped =
false;
335 if (set_mapped)
errorQuda(
"set_mapped already set");
345 if (!set_mapped)
errorQuda(
"set_mapped not set");
355 virtual void operator()(
375 dslashParam.threads = volume;
384 const int packIndex =
Nstream - 1;
386 issuePack(*in, dslash, parity_src, static_cast<MemoryLocation>(
Device | (
Remote * dslashParam.remote_write)),
396 for (
int i = 3; i >= 0; i--) {
397 if (!dslashParam.commDim[i])
continue;
399 for (
int dir = 1; dir >= 0; dir--) {
404 if (event_test != cudaSuccess)
407 if (cudaSuccess == event_test) {
411 dslashParam.remote_write ?
streams + packIndex :
nullptr,
false, dslashParam.remote_write),
418 if (
commsComplete(*in, dslash, i, dir,
false,
false,
false,
false)) {
429 for (
int dir = 1; dir >= 0; dir--) {
441 dslashParam.kernel_type =
static_cast<KernelType>(i);
442 dslashParam.threads = dslash.
Nface() * faceVolumeCB[i];
471 dslashParam.threads = volume;
480 const int packIndex =
Nstream - 1;
482 issuePack(*in, dslash, parity_src, static_cast<MemoryLocation>(
Device | (
Remote * dslashParam.remote_write)),
493 for (
int i = 3; i >= 0; i--) {
494 if (!dslashParam.commDim[i])
continue;
496 for (
int dir = 1; dir >= 0; dir--) {
500 if (event_test != cudaSuccess)
503 if (cudaSuccess == event_test) {
507 dslashParam.remote_write ?
streams + packIndex :
nullptr,
false, dslashParam.remote_write),
514 if (
commsComplete(*in, dslash, i, dir,
false,
false,
false,
false, scatterIndex)) {
523 for (
int i = 3; i >= 0; i--) {
524 if (dslashParam.commDim[i]
559 dslashParam.threads = volume;
563 const int packIndex =
Nstream - 1;
565 issuePack(*in, dslash, parity_src, static_cast<MemoryLocation>(
Device | (
Remote * dslashParam.remote_write)),
571 bool pack_event =
false;
572 for (
int p2p = 0; p2p < 2; p2p++) {
573 for (
int i = 3; i >= 0; i--) {
574 if (!dslashParam.commDim[i])
continue;
581 for (
int dir = 1; dir >= 0; dir--) {
584 dslashParam.remote_write ?
streams + packIndex :
nullptr,
true, dslashParam.remote_write),
593 for (
int i = 3; i >= 0; i--) {
594 if (!dslashParam.commDim[i])
continue;
596 for (
int dir = 1; dir >= 0; dir--) {
600 if (
commsComplete(*in, dslash, i, dir,
true,
true,
false,
false)) {
611 dslashParam.kernel_type =
static_cast<KernelType>(i);
612 dslashParam.threads = dslash.
Nface() * faceVolumeCB[i];
641 dslashParam.threads = volume;
645 const int packIndex =
Nstream - 1;
647 issuePack(*in, dslash, parity_src, static_cast<MemoryLocation>(
Device | (
Remote * dslashParam.remote_write)),
653 bool pack_event =
false;
654 for (
int p2p = 0; p2p < 2; p2p++) {
655 for (
int i = 3; i >= 0; i--) {
656 if (!dslashParam.commDim[i])
continue;
663 for (
int dir = 1; dir >= 0; dir--) {
666 dslashParam.remote_write ?
streams + packIndex :
nullptr,
true, dslashParam.remote_write),
675 for (
int i = 3; i >= 0; i--) {
676 if (!dslashParam.commDim[i])
continue;
678 for (
int dir = 1; dir >= 0; dir--) {
682 if (
commsComplete(*in, dslash, i, dir,
true,
true,
false,
false)) {
716 dslashParam.threads = volume;
725 const int packIndex =
Nstream - 1;
727 issuePack(*in, dslash, parity_src, static_cast<MemoryLocation>(
Device | (
Remote * dslashParam.remote_write)),
737 for (
int i = 3; i >= 0; i--) {
738 if (!dslashParam.commDim[i])
continue;
740 for (
int dir = 1; dir >= 0; dir--) {
744 if (event_test != cudaSuccess)
747 if (cudaSuccess == event_test) {
751 dslashParam.remote_write ?
streams + packIndex :
nullptr,
false, dslashParam.remote_write),
758 if (
commsComplete(*in, dslash, i, dir,
false,
true,
false,
false)) {
768 dslashParam.kernel_type =
static_cast<KernelType>(i);
769 dslashParam.threads = dslash.
Nface() * faceVolumeCB[i];
798 dslashParam.threads = volume;
807 const int packIndex =
Nstream - 1;
809 issuePack(*in, dslash, parity_src, static_cast<MemoryLocation>(
Device | (
Remote * dslashParam.remote_write)),
819 for (
int i = 3; i >= 0; i--) {
820 if (!dslashParam.commDim[i])
continue;
822 for (
int dir = 1; dir >= 0; dir--) {
826 if (event_test != cudaSuccess)
829 if (cudaSuccess == event_test) {
833 dslashParam.remote_write ?
streams + packIndex :
nullptr,
false, dslashParam.remote_write),
840 if (
commsComplete(*in, dslash, i, dir,
false,
true,
false,
false)) {
862 #define CUDA_CALL( call ) \ 864 CUresult cudaStatus = call; \ 865 if ( CUDA_SUCCESS != cudaStatus ) { \ 866 const char *err_str = nullptr; \ 867 cuGetErrorString(cudaStatus, &err_str); \ 868 fprintf(stderr, "ERROR: CUDA call \"%s\" in line %d of file %s failed with %s (%d).\n", #call, __LINE__, __FILE__, err_str, cudaStatus); \ 872 #define CUDA_CALL( call ) call 880 #if (CUDA_VERSION >= 8000) && 0 890 dslashParam.threads = volume;
899 const int packIndex =
Nstream - 1;
901 issuePack(*in, dslash, parity_src, static_cast<MemoryLocation>(
Device | (
Remote * dslashParam.remote_write)),
911 for (
int i = 3; i >= 0; i--) {
912 if (!dslashParam.commDim[i])
continue;
914 for (
int dir = 1; dir >= 0; dir--) {
918 if (event_test != cudaSuccess)
921 if (cudaSuccess == event_test) {
925 dslashParam.remote_write ?
streams + packIndex :
nullptr,
false, dslashParam.remote_write),
930 *((
volatile cuuint32_t *)(commsEnd_h + 2 * i + 1 - dir)) = 0;
932 streams[2 * i + dir], commsEnd_d[2 * i + 1 - dir], 1, CU_STREAM_WAIT_VALUE_EQ));
942 if (
commsComplete(*in, dslash, i, dir,
false,
false,
false,
true)) {
953 for (
int dir = 1; dir >= 0; dir--) {
965 dslashParam.kernel_type =
static_cast<KernelType>(i);
966 dslashParam.threads = dslash.
Nface() * faceVolumeCB[i];
985 errorQuda(
"Async dslash policy variants require CUDA 8.0 and above");
988 #endif // CUDA_VERSION >= 8000 997 #if (CUDA_VERSION >= 8000) && 0 1007 dslashParam.threads = volume;
1016 const int packIndex =
Nstream - 1;
1018 issuePack(*in, dslash, parity_src, static_cast<MemoryLocation>(
Device | (
Remote * dslashParam.remote_write)),
1029 for (
int i = 3; i >= 0; i--) {
1030 if (!dslashParam.commDim[i])
continue;
1032 for (
int dir = 1; dir >= 0; dir--) {
1037 if (event_test != cudaSuccess)
1040 if (cudaSuccess == event_test) {
1044 dslashParam.remote_write ?
streams + packIndex :
nullptr,
false, dslashParam.remote_write),
1049 *((
volatile cuuint32_t *)(commsEnd_h + 2 * i + 1 - dir)) = 0;
1051 streams[scatterIndex], commsEnd_d[2 * i + 1 - dir], 1, CU_STREAM_WAIT_VALUE_EQ));
1061 if (
commsComplete(*in, dslash, i, dir,
false,
false,
false,
true, scatterIndex)) {
1071 for (
int i = 3; i >= 0; i--) {
1095 errorQuda(
"Async dslash policy variants require CUDA 8.0 and above");
1098 #endif // CUDA_VERSION >= 8000 1115 dslashParam.threads = volume;
1126 issuePack(*in, dslash, parity_src, static_cast<MemoryLocation>(
Host | (
Remote * dslashParam.remote_write)),
1132 for (
int i = 3; i >= 0; i--) {
1133 if (!dslashParam.remote_write
1140 for (
int p2p = 0; p2p < 2; p2p++) {
1141 for (
int i = 3; i >= 0; i--) {
1142 if (!dslashParam.commDim[i])
continue;
1144 for (
int dir = 1; dir >= 0; dir--) {
1147 dslashParam.remote_write ?
streams + packIndex :
nullptr,
false, dslashParam.remote_write),
1157 for (
int i = 3; i >= 0; i--) {
1158 if (!dslashParam.commDim[i])
continue;
1160 for (
int dir = 1; dir >= 0; dir--) {
1164 if (
commsComplete(*in, dslash, i, dir,
false,
false,
false,
false)) {
1173 for (
int dir = 1; dir >= 0; dir--) {
1185 dslashParam.kernel_type =
static_cast<KernelType>(i);
1186 dslashParam.threads = dslash.
Nface() * faceVolumeCB[i];
1216 dslashParam.threads = volume;
1225 issuePack(*in, dslash, parity_src, static_cast<MemoryLocation>(
Host | (
Remote * dslashParam.remote_write)),
1233 for (
int i = 3; i >= 0; i--) {
1234 if (!dslashParam.remote_write
1241 for (
int p2p = 0; p2p < 2; p2p++) {
1242 for (
int i = 3; i >= 0; i--) {
1243 if (!dslashParam.commDim[i])
continue;
1245 for (
int dir = 1; dir >= 0; dir--) {
1249 dslashParam.remote_write ?
streams + packScatterIndex :
nullptr,
false, dslashParam.remote_write),
1259 for (
int i = 3; i >= 0; i--) {
1260 if (!dslashParam.commDim[i])
continue;
1262 for (
int dir = 1; dir >= 0; dir--) {
1266 if (
commsComplete(*in, dslash, i, dir,
false,
false,
false,
false, packScatterIndex)) {
1276 for (
int i = 3; i >= 0; i--) {
1310 dslashParam.threads = volume;
1321 issuePack(*in, dslash, parity_src, static_cast<MemoryLocation>(
Host | (
Remote * dslashParam.remote_write)),
1327 for (
int i = 3; i >= 0; i--) {
1328 if (!dslashParam.remote_write
1335 for (
int p2p = 0; p2p < 2; p2p++) {
1336 for (
int i = 3; i >= 0; i--) {
1337 if (!dslashParam.commDim[i])
continue;
1339 for (
int dir = 1; dir >= 0; dir--) {
1342 dslashParam.remote_write ?
streams + packIndex :
nullptr,
false, dslashParam.remote_write),
1352 for (
int i = 3; i >= 0; i--) {
1353 if (!dslashParam.commDim[i])
continue;
1355 for (
int dir = 1; dir >= 0; dir--) {
1359 if (
commsComplete(*in, dslash, i, dir,
false,
true,
false,
false)) {
1369 dslashParam.kernel_type =
static_cast<KernelType>(i);
1370 dslashParam.threads = dslash.
Nface() * faceVolumeCB[i];
1400 dslashParam.threads = volume;
1409 issuePack(*in, dslash, parity_src, static_cast<MemoryLocation>(
Host | (
Remote * dslashParam.remote_write)),
1417 for (
int i = 3; i >= 0; i--) {
1418 if (!dslashParam.remote_write
1425 for (
int p2p = 0; p2p < 2; p2p++) {
1426 for (
int i = 3; i >= 0; i--) {
1427 if (!dslashParam.commDim[i])
continue;
1429 for (
int dir = 1; dir >= 0; dir--) {
1432 dslashParam.remote_write ?
streams + packIndex :
nullptr,
false, dslashParam.remote_write),
1442 for (
int i = 3; i >= 0; i--) {
1443 if (!dslashParam.commDim[i])
continue;
1445 for (
int dir = 1; dir >= 0; dir--) {
1449 if (
commsComplete(*in, dslash, i, dir,
false,
true,
false,
false)) {
1484 dslashParam.threads = volume;
1495 issuePack(*in, dslash, parity_src, static_cast<MemoryLocation>(
Host | (
Remote * dslashParam.remote_write)),
1501 for (
int i = 3; i >= 0; i--) {
1502 if (!dslashParam.remote_write
1509 for (
int p2p = 0; p2p < 2; p2p++) {
1510 for (
int i = 3; i >= 0; i--) {
1511 if (!dslashParam.commDim[i])
continue;
1513 for (
int dir = 1; dir >= 0; dir--) {
1516 dslashParam.remote_write ?
streams + packIndex :
nullptr,
false, dslashParam.remote_write),
1526 for (
int i = 3; i >= 0; i--) {
1527 if (!dslashParam.commDim[i])
continue;
1529 for (
int dir = 1; dir >= 0; dir--) {
1533 if (
commsComplete(*in, dslash, i, dir,
false,
false,
true,
false)) {
1543 dslashParam.kernel_type =
static_cast<KernelType>(i);
1544 dslashParam.threads = dslash.
Nface() * faceVolumeCB[i];
1574 dslashParam.threads = volume;
1585 issuePack(*in, dslash, parity_src, static_cast<MemoryLocation>(
Host | (
Remote * dslashParam.remote_write)),
1591 for (
int i = 3; i >= 0; i--) {
1592 if (!dslashParam.remote_write
1599 for (
int p2p = 0; p2p < 2; p2p++) {
1600 for (
int i = 3; i >= 0; i--) {
1601 if (!dslashParam.commDim[i])
continue;
1603 for (
int dir = 1; dir >= 0; dir--) {
1606 dslashParam.remote_write ?
streams + packIndex :
nullptr,
false, dslashParam.remote_write),
1616 for (
int i = 3; i >= 0; i--) {
1617 if (!dslashParam.commDim[i])
continue;
1619 for (
int dir = 1; dir >= 0; dir--) {
1623 if (
commsComplete(*in, dslash, i, dir,
false,
false,
true,
false)) {
1655 dslashParam.threads = volume;
1690 extern std::vector<QudaDslashPolicy>
policies;
1711 switch (dslashPolicy) {
1757 default:
errorQuda(
"Dslash policy %d not recognized", static_cast<int>(dslashPolicy));
break;
1789 dslashParam(dslash.dslashParam),
1792 ghostFace(ghostFace),
1797 if (!dslash_policy_init) {
1819 static char *dslash_policy_env = getenv(
"QUDA_ENABLE_DSLASH_POLICY");
1820 if (dslash_policy_env) {
1821 std::stringstream policy_list(dslash_policy_env);
1824 while (policy_list >> policy_) {
1833 errorQuda(
"Cannot select a GDR policy %d unless QUDA_ENABLE_GDR is set", static_cast<int>(dslash_policy));
1838 if (policy_list.peek() ==
',') policy_list.ignore();
1841 errorQuda(
"No valid policy found in QUDA_ENABLE_DSLASH_POLICY");
1844 first_active_policy = 0;
1868 #if (CUDA_VERSION >= 8000) && 0 1869 #if (CUDA_VERSION >= 9000) 1872 int can_use_stream_mem_ops;
1873 cuDeviceGetAttribute(&can_use_stream_mem_ops, CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_MEM_OPS, device);
1875 int can_use_stream_mem_ops = 1;
1877 if (can_use_stream_mem_ops) {
1886 strcat(policy_string, (
int)policies[i] == i ?
"1" :
"0");
1889 static char *dslash_pack_env = getenv(
"QUDA_ENABLE_DSLASH_PACK");
1890 if (dslash_pack_env && strcmp(dslash_pack_env,
"0") == 0) {
1892 dslash_pack_compute =
false;
1895 static char *dslash_interior_env = getenv(
"QUDA_ENABLE_DSLASH_INTERIOR");
1896 if (dslash_interior_env && strcmp(dslash_interior_env,
"0") == 0) {
1898 dslash_interior_compute =
false;
1901 static char *dslash_exterior_env = getenv(
"QUDA_ENABLE_DSLASH_EXTERIOR");
1902 if (dslash_exterior_env && strcmp(dslash_exterior_env,
"0") == 0) {
1904 dslash_exterior_compute =
false;
1907 static char *dslash_copy_env = getenv(
"QUDA_ENABLE_DSLASH_COPY");
1908 if (dslash_copy_env && strcmp(dslash_copy_env,
"0") == 0) {
1910 dslash_copy =
false;
1913 static char *dslash_comms_env = getenv(
"QUDA_ENABLE_DSLASH_COMMS");
1914 if (dslash_comms_env && strcmp(dslash_comms_env,
"0") == 0) {
1916 dslash_comms =
false;
1925 for (
auto &p2p : p2p_policies) {
1933 for (
auto &i : policies) {
1939 !dslashParam.remote_write) {
1942 (*dslashImp)(dslash,
in, volume, ghostFace, profile);
1980 (*dslashImp)(dslash,
in, volume, ghostFace, profile);
1982 (*dslashImp)(dslash,
in, volume, ghostFace, profile);
1989 (*dslashImp)(dslash,
in, volume, ghostFace, profile);
1996 errorQuda(
"Unsupported dslash policy %d\n", static_cast<int>(i));
2006 dslash_policy_init =
true;
2014 if (tp.
aux.x >= static_cast<int>(policies.size()))
errorQuda(
"Requested policy that is outside of range");
2032 dslashParam.remote_write
2038 (*dslashImp)(dslash,
in, volume, ghostFace, profile);
2053 while ((
unsigned)param.
aux.x < policies.size()-1) {
2059 while ((
unsigned)param.
aux.y < p2p_policies.size()-1) {
2085 KernelType kernel_type = dslashParam.kernel_type;
2090 strcat(key.
aux, policy_string);
2091 dslashParam.kernel_type = kernel_type;
2096 KernelType kernel_type = dslashParam.kernel_type;
2098 long long flops_ = dslash.
flops();
2099 dslashParam.kernel_type = kernel_type;
2104 KernelType kernel_type = dslashParam.kernel_type;
2106 long long bytes_ = dslash.
bytes();
2107 dslashParam.kernel_type = kernel_type;
virtual void apply(const cudaStream_t &stream)=0
bool dslash_exterior_compute
DslashPolicyTune(Dslash &dslash, cudaColorSpinorField *in, const int volume, const int *ghostFace, TimeProfile &profile)
virtual void postTune()
Restore the output field if doing exterior kernel.
bool dslash_interior_compute
cudaEvent_t scatterStart[Nstream]
cudaEvent_t gatherStart[Nstream]
static DslashPolicyImp< Dslash > * create(const QudaDslashPolicy &dslashPolicy)
void streamInit(cudaStream_t *stream_p)
void apply(const cudaStream_t &stream)
cudaError_t qudaEventQuery(cudaEvent_t &event)
Wrapper around cudaEventQuery or cuEventQuery.
cudaError_t qudaStreamWaitEvent(cudaStream_t stream, cudaEvent_t event, unsigned int flags)
Wrapper around cudaEventRecord or cuEventRecord.
void disableProfileCount()
Disable the profile kernel counting.
int gatherCompleted[Nstream]
QudaVerbosity getVerbosity()
void issueRecv(cudaColorSpinorField &input, const Dslash &dslash, cudaStream_t *stream, bool gdr)
This helper function simply posts all receives in all directions.
void gather(int nFace, int dagger, int dir, cudaStream_t *stream_p=NULL)
#define PROFILE(f, profile, idx)
int getStreamIndex(const T &dslashParam)
Returns a stream index for posting the pack/scatters to. We desire a stream index that is not being u...
void augmentAux(KernelType type, const char *extra)
void operator()(Dslash &dslash, cudaColorSpinorField *in, const int volume, const int *faceVolumeCB, TimeProfile &profile)
virtual TuneKey tuneKey() const =0
cudaEvent_t dslashStart[2]
int commsQuery(int nFace, int d, int dagger=0, cudaStream_t *stream_p=nullptr, bool gdr_send=false, bool gdr_recv=false)
Non-blocking query if the halo communication has completed.
void enableProfileCount()
Enable the profile kernel counting.
void completeDslash(const ColorSpinorField &in, const T &dslashParam)
Ensure that the dslash is complete. By construction, the dslash will have completed (or is in flight)...
void comm_enable_peer2peer(bool enable)
Enable / disable peer-to-peer communication: used for dslash policies that do not presently support p...
std::vector< QudaP2PPolicy > p2p_policies
void setAux(KernelType type, const char *aux_)
char policy_string[TuneKey::aux_n]
virtual long long bytes() const
void defaultTuneParam(TuneParam ¶m) const
std::vector< QudaDslashPolicy > policies
const cudaEvent_t & getIPCCopyEvent(int dir, int dim) const
unsigned int sharedBytesPerThread() const
int commsCompleted[Nstream]
cudaError_t qudaStreamSynchronize(cudaStream_t &stream)
Wrapper around cudaStreamSynchronize or cuStreamSynchronize.
const char * comm_dim_topology_string()
Return a string that defines the comm topology (for use as a tuneKey)
void operator()(Dslash &dslash, cudaColorSpinorField *in, const int volume, const int *faceVolumeCB, TimeProfile &profile)
void operator()(Dslash &dslash, cudaColorSpinorField *in, const int volume, const int *faceVolumeCB, TimeProfile &profile)
QudaSiteSubset SiteSubset() const
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
void operator()(Dslash &dslash, cudaColorSpinorField *in, const int volume, const int *faceVolumeCB, TimeProfile &profile)
const cudaEvent_t & getIPCRemoteCopyEvent(int dir, int dim) const
void operator()(Dslash &dslash, cudaColorSpinorField *in, const int volume, const int *faceVolumeCB, TimeProfile &profile)
DslashCommsPattern(const int commDim[], bool gdr_send=false)
void operator()(Dslash &dslash, cudaColorSpinorField *in, const int volume, const int *faceVolumeCB, TimeProfile &profile)
void setMappedGhost(Dslash &dslash, ColorSpinorField &in, bool to_mapped)
Set the ghosts to the mapped CPU ghost buffer, or unsets if already set. Note this must not be called...
DslashArg< Float > & dslashParam
void sendStart(int nFace, int d, int dagger=0, cudaStream_t *stream_p=nullptr, bool gdr=false, bool remote_write=false)
Initiate halo communication sending.
void issueGather(cudaColorSpinorField &in, const Dslash &dslash)
This helper function simply posts the device-host memory copies of all halos in all dimensions and di...
void setPolicyTuning(bool)
Enable / disable whether are tuning a policy.
int first_active_p2p_policy
bool comm_peer2peer_enabled(int dir, int dim)
void issuePack(cudaColorSpinorField &in, const Dslash &dslash, int parity, MemoryLocation location, int packIndex)
This helper function simply posts the packing kernel needed for halo exchange.
static int index(int ndim, const int *dims, const int *x)
virtual ~DslashPolicyImp()
void enable_policy(QudaDslashPolicy p)
void initTuneParam(TuneParam ¶m) const
void operator()(Dslash &dslash, cudaColorSpinorField *in, const int volume, const int *faceVolumeCB, TimeProfile &profile)
static int commDim[QUDA_MAX_DIM]
__device__ __host__ void pack(Arg &arg, int ghost_idx, int s, int parity)
void pack(int nFace, int parity, int dagger, int stream_idx, MemoryLocation location[], MemoryLocation location_label, bool spin_project=true, double a=0, double b=0, double c=0)
unsigned int sharedBytesPerBlock(const TuneParam ¶m) const
void operator()(Dslash &dslash, cudaColorSpinorField *in, const int volume, const int *faceVolumeCB, TimeProfile &profile)
cudaEvent_t scatterEnd[Nstream]
bool commsComplete(cudaColorSpinorField &in, const Dslash &dslash, int dim, int dir, bool gdr_send, bool gdr_recv, bool zero_copy_recv, bool async, int scatterIndex=-1)
Wrapper for querying if communication is finished in the dslash, and if it is take the appropriate ac...
void operator()(Dslash &dslash, cudaColorSpinorField *in, const int volume, const int *faceVolumeCB, TimeProfile &profile)
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
void operator()(Dslash &dslash, cudaColorSpinorField *in, const int volume, const int *faceVolumeCB, TimeProfile &profile)
void operator()(Dslash &dslash, cudaColorSpinorField *in, const int volume, const int *faceVolumeCB, TimeProfile &profile)
bool comm_gdr_enabled()
Query if GPU Direct RDMA communication is enabled (global setting)
void setKernelPackT(bool pack)
cudaError_t qudaEventRecord(cudaEvent_t &event, cudaStream_t stream=0)
Wrapper around cudaEventRecord or cuEventRecord.
static cudaColorSpinorField * inSpinor
virtual void initTuneParam(TuneParam ¶m) const
int dslashCompleted[Nstream]
void operator()(Dslash &dslash, cudaColorSpinorField *in, const int volume, const int *faceVolumeCB, TimeProfile &profile)
void setFusedParam(Arg ¶m, Dslash &dslash, const int *faceVolumeCB)
void operator()(Dslash &dslash, cudaColorSpinorField *in, const int volume, const int *faceVolumeCB, TimeProfile &profile)
void pushKernelPackT(bool pack)
#define QUDA_MAX_DIM
Maximum number of dimensions supported by QUDA. In practice, no routines make use of more than 5...
bool advanceTuneParam(TuneParam ¶m) const
decltype(dslash.dslashParam) & dslashParam
const std::map< TuneKey, TuneParam > & getTuneCache()
Returns a reference to the tunecache map.
virtual long long flops() const
QudaTune getTuning()
Query whether autotuning is enabled or not. Default is enabled but can be overridden by setting QUDA_...
void scatter(int nFace, int dagger, int dir, cudaStream_t *stream_p)
virtual void preTune()
Save the output field since the output field is both read from and written to in the exterior kernels...
cudaColorSpinorField * in
void recvStart(int nFace, int dir, int dagger=0, cudaStream_t *stream_p=nullptr, bool gdr=false)
Initiate halo communication receive.
const char * getAux(KernelType type) const
int comm_peer2peer_enabled_global()
bool comm_gdr_blacklist()
Query if GPU Direct RDMA communication is blacklisted for this GPU.
virtual ~DslashPolicyTune()
const char * comm_config_string()
Return a string that defines the P2P/GDR environment variable configuration (for use as a tuneKey to ...
cudaEvent_t gatherEnd[Nstream]
virtual void apply(const cudaStream_t &stream)=0
virtual void defaultTuneParam(TuneParam ¶m) const
void operator()(Dslash &dslash, cudaColorSpinorField *in, const int volume, const int *faceVolumeCB, TimeProfile &profile)
bool advanceAux(TuneParam ¶m) const
void disable_policy(QudaDslashPolicy p)
void operator()(Dslash &dslash, cudaColorSpinorField *in, const int volume, const int *faceVolumeCB, TimeProfile &profile)