1 #include <unordered_set>
15 #define PROFILE(f, idx) \
16 apiTimer.TPSTART(idx); \
20 #define PROFILE(f, idx) f;
34 void qudaFuncSetAttribute_(
const void *kernel, cudaFuncAttribute attr,
int value,
const char *func,
const char *file,
42 void qudaFuncGetAttributes_(cudaFuncAttributes &attr,
const void *kernel,
const char *func,
const char *file,
45 #define qudaFuncSetAttribute(kernel, attr, value) \
46 ::quda::qudaFuncSetAttribute_(kernel, attr, value, __func__, quda::file_name(__FILE__), __STRINGIFY__(__LINE__))
48 #define qudaFuncGetAttributes(attr, kernel) \
49 ::quda::qudaFuncGetAttributes_(attr, kernel, __func__, quda::file_name(__FILE__), __STRINGIFY__(__LINE__))
52 static TimeProfile apiTimer(
"CUDA API calls (driver)");
54 static TimeProfile apiTimer(
"CUDA API calls (runtime)");
60 static std::unordered_set<const void *> cache;
61 auto search = cache.find(func);
62 if (search == cache.end()) {
64 qudaFuncSetAttribute(func, cudaFuncAttributePreferredSharedMemoryCarveout, (
int)cudaSharedmemCarveoutMaxShared);
65 cudaFuncAttributes attributes;
86 const cudaMemcpyKind kind;
89 const bool active_tuning;
91 unsigned int sharedBytesPerThread()
const {
return 0; }
92 unsigned int sharedBytesPerBlock(
const TuneParam &
param)
const {
return 0; }
95 inline QudaMem(
void *dst,
const void *src,
size_t count, cudaMemcpyKind kind,
const cudaStream_t &
stream,
96 bool async,
const char *func,
const char *file,
const char *line) :
108 case cudaMemcpyDeviceToHost: name =
"cudaMemcpyDeviceToHost";
break;
109 case cudaMemcpyHostToDevice: name =
"cudaMemcpyHostToDevice";
break;
110 case cudaMemcpyHostToHost: name =
"cudaMemcpyHostToHost";
break;
111 case cudaMemcpyDeviceToDevice: name =
"cudaMemcpyDeviceToDevice";
break;
112 case cudaMemcpyDefault: name =
"cudaMemcpyDefault";
break;
113 default:
errorQuda(
"Unsupported cudaMemcpyKind %d", kind);
117 case cudaMemcpyDeviceToHost: name =
"cudaMemcpyAsyncDeviceToHost";
break;
118 case cudaMemcpyHostToDevice: name =
"cudaMemcpyAsyncHostToDevice";
break;
119 case cudaMemcpyHostToHost: name =
"cudaMemcpyAsyncHostToHost";
break;
120 case cudaMemcpyDeviceToDevice: name =
"cudaMemcpyAsyncDeviceToDevice";
break;
121 case cudaMemcpyDefault: name =
"cudaMemcpyAsyncDefault";
break;
122 default:
errorQuda(
"Unsupported cudaMemcpyKind %d", kind);
134 inline QudaMem(
void *dst,
int value,
size_t count,
const cudaStream_t &
stream,
bool async,
const char *func,
135 const char *file,
const char *line) :
141 kind(cudaMemcpyDefault),
145 name = !async ?
"cudaMemset" :
"cudaMemsetAsync";
161 #ifdef USE_DRIVER_API
163 case cudaMemcpyDeviceToHost:
166 case cudaMemcpyHostToDevice:
169 case cudaMemcpyDeviceToDevice:
172 case cudaMemcpyDefault:
175 default:
errorQuda(
"Unsupported cuMemcpyTypeAsync %d", kind);
184 default:
errorQuda(
"Unsupported cudaMemcpyTypeAsync %d", kind);
187 PROFILE(cudaMemcpyAsync(dst, src, count, kind,
stream), type);
190 #ifdef USE_DRIVER_API
192 case cudaMemcpyDeviceToHost: cuMemcpyDtoH(dst, (CUdeviceptr)src, count);
break;
193 case cudaMemcpyHostToDevice: cuMemcpyHtoD((CUdeviceptr)dst, src, count);
break;
194 case cudaMemcpyHostToHost: memcpy(dst, src, count);
break;
195 case cudaMemcpyDeviceToDevice: cuMemcpyDtoD((CUdeviceptr)dst, (CUdeviceptr)src, count);
break;
196 case cudaMemcpyDefault: cuMemcpy((CUdeviceptr)dst, (CUdeviceptr)src, count);
break;
197 default:
errorQuda(
"Unsupported cudaMemcpyType %d", kind);
200 cudaMemcpy(dst, src, count, kind);
204 #ifdef USE_DRIVER_API
206 cuMemsetD32Async((CUdeviceptr)dst, value, count / 4,
stream);
208 cuMemsetD32((CUdeviceptr)dst, value, count / 4);
211 cudaMemsetAsync(dst, value, count,
stream);
213 cudaMemset(dst, value, count);
223 strcpy(vol,
"bytes=");
224 u64toa(vol + 6, (uint64_t)count);
228 long long flops()
const {
return 0; }
229 long long bytes()
const {
return kind == cudaMemcpyDeviceToDevice ? 2 * count : count; }
232 void qudaMemcpy_(
void *dst,
const void *src,
size_t count, cudaMemcpyKind kind,
const char *func,
const char *file,
235 if (count == 0)
return;
236 QudaMem copy(dst, src, count, kind, 0,
false, func, file, line);
237 cudaError_t error = cudaGetLastError();
238 if (error != cudaSuccess)
errorQuda(
"(CUDA) %s\n (%s:%s in %s())\n", cudaGetErrorString(error), file, line, func);
242 const char *func,
const char *file,
const char *line)
244 if (count == 0)
return;
246 if (kind == cudaMemcpyDeviceToDevice) {
249 #ifdef USE_DRIVER_API
251 case cudaMemcpyDeviceToHost:
254 case cudaMemcpyHostToDevice:
257 case cudaMemcpyDeviceToDevice:
260 case cudaMemcpyDefault:
263 default:
errorQuda(
"Unsupported cuMemcpyTypeAsync %d", kind);
272 void qudaMemcpy2D_(
void *dst,
size_t dpitch,
const void *src,
size_t spitch,
size_t width,
size_t height,
273 cudaMemcpyKind kind,
const char *func,
const char *file,
const char *line)
275 #ifdef USE_DRIVER_API
277 param.srcPitch = spitch;
279 param.srcXInBytes = 0;
280 param.dstPitch = dpitch;
282 param.dstXInBytes = 0;
283 param.WidthInBytes = width;
284 param.Height = height;
287 case cudaMemcpyDeviceToHost:
288 param.srcDevice = (CUdeviceptr)src;
289 param.srcMemoryType = CU_MEMORYTYPE_DEVICE;
291 param.dstMemoryType = CU_MEMORYTYPE_HOST;
293 default:
errorQuda(
"Unsupported cuMemcpyType2DAsync %d", kind);
301 void qudaMemcpy2DAsync_(
void *dst,
size_t dpitch,
const void *src,
size_t spitch,
size_t width,
size_t height,
305 #ifdef USE_DRIVER_API
307 param.srcPitch = spitch;
309 param.srcXInBytes = 0;
310 param.dstPitch = dpitch;
312 param.dstXInBytes = 0;
313 param.WidthInBytes = width;
314 param.Height = height;
317 case cudaMemcpyDeviceToHost:
318 param.srcDevice = (CUdeviceptr)src;
319 param.srcMemoryType = CU_MEMORYTYPE_DEVICE;
321 param.dstMemoryType = CU_MEMORYTYPE_HOST;
323 default:
errorQuda(
"Unsupported cuMemcpyType2DAsync %d", kind);
331 void qudaMemset_(
void *ptr,
int value,
size_t count,
const char *func,
const char *file,
const char *line)
333 if (count == 0)
return;
334 QudaMem set(ptr, value, count, 0,
false, func, file, line);
335 cudaError_t error = cudaGetLastError();
337 errorQuda(
"(CUDA) %s\n (%s:%s in %s())\n", cudaGetErrorString(error), file, line, func);
341 const char *file,
const char *line)
343 if (count == 0)
return;
345 cudaError_t error = cudaGetLastError();
346 if (error != cudaSuccess)
errorQuda(
"(CUDA) %s\n (%s:%s in %s())\n", cudaGetErrorString(error), file, line, func);
349 void qudaMemset2D_(
void *ptr,
size_t pitch,
int value,
size_t width,
size_t height,
const char *func,
350 const char *file,
const char *line)
352 cudaError_t error = cudaMemset2D(ptr, pitch, value, width, height);
353 if (error != cudaSuccess)
errorQuda(
"(CUDA) %s\n (%s:%s in %s())\n", cudaGetErrorString(error), file, line, func);
357 const char *func,
const char *file,
const char *line)
359 cudaError_t error = cudaMemset2DAsync(ptr, pitch, value, width, height,
stream);
360 if (error != cudaSuccess)
errorQuda(
"(CUDA) %s\n (%s:%s in %s())\n", cudaGetErrorString(error), file, line, func);
364 const char *func,
const char *file,
const char *line)
370 dev_id = cudaCpuDeviceId;
374 cudaError_t error = cudaMemPrefetchAsync(ptr, count, dev_id,
stream);
375 if (error != cudaSuccess)
errorQuda(
"(CUDA) %s\n (%s:%s in %s())\n", cudaGetErrorString(error), file, line, func);
378 bool qudaEventQuery_(cudaEvent_t &event,
const char *func,
const char *file,
const char *line)
380 #ifdef USE_DRIVER_API
383 case CUDA_SUCCESS:
return true;
384 case CUDA_ERROR_NOT_READY:
return false;
387 cuGetErrorName(error, &str);
388 errorQuda(
"cuEventQuery returned error %s\n (%s:%s in %s())", str, file, line, func);
394 case cudaSuccess:
return true;
395 case cudaErrorNotReady:
return false;
396 default:
errorQuda(
"(CUDA) %s\n (%s:%s in %s())\n", cudaGetErrorString(error), file, line, func);
404 #ifdef USE_DRIVER_API
406 if (error != CUDA_SUCCESS) {
408 cuGetErrorName(error, &str);
409 errorQuda(
"cuEventRecord returned error %s\n (%s:%s in %s())", str, file, line, func);
413 if (error != cudaSuccess)
errorQuda(
"(CUDA) %s\n (%s:%s in %s())\n", cudaGetErrorString(error), file, line, func);
418 const char *file,
const char *line)
420 #ifdef USE_DRIVER_API
422 if (error != CUDA_SUCCESS) {
424 cuGetErrorName(error, &str);
425 errorQuda(
"cuStreamWaitEvent returned error %s\n (%s:%s in %s())", str, file, line, func);
429 if (error != cudaSuccess)
errorQuda(
"(CUDA) %s\n (%s:%s in %s())\n", cudaGetErrorString(error), file, line, func);
435 #ifdef USE_DRIVER_API
437 if (error != CUDA_SUCCESS) {
439 cuGetErrorName(error, &str);
440 errorQuda(
"cuEventSynchronize returned error %s\n (%s:%s in %s())", str, file, line, func);
444 if (error != cudaSuccess)
errorQuda(
"(CUDA) %s\n (%s:%s in %s())\n", cudaGetErrorString(error), file, line, func);
450 #ifdef USE_DRIVER_API
452 if (error != CUDA_SUCCESS) {
454 cuGetErrorName(error, &str);
455 errorQuda(
"(CUDA) cuStreamSynchronize returned error %s\n (%s:%s in %s())\n", str, file, line, func);
460 errorQuda(
"(CUDA) %s\n (%s:%s in %s())", cudaGetErrorString(error), file, line, func);
466 #ifdef USE_DRIVER_API
468 if (error != CUDA_SUCCESS) {
470 cuGetErrorName(error, &str);
471 errorQuda(
"cuCtxSynchronize returned error %s (%s:%s in %s())\n", str, file, line, func);
475 if (error != cudaSuccess)
errorQuda(
"(CUDA) %s\n (%s:%s in %s())\n", cudaGetErrorString(error), file, line, func);
479 void qudaFuncSetAttribute_(
const void *kernel, cudaFuncAttribute attr,
int value,
const char *func,
const char *file,
484 if (error != cudaSuccess)
errorQuda(
"(CUDA) %s\n (%s:%s in %s())\n", cudaGetErrorString(error), file, line, func);
492 if (error != cudaSuccess)
errorQuda(
"(CUDA) %s\n (%s:%s in %s())\n", cudaGetErrorString(error), file, line, func);
QudaMem(void *dst, const void *src, size_t count, cudaMemcpyKind kind, const cudaStream_t &stream, bool async, const char *func, const char *file, const char *line)
void apply(const qudaStream_t &stream)
QudaMem(void *dst, int value, size_t count, const cudaStream_t &stream, bool async, const char *func, const char *file, const char *line)
bool advanceTuneParam(TuneParam ¶m) const
bool set_max_shared_bytes
@ QUDA_CUDA_FIELD_LOCATION
@ QUDA_CPU_FIELD_LOCATION
enum QudaFieldLocation_s QudaFieldLocation
void copy(ColorSpinorField &dst, const ColorSpinorField &src)
size_t max_dynamic_shared_memory()
Returns the maximum dynamic shared memory per block.
TuneParam tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
void qudaMemset2DAsync_(void *ptr, size_t pitch, int value, size_t width, size_t height, const qudaStream_t &stream, const char *func, const char *file, const char *line)
Wrapper around cudaMemsetAsync or driver API equivalent.
bool qudaEventQuery_(cudaEvent_t &event, const char *func, const char *file, const char *line)
Wrapper around cudaEventQuery or cuEventQuery with built-in error checking.
void qudaMemcpy2DAsync_(void *dst, size_t dpitch, const void *src, size_t spitch, size_t width, size_t height, cudaMemcpyKind kind, const qudaStream_t &stream, const char *func, const char *file, const char *line)
Wrapper around cudaMemcpy2DAsync or driver API equivalent.
void qudaMemset2D_(void *ptr, size_t pitch, int value, size_t width, size_t height, const char *func, const char *file, const char *line)
Wrapper around cudaMemset2D or driver API equivalent.
void printAPIProfile()
Print out the timer profile for CUDA API calls.
void qudaDeviceSynchronize_(const char *func, const char *file, const char *line)
Wrapper around cudaDeviceSynchronize or cuDeviceSynchronize with built-in error checking.
void qudaFuncSetAttribute_(const void *kernel, cudaFuncAttribute attr, int value, const char *func, const char *file, const char *line)
Wrapper around cudaFuncSetAttribute with built-in error checking.
void qudaStreamSynchronize_(qudaStream_t &stream, const char *func, const char *file, const char *line)
Wrapper around cudaStreamSynchronize or cuStreamSynchronize with built-in error checking.
@ QUDA_PROFILE_MEMCPY_H2D_ASYNC
@ QUDA_PROFILE_EVENT_SYNCHRONIZE
@ QUDA_PROFILE_FUNC_SET_ATTRIBUTE
@ QUDA_PROFILE_MEMCPY_D2D_ASYNC
@ QUDA_PROFILE_DEVICE_SYNCHRONIZE
@ QUDA_PROFILE_STREAM_SYNCHRONIZE
@ QUDA_PROFILE_EVENT_QUERY
@ QUDA_PROFILE_STREAM_WAIT_EVENT
@ QUDA_PROFILE_MEMCPY_DEFAULT_ASYNC
@ QUDA_PROFILE_MEMCPY_D2H_ASYNC
@ QUDA_PROFILE_LAUNCH_KERNEL
@ QUDA_PROFILE_EVENT_RECORD
@ QUDA_PROFILE_MEMCPY2D_D2H_ASYNC
bool activeTuning()
query if tuning is in progress
void qudaEventSynchronize_(cudaEvent_t &event, const char *func, const char *file, const char *line)
Wrapper around cudaEventSynchronize or cuEventSynchronize with built-in error checking.
void qudaMemcpy2D_(void *dst, size_t dpitch, const void *src, size_t spitch, size_t width, size_t height, cudaMemcpyKind kind, const char *func, const char *file, const char *line)
Wrapper around cudaMemcpy2DAsync or driver API equivalent.
qudaError_t qudaLaunchKernel(const void *func, const TuneParam &tp, void **args, qudaStream_t stream)
Wrapper around cudaLaunchKernel.
void qudaMemsetAsync_(void *ptr, int value, size_t count, const qudaStream_t &stream, const char *func, const char *file, const char *line)
Wrapper around cudaMemsetAsync or driver API equivalent.
void qudaMemPrefetchAsync_(void *ptr, size_t count, QudaFieldLocation mem_space, const qudaStream_t &stream, const char *func, const char *file, const char *line)
Wrapper around cudaMemPrefetchAsync or driver API equivalent.
__host__ __device__ std::enable_if<!isFixed< T1 >::value &&!isFixed< T2 >::value, void >::type copy(T1 &a, const T2 &b)
Copy function which is trival between floating point types. When converting to an integer type,...
void u64toa(char *buffer, uint64_t value)
void qudaMemcpy_(void *dst, const void *src, size_t count, cudaMemcpyKind kind, const char *func, const char *file, const char *line)
Wrapper around cudaMemcpy or driver API equivalent.
void qudaEventRecord_(cudaEvent_t &event, qudaStream_t stream, const char *func, const char *file, const char *line)
Wrapper around cudaEventRecord or cuEventRecord with built-in error checking.
void qudaStreamWaitEvent_(qudaStream_t stream, cudaEvent_t event, unsigned int flags, const char *func, const char *file, const char *line)
Wrapper around cudaStreamWaitEvent or cuStreamWaitEvent with built-in error checking.
void qudaMemset_(void *ptr, int value, size_t count, const char *func, const char *file, const char *line)
Wrapper around cudaMemset or driver API equivalent.
void qudaFuncGetAttributes_(cudaFuncAttributes &attr, const void *kernel, const char *func, const char *file, const char *line)
Wrapper around cudaFuncGetAttributes with built-in error checking.
void qudaMemcpyAsync_(void *dst, const void *src, size_t count, cudaMemcpyKind kind, const qudaStream_t &stream, const char *func, const char *file, const char *line)
Wrapper around cudaMemcpyAsync or driver API equivalent.
#define qudaFuncGetAttributes(attr, kernel)
#define qudaFuncSetAttribute(kernel, attr, value)
cudaStream_t qudaStream_t
QudaTune getTuning()
Query whether autotuning is enabled or not. Default is enabled but can be overridden by setting QUDA_...
QudaVerbosity getVerbosity()