5 #include <cuda_runtime.h> 26 void qudaMemcpy_(
void *dst,
const void *src,
size_t count, cudaMemcpyKind kind,
27 const char *func,
const char *file,
const char *line);
31 #define STRINGIFY__(x) #x 32 #define __STRINGIFY__(x) STRINGIFY__(x) 33 #define qudaMemcpy(dst, src, count, kind) \ 34 ::quda::qudaMemcpy_(dst, src, count, kind, __func__, quda::file_name(__FILE__), __STRINGIFY__(__LINE__)); 36 #define STRINGIFY__(x) #x 37 #define __STRINGIFY__(x) STRINGIFY__(x) 38 #define qudaMemcpyAsync(dst, src, count, kind, stream) \ 39 ::quda::qudaMemcpyAsync_(dst, src, count, kind, stream, __func__, quda::file_name(__FILE__), __STRINGIFY__(__LINE__)); 41 #define STRINGIFY__(x) #x 42 #define __STRINGIFY__(x) STRINGIFY__(x) 43 #define qudaMemcpy2DAsync(dst, dpitch, src, spitch, width, height, kind, stream) \ 44 ::quda::qudaMemcpy2DAsync_(dst, dpitch, src, spitch, width, height, kind, stream, __func__, quda::file_name(__FILE__), __STRINGIFY__(__LINE__)); 58 const char *func,
const char *file,
const char *line);
73 size_t width,
size_t hieght, cudaMemcpyKind kind,
const cudaStream_t &stream,
74 const char *func,
const char *file,
const char *line);
85 cudaError_t
qudaLaunchKernel(
const void* func, dim3 gridDim, dim3 blockDim,
void** args,
size_t sharedMem, cudaStream_t stream);
126 #if CUDA_VERSION >= 9000 133 cudaError_t qudaFuncSetAttribute(
const void* func, cudaFuncAttribute attr,
int value);
143 #define STRINGIFY__(x) #x 144 #define __STRINGIFY__(x) STRINGIFY__(x) 145 #define qudaDeviceSynchronize() \ 146 ::quda::qudaDeviceSynchronize_(__func__, quda::file_name(__FILE__), __STRINGIFY__(__LINE__)); void qudaMemcpy2DAsync_(void *dst, size_t dpitch, const void *src, size_t spitch, size_t width, size_t hieght, cudaMemcpyKind kind, const cudaStream_t &stream, const char *func, const char *file, const char *line)
Wrapper around cudaMemcpy2DAsync or driver API equivalent Potentially add auto-profiling support...
cudaError_t qudaEventSynchronize(cudaEvent_t &event)
Wrapper around cudaEventSynchronize or cuEventSynchronize.
cudaError_t qudaEventQuery(cudaEvent_t &event)
Wrapper around cudaEventQuery or cuEventQuery.
cudaError_t qudaStreamWaitEvent(cudaStream_t stream, cudaEvent_t event, unsigned int flags)
Wrapper around cudaEventRecord or cuEventRecord.
cudaError_t qudaStreamSynchronize(cudaStream_t &stream)
Wrapper around cudaStreamSynchronize or cuStreamSynchronize.
cudaError_t qudaDeviceSynchronize_(const char *func, const char *file, const char *line)
Wrapper around cudaDeviceSynchronize or cuDeviceSynchronize.
void qudaMemcpyAsync_(void *dst, const void *src, size_t count, cudaMemcpyKind kind, const cudaStream_t &stream, const char *func, const char *file, const char *line)
Wrapper around cudaMemcpyAsync or driver API equivalent Potentially add auto-profiling support...
void printAPIProfile()
Print out the timer profile for CUDA API calls.
cudaError_t qudaEventRecord(cudaEvent_t &event, cudaStream_t stream=0)
Wrapper around cudaEventRecord or cuEventRecord.
__device__ unsigned int count[QUDA_MAX_MULTI_REDUCE]
cudaError_t qudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim, void **args, size_t sharedMem, cudaStream_t stream)
Wrapper around cudaLaunchKernel.
void qudaMemcpy_(void *dst, const void *src, size_t count, cudaMemcpyKind kind, const char *func, const char *file, const char *line)
Wrapper around cudaMemcpy used for auto-profiling. Do not call directly, rather call macro below whic...