4 #include <cuda_runtime.h> 26 const char *
func,
const char *file,
const char *line);
30 #define STRINGIFY__(x) #x 31 #define __STRINGIFY__(x) STRINGIFY__(x) 32 #define qudaMemcpy(dst, src, count, kind) \ 33 ::quda::qudaMemcpy_(dst, src, count, kind, __func__, quda::file_name(__FILE__), __STRINGIFY__(__LINE__)); 35 #define STRINGIFY__(x) #x 36 #define __STRINGIFY__(x) STRINGIFY__(x) 37 #define qudaMemcpyAsync(dst, src, count, kind, stream) \ 38 ::quda::qudaMemcpyAsync_(dst, src, count, kind, stream, __func__, quda::file_name(__FILE__), __STRINGIFY__(__LINE__)); 40 #define STRINGIFY__(x) #x 41 #define __STRINGIFY__(x) STRINGIFY__(x) 42 #define qudaMemcpy2DAsync(dst, dpitch, src, spitch, width, height, kind, stream) \ 43 ::quda::qudaMemcpy2DAsync_(dst, dpitch, src, spitch, width, height, kind, stream, __func__, quda::file_name(__FILE__), __STRINGIFY__(__LINE__)); 57 const char *
func,
const char *file,
const char *line);
72 size_t width,
size_t hieght, cudaMemcpyKind
kind,
const cudaStream_t &
stream,
73 const char *
func,
const char *file,
const char *line);
125 #if (CUDA_VERSION >= 9000) 132 cudaError_t qudaFuncSetAttribute(
const void*
func, cudaFuncAttribute attr,
int value);
size_t const void size_t spitch
size_t const void size_t size_t width
void qudaMemcpy2DAsync_(void *dst, size_t dpitch, const void *src, size_t spitch, size_t width, size_t hieght, cudaMemcpyKind kind, const cudaStream_t &stream, const char *func, const char *file, const char *line)
Wrapper around cudaMemcpy2DAsync or driver API equivalent Potentially add auto-profiling support...
cudaError_t qudaEventSynchronize(cudaEvent_t &event)
Wrapper around cudaEventSynchronize or cuEventSynchronize.
cudaError_t qudaEventQuery(cudaEvent_t &event)
Wrapper around cudaEventQuery or cuEventQuery.
cudaError_t qudaStreamWaitEvent(cudaStream_t stream, cudaEvent_t event, unsigned int flags)
Wrapper around cudaEventRecord or cuEventRecord.
cudaError_t qudaStreamSynchronize(cudaStream_t &stream)
Wrapper around cudaStreamSynchronize or cuStreamSynchronize.
dim3 dim3 void size_t sharedMem
cudaError_t qudaDeviceSynchronize()
Wrapper around cudaDeviceSynchronize or cuDeviceSynchronize.
const void size_t enum cudaMemcpyKind kind
void qudaMemcpyAsync_(void *dst, const void *src, size_t count, cudaMemcpyKind kind, const cudaStream_t &stream, const char *func, const char *file, const char *line)
Wrapper around cudaMemcpyAsync or driver API equivalent Potentially add auto-profiling support...
void printAPIProfile()
Print out the timer profile for CUDA API calls.
cudaError_t qudaEventRecord(cudaEvent_t &event, cudaStream_t stream=0)
Wrapper around cudaEventRecord or cuEventRecord.
const void int size_t unsigned int flags
__device__ unsigned int count[QUDA_MAX_MULTI_REDUCE]
cudaError_t qudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim, void **args, size_t sharedMem, cudaStream_t stream)
Wrapper around cudaLaunchKernel.
void qudaMemcpy_(void *dst, const void *src, size_t count, cudaMemcpyKind kind, const char *func, const char *file, const char *line)
Wrapper around cudaMemcpy used for auto-profiling. Do not call directly, rather call macro below whic...