QUDA  0.9.0
quda_cuda_api.h
Go to the documentation of this file.
1 #pragma once
2 
3 #include <cuda.h>
4 #include <cuda_runtime.h>
5 #include <quda_cuda_api.h>
6 
14 namespace quda {
15 
25  void qudaMemcpy_(void *dst, const void *src, size_t count, cudaMemcpyKind kind,
26  const char *func, const char *file, const char *line);
27 
28 }
29 
30 #define STRINGIFY__(x) #x
31 #define __STRINGIFY__(x) STRINGIFY__(x)
32 #define qudaMemcpy(dst, src, count, kind) \
33  ::quda::qudaMemcpy_(dst, src, count, kind, __func__, quda::file_name(__FILE__), __STRINGIFY__(__LINE__));
34 
35 #define STRINGIFY__(x) #x
36 #define __STRINGIFY__(x) STRINGIFY__(x)
37 #define qudaMemcpyAsync(dst, src, count, kind, stream) \
38  ::quda::qudaMemcpyAsync_(dst, src, count, kind, stream, __func__, quda::file_name(__FILE__), __STRINGIFY__(__LINE__));
39 
40 #define STRINGIFY__(x) #x
41 #define __STRINGIFY__(x) STRINGIFY__(x)
42 #define qudaMemcpy2DAsync(dst, dpitch, src, spitch, width, height, kind, stream) \
43  ::quda::qudaMemcpy2DAsync_(dst, dpitch, src, spitch, width, height, kind, stream, __func__, quda::file_name(__FILE__), __STRINGIFY__(__LINE__));
44 
45 namespace quda {
46 
56  void qudaMemcpyAsync_(void *dst, const void *src, size_t count, cudaMemcpyKind kind, const cudaStream_t &stream,
57  const char *func, const char *file, const char *line);
58 
71  void qudaMemcpy2DAsync_(void *dst, size_t dpitch, const void *src, size_t spitch,
72  size_t width, size_t hieght, cudaMemcpyKind kind, const cudaStream_t &stream,
73  const char *func, const char *file, const char *line);
74 
84  cudaError_t qudaLaunchKernel(const void* func, dim3 gridDim, dim3 blockDim, void** args, size_t sharedMem, cudaStream_t stream);
85 
91  cudaError_t qudaEventQuery(cudaEvent_t &event);
92 
98  cudaError_t qudaEventRecord(cudaEvent_t &event, cudaStream_t stream=0);
99 
106  cudaError_t qudaStreamWaitEvent(cudaStream_t stream, cudaEvent_t event, unsigned int flags);
107 
112  cudaError_t qudaStreamSynchronize(cudaStream_t &stream);
113 
118  cudaError_t qudaEventSynchronize(cudaEvent_t &event);
119 
123  cudaError_t qudaDeviceSynchronize();
124 
125 #if (CUDA_VERSION >= 9000)
126 
132  cudaError_t qudaFuncSetAttribute(const void* func, cudaFuncAttribute attr, int value);
133 #endif
134 
138  void printAPIProfile();
139 
140 } // namespace quda
size_t const void size_t spitch
cudaEvent_t event
dim3 dim3 blockDim
size_t const void size_t size_t width
void qudaMemcpy2DAsync_(void *dst, size_t dpitch, const void *src, size_t spitch, size_t width, size_t hieght, cudaMemcpyKind kind, const cudaStream_t &stream, const char *func, const char *file, const char *line)
Wrapper around cudaMemcpy2DAsync or driver API equivalent Potentially add auto-profiling support...
cudaError_t qudaEventSynchronize(cudaEvent_t &event)
Wrapper around cudaEventSynchronize or cuEventSynchronize.
cudaError_t qudaEventQuery(cudaEvent_t &event)
Wrapper around cudaEventQuery or cuEventQuery.
cudaError_t qudaStreamWaitEvent(cudaStream_t stream, cudaEvent_t event, unsigned int flags)
Wrapper around cudaEventRecord or cuEventRecord.
const void * func
const void * src
cudaStream_t * stream
cudaError_t qudaStreamSynchronize(cudaStream_t &stream)
Wrapper around cudaStreamSynchronize or cuStreamSynchronize.
dim3 dim3 void size_t sharedMem
cudaError_t qudaDeviceSynchronize()
Wrapper around cudaDeviceSynchronize or cuDeviceSynchronize.
dim3 dim3 void ** args
const void size_t enum cudaMemcpyKind kind
void qudaMemcpyAsync_(void *dst, const void *src, size_t count, cudaMemcpyKind kind, const cudaStream_t &stream, const char *func, const char *file, const char *line)
Wrapper around cudaMemcpyAsync or driver API equivalent Potentially add auto-profiling support...
void printAPIProfile()
Print out the timer profile for CUDA API calls.
cudaError_t qudaEventRecord(cudaEvent_t &event, cudaStream_t stream=0)
Wrapper around cudaEventRecord or cuEventRecord.
const void int size_t unsigned int flags
__device__ unsigned int count[QUDA_MAX_MULTI_REDUCE]
Definition: cub_helper.cuh:118
cudaError_t qudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim, void **args, size_t sharedMem, cudaStream_t stream)
Wrapper around cudaLaunchKernel.
void qudaMemcpy_(void *dst, const void *src, size_t count, cudaMemcpyKind kind, const char *func, const char *file, const char *line)
Wrapper around cudaMemcpy used for auto-profiling. Do not call directly, rather call macro below whic...