13 #define PROFILE(f, idx) \ 14 apiTimer.TPSTART(idx); \ 18 #define PROFILE(f, idx) f; 24 static TimeProfile
apiTimer(
"CUDA API calls (driver)");
26 static TimeProfile
apiTimer(
"CUDA API calls (runtime)");
42 const char *
func,
const char *file,
const char *line)
46 case cudaMemcpyDeviceToHost:
47 name =
"cudaMemcpyDeviceToHost";
49 case cudaMemcpyHostToDevice:
50 name =
"cudaMemcpyHostToDevice";
52 case cudaMemcpyHostToHost:
53 name =
"cudaMemcpyHostToHost";
55 case cudaMemcpyDeviceToDevice:
56 name =
"cudaMemcpyDeviceToDevice";
58 case cudaMemcpyDefault:
59 name =
"cudaMemcpyDefault";
77 case cudaMemcpyDeviceToHost:
80 case cudaMemcpyHostToDevice:
83 case cudaMemcpyHostToHost:
86 case cudaMemcpyDeviceToDevice:
87 cuMemcpyDtoD((CUdeviceptr)
dst, (CUdeviceptr)
src,
count);
89 case cudaMemcpyDefault:
90 cuMemcpy((CUdeviceptr)
dst, (CUdeviceptr)
src,
count);
108 long long flops()
const {
return 0; }
114 const char *
func,
const char *file,
const char *line) {
116 printfQuda(
"%s bytes = %llu\n", __func__, (
long long unsigned int)
count);
118 if (
count == 0)
return;
129 const char *
func,
const char *file,
const char *line)
131 #ifdef USE_DRIVER_API 133 case cudaMemcpyDeviceToHost:
136 case cudaMemcpyHostToDevice:
139 case cudaMemcpyDeviceToDevice:
153 const char *
func,
const char *file,
const char *line)
155 #ifdef USE_DRIVER_API 159 param.srcXInBytes = 0;
162 param.dstXInBytes = 0;
167 case cudaMemcpyDeviceToHost:
169 param.srcMemoryType = CU_MEMORYTYPE_DEVICE;
171 param.dstMemoryType = CU_MEMORYTYPE_HOST;
192 #ifdef USE_DRIVER_API 197 case CUDA_ERROR_NOT_READY:
198 return cudaErrorNotReady;
200 errorQuda(
"cuEventQuery return error code %d", error);
211 #ifdef USE_DRIVER_API 217 errorQuda(
"cuEventRecord return error code %d", error);
228 #ifdef USE_DRIVER_API 234 errorQuda(
"cuStreamWaitEvent return error code %d", error);
245 #ifdef USE_DRIVER_API 251 errorQuda(
"cuStreamSynchronize return error code %d", error);
262 #ifdef USE_DRIVER_API 268 errorQuda(
"cuEventSynchronize return error code %d", error);
279 #ifdef USE_DRIVER_API 285 errorQuda(
"cuCtxSynchronize return error code %d", error);
294 #if (CUDA_VERSION >= 9000) 295 cudaError_t qudaFuncSetAttribute(
const void*
func, cudaFuncAttribute attr,
int value)
size_t const void size_t spitch
size_t const void size_t size_t width
void qudaMemcpy2DAsync_(void *dst, size_t dpitch, const void *src, size_t spitch, size_t width, size_t hieght, cudaMemcpyKind kind, const cudaStream_t &stream, const char *func, const char *file, const char *line)
Wrapper around cudaMemcpy2DAsync or driver API equivalent Potentially add auto-profiling support...
cudaError_t qudaEventSynchronize(cudaEvent_t &event)
Wrapper around cudaEventSynchronize or cuEventSynchronize.
cudaError_t qudaEventQuery(cudaEvent_t &event)
Wrapper around cudaEventQuery or cuEventQuery.
cudaError_t qudaStreamWaitEvent(cudaStream_t stream, cudaEvent_t event, unsigned int flags)
Wrapper around cudaEventRecord or cuEventRecord.
QudaVerbosity getVerbosity()
char * strcpy(char *__dst, const char *__src)
__host__ __device__ void copy(T1 &a, const T2 &b)
static TimeProfile apiTimer("CUDA API calls (driver)")
char * strcat(char *__s1, const char *__s2)
const cudaMemcpyKind kind
cudaError_t qudaStreamSynchronize(cudaStream_t &stream)
Wrapper around cudaStreamSynchronize or cuStreamSynchronize.
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
unsigned long long uint64_t
dim3 dim3 void size_t sharedMem
cudaError_t qudaDeviceSynchronize()
Wrapper around cudaDeviceSynchronize or cuDeviceSynchronize.
void * memcpy(void *__dst, const void *__src, size_t __n)
bool activeTuning()
query if tuning is in progress
QudaMemCopy(void *dst, const void *src, size_t count, cudaMemcpyKind kind, const char *func, const char *file, const char *line)
bool advanceTuneParam(TuneParam ¶m) const
const void size_t enum cudaMemcpyKind kind
void qudaMemcpyAsync_(void *dst, const void *src, size_t count, cudaMemcpyKind kind, const cudaStream_t &stream, const char *func, const char *file, const char *line)
Wrapper around cudaMemcpyAsync or driver API equivalent Potentially add auto-profiling support...
unsigned int sharedBytesPerBlock(const TuneParam ¶m) const
unsigned int sharedBytesPerThread() const
void printAPIProfile()
Print out the timer profile for CUDA API calls.
cudaError_t qudaEventRecord(cudaEvent_t &event, cudaStream_t stream=0)
Wrapper around cudaEventRecord or cuEventRecord.
const void int size_t unsigned int flags
QudaTune getTuning()
Query whether autotuning is enabled or not. Default is enabled but can be overridden by setting QUDA_...
void u64toa(char *buffer, uint64_t value)
size_t const void size_t size_t size_t height
__device__ unsigned int count[QUDA_MAX_MULTI_REDUCE]
void apply(const cudaStream_t &stream)
cudaError_t qudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim, void **args, size_t sharedMem, cudaStream_t stream)
Wrapper around cudaLaunchKernel.
void qudaMemcpy_(void *dst, const void *src, size_t count, cudaMemcpyKind kind, const char *func, const char *file, const char *line)
Wrapper around cudaMemcpy used for auto-profiling. Do not call directly, rather call macro below whic...