13 #define PROFILE(f, idx) \ 14 apiTimer.TPSTART(idx); \ 18 #define PROFILE(f, idx) f; 24 static TimeProfile
apiTimer(
"CUDA API calls (driver)");
26 static TimeProfile
apiTimer(
"CUDA API calls (runtime)");
42 inline QudaMemCopy(
void *dst,
const void *src,
size_t count, cudaMemcpyKind kind,
43 bool async,
const char *func,
const char *file,
const char *line)
44 : dst(dst), src(src), count(count), kind(kind), async(async) {
48 case cudaMemcpyDeviceToHost: name =
"cudaMemcpyDeviceToHost";
break;
49 case cudaMemcpyHostToDevice: name =
"cudaMemcpyHostToDevice";
break;
50 case cudaMemcpyHostToHost: name =
"cudaMemcpyHostToHost";
break;
51 case cudaMemcpyDeviceToDevice: name =
"cudaMemcpyDeviceToDevice";
break;
52 case cudaMemcpyDefault: name =
"cudaMemcpyDefault";
break;
53 default:
errorQuda(
"Unsupported cudaMemcpyType %d", kind);
57 case cudaMemcpyDeviceToHost: name =
"cudaMemcpyAsyncDeviceToHost";
break;
58 case cudaMemcpyHostToDevice: name =
"cudaMemcpyAsyncHostToDevice";
break;
59 case cudaMemcpyHostToHost: name =
"cudaMemcpyAsyncHostToHost";
break;
60 case cudaMemcpyDeviceToDevice: name =
"cudaMemcpyAsyncDeviceToDevice";
break;
61 case cudaMemcpyDefault: name =
"cudaMemcpyAsyncDefault";
break;
62 default:
errorQuda(
"Unsupported cudaMemcpyType %d", kind);
79 case cudaMemcpyDeviceToHost:
82 case cudaMemcpyHostToDevice:
85 case cudaMemcpyDeviceToDevice:
89 errorQuda(
"Unsupported cuMemcpyTypeAsync %d", kind);
92 PROFILE(cudaMemcpyAsync(dst, src, count, kind, stream),
98 case cudaMemcpyDeviceToHost: cuMemcpyDtoH(dst, (CUdeviceptr)src, count);
break;
99 case cudaMemcpyHostToDevice: cuMemcpyHtoD((CUdeviceptr)dst, src, count);
break;
100 case cudaMemcpyHostToHost: memcpy(dst, src, count);
break;
101 case cudaMemcpyDeviceToDevice: cuMemcpyDtoD((CUdeviceptr)dst, (CUdeviceptr)src, count);
break;
102 case cudaMemcpyDefault: cuMemcpy((CUdeviceptr)dst, (CUdeviceptr)src, count);
break;
104 errorQuda(
"Unsupported cudaMemcpyType %d", kind);
107 cudaMemcpy(dst, src, count, kind);
116 strcpy(vol,
"bytes=");
117 u64toa(vol+6, (uint64_t)count);
121 long long flops()
const {
return 0; }
122 long long bytes()
const {
return kind == cudaMemcpyDeviceToDevice ? 2*
count :
count; }
127 const char *func,
const char *file,
const char *line) {
128 if (count == 0)
return;
133 cudaMemcpy(dst, src, count, kind);
135 cudaError_t error = cudaGetLastError();
136 if (error != cudaSuccess)
137 errorQuda(
"(CUDA) %s\n (%s:%s in %s())\n", cudaGetErrorString(error), file, line, func);
141 const char *func,
const char *file,
const char *line)
143 if (count == 0)
return;
145 if (kind == cudaMemcpyDeviceToDevice) {
149 #ifdef USE_DRIVER_API 151 case cudaMemcpyDeviceToHost:
154 case cudaMemcpyHostToDevice:
157 case cudaMemcpyDeviceToDevice:
161 errorQuda(
"Unsupported cuMemcpyTypeAsync %d", kind);
164 PROFILE(cudaMemcpyAsync(dst, src, count, kind, stream),
171 size_t width,
size_t height, cudaMemcpyKind
kind,
const cudaStream_t &
stream,
172 const char *func,
const char *file,
const char *line)
174 #ifdef USE_DRIVER_API 176 param.srcPitch = spitch;
178 param.srcXInBytes = 0;
179 param.dstPitch = dpitch;
181 param.dstXInBytes = 0;
182 param.WidthInBytes = width;
183 param.Height = height;
186 case cudaMemcpyDeviceToHost:
187 param.srcDevice = (CUdeviceptr)src;
188 param.srcMemoryType = CU_MEMORYTYPE_DEVICE;
190 param.dstMemoryType = CU_MEMORYTYPE_HOST;
193 errorQuda(
"Unsupported cuMemcpyType2DAsync %d", kind);
201 cudaError_t
qudaLaunchKernel(
const void* func, dim3 gridDim, dim3 blockDim,
void** args,
size_t sharedMem, cudaStream_t
stream)
211 #ifdef USE_DRIVER_API 216 case CUDA_ERROR_NOT_READY:
217 return cudaErrorNotReady;
220 cuGetErrorName(error, &str);
221 errorQuda(
"cuEventQuery returned error %s", str);
223 return cudaErrorUnknown;
232 #ifdef USE_DRIVER_API 239 cuGetErrorName(error, &str);
240 errorQuda(
"cuEventrecord returned error %s", str);
242 return cudaErrorUnknown;
251 #ifdef USE_DRIVER_API 258 cuGetErrorName(error, &str);
259 errorQuda(
"cuStreamWaitEvent returned error %s", str);
261 return cudaErrorUnknown;
270 #ifdef USE_DRIVER_API 277 cuGetErrorName(error, &str);
278 errorQuda(
"cuStreamSynchronize returned error %s", str);
280 return cudaErrorUnknown;
289 #ifdef USE_DRIVER_API 296 cuGetErrorName(error, &str);
297 errorQuda(
"cuEventSynchronize returned error %s", str);
299 return cudaErrorUnknown;
308 #ifdef USE_DRIVER_API 315 cuGetErrorName(error, &str);
316 errorQuda(
"cuCtxSynchronize returned error %s (%s:%s in %s())\n", str, file, line, func);
318 return cudaErrorUnknown;
321 if (error != cudaSuccess)
322 errorQuda(
"(CUDA) %s\n (%s:%s in %s())\n", cudaGetErrorString(error), file, line, func);
327 #if (CUDA_VERSION >= 9000) 328 cudaError_t qudaFuncSetAttribute(
const void* func, cudaFuncAttribute attr,
int value)
void qudaMemcpy2DAsync_(void *dst, size_t dpitch, const void *src, size_t spitch, size_t width, size_t hieght, cudaMemcpyKind kind, const cudaStream_t &stream, const char *func, const char *file, const char *line)
Wrapper around cudaMemcpy2DAsync or driver API equivalent Potentially add auto-profiling support...
cudaError_t qudaEventSynchronize(cudaEvent_t &event)
Wrapper around cudaEventSynchronize or cuEventSynchronize.
cudaError_t qudaEventQuery(cudaEvent_t &event)
Wrapper around cudaEventQuery or cuEventQuery.
cudaError_t qudaStreamWaitEvent(cudaStream_t stream, cudaEvent_t event, unsigned int flags)
Wrapper around cudaEventRecord or cuEventRecord.
QudaVerbosity getVerbosity()
__host__ __device__ void copy(T1 &a, const T2 &b)
static TimeProfile apiTimer("CUDA API calls (driver)")
const cudaMemcpyKind kind
cudaError_t qudaStreamSynchronize(cudaStream_t &stream)
Wrapper around cudaStreamSynchronize or cuStreamSynchronize.
cudaError_t qudaDeviceSynchronize_(const char *func, const char *file, const char *line)
Wrapper around cudaDeviceSynchronize or cuDeviceSynchronize.
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
bool activeTuning()
query if tuning is in progress
bool advanceTuneParam(TuneParam ¶m) const
void qudaMemcpyAsync_(void *dst, const void *src, size_t count, cudaMemcpyKind kind, const cudaStream_t &stream, const char *func, const char *file, const char *line)
Wrapper around cudaMemcpyAsync or driver API equivalent Potentially add auto-profiling support...
unsigned int sharedBytesPerBlock(const TuneParam ¶m) const
unsigned int sharedBytesPerThread() const
QudaMemCopy(void *dst, const void *src, size_t count, cudaMemcpyKind kind, bool async, const char *func, const char *file, const char *line)
void printAPIProfile()
Print out the timer profile for CUDA API calls.
cudaError_t qudaEventRecord(cudaEvent_t &event, cudaStream_t stream=0)
Wrapper around cudaEventRecord or cuEventRecord.
QudaTune getTuning()
Query whether autotuning is enabled or not. Default is enabled but can be overridden by setting QUDA_...
void u64toa(char *buffer, uint64_t value)
void apply(const cudaStream_t &stream)
cudaError_t qudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim, void **args, size_t sharedMem, cudaStream_t stream)
Wrapper around cudaLaunchKernel.
void qudaMemcpy_(void *dst, const void *src, size_t count, cudaMemcpyKind kind, const char *func, const char *file, const char *line)
Wrapper around cudaMemcpy used for auto-profiling. Do not call directly, rather call macro below whic...