QUDA  1.0.0
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros
quda_cuda_api.cpp
Go to the documentation of this file.
1 #include <tune_quda.h>
2 #include <uint_to_char.h>
3 #include <quda_internal.h>
4 
5 // if this macro is defined then we use the driver API, else use the
6 // runtime API. Typically the driver API has 10-20% less overhead
7 #define USE_DRIVER_API
8 
9 // if this macro is defined then we profile the CUDA API calls
10 //#define API_PROFILE
11 
12 #ifdef API_PROFILE
13 #define PROFILE(f, idx) \
14  apiTimer.TPSTART(idx); \
15  f; \
16  apiTimer.TPSTOP(idx);
17 #else
18 #define PROFILE(f, idx) f;
19 #endif
20 
21 namespace quda {
22 
23 #ifdef USE_DRIVER_API
24  static TimeProfile apiTimer("CUDA API calls (driver)");
25 #else
26  static TimeProfile apiTimer("CUDA API calls (runtime)");
27 #endif
28 
29  class QudaMemCopy : public Tunable {
30 
31  void *dst;
32  const void *src;
33  const size_t count;
34  const cudaMemcpyKind kind;
35  const bool async;
36  const char *name;
37 
38  unsigned int sharedBytesPerThread() const { return 0; }
39  unsigned int sharedBytesPerBlock(const TuneParam &param) const { return 0; }
40 
41  public:
42  inline QudaMemCopy(void *dst, const void *src, size_t count, cudaMemcpyKind kind,
43  bool async, const char *func, const char *file, const char *line)
44  : dst(dst), src(src), count(count), kind(kind), async(async) {
45 
46  if (!async) {
47  switch (kind) {
48  case cudaMemcpyDeviceToHost: name = "cudaMemcpyDeviceToHost"; break;
49  case cudaMemcpyHostToDevice: name = "cudaMemcpyHostToDevice"; break;
50  case cudaMemcpyHostToHost: name = "cudaMemcpyHostToHost"; break;
51  case cudaMemcpyDeviceToDevice: name = "cudaMemcpyDeviceToDevice"; break;
52  case cudaMemcpyDefault: name = "cudaMemcpyDefault"; break;
53  default: errorQuda("Unsupported cudaMemcpyType %d", kind);
54  }
55  } else {
56  switch(kind) {
57  case cudaMemcpyDeviceToHost: name = "cudaMemcpyAsyncDeviceToHost"; break;
58  case cudaMemcpyHostToDevice: name = "cudaMemcpyAsyncHostToDevice"; break;
59  case cudaMemcpyHostToHost: name = "cudaMemcpyAsyncHostToHost"; break;
60  case cudaMemcpyDeviceToDevice: name = "cudaMemcpyAsyncDeviceToDevice"; break;
61  case cudaMemcpyDefault: name = "cudaMemcpyAsyncDefault"; break;
62  default: errorQuda("Unsupported cudaMemcpyType %d", kind);
63  }
64  }
65  strcpy(aux, func);
66  strcat(aux, ",");
67  strcat(aux, file);
68  strcat(aux, ",");
69  strcat(aux, line);
70  }
71 
72  virtual ~QudaMemCopy() { }
73 
74  inline void apply(const cudaStream_t &stream) {
75  tuneLaunch(*this, getTuning(), getVerbosity());
76  if (async) {
77 #ifdef USE_DRIVER_API
78  switch (kind) {
79  case cudaMemcpyDeviceToHost:
80  PROFILE(cuMemcpyDtoHAsync(dst, (CUdeviceptr)src, count, stream), QUDA_PROFILE_MEMCPY_D2H_ASYNC);
81  break;
82  case cudaMemcpyHostToDevice:
83  PROFILE(cuMemcpyHtoDAsync((CUdeviceptr)dst, src, count, stream), QUDA_PROFILE_MEMCPY_H2D_ASYNC);
84  break;
85  case cudaMemcpyDeviceToDevice:
86  PROFILE(cuMemcpyDtoDAsync((CUdeviceptr)dst, (CUdeviceptr)src, count, stream), QUDA_PROFILE_MEMCPY_D2D_ASYNC);
87  break;
88  default:
89  errorQuda("Unsupported cuMemcpyTypeAsync %d", kind);
90  }
91 #else
92  PROFILE(cudaMemcpyAsync(dst, src, count, kind, stream),
93  kind == cudaMemcpyDeviceToHost ? QUDA_PROFILE_MEMCPY_D2H_ASYNC : QUDA_PROFILE_MEMCPY_H2D_ASYNC);
94 #endif
95  } else {
96 #ifdef USE_DRIVER_API
97  switch(kind) {
98  case cudaMemcpyDeviceToHost: cuMemcpyDtoH(dst, (CUdeviceptr)src, count); break;
99  case cudaMemcpyHostToDevice: cuMemcpyHtoD((CUdeviceptr)dst, src, count); break;
100  case cudaMemcpyHostToHost: memcpy(dst, src, count); break;
101  case cudaMemcpyDeviceToDevice: cuMemcpyDtoD((CUdeviceptr)dst, (CUdeviceptr)src, count); break;
102  case cudaMemcpyDefault: cuMemcpy((CUdeviceptr)dst, (CUdeviceptr)src, count); break;
103  default:
104  errorQuda("Unsupported cudaMemcpyType %d", kind);
105  }
106 #else
107  cudaMemcpy(dst, src, count, kind);
108 #endif
109  }
110  }
111 
112  bool advanceTuneParam(TuneParam &param) const { return false; }
113 
114  TuneKey tuneKey() const {
115  char vol[128];
116  strcpy(vol,"bytes=");
117  u64toa(vol+6, (uint64_t)count);
118  return TuneKey(vol, name, aux);
119  }
120 
121  long long flops() const { return 0; }
122  long long bytes() const { return kind == cudaMemcpyDeviceToDevice ? 2*count : count; }
123 
124  };
125 
126  void qudaMemcpy_(void *dst, const void *src, size_t count, cudaMemcpyKind kind,
127  const char *func, const char *file, const char *line) {
128  if (count == 0) return;
129 #if 1
130  QudaMemCopy copy(dst, src, count, kind, false, func, file, line);
131  copy.apply(0);
132 #else
133  cudaMemcpy(dst, src, count, kind);
134 #endif
135  cudaError_t error = cudaGetLastError();
136  if (error != cudaSuccess)
137  errorQuda("(CUDA) %s\n (%s:%s in %s())\n", cudaGetErrorString(error), file, line, func);
138  }
139 
140  void qudaMemcpyAsync_(void *dst, const void *src, size_t count, cudaMemcpyKind kind, const cudaStream_t &stream,
141  const char *func, const char *file, const char *line)
142  {
143  if (count == 0) return;
144 
145  if (kind == cudaMemcpyDeviceToDevice) {
146  QudaMemCopy copy(dst, src, count, kind, true, func, file, line);
147  copy.apply(stream);
148  } else {
149 #ifdef USE_DRIVER_API
150  switch (kind) {
151  case cudaMemcpyDeviceToHost:
152  PROFILE(cuMemcpyDtoHAsync(dst, (CUdeviceptr)src, count, stream), QUDA_PROFILE_MEMCPY_D2H_ASYNC);
153  break;
154  case cudaMemcpyHostToDevice:
155  PROFILE(cuMemcpyHtoDAsync((CUdeviceptr)dst, src, count, stream), QUDA_PROFILE_MEMCPY_H2D_ASYNC);
156  break;
157  case cudaMemcpyDeviceToDevice:
158  PROFILE(cuMemcpyDtoDAsync((CUdeviceptr)dst, (CUdeviceptr)src, count, stream), QUDA_PROFILE_MEMCPY_D2D_ASYNC);
159  break;
160  default:
161  errorQuda("Unsupported cuMemcpyTypeAsync %d", kind);
162  }
163 #else
164  PROFILE(cudaMemcpyAsync(dst, src, count, kind, stream),
165  kind == cudaMemcpyDeviceToHost ? QUDA_PROFILE_MEMCPY_D2H_ASYNC : QUDA_PROFILE_MEMCPY_H2D_ASYNC);
166 #endif
167  }
168  }
169 
170  void qudaMemcpy2DAsync_(void *dst, size_t dpitch, const void *src, size_t spitch,
171  size_t width, size_t height, cudaMemcpyKind kind, const cudaStream_t &stream,
172  const char *func, const char *file, const char *line)
173  {
174 #ifdef USE_DRIVER_API
175  CUDA_MEMCPY2D param;
176  param.srcPitch = spitch;
177  param.srcY = 0;
178  param.srcXInBytes = 0;
179  param.dstPitch = dpitch;
180  param.dstY = 0;
181  param.dstXInBytes = 0;
182  param.WidthInBytes = width;
183  param.Height = height;
184 
185  switch (kind) {
186  case cudaMemcpyDeviceToHost:
187  param.srcDevice = (CUdeviceptr)src;
188  param.srcMemoryType = CU_MEMORYTYPE_DEVICE;
189  param.dstHost = dst;
190  param.dstMemoryType = CU_MEMORYTYPE_HOST;
191  break;
192  default:
193  errorQuda("Unsupported cuMemcpyType2DAsync %d", kind);
194  }
195  PROFILE(cuMemcpy2DAsync(&param, stream), QUDA_PROFILE_MEMCPY2D_D2H_ASYNC);
196 #else
197  PROFILE(cudaMemcpy2DAsync(dst, dpitch, src, spitch, width, height, kind, stream), QUDA_PROFILE_MEMCPY2D_D2H_ASYNC);
198 #endif
199  }
200 
201  cudaError_t qudaLaunchKernel(const void* func, dim3 gridDim, dim3 blockDim, void** args, size_t sharedMem, cudaStream_t stream)
202  {
203  // no driver API variant here since we have C++ functions
204  PROFILE(cudaError_t error = cudaLaunchKernel(func, gridDim, blockDim, args, sharedMem, stream), QUDA_PROFILE_LAUNCH_KERNEL);
205  if (error != cudaSuccess && !activeTuning()) errorQuda("(CUDA) %s", cudaGetErrorString(error));
206  return error;
207  }
208 
209  cudaError_t qudaEventQuery(cudaEvent_t &event)
210  {
211 #ifdef USE_DRIVER_API
212  PROFILE(CUresult error = cuEventQuery(event), QUDA_PROFILE_EVENT_QUERY);
213  switch (error) {
214  case CUDA_SUCCESS:
215  return cudaSuccess;
216  case CUDA_ERROR_NOT_READY: // this is the only return value care about
217  return cudaErrorNotReady;
218  default:
219  const char *str;
220  cuGetErrorName(error, &str);
221  errorQuda("cuEventQuery returned error %s", str);
222  }
223  return cudaErrorUnknown;
224 #else
225  PROFILE(cudaError_t error = cudaEventQuery(event), QUDA_PROFILE_EVENT_QUERY);
226  return error;
227 #endif
228  }
229 
230  cudaError_t qudaEventRecord(cudaEvent_t &event, cudaStream_t stream)
231  {
232 #ifdef USE_DRIVER_API
233  PROFILE(CUresult error = cuEventRecord(event, stream), QUDA_PROFILE_EVENT_RECORD);
234  switch (error) {
235  case CUDA_SUCCESS:
236  return cudaSuccess;
237  default: // should always return successful
238  const char *str;
239  cuGetErrorName(error, &str);
240  errorQuda("cuEventrecord returned error %s", str);
241  }
242  return cudaErrorUnknown;
243 #else
244  PROFILE(cudaError_t error = cudaEventRecord(event, stream), QUDA_PROFILE_EVENT_RECORD);
245  return error;
246 #endif
247  }
248 
249  cudaError_t qudaStreamWaitEvent(cudaStream_t stream, cudaEvent_t event, unsigned int flags)
250  {
251 #ifdef USE_DRIVER_API
252  PROFILE(CUresult error = cuStreamWaitEvent(stream, event, flags), QUDA_PROFILE_STREAM_WAIT_EVENT);
253  switch (error) {
254  case CUDA_SUCCESS:
255  return cudaSuccess;
256  default: // should always return successful
257  const char *str;
258  cuGetErrorName(error, &str);
259  errorQuda("cuStreamWaitEvent returned error %s", str);
260  }
261  return cudaErrorUnknown;
262 #else
263  PROFILE(cudaError_t error = cudaStreamWaitEvent(stream, event, flags), QUDA_PROFILE_STREAM_WAIT_EVENT);
264  return error;
265 #endif
266  }
267 
268  cudaError_t qudaStreamSynchronize(cudaStream_t &stream)
269  {
270 #ifdef USE_DRIVER_API
271  PROFILE(CUresult error = cuStreamSynchronize(stream), QUDA_PROFILE_STREAM_SYNCHRONIZE);
272  switch (error) {
273  case CUDA_SUCCESS:
274  return cudaSuccess;
275  default: // should always return successful
276  const char *str;
277  cuGetErrorName(error, &str);
278  errorQuda("cuStreamSynchronize returned error %s", str);
279  }
280  return cudaErrorUnknown;
281 #else
282  PROFILE(cudaError_t error = cudaStreamSynchronize(stream), QUDA_PROFILE_STREAM_SYNCHRONIZE);
283  return error;
284 #endif
285  }
286 
287  cudaError_t qudaEventSynchronize(cudaEvent_t &event)
288  {
289 #ifdef USE_DRIVER_API
290  PROFILE(CUresult error = cuEventSynchronize(event), QUDA_PROFILE_EVENT_SYNCHRONIZE);
291  switch (error) {
292  case CUDA_SUCCESS:
293  return cudaSuccess;
294  default: // should always return successful
295  const char *str;
296  cuGetErrorName(error, &str);
297  errorQuda("cuEventSynchronize returned error %s", str);
298  }
299  return cudaErrorUnknown;
300 #else
301  PROFILE(cudaError_t error = cudaEventSynchronize(event), QUDA_PROFILE_EVENT_SYNCHRONIZE);
302  return error;
303 #endif
304  }
305 
306  cudaError_t qudaDeviceSynchronize_(const char *func, const char *file, const char *line)
307  {
308 #ifdef USE_DRIVER_API
309  PROFILE(CUresult error = cuCtxSynchronize(), QUDA_PROFILE_DEVICE_SYNCHRONIZE);
310  switch (error) {
311  case CUDA_SUCCESS:
312  return cudaSuccess;
313  default: // should always return successful
314  const char *str;
315  cuGetErrorName(error, &str);
316  errorQuda("cuCtxSynchronize returned error %s (%s:%s in %s())\n", str, file, line, func);
317  }
318  return cudaErrorUnknown;
319 #else
320  PROFILE(cudaError_t error = cudaDeviceSynchronize(), QUDA_PROFILE_DEVICE_SYNCHRONIZE);
321  if (error != cudaSuccess)
322  errorQuda("(CUDA) %s\n (%s:%s in %s())\n", cudaGetErrorString(error), file, line, func);
323  return error;
324 #endif
325  }
326 
327 #if (CUDA_VERSION >= 9000)
328  cudaError_t qudaFuncSetAttribute(const void* func, cudaFuncAttribute attr, int value)
329  {
330  // no driver API variant here since we have C++ functions
331  PROFILE(cudaError_t error = cudaFuncSetAttribute(func, attr, value), QUDA_PROFILE_FUNC_SET_ATTRIBUTE);
332  return error;
333  }
334 #endif
335 
337 #ifdef API_PROFILE
338  apiTimer.Print();
339 #endif
340  }
341 
342 } // namespace quda
const size_t count
void qudaMemcpy2DAsync_(void *dst, size_t dpitch, const void *src, size_t spitch, size_t width, size_t hieght, cudaMemcpyKind kind, const cudaStream_t &stream, const char *func, const char *file, const char *line)
Wrapper around cudaMemcpy2DAsync or driver API equivalent Potentially add auto-profiling support...
cudaError_t qudaEventSynchronize(cudaEvent_t &event)
Wrapper around cudaEventSynchronize or cuEventSynchronize.
long long bytes() const
cudaError_t qudaEventQuery(cudaEvent_t &event)
Wrapper around cudaEventQuery or cuEventQuery.
cudaError_t qudaStreamWaitEvent(cudaStream_t stream, cudaEvent_t event, unsigned int flags)
Wrapper around cudaEventRecord or cuEventRecord.
QudaVerbosity getVerbosity()
Definition: util_quda.cpp:21
#define errorQuda(...)
Definition: util_quda.h:121
cudaStream_t * stream
#define PROFILE(f, idx)
TuneKey tuneKey() const
__host__ __device__ void copy(T1 &a, const T2 &b)
static TimeProfile apiTimer("CUDA API calls (driver)")
const cudaMemcpyKind kind
QudaGaugeParam param
Definition: pack_test.cpp:17
cudaError_t qudaStreamSynchronize(cudaStream_t &stream)
Wrapper around cudaStreamSynchronize or cuStreamSynchronize.
cudaError_t qudaDeviceSynchronize_(const char *func, const char *file, const char *line)
Wrapper around cudaDeviceSynchronize or cuDeviceSynchronize.
virtual ~QudaMemCopy()
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
Definition: tune.cpp:643
const void * src
bool activeTuning()
query if tuning is in progress
Definition: tune.cpp:121
bool advanceTuneParam(TuneParam &param) const
void qudaMemcpyAsync_(void *dst, const void *src, size_t count, cudaMemcpyKind kind, const cudaStream_t &stream, const char *func, const char *file, const char *line)
Wrapper around cudaMemcpyAsync or driver API equivalent Potentially add auto-profiling support...
unsigned int sharedBytesPerBlock(const TuneParam &param) const
const char * name
unsigned int sharedBytesPerThread() const
QudaMemCopy(void *dst, const void *src, size_t count, cudaMemcpyKind kind, bool async, const char *func, const char *file, const char *line)
void printAPIProfile()
Print out the timer profile for CUDA API calls.
cudaError_t qudaEventRecord(cudaEvent_t &event, cudaStream_t stream=0)
Wrapper around cudaEventRecord or cuEventRecord.
long long flops() const
QudaTune getTuning()
Query whether autotuning is enabled or not. Default is enabled but can be overridden by setting QUDA_...
Definition: util_quda.cpp:52
bool flags
void u64toa(char *buffer, uint64_t value)
Definition: uint_to_char.h:127
void apply(const cudaStream_t &stream)
cudaError_t qudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim, void **args, size_t sharedMem, cudaStream_t stream)
Wrapper around cudaLaunchKernel.
char aux[TuneKey::aux_n]
Definition: tune_quda.h:265
void qudaMemcpy_(void *dst, const void *src, size_t count, cudaMemcpyKind kind, const char *func, const char *file, const char *line)
Wrapper around cudaMemcpy used for auto-profiling. Do not call directly, rather call macro below whic...