QUDA  0.9.0
quda_cuda_api.cpp
Go to the documentation of this file.
1 #include <tune_quda.h>
2 #include <uint_to_char.h>
3 #include <quda_internal.h>
4 
5 // if this macro is defined then we use the driver API, else use the
6 // runtime API. Typically the driver API has 10-20% less overhead
7 #define USE_DRIVER_API
8 
9 // if this macro is defined then we profile the CUDA API calls
10 //#define API_PROFILE
11 
12 #ifdef API_PROFILE
13 #define PROFILE(f, idx) \
14  apiTimer.TPSTART(idx); \
15  f; \
16  apiTimer.TPSTOP(idx);
17 #else
18 #define PROFILE(f, idx) f;
19 #endif
20 
21 namespace quda {
22 
23 #ifdef USE_DRIVER_API
24  static TimeProfile apiTimer("CUDA API calls (driver)");
25 #else
26  static TimeProfile apiTimer("CUDA API calls (runtime)");
27 #endif
28 
29  class QudaMemCopy : public Tunable {
30 
31  void *dst;
32  const void *src;
33  const size_t count;
34  const cudaMemcpyKind kind;
35  const char *name;
36 
37  unsigned int sharedBytesPerThread() const { return 0; }
38  unsigned int sharedBytesPerBlock(const TuneParam &param) const { return 0; }
39 
40  public:
41  inline QudaMemCopy(void *dst, const void *src, size_t count, cudaMemcpyKind kind,
42  const char *func, const char *file, const char *line)
43  : dst(dst), src(src), count(count), kind(kind) {
44 
45  switch(kind) {
46  case cudaMemcpyDeviceToHost:
47  name = "cudaMemcpyDeviceToHost";
48  break;
49  case cudaMemcpyHostToDevice:
50  name = "cudaMemcpyHostToDevice";
51  break;
52  case cudaMemcpyHostToHost:
53  name = "cudaMemcpyHostToHost";
54  break;
55  case cudaMemcpyDeviceToDevice:
56  name = "cudaMemcpyDeviceToDevice";
57  break;
58  case cudaMemcpyDefault:
59  name = "cudaMemcpyDefault";
60  break;
61  default:
62  errorQuda("Unsupported cudaMemcpyType %d", kind);
63  }
64  strcpy(aux, func);
65  strcat(aux, ",");
66  strcat(aux, file);
67  strcat(aux, ",");
68  strcat(aux, line);
69  }
70 
71  virtual ~QudaMemCopy() { }
72 
73  inline void apply(const cudaStream_t &stream) {
74  tuneLaunch(*this, getTuning(), getVerbosity());
75 #ifdef USE_DRIVER_API
76  switch(kind) {
77  case cudaMemcpyDeviceToHost:
78  cuMemcpyDtoH(dst, (CUdeviceptr)src, count);
79  break;
80  case cudaMemcpyHostToDevice:
81  cuMemcpyHtoD((CUdeviceptr)dst, src, count);
82  break;
83  case cudaMemcpyHostToHost:
84  memcpy(dst, src, count);
85  break;
86  case cudaMemcpyDeviceToDevice:
87  cuMemcpyDtoD((CUdeviceptr)dst, (CUdeviceptr)src, count);
88  break;
89  case cudaMemcpyDefault:
90  cuMemcpy((CUdeviceptr)dst, (CUdeviceptr)src, count);
91  default:
92  errorQuda("Unsupported cudaMemcpyType %d", kind);
93  }
94 #else
95  cudaMemcpy(dst, src, count, kind);
96 #endif
97  }
98 
99  bool advanceTuneParam(TuneParam &param) const { return false; }
100 
101  TuneKey tuneKey() const {
102  char vol[128];
103  strcpy(vol,"bytes=");
104  u64toa(vol+6, (uint64_t)count);
105  return TuneKey(vol, name, aux);
106  }
107 
108  long long flops() const { return 0; }
109  long long bytes() const { return kind == cudaMemcpyDeviceToDevice ? 2*count : count; }
110 
111  };
112 
113  void qudaMemcpy_(void *dst, const void *src, size_t count, cudaMemcpyKind kind,
114  const char *func, const char *file, const char *line) {
116  printfQuda("%s bytes = %llu\n", __func__, (long long unsigned int)count);
117 
118  if (count == 0) return;
119 #if 1
120  QudaMemCopy copy(dst, src, count, kind, func, file, line);
121  copy.apply(0);
122 #else
123  cudaMemcpy(dst, src, count, kind);
124 #endif
125  checkCudaError();
126  }
127 
128  void qudaMemcpyAsync_(void *dst, const void *src, size_t count, cudaMemcpyKind kind, const cudaStream_t &stream,
129  const char *func, const char *file, const char *line)
130  {
131 #ifdef USE_DRIVER_API
132  switch (kind) {
133  case cudaMemcpyDeviceToHost:
134  PROFILE(cuMemcpyDtoHAsync(dst, (CUdeviceptr)src, count, stream), QUDA_PROFILE_MEMCPY_D2H_ASYNC);
135  break;
136  case cudaMemcpyHostToDevice:
137  PROFILE(cuMemcpyHtoDAsync((CUdeviceptr)dst, src, count, stream), QUDA_PROFILE_MEMCPY_H2D_ASYNC);
138  break;
139  case cudaMemcpyDeviceToDevice:
140  PROFILE(cuMemcpyDtoDAsync((CUdeviceptr)dst, (CUdeviceptr)src, count, stream), QUDA_PROFILE_MEMCPY_D2D_ASYNC);
141  break;
142  default:
143  errorQuda("Unsupported cuMemcpyTypeAsync %d", kind);
144  }
145 #else
146  PROFILE(cudaMemcpyAsync(dst, src, count, kind, stream),
148 #endif
149  }
150 
151  void qudaMemcpy2DAsync_(void *dst, size_t dpitch, const void *src, size_t spitch,
152  size_t width, size_t height, cudaMemcpyKind kind, const cudaStream_t &stream,
153  const char *func, const char *file, const char *line)
154  {
155 #ifdef USE_DRIVER_API
156  CUDA_MEMCPY2D param;
157  param.srcPitch = spitch;
158  param.srcY = 0;
159  param.srcXInBytes = 0;
160  param.dstPitch = dpitch;
161  param.dstY = 0;
162  param.dstXInBytes = 0;
163  param.WidthInBytes = width;
164  param.Height = height;
165 
166  switch (kind) {
167  case cudaMemcpyDeviceToHost:
168  param.srcDevice = (CUdeviceptr)src;
169  param.srcMemoryType = CU_MEMORYTYPE_DEVICE;
170  param.dstHost = dst;
171  param.dstMemoryType = CU_MEMORYTYPE_HOST;
172  break;
173  default:
174  errorQuda("Unsupported cuMemcpyType2DAsync %d", kind);
175  }
177 #else
179 #endif
180  }
181 
182  cudaError_t qudaLaunchKernel(const void* func, dim3 gridDim, dim3 blockDim, void** args, size_t sharedMem, cudaStream_t stream)
183  {
184  // no driver API variant here since we have C++ functions
185  PROFILE(cudaError_t error = cudaLaunchKernel(func, gridDim, blockDim, args, sharedMem, stream), QUDA_PROFILE_LAUNCH_KERNEL);
186  if (error != cudaSuccess && !activeTuning()) errorQuda("(CUDA) %s", cudaGetErrorString(error));
187  return error;
188  }
189 
190  cudaError_t qudaEventQuery(cudaEvent_t &event)
191  {
192 #ifdef USE_DRIVER_API
193  PROFILE(CUresult error = cuEventQuery(event), QUDA_PROFILE_EVENT_QUERY);
194  switch (error) {
195  case CUDA_SUCCESS:
196  return cudaSuccess;
197  case CUDA_ERROR_NOT_READY: // this is the only return value care about
198  return cudaErrorNotReady;
199  default:
200  errorQuda("cuEventQuery return error code %d", error);
201  }
202  return cudaErrorUnknown;
203 #else
204  PROFILE(cudaError_t error = cudaEventQuery(event), QUDA_PROFILE_EVENT_QUERY);
205  return error;
206 #endif
207  }
208 
209  cudaError_t qudaEventRecord(cudaEvent_t &event, cudaStream_t stream)
210  {
211 #ifdef USE_DRIVER_API
212  PROFILE(CUresult error = cuEventRecord(event, stream), QUDA_PROFILE_EVENT_RECORD);
213  switch (error) {
214  case CUDA_SUCCESS:
215  return cudaSuccess;
216  default: // should always return successful
217  errorQuda("cuEventRecord return error code %d", error);
218  }
219  return cudaErrorUnknown;
220 #else
221  PROFILE(cudaError_t error = cudaEventRecord(event, stream), QUDA_PROFILE_EVENT_RECORD);
222  return error;
223 #endif
224  }
225 
226  cudaError_t qudaStreamWaitEvent(cudaStream_t stream, cudaEvent_t event, unsigned int flags)
227  {
228 #ifdef USE_DRIVER_API
229  PROFILE(CUresult error = cuStreamWaitEvent(stream, event, flags), QUDA_PROFILE_STREAM_WAIT_EVENT);
230  switch (error) {
231  case CUDA_SUCCESS:
232  return cudaSuccess;
233  default: // should always return successful
234  errorQuda("cuStreamWaitEvent return error code %d", error);
235  }
236  return cudaErrorUnknown;
237 #else
238  PROFILE(cudaError_t error = cudaStreamWaitEvent(stream, event, flags), QUDA_PROFILE_STREAM_WAIT_EVENT);
239  return error;
240 #endif
241  }
242 
243  cudaError_t qudaStreamSynchronize(cudaStream_t &stream)
244  {
245 #ifdef USE_DRIVER_API
246  PROFILE(CUresult error = cuStreamSynchronize(stream), QUDA_PROFILE_STREAM_SYNCHRONIZE);
247  switch (error) {
248  case CUDA_SUCCESS:
249  return cudaSuccess;
250  default: // should always return successful
251  errorQuda("cuStreamSynchronize return error code %d", error);
252  }
253  return cudaErrorUnknown;
254 #else
255  PROFILE(cudaError_t error = cudaStreamSynchronize(stream), QUDA_PROFILE_STREAM_SYNCHRONIZE);
256  return error;
257 #endif
258  }
259 
260  cudaError_t qudaEventSynchronize(cudaEvent_t &event)
261  {
262 #ifdef USE_DRIVER_API
263  PROFILE(CUresult error = cuEventSynchronize(event), QUDA_PROFILE_EVENT_SYNCHRONIZE);
264  switch (error) {
265  case CUDA_SUCCESS:
266  return cudaSuccess;
267  default: // should always return successful
268  errorQuda("cuEventSynchronize return error code %d", error);
269  }
270  return cudaErrorUnknown;
271 #else
272  PROFILE(cudaError_t error = cudaEventSynchronize(event), QUDA_PROFILE_EVENT_SYNCHRONIZE);
273  return error;
274 #endif
275  }
276 
277  cudaError_t qudaDeviceSynchronize()
278  {
279 #ifdef USE_DRIVER_API
280  PROFILE(CUresult error = cuCtxSynchronize(), QUDA_PROFILE_DEVICE_SYNCHRONIZE);
281  switch (error) {
282  case CUDA_SUCCESS:
283  return cudaSuccess;
284  default: // should always return successful
285  errorQuda("cuCtxSynchronize return error code %d", error);
286  }
287  return cudaErrorUnknown;
288 #else
289  PROFILE(cudaError_t error = cudaDeviceSynchronize(), QUDA_PROFILE_DEVICE_SYNCHRONIZE);
290  return error;
291 #endif
292  }
293 
294 #if (CUDA_VERSION >= 9000)
295  cudaError_t qudaFuncSetAttribute(const void* func, cudaFuncAttribute attr, int value)
296  {
297  // no driver API variant here since we have C++ functions
298  PROFILE(cudaError_t error = cudaFuncSetAttribute(func, attr, value), QUDA_PROFILE_FUNC_SET_ATTRIBUTE);
299  return error;
300  }
301 #endif
302 
304 #ifdef API_PROFILE
305  apiTimer.Print();
306 #endif
307  }
308 
309 } // namespace quda
size_t const void size_t spitch
cudaEvent_t event
const size_t count
dim3 dim3 blockDim
size_t const void size_t size_t width
void qudaMemcpy2DAsync_(void *dst, size_t dpitch, const void *src, size_t spitch, size_t width, size_t hieght, cudaMemcpyKind kind, const cudaStream_t &stream, const char *func, const char *file, const char *line)
Wrapper around cudaMemcpy2DAsync or driver API equivalent Potentially add auto-profiling support...
cudaError_t qudaEventSynchronize(cudaEvent_t &event)
Wrapper around cudaEventSynchronize or cuEventSynchronize.
long long bytes() const
cudaError_t qudaEventQuery(cudaEvent_t &event)
Wrapper around cudaEventQuery or cuEventQuery.
cudaError_t qudaStreamWaitEvent(cudaStream_t stream, cudaEvent_t event, unsigned int flags)
Wrapper around cudaEventRecord or cuEventRecord.
QudaVerbosity getVerbosity()
Definition: util_quda.cpp:20
const void * func
const void * src
#define errorQuda(...)
Definition: util_quda.h:90
cudaStream_t * stream
#define PROFILE(f, idx)
char * strcpy(char *__dst, const char *__src)
TuneKey tuneKey() const
__host__ __device__ void copy(T1 &a, const T2 &b)
static TimeProfile apiTimer("CUDA API calls (driver)")
char * strcat(char *__s1, const char *__s2)
const cudaMemcpyKind kind
QudaGaugeParam param
Definition: pack_test.cpp:17
cudaError_t qudaStreamSynchronize(cudaStream_t &stream)
Wrapper around cudaStreamSynchronize or cuStreamSynchronize.
void Print()
Definition: timer.cpp:6
virtual ~QudaMemCopy()
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
Definition: tune.cpp:603
unsigned long long uint64_t
dim3 dim3 void size_t sharedMem
cudaError_t qudaDeviceSynchronize()
Wrapper around cudaDeviceSynchronize or cuDeviceSynchronize.
const void * src
void * memcpy(void *__dst, const void *__src, size_t __n)
return cudaErrorUnknown
bool activeTuning()
query if tuning is in progress
Definition: tune.cpp:103
dim3 dim3 void ** args
QudaMemCopy(void *dst, const void *src, size_t count, cudaMemcpyKind kind, const char *func, const char *file, const char *line)
bool advanceTuneParam(TuneParam &param) const
const void size_t enum cudaMemcpyKind kind
void qudaMemcpyAsync_(void *dst, const void *src, size_t count, cudaMemcpyKind kind, const cudaStream_t &stream, const char *func, const char *file, const char *line)
Wrapper around cudaMemcpyAsync or driver API equivalent Potentially add auto-profiling support...
unsigned int sharedBytesPerBlock(const TuneParam &param) const
#define printfQuda(...)
Definition: util_quda.h:84
const char * name
unsigned int sharedBytesPerThread() const
void printAPIProfile()
Print out the timer profile for CUDA API calls.
cudaError_t qudaEventRecord(cudaEvent_t &event, cudaStream_t stream=0)
Wrapper around cudaEventRecord or cuEventRecord.
long long flops() const
#define checkCudaError()
Definition: util_quda.h:129
const void int size_t unsigned int flags
QudaTune getTuning()
Query whether autotuning is enabled or not. Default is enabled but can be overridden by setting QUDA_...
Definition: util_quda.cpp:51
void u64toa(char *buffer, uint64_t value)
Definition: uint_to_char.h:127
size_t const void size_t size_t size_t height
__device__ unsigned int count[QUDA_MAX_MULTI_REDUCE]
Definition: cub_helper.cuh:118
void apply(const cudaStream_t &stream)
cudaError_t qudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim, void **args, size_t sharedMem, cudaStream_t stream)
Wrapper around cudaLaunchKernel.
char aux[TuneKey::aux_n]
Definition: tune_quda.h:189
void qudaMemcpy_(void *dst, const void *src, size_t count, cudaMemcpyKind kind, const char *func, const char *file, const char *line)
Wrapper around cudaMemcpy used for auto-profiling. Do not call directly, rather call macro below whic...