QUDA  v1.1.0
A library for QCD on GPUs
quda_api.cpp
Go to the documentation of this file.
1 #include <unordered_set>
2 #include <tune_quda.h>
3 #include <uint_to_char.h>
4 #include <quda_internal.h>
5 #include <device.h>
6 
7 // if this macro is defined then we use the driver API, else use the
8 // runtime API. Typically the driver API has 10-20% less overhead
9 #define USE_DRIVER_API
10 
11 // if this macro is defined then we profile the CUDA API calls
12 //#define API_PROFILE
13 
14 #ifdef API_PROFILE
15 #define PROFILE(f, idx) \
16  apiTimer.TPSTART(idx); \
17  f; \
18  apiTimer.TPSTOP(idx);
19 #else
20 #define PROFILE(f, idx) f;
21 #endif
22 
23 namespace quda
24 {
25 
26  // No need to abstract these across the library so keep these definitions local to CUDA target
27 
34  void qudaFuncSetAttribute_(const void *kernel, cudaFuncAttribute attr, int value, const char *func, const char *file,
35  const char *line);
36 
42  void qudaFuncGetAttributes_(cudaFuncAttributes &attr, const void *kernel, const char *func, const char *file,
43  const char *line);
44 
45 #define qudaFuncSetAttribute(kernel, attr, value) \
46  ::quda::qudaFuncSetAttribute_(kernel, attr, value, __func__, quda::file_name(__FILE__), __STRINGIFY__(__LINE__))
47 
48 #define qudaFuncGetAttributes(attr, kernel) \
49  ::quda::qudaFuncGetAttributes_(attr, kernel, __func__, quda::file_name(__FILE__), __STRINGIFY__(__LINE__))
50 
51 #ifdef USE_DRIVER_API
52  static TimeProfile apiTimer("CUDA API calls (driver)");
53 #else
54  static TimeProfile apiTimer("CUDA API calls (runtime)");
55 #endif
56 
57  qudaError_t qudaLaunchKernel(const void *func, const TuneParam &tp, void **args, qudaStream_t stream)
58  {
59  if (tp.set_max_shared_bytes) {
60  static std::unordered_set<const void *> cache;
61  auto search = cache.find(func);
62  if (search == cache.end()) {
63  cache.insert(func);
64  qudaFuncSetAttribute(func, cudaFuncAttributePreferredSharedMemoryCarveout, (int)cudaSharedmemCarveoutMaxShared);
65  cudaFuncAttributes attributes;
66  qudaFuncGetAttributes(attributes, func);
67  qudaFuncSetAttribute(func, cudaFuncAttributeMaxDynamicSharedMemorySize,
68  device::max_dynamic_shared_memory() - attributes.sharedSizeBytes);
69  }
70  }
71 
72  // no driver API variant here since we have C++ functions
73  PROFILE(cudaError_t error = cudaLaunchKernel(func, tp.grid, tp.block, args, tp.shared_bytes, stream),
75  if (error != cudaSuccess && !activeTuning()) errorQuda("(CUDA) %s", cudaGetErrorString(error));
76  return error == cudaSuccess ? QUDA_SUCCESS : QUDA_ERROR;
77  }
78 
79  class QudaMem : public Tunable
80  {
81  void *dst;
82  const void *src;
83  const size_t count;
84  const int value;
85  const bool copy;
86  const cudaMemcpyKind kind;
87  const bool async;
88  const char *name;
89  const bool active_tuning;
90 
91  unsigned int sharedBytesPerThread() const { return 0; }
92  unsigned int sharedBytesPerBlock(const TuneParam &param) const { return 0; }
93 
94  public:
95  inline QudaMem(void *dst, const void *src, size_t count, cudaMemcpyKind kind, const cudaStream_t &stream,
96  bool async, const char *func, const char *file, const char *line) :
97  dst(dst),
98  src(src),
99  count(count),
100  value(0),
101  copy(true),
102  kind(kind),
103  async(async),
104  active_tuning(activeTuning())
105  {
106  if (!async) {
107  switch (kind) {
108  case cudaMemcpyDeviceToHost: name = "cudaMemcpyDeviceToHost"; break;
109  case cudaMemcpyHostToDevice: name = "cudaMemcpyHostToDevice"; break;
110  case cudaMemcpyHostToHost: name = "cudaMemcpyHostToHost"; break;
111  case cudaMemcpyDeviceToDevice: name = "cudaMemcpyDeviceToDevice"; break;
112  case cudaMemcpyDefault: name = "cudaMemcpyDefault"; break;
113  default: errorQuda("Unsupported cudaMemcpyKind %d", kind);
114  }
115  } else {
116  switch (kind) {
117  case cudaMemcpyDeviceToHost: name = "cudaMemcpyAsyncDeviceToHost"; break;
118  case cudaMemcpyHostToDevice: name = "cudaMemcpyAsyncHostToDevice"; break;
119  case cudaMemcpyHostToHost: name = "cudaMemcpyAsyncHostToHost"; break;
120  case cudaMemcpyDeviceToDevice: name = "cudaMemcpyAsyncDeviceToDevice"; break;
121  case cudaMemcpyDefault: name = "cudaMemcpyAsyncDefault"; break;
122  default: errorQuda("Unsupported cudaMemcpyKind %d", kind);
123  }
124  }
125  strcpy(aux, func);
126  strcat(aux, ",");
127  strcat(aux, file);
128  strcat(aux, ",");
129  strcat(aux, line);
130 
131  apply(stream);
132  }
133 
134  inline QudaMem(void *dst, int value, size_t count, const cudaStream_t &stream, bool async, const char *func,
135  const char *file, const char *line) :
136  dst(dst),
137  src(nullptr),
138  count(count),
139  value(value),
140  copy(false),
141  kind(cudaMemcpyDefault),
142  async(async),
143  active_tuning(activeTuning())
144  {
145  name = !async ? "cudaMemset" : "cudaMemsetAsync";
146  strcpy(aux, func);
147  strcat(aux, ",");
148  strcat(aux, file);
149  strcat(aux, ",");
150  strcat(aux, line);
151 
152  apply(stream);
153  }
154 
155  inline void apply(const qudaStream_t &stream)
156  {
157  if (!active_tuning) tuneLaunch(*this, getTuning(), getVerbosity());
158 
159  if (copy) {
160  if (async) {
161 #ifdef USE_DRIVER_API
162  switch (kind) {
163  case cudaMemcpyDeviceToHost:
164  PROFILE(cuMemcpyDtoHAsync(dst, (CUdeviceptr)src, count, stream), QUDA_PROFILE_MEMCPY_D2H_ASYNC);
165  break;
166  case cudaMemcpyHostToDevice:
167  PROFILE(cuMemcpyHtoDAsync((CUdeviceptr)dst, src, count, stream), QUDA_PROFILE_MEMCPY_H2D_ASYNC);
168  break;
169  case cudaMemcpyDeviceToDevice:
170  PROFILE(cuMemcpyDtoDAsync((CUdeviceptr)dst, (CUdeviceptr)src, count, stream), QUDA_PROFILE_MEMCPY_D2D_ASYNC);
171  break;
172  case cudaMemcpyDefault:
173  PROFILE(cuMemcpyAsync((CUdeviceptr)dst, (CUdeviceptr)src, count, stream), QUDA_PROFILE_MEMCPY_DEFAULT_ASYNC);
174  break;
175  default: errorQuda("Unsupported cuMemcpyTypeAsync %d", kind);
176  }
177 #else
178  QudaProfileType type;
179  switch (kind) {
180  case cudaMemcpyDeviceToHost: type = QUDA_PROFILE_MEMCPY_D2H_ASYNC; break;
181  case cudaMemcpyHostToDevice: type = QUDA_PROFILE_MEMCPY_H2D_ASYNC; break;
182  case cudaMemcpyDeviceToDevice: type = QUDA_PROFILE_MEMCPY_D2D_ASYNC; break;
183  case cudaMemcpyDefault: type = QUDA_PROFILE_MEMCPY_DEFAULT_ASYNC; break;
184  default: errorQuda("Unsupported cudaMemcpyTypeAsync %d", kind);
185  }
186 
187  PROFILE(cudaMemcpyAsync(dst, src, count, kind, stream), type);
188 #endif
189  } else {
190 #ifdef USE_DRIVER_API
191  switch (kind) {
192  case cudaMemcpyDeviceToHost: cuMemcpyDtoH(dst, (CUdeviceptr)src, count); break;
193  case cudaMemcpyHostToDevice: cuMemcpyHtoD((CUdeviceptr)dst, src, count); break;
194  case cudaMemcpyHostToHost: memcpy(dst, src, count); break;
195  case cudaMemcpyDeviceToDevice: cuMemcpyDtoD((CUdeviceptr)dst, (CUdeviceptr)src, count); break;
196  case cudaMemcpyDefault: cuMemcpy((CUdeviceptr)dst, (CUdeviceptr)src, count); break;
197  default: errorQuda("Unsupported cudaMemcpyType %d", kind);
198  }
199 #else
200  cudaMemcpy(dst, src, count, kind);
201 #endif
202  }
203  } else {
204 #ifdef USE_DRIVER_API
205  if (async)
206  cuMemsetD32Async((CUdeviceptr)dst, value, count / 4, stream);
207  else
208  cuMemsetD32((CUdeviceptr)dst, value, count / 4);
209 #else
210  if (async)
211  cudaMemsetAsync(dst, value, count, stream);
212  else
213  cudaMemset(dst, value, count);
214 #endif
215  }
216  }
217 
218  bool advanceTuneParam(TuneParam &param) const { return false; }
219 
220  TuneKey tuneKey() const
221  {
222  char vol[128];
223  strcpy(vol, "bytes=");
224  u64toa(vol + 6, (uint64_t)count);
225  return TuneKey(vol, name, aux);
226  }
227 
228  long long flops() const { return 0; }
229  long long bytes() const { return kind == cudaMemcpyDeviceToDevice ? 2 * count : count; }
230  };
231 
232  void qudaMemcpy_(void *dst, const void *src, size_t count, cudaMemcpyKind kind, const char *func, const char *file,
233  const char *line)
234  {
235  if (count == 0) return;
236  QudaMem copy(dst, src, count, kind, 0, false, func, file, line);
237  cudaError_t error = cudaGetLastError();
238  if (error != cudaSuccess) errorQuda("(CUDA) %s\n (%s:%s in %s())\n", cudaGetErrorString(error), file, line, func);
239  }
240 
241  void qudaMemcpyAsync_(void *dst, const void *src, size_t count, cudaMemcpyKind kind, const qudaStream_t &stream,
242  const char *func, const char *file, const char *line)
243  {
244  if (count == 0) return;
245 
246  if (kind == cudaMemcpyDeviceToDevice) {
247  QudaMem copy(dst, src, count, kind, stream, true, func, file, line);
248  } else {
249 #ifdef USE_DRIVER_API
250  switch (kind) {
251  case cudaMemcpyDeviceToHost:
252  PROFILE(cuMemcpyDtoHAsync(dst, (CUdeviceptr)src, count, stream), QUDA_PROFILE_MEMCPY_D2H_ASYNC);
253  break;
254  case cudaMemcpyHostToDevice:
255  PROFILE(cuMemcpyHtoDAsync((CUdeviceptr)dst, src, count, stream), QUDA_PROFILE_MEMCPY_H2D_ASYNC);
256  break;
257  case cudaMemcpyDeviceToDevice:
258  PROFILE(cuMemcpyDtoDAsync((CUdeviceptr)dst, (CUdeviceptr)src, count, stream), QUDA_PROFILE_MEMCPY_D2D_ASYNC);
259  break;
260  case cudaMemcpyDefault:
261  PROFILE(cuMemcpyAsync((CUdeviceptr)dst, (CUdeviceptr)src, count, stream), QUDA_PROFILE_MEMCPY_DEFAULT_ASYNC);
262  break;
263  default: errorQuda("Unsupported cuMemcpyTypeAsync %d", kind);
264  }
265 #else
266  PROFILE(cudaMemcpyAsync(dst, src, count, kind, stream),
267  kind == cudaMemcpyDeviceToHost ? QUDA_PROFILE_MEMCPY_D2H_ASYNC : QUDA_PROFILE_MEMCPY_H2D_ASYNC);
268 #endif
269  }
270  }
271 
272  void qudaMemcpy2D_(void *dst, size_t dpitch, const void *src, size_t spitch, size_t width, size_t height,
273  cudaMemcpyKind kind, const char *func, const char *file, const char *line)
274  {
275 #ifdef USE_DRIVER_API
276  CUDA_MEMCPY2D param;
277  param.srcPitch = spitch;
278  param.srcY = 0;
279  param.srcXInBytes = 0;
280  param.dstPitch = dpitch;
281  param.dstY = 0;
282  param.dstXInBytes = 0;
283  param.WidthInBytes = width;
284  param.Height = height;
285 
286  switch (kind) {
287  case cudaMemcpyDeviceToHost:
288  param.srcDevice = (CUdeviceptr)src;
289  param.srcMemoryType = CU_MEMORYTYPE_DEVICE;
290  param.dstHost = dst;
291  param.dstMemoryType = CU_MEMORYTYPE_HOST;
292  break;
293  default: errorQuda("Unsupported cuMemcpyType2DAsync %d", kind);
294  }
296 #else
297  PROFILE(cudaMemcpy2D(dst, dpitch, src, spitch, width, height, kind), QUDA_PROFILE_MEMCPY2D_D2H_ASYNC);
298 #endif
299  }
300 
301  void qudaMemcpy2DAsync_(void *dst, size_t dpitch, const void *src, size_t spitch, size_t width, size_t height,
302  cudaMemcpyKind kind, const qudaStream_t &stream, const char *func, const char *file,
303  const char *line)
304  {
305 #ifdef USE_DRIVER_API
306  CUDA_MEMCPY2D param;
307  param.srcPitch = spitch;
308  param.srcY = 0;
309  param.srcXInBytes = 0;
310  param.dstPitch = dpitch;
311  param.dstY = 0;
312  param.dstXInBytes = 0;
313  param.WidthInBytes = width;
314  param.Height = height;
315 
316  switch (kind) {
317  case cudaMemcpyDeviceToHost:
318  param.srcDevice = (CUdeviceptr)src;
319  param.srcMemoryType = CU_MEMORYTYPE_DEVICE;
320  param.dstHost = dst;
321  param.dstMemoryType = CU_MEMORYTYPE_HOST;
322  break;
323  default: errorQuda("Unsupported cuMemcpyType2DAsync %d", kind);
324  }
326 #else
327  PROFILE(cudaMemcpy2DAsync(dst, dpitch, src, spitch, width, height, kind, stream), QUDA_PROFILE_MEMCPY2D_D2H_ASYNC);
328 #endif
329  }
330 
331  void qudaMemset_(void *ptr, int value, size_t count, const char *func, const char *file, const char *line)
332  {
333  if (count == 0) return;
334  QudaMem set(ptr, value, count, 0, false, func, file, line);
335  cudaError_t error = cudaGetLastError();
336  if (error != cudaSuccess && !activeTuning())
337  errorQuda("(CUDA) %s\n (%s:%s in %s())\n", cudaGetErrorString(error), file, line, func);
338  }
339 
340  void qudaMemsetAsync_(void *ptr, int value, size_t count, const qudaStream_t &stream, const char *func,
341  const char *file, const char *line)
342  {
343  if (count == 0) return;
344  QudaMem copy(ptr, value, count, stream, true, func, file, line);
345  cudaError_t error = cudaGetLastError();
346  if (error != cudaSuccess) errorQuda("(CUDA) %s\n (%s:%s in %s())\n", cudaGetErrorString(error), file, line, func);
347  }
348 
349  void qudaMemset2D_(void *ptr, size_t pitch, int value, size_t width, size_t height, const char *func,
350  const char *file, const char *line)
351  {
352  cudaError_t error = cudaMemset2D(ptr, pitch, value, width, height);
353  if (error != cudaSuccess) errorQuda("(CUDA) %s\n (%s:%s in %s())\n", cudaGetErrorString(error), file, line, func);
354  }
355 
356  void qudaMemset2DAsync_(void *ptr, size_t pitch, int value, size_t width, size_t height, const qudaStream_t &stream,
357  const char *func, const char *file, const char *line)
358  {
359  cudaError_t error = cudaMemset2DAsync(ptr, pitch, value, width, height, stream);
360  if (error != cudaSuccess) errorQuda("(CUDA) %s\n (%s:%s in %s())\n", cudaGetErrorString(error), file, line, func);
361  }
362 
363  void qudaMemPrefetchAsync_(void *ptr, size_t count, QudaFieldLocation mem_space, const qudaStream_t &stream,
364  const char *func, const char *file, const char *line)
365  {
366  int dev_id = 0;
367  if (mem_space == QUDA_CUDA_FIELD_LOCATION)
368  dev_id = comm_gpuid();
369  else if (mem_space == QUDA_CPU_FIELD_LOCATION)
370  dev_id = cudaCpuDeviceId;
371  else
372  errorQuda("Invalid QudaFieldLocation.");
373 
374  cudaError_t error = cudaMemPrefetchAsync(ptr, count, dev_id, stream);
375  if (error != cudaSuccess) errorQuda("(CUDA) %s\n (%s:%s in %s())\n", cudaGetErrorString(error), file, line, func);
376  }
377 
378  bool qudaEventQuery_(cudaEvent_t &event, const char *func, const char *file, const char *line)
379  {
380 #ifdef USE_DRIVER_API
381  PROFILE(CUresult error = cuEventQuery(event), QUDA_PROFILE_EVENT_QUERY);
382  switch (error) {
383  case CUDA_SUCCESS: return true;
384  case CUDA_ERROR_NOT_READY: return false;
385  default: {
386  const char *str;
387  cuGetErrorName(error, &str);
388  errorQuda("cuEventQuery returned error %s\n (%s:%s in %s())", str, file, line, func);
389  }
390  }
391 #else
392  PROFILE(cudaError_t error = cudaEventQuery(event), QUDA_PROFILE_EVENT_QUERY);
393  switch (error) {
394  case cudaSuccess: return true;
395  case cudaErrorNotReady: return false;
396  default: errorQuda("(CUDA) %s\n (%s:%s in %s())\n", cudaGetErrorString(error), file, line, func);
397  }
398 #endif
399  return false;
400  }
401 
402  void qudaEventRecord_(cudaEvent_t &event, qudaStream_t stream, const char *func, const char *file, const char *line)
403  {
404 #ifdef USE_DRIVER_API
405  PROFILE(CUresult error = cuEventRecord(event, stream), QUDA_PROFILE_EVENT_RECORD);
406  if (error != CUDA_SUCCESS) {
407  const char *str;
408  cuGetErrorName(error, &str);
409  errorQuda("cuEventRecord returned error %s\n (%s:%s in %s())", str, file, line, func);
410  }
411 #else
412  PROFILE(cudaError_t error = cudaEventRecord(event, stream), QUDA_PROFILE_EVENT_RECORD);
413  if (error != cudaSuccess) errorQuda("(CUDA) %s\n (%s:%s in %s())\n", cudaGetErrorString(error), file, line, func);
414 #endif
415  }
416 
417  void qudaStreamWaitEvent_(qudaStream_t stream, cudaEvent_t event, unsigned int flags, const char *func,
418  const char *file, const char *line)
419  {
420 #ifdef USE_DRIVER_API
421  PROFILE(CUresult error = cuStreamWaitEvent(stream, event, flags), QUDA_PROFILE_STREAM_WAIT_EVENT);
422  if (error != CUDA_SUCCESS) {
423  const char *str;
424  cuGetErrorName(error, &str);
425  errorQuda("cuStreamWaitEvent returned error %s\n (%s:%s in %s())", str, file, line, func);
426  }
427 #else
428  PROFILE(cudaError_t error = cudaStreamWaitEvent(stream, event, flags), QUDA_PROFILE_STREAM_WAIT_EVENT);
429  if (error != cudaSuccess) errorQuda("(CUDA) %s\n (%s:%s in %s())\n", cudaGetErrorString(error), file, line, func);
430 #endif
431  }
432 
433  void qudaEventSynchronize_(cudaEvent_t &event, const char *func, const char *file, const char *line)
434  {
435 #ifdef USE_DRIVER_API
436  PROFILE(CUresult error = cuEventSynchronize(event), QUDA_PROFILE_EVENT_SYNCHRONIZE);
437  if (error != CUDA_SUCCESS) {
438  const char *str;
439  cuGetErrorName(error, &str);
440  errorQuda("cuEventSynchronize returned error %s\n (%s:%s in %s())", str, file, line, func);
441  }
442 #else
443  PROFILE(cudaError_t error = cudaEventSynchronize(event), QUDA_PROFILE_EVENT_SYNCHRONIZE);
444  if (error != cudaSuccess) errorQuda("(CUDA) %s\n (%s:%s in %s())\n", cudaGetErrorString(error), file, line, func);
445 #endif
446  }
447 
448  void qudaStreamSynchronize_(qudaStream_t &stream, const char *func, const char *file, const char *line)
449  {
450 #ifdef USE_DRIVER_API
451  PROFILE(CUresult error = cuStreamSynchronize(stream), QUDA_PROFILE_STREAM_SYNCHRONIZE);
452  if (error != CUDA_SUCCESS) {
453  const char *str;
454  cuGetErrorName(error, &str);
455  errorQuda("(CUDA) cuStreamSynchronize returned error %s\n (%s:%s in %s())\n", str, file, line, func);
456  }
457 #else
458  PROFILE(cudaError_t error = cudaStreamSynchronize(stream), QUDA_PROFILE_STREAM_SYNCHRONIZE);
459  if (error != cudaSuccess && !activeTuning())
460  errorQuda("(CUDA) %s\n (%s:%s in %s())", cudaGetErrorString(error), file, line, func);
461 #endif
462  }
463 
464  void qudaDeviceSynchronize_(const char *func, const char *file, const char *line)
465  {
466 #ifdef USE_DRIVER_API
467  PROFILE(CUresult error = cuCtxSynchronize(), QUDA_PROFILE_DEVICE_SYNCHRONIZE);
468  if (error != CUDA_SUCCESS) {
469  const char *str;
470  cuGetErrorName(error, &str);
471  errorQuda("cuCtxSynchronize returned error %s (%s:%s in %s())\n", str, file, line, func);
472  }
473 #else
474  PROFILE(cudaError_t error = cudaDeviceSynchronize(), QUDA_PROFILE_DEVICE_SYNCHRONIZE);
475  if (error != cudaSuccess) errorQuda("(CUDA) %s\n (%s:%s in %s())\n", cudaGetErrorString(error), file, line, func);
476 #endif
477  }
478 
479  void qudaFuncSetAttribute_(const void *kernel, cudaFuncAttribute attr, int value, const char *func, const char *file,
480  const char *line)
481  {
482  // no driver API variant here since we have C++ functions
483  PROFILE(cudaError_t error = cudaFuncSetAttribute(kernel, attr, value), QUDA_PROFILE_FUNC_SET_ATTRIBUTE);
484  if (error != cudaSuccess) errorQuda("(CUDA) %s\n (%s:%s in %s())\n", cudaGetErrorString(error), file, line, func);
485  }
486 
487  void qudaFuncGetAttributes_(cudaFuncAttributes &attr, const void *kernel, const char *func, const char *file,
488  const char *line)
489  {
490  // no driver API variant here since we have C++ functions
491  PROFILE(cudaError_t error = cudaFuncGetAttributes(&attr, kernel), QUDA_PROFILE_FUNC_SET_ATTRIBUTE);
492  if (error != cudaSuccess) errorQuda("(CUDA) %s\n (%s:%s in %s())\n", cudaGetErrorString(error), file, line, func);
493  }
494 
496  {
497 #ifdef API_PROFILE
498  apiTimer.Print();
499 #endif
500  }
501 
502 } // namespace quda
QudaMem(void *dst, const void *src, size_t count, cudaMemcpyKind kind, const cudaStream_t &stream, bool async, const char *func, const char *file, const char *line)
Definition: quda_api.cpp:95
void apply(const qudaStream_t &stream)
Definition: quda_api.cpp:155
long long flops() const
Definition: quda_api.cpp:228
long long bytes() const
Definition: quda_api.cpp:229
QudaMem(void *dst, int value, size_t count, const cudaStream_t &stream, bool async, const char *func, const char *file, const char *line)
Definition: quda_api.cpp:134
bool advanceTuneParam(TuneParam &param) const
Definition: quda_api.cpp:218
TuneKey tuneKey() const
Definition: quda_api.cpp:220
void Print()
Definition: timer.cpp:7
char aux[TuneKey::aux_n]
Definition: tune_quda.h:269
bool set_max_shared_bytes
Definition: tune_quda.h:31
int comm_gpuid(void)
@ QUDA_CUDA_FIELD_LOCATION
Definition: enum_quda.h:326
@ QUDA_CPU_FIELD_LOCATION
Definition: enum_quda.h:325
enum QudaFieldLocation_s QudaFieldLocation
qudaError_t
Definition: enum_quda.h:10
@ QUDA_SUCCESS
Definition: enum_quda.h:10
@ QUDA_ERROR
Definition: enum_quda.h:10
void copy(ColorSpinorField &dst, const ColorSpinorField &src)
Definition: blas_quda.h:24
size_t max_dynamic_shared_memory()
Returns the maximum dynamic shared memory per block.
Definition: device.cpp:215
TuneParam tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
Definition: tune.cpp:677
void qudaMemset2DAsync_(void *ptr, size_t pitch, int value, size_t width, size_t height, const qudaStream_t &stream, const char *func, const char *file, const char *line)
Wrapper around cudaMemsetAsync or driver API equivalent.
Definition: quda_api.cpp:356
bool qudaEventQuery_(cudaEvent_t &event, const char *func, const char *file, const char *line)
Wrapper around cudaEventQuery or cuEventQuery with built-in error checking.
Definition: quda_api.cpp:378
void qudaMemcpy2DAsync_(void *dst, size_t dpitch, const void *src, size_t spitch, size_t width, size_t height, cudaMemcpyKind kind, const qudaStream_t &stream, const char *func, const char *file, const char *line)
Wrapper around cudaMemcpy2DAsync or driver API equivalent.
Definition: quda_api.cpp:301
void qudaMemset2D_(void *ptr, size_t pitch, int value, size_t width, size_t height, const char *func, const char *file, const char *line)
Wrapper around cudaMemset2D or driver API equivalent.
Definition: quda_api.cpp:349
void printAPIProfile()
Print out the timer profile for CUDA API calls.
Definition: quda_api.cpp:495
void qudaDeviceSynchronize_(const char *func, const char *file, const char *line)
Wrapper around cudaDeviceSynchronize or cuDeviceSynchronize with built-in error checking.
Definition: quda_api.cpp:464
void qudaFuncSetAttribute_(const void *kernel, cudaFuncAttribute attr, int value, const char *func, const char *file, const char *line)
Wrapper around cudaFuncSetAttribute with built-in error checking.
Definition: quda_api.cpp:479
qudaStream_t * stream
void qudaStreamSynchronize_(qudaStream_t &stream, const char *func, const char *file, const char *line)
Wrapper around cudaStreamSynchronize or cuStreamSynchronize with built-in error checking.
Definition: quda_api.cpp:448
QudaProfileType
Definition: timer.h:103
@ QUDA_PROFILE_MEMCPY_H2D_ASYNC
Definition: timer.h:141
@ QUDA_PROFILE_EVENT_SYNCHRONIZE
Definition: timer.h:134
@ QUDA_PROFILE_FUNC_SET_ATTRIBUTE
Definition: timer.h:132
@ QUDA_PROFILE_MEMCPY_D2D_ASYNC
Definition: timer.h:138
@ QUDA_PROFILE_DEVICE_SYNCHRONIZE
Definition: timer.h:136
@ QUDA_PROFILE_STREAM_SYNCHRONIZE
Definition: timer.h:135
@ QUDA_PROFILE_EVENT_QUERY
Definition: timer.h:130
@ QUDA_PROFILE_STREAM_WAIT_EVENT
Definition: timer.h:131
@ QUDA_PROFILE_MEMCPY_DEFAULT_ASYNC
Definition: timer.h:142
@ QUDA_PROFILE_MEMCPY_D2H_ASYNC
Definition: timer.h:139
@ QUDA_PROFILE_LAUNCH_KERNEL
Definition: timer.h:128
@ QUDA_PROFILE_EVENT_RECORD
Definition: timer.h:129
@ QUDA_PROFILE_MEMCPY2D_D2H_ASYNC
Definition: timer.h:140
bool activeTuning()
query if tuning is in progress
Definition: tune.cpp:137
void qudaEventSynchronize_(cudaEvent_t &event, const char *func, const char *file, const char *line)
Wrapper around cudaEventSynchronize or cuEventSynchronize with built-in error checking.
Definition: quda_api.cpp:433
void qudaMemcpy2D_(void *dst, size_t dpitch, const void *src, size_t spitch, size_t width, size_t height, cudaMemcpyKind kind, const char *func, const char *file, const char *line)
Wrapper around cudaMemcpy2DAsync or driver API equivalent.
Definition: quda_api.cpp:272
qudaError_t qudaLaunchKernel(const void *func, const TuneParam &tp, void **args, qudaStream_t stream)
Wrapper around cudaLaunchKernel.
Definition: quda_api.cpp:57
void qudaMemsetAsync_(void *ptr, int value, size_t count, const qudaStream_t &stream, const char *func, const char *file, const char *line)
Wrapper around cudaMemsetAsync or driver API equivalent.
Definition: quda_api.cpp:340
void qudaMemPrefetchAsync_(void *ptr, size_t count, QudaFieldLocation mem_space, const qudaStream_t &stream, const char *func, const char *file, const char *line)
Wrapper around cudaMemPrefetchAsync or driver API equivalent.
Definition: quda_api.cpp:363
__host__ __device__ std::enable_if<!isFixed< T1 >::value &&!isFixed< T2 >::value, void >::type copy(T1 &a, const T2 &b)
Copy function which is trival between floating point types. When converting to an integer type,...
Definition: convert.h:64
void u64toa(char *buffer, uint64_t value)
Definition: uint_to_char.h:127
void qudaMemcpy_(void *dst, const void *src, size_t count, cudaMemcpyKind kind, const char *func, const char *file, const char *line)
Wrapper around cudaMemcpy or driver API equivalent.
Definition: quda_api.cpp:232
void qudaEventRecord_(cudaEvent_t &event, qudaStream_t stream, const char *func, const char *file, const char *line)
Wrapper around cudaEventRecord or cuEventRecord with built-in error checking.
Definition: quda_api.cpp:402
void qudaStreamWaitEvent_(qudaStream_t stream, cudaEvent_t event, unsigned int flags, const char *func, const char *file, const char *line)
Wrapper around cudaStreamWaitEvent or cuStreamWaitEvent with built-in error checking.
Definition: quda_api.cpp:417
void qudaMemset_(void *ptr, int value, size_t count, const char *func, const char *file, const char *line)
Wrapper around cudaMemset or driver API equivalent.
Definition: quda_api.cpp:331
void qudaFuncGetAttributes_(cudaFuncAttributes &attr, const void *kernel, const char *func, const char *file, const char *line)
Wrapper around cudaFuncGetAttributes with built-in error checking.
Definition: quda_api.cpp:487
void qudaMemcpyAsync_(void *dst, const void *src, size_t count, cudaMemcpyKind kind, const qudaStream_t &stream, const char *func, const char *file, const char *line)
Wrapper around cudaMemcpyAsync or driver API equivalent.
Definition: quda_api.cpp:241
QudaGaugeParam param
Definition: pack_test.cpp:18
#define qudaFuncGetAttributes(attr, kernel)
Definition: quda_api.cpp:48
#define qudaFuncSetAttribute(kernel, attr, value)
Definition: quda_api.cpp:45
#define PROFILE(f, idx)
Definition: quda_api.cpp:20
cudaStream_t qudaStream_t
Definition: quda_api.h:9
QudaTune getTuning()
Query whether autotuning is enabled or not. Default is enabled but can be overridden by setting QUDA_...
Definition: util_quda.cpp:52
QudaVerbosity getVerbosity()
Definition: util_quda.cpp:21
#define errorQuda(...)
Definition: util_quda.h:120