QUDA  v1.1.0
A library for QCD on GPUs
device.cpp
Go to the documentation of this file.
1 #include <cuda_runtime.h>
2 #include <cuda_profiler_api.h>
3 #include <util_quda.h>
4 #include <quda_internal.h>
5 
6 #ifdef QUDA_NVML
7 #include <nvml.h>
8 #endif
9 
10 #ifdef NUMA_NVML
11 #include <numa_affinity.h>
12 #endif
13 
14 cudaDeviceProp deviceProp;
16 
17 namespace quda
18 {
19 
20  namespace device
21  {
22 
23  static bool initialized = false;
24 
25  void init(int dev)
26  {
27  if (initialized) return;
28  initialized = true;
29  printfQuda("*** HIP BACKEND ***\n");
30  int driver_version;
31  cudaDriverGetVersion(&driver_version);
32  printfQuda("CUDA Driver version = %d\n", driver_version);
33 
34  int runtime_version;
35  cudaRuntimeGetVersion(&runtime_version);
36  printfQuda("CUDA Runtime version = %d\n", runtime_version);
37 
38 #ifdef QUDA_NVML
39  nvmlReturn_t result = nvmlInit();
40  if (NVML_SUCCESS != result) errorQuda("NVML Init failed with error %d", result);
41  const int length = 80;
42  char graphics_version[length];
43  result = nvmlSystemGetDriverVersion(graphics_version, length);
44  if (NVML_SUCCESS != result) errorQuda("nvmlSystemGetDriverVersion failed with error %d", result);
45  printfQuda("Graphic driver version = %s\n", graphics_version);
46  result = nvmlShutdown();
47  if (NVML_SUCCESS != result) errorQuda("NVML Shutdown failed with error %d", result);
48 #endif
49 
50  int deviceCount;
51  cudaGetDeviceCount(&deviceCount);
52  if (deviceCount == 0) { errorQuda("No CUDA devices found"); }
53 
54  for (int i = 0; i < deviceCount; i++) {
55  cudaGetDeviceProperties(&deviceProp, i);
56  checkCudaErrorNoSync(); // "NoSync" for correctness in HOST_DEBUG mode
57  if (getVerbosity() >= QUDA_SUMMARIZE) { printfQuda("Found device %d: %s\n", i, deviceProp.name); }
58  }
59 
60  cudaGetDeviceProperties(&deviceProp, dev);
61  checkCudaErrorNoSync(); // "NoSync" for correctness in HOST_DEBUG mode
62  if (deviceProp.major < 1) { errorQuda("Device %d does not support CUDA", dev); }
63 
64  // Check GPU and QUDA build compatibiliy
65  // 4 cases:
66  // a) QUDA and GPU match: great
67  // b) QUDA built for higher compute capability: error
68  // c) QUDA built for lower major compute capability: warn if QUDA_ALLOW_JIT, else error
69  // d) QUDA built for same major compute capability but lower minor: warn
70 
71  const int my_major = __COMPUTE_CAPABILITY__ / 100;
72  const int my_minor = (__COMPUTE_CAPABILITY__ - my_major * 100) / 10;
73  // b) UDA was compiled for a higher compute capability
74  if (deviceProp.major * 100 + deviceProp.minor * 10 < __COMPUTE_CAPABILITY__)
75  errorQuda("** Running on a device with compute capability %i.%i but QUDA was compiled for %i.%i. ** \n --- "
76  "Please set the correct QUDA_GPU_ARCH when running cmake.\n",
77  deviceProp.major, deviceProp.minor, my_major, my_minor);
78 
79  // c) QUDA was compiled for a lower compute capability
80  if (deviceProp.major < my_major) {
81  char *allow_jit_env = getenv("QUDA_ALLOW_JIT");
82  if (allow_jit_env && strcmp(allow_jit_env, "1") == 0) {
83  if (getVerbosity() > QUDA_SILENT)
84  warningQuda("** Running on a device with compute capability %i.%i but QUDA was compiled for %i.%i. **\n -- "
85  "Jitting the PTX since QUDA_ALLOW_JIT=1 was set. Note that this will take some time.\n",
86  deviceProp.major, deviceProp.minor, my_major, my_minor);
87  } else {
88  errorQuda("** Running on a device with compute capability %i.%i but QUDA was compiled for %i.%i. **\n --- "
89  "Please set the correct QUDA_GPU_ARCH when running cmake.\n If you want the PTX to be jitted for "
90  "your current GPU arch please set the enviroment variable QUDA_ALLOW_JIT=1.",
91  deviceProp.major, deviceProp.minor, my_major, my_minor);
92  }
93  }
94  // d) QUDA built for same major compute capability but lower minor
95  if (deviceProp.major == my_major and deviceProp.minor > my_minor) {
97  "** Running on a device with compute capability %i.%i but QUDA was compiled for %i.%i. **\n -- This might "
98  "result in a lower performance. Please consider adjusting QUDA_GPU_ARCH when running cmake.\n",
99  deviceProp.major, deviceProp.minor, my_major, my_minor);
100  }
101 
102  if (getVerbosity() >= QUDA_SUMMARIZE) { printfQuda("Using device %d: %s\n", dev, deviceProp.name); }
103 #ifndef USE_QDPJIT
104  cudaSetDevice(dev);
105  checkCudaErrorNoSync(); // "NoSync" for correctness in HOST_DEBUG mode
106 #endif
107 
108 #ifdef NUMA_NVML
109  char *enable_numa_env = getenv("QUDA_ENABLE_NUMA");
110  if (enable_numa_env && strcmp(enable_numa_env, "0") == 0) {
111  if (getVerbosity() > QUDA_SILENT) printfQuda("Disabling numa_affinity\n");
112  } else {
113  setNumaAffinityNVML(dev);
114  }
115 #endif
116 
117  cudaDeviceSetCacheConfig(cudaFuncCachePreferL1);
118  // cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeEightByte);
119  // cudaGetDeviceProperties(&deviceProp, dev);
120  }
121 
122  void create_context()
123  {
125 
126  int greatestPriority;
127  int leastPriority;
128  cudaDeviceGetStreamPriorityRange(&leastPriority, &greatestPriority);
129  for (int i = 0; i < Nstream - 1; i++) {
130  cudaStreamCreateWithPriority(&streams[i], cudaStreamDefault, greatestPriority);
131  }
132  cudaStreamCreateWithPriority(&streams[Nstream - 1], cudaStreamDefault, leastPriority);
133 
134  checkCudaError();
135  }
136 
137  void destroy()
138  {
139  if (streams) {
140  for (int i = 0; i < Nstream; i++) cudaStreamDestroy(streams[i]);
141  delete[] streams;
142  streams = nullptr;
143  }
144 
145  char *device_reset_env = getenv("QUDA_DEVICE_RESET");
146  if (device_reset_env && strcmp(device_reset_env, "1") == 0) {
147  // end this CUDA context
148  cudaDeviceReset();
149  }
150  }
151 
152  namespace profile
153  {
154 
155  void start() { cudaProfilerStart(); }
156 
157  void stop() { cudaProfilerStop(); }
158 
159  } // namespace profile
160 
161  } // namespace device
162 } // namespace quda
qudaStream_t * streams
Definition: device.cpp:15
cudaDeviceProp deviceProp
Definition: device.cpp:14
@ QUDA_SILENT
Definition: enum_quda.h:265
@ QUDA_SUMMARIZE
Definition: enum_quda.h:266
int length[]
void stop()
Stop profiling.
Definition: device.cpp:228
void start()
Start profiling.
Definition: device.cpp:226
void create_context()
Create the streams associated with parallel execution.
Definition: device.cpp:185
void init(int dev)
Create the device context. Called by initQuda when initializing the library.
Definition: device.cpp:25
void destroy()
Free any persistent context state. Called by endQuda when tearing down the library.
Definition: device.cpp:200
const int Nstream
int setNumaAffinityNVML(int deviceid)
cudaStream_t qudaStream_t
Definition: quda_api.h:9
#define printfQuda(...)
Definition: util_quda.h:114
#define checkCudaErrorNoSync()
Definition: util_quda.h:143
#define checkCudaError()
Definition: util_quda.h:158
QudaVerbosity getVerbosity()
Definition: util_quda.cpp:21
#define warningQuda(...)
Definition: util_quda.h:132
#define errorQuda(...)
Definition: util_quda.h:120