QUDA  v1.1.0
A library for QCD on GPUs
device.cpp
Go to the documentation of this file.
1 #include <cuda_runtime.h>
2 #include <cuda_profiler_api.h>
3 #include <util_quda.h>
4 #include <quda_internal.h>
5 
6 #ifdef QUDA_NVML
7 #include <nvml.h>
8 #endif
9 
10 #ifdef NUMA_NVML
11 #include <numa_affinity.h>
12 #endif
13 
14 cudaDeviceProp deviceProp;
16 
17 namespace quda
18 {
19 
20  namespace device
21  {
22 
23  static bool initialized = false;
24 
25  void init(int dev)
26  {
27  if (initialized) return;
28  initialized = true;
29 
30  int driver_version;
31  cudaDriverGetVersion(&driver_version);
32  printfQuda("CUDA Driver version = %d\n", driver_version);
33 
34  int runtime_version;
35  cudaRuntimeGetVersion(&runtime_version);
36  printfQuda("CUDA Runtime version = %d\n", runtime_version);
37 
38 #ifdef QUDA_NVML
39  nvmlReturn_t result = nvmlInit();
40  if (NVML_SUCCESS != result) errorQuda("NVML Init failed with error %d", result);
41  const int length = 80;
42  char graphics_version[length];
43  result = nvmlSystemGetDriverVersion(graphics_version, length);
44  if (NVML_SUCCESS != result) errorQuda("nvmlSystemGetDriverVersion failed with error %d", result);
45  printfQuda("Graphic driver version = %s\n", graphics_version);
46  result = nvmlShutdown();
47  if (NVML_SUCCESS != result) errorQuda("NVML Shutdown failed with error %d", result);
48 #endif
49 
50  int deviceCount;
51  cudaGetDeviceCount(&deviceCount);
52  if (deviceCount == 0) { errorQuda("No CUDA devices found"); }
53 
54  for (int i = 0; i < deviceCount; i++) {
55  cudaGetDeviceProperties(&deviceProp, i);
56  checkCudaErrorNoSync(); // "NoSync" for correctness in HOST_DEBUG mode
57  if (getVerbosity() >= QUDA_SUMMARIZE) { printfQuda("Found device %d: %s\n", i, deviceProp.name); }
58  }
59 
60  cudaGetDeviceProperties(&deviceProp, dev);
61  checkCudaErrorNoSync(); // "NoSync" for correctness in HOST_DEBUG mode
62  if (deviceProp.major < 1) { errorQuda("Device %d does not support CUDA", dev); }
63 
64  // Check GPU and QUDA build compatibiliy
65  // 4 cases:
66  // a) QUDA and GPU match: great
67  // b) QUDA built for higher compute capability: error
68  // c) QUDA built for lower major compute capability: warn if QUDA_ALLOW_JIT, else error
69  // d) QUDA built for same major compute capability but lower minor: warn
70 
71  const int my_major = __COMPUTE_CAPABILITY__ / 100;
72  const int my_minor = (__COMPUTE_CAPABILITY__ - my_major * 100) / 10;
73  // b) UDA was compiled for a higher compute capability
74  if (deviceProp.major * 100 + deviceProp.minor * 10 < __COMPUTE_CAPABILITY__)
75  errorQuda("** Running on a device with compute capability %i.%i but QUDA was compiled for %i.%i. ** \n --- "
76  "Please set the correct QUDA_GPU_ARCH when running cmake.\n",
77  deviceProp.major, deviceProp.minor, my_major, my_minor);
78 
79  // c) QUDA was compiled for a lower compute capability
80  if (deviceProp.major < my_major) {
81  char *allow_jit_env = getenv("QUDA_ALLOW_JIT");
82  if (allow_jit_env && strcmp(allow_jit_env, "1") == 0) {
83  if (getVerbosity() > QUDA_SILENT)
84  warningQuda("** Running on a device with compute capability %i.%i but QUDA was compiled for %i.%i. **\n -- "
85  "Jitting the PTX since QUDA_ALLOW_JIT=1 was set. Note that this will take some time.\n",
86  deviceProp.major, deviceProp.minor, my_major, my_minor);
87  } else {
88  errorQuda("** Running on a device with compute capability %i.%i but QUDA was compiled for %i.%i. **\n --- "
89  "Please set the correct QUDA_GPU_ARCH when running cmake.\n If you want the PTX to be jitted for "
90  "your current GPU arch please set the enviroment variable QUDA_ALLOW_JIT=1.",
91  deviceProp.major, deviceProp.minor, my_major, my_minor);
92  }
93  }
94  // d) QUDA built for same major compute capability but lower minor
95  if (deviceProp.major == my_major and deviceProp.minor > my_minor) {
97  "** Running on a device with compute capability %i.%i but QUDA was compiled for %i.%i. **\n -- This might "
98  "result in a lower performance. Please consider adjusting QUDA_GPU_ARCH when running cmake.\n",
99  deviceProp.major, deviceProp.minor, my_major, my_minor);
100  }
101 
102  if (getVerbosity() >= QUDA_SUMMARIZE) { printfQuda("Using device %d: %s\n", dev, deviceProp.name); }
103 #ifndef USE_QDPJIT
104  cudaSetDevice(dev);
105  checkCudaErrorNoSync(); // "NoSync" for correctness in HOST_DEBUG mode
106 #endif
107 
108 #ifdef NUMA_NVML
109  char *enable_numa_env = getenv("QUDA_ENABLE_NUMA");
110  if (enable_numa_env && strcmp(enable_numa_env, "0") == 0) {
111  if (getVerbosity() > QUDA_SILENT) printfQuda("Disabling numa_affinity\n");
112  } else {
113  setNumaAffinityNVML(dev);
114  }
115 #endif
116 
117  cudaDeviceSetCacheConfig(cudaFuncCachePreferL1);
118  // cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeEightByte);
119  // cudaGetDeviceProperties(&deviceProp, dev);
120  }
121 
123  {
124 
125  int dev_count;
126  cudaGetDeviceCount(&dev_count);
127  int device;
128  for (device = 0; device < dev_count; device++) {
129 
130  // cudaDeviceProp deviceProp;
131  cudaGetDeviceProperties(&deviceProp, device);
132  printfQuda("%d - name: %s\n", device, deviceProp.name);
133  printfQuda("%d - totalGlobalMem: %lu bytes ( %.2f Gbytes)\n", device, deviceProp.totalGlobalMem,
134  deviceProp.totalGlobalMem / (float)(1024 * 1024 * 1024));
135  printfQuda("%d - sharedMemPerBlock: %lu bytes ( %.2f Kbytes)\n", device, deviceProp.sharedMemPerBlock,
136  deviceProp.sharedMemPerBlock / (float)1024);
137  printfQuda("%d - regsPerBlock: %d\n", device, deviceProp.regsPerBlock);
138  printfQuda("%d - warpSize: %d\n", device, deviceProp.warpSize);
139  printfQuda("%d - memPitch: %lu\n", device, deviceProp.memPitch);
140  printfQuda("%d - maxThreadsPerBlock: %d\n", device, deviceProp.maxThreadsPerBlock);
141  printfQuda("%d - maxThreadsDim[0]: %d\n", device, deviceProp.maxThreadsDim[0]);
142  printfQuda("%d - maxThreadsDim[1]: %d\n", device, deviceProp.maxThreadsDim[1]);
143  printfQuda("%d - maxThreadsDim[2]: %d\n", device, deviceProp.maxThreadsDim[2]);
144  printfQuda("%d - maxGridSize[0]: %d\n", device, deviceProp.maxGridSize[0]);
145  printfQuda("%d - maxGridSize[1]: %d\n", device, deviceProp.maxGridSize[1]);
146  printfQuda("%d - maxGridSize[2]: %d\n", device, deviceProp.maxGridSize[2]);
147  printfQuda("%d - totalConstMem: %lu bytes ( %.2f Kbytes)\n", device, deviceProp.totalConstMem,
148  deviceProp.totalConstMem / (float)1024);
149  printfQuda("%d - compute capability: %d.%d\n", device, deviceProp.major, deviceProp.minor);
150  printfQuda("%d - deviceOverlap %s\n", device, (deviceProp.deviceOverlap ? "true" : "false"));
151  printfQuda("%d - multiProcessorCount %d\n", device, deviceProp.multiProcessorCount);
152  printfQuda("%d - kernelExecTimeoutEnabled %s\n", device,
153  (deviceProp.kernelExecTimeoutEnabled ? "true" : "false"));
154  printfQuda("%d - integrated %s\n", device, (deviceProp.integrated ? "true" : "false"));
155  printfQuda("%d - canMapHostMemory %s\n", device, (deviceProp.canMapHostMemory ? "true" : "false"));
156  switch (deviceProp.computeMode) {
157  case 0: printfQuda("%d - computeMode 0: cudaComputeModeDefault\n", device); break;
158  case 1: printfQuda("%d - computeMode 1: cudaComputeModeExclusive\n", device); break;
159  case 2: printfQuda("%d - computeMode 2: cudaComputeModeProhibited\n", device); break;
160  case 3: printfQuda("%d - computeMode 3: cudaComputeModeExclusiveProcess\n", device); break;
161  default: errorQuda("Unknown deviceProp.computeMode.");
162  }
163 
164  printfQuda("%d - surfaceAlignment %lu\n", device, deviceProp.surfaceAlignment);
165  printfQuda("%d - concurrentKernels %s\n", device, (deviceProp.concurrentKernels ? "true" : "false"));
166  printfQuda("%d - ECCEnabled %s\n", device, (deviceProp.ECCEnabled ? "true" : "false"));
167  printfQuda("%d - pciBusID %d\n", device, deviceProp.pciBusID);
168  printfQuda("%d - pciDeviceID %d\n", device, deviceProp.pciDeviceID);
169  printfQuda("%d - pciDomainID %d\n", device, deviceProp.pciDomainID);
170  printfQuda("%d - tccDriver %s\n", device, (deviceProp.tccDriver ? "true" : "false"));
171  switch (deviceProp.asyncEngineCount) {
172  case 0: printfQuda("%d - asyncEngineCount 1: host -> device only\n", device); break;
173  case 1: printfQuda("%d - asyncEngineCount 2: host <-> device\n", device); break;
174  case 2: printfQuda("%d - asyncEngineCount 0: not supported\n", device); break;
175  default: errorQuda("Unknown deviceProp.asyncEngineCount.");
176  }
177  printfQuda("%d - unifiedAddressing %s\n", device, (deviceProp.unifiedAddressing ? "true" : "false"));
178  printfQuda("%d - memoryClockRate %d kilohertz\n", device, deviceProp.memoryClockRate);
179  printfQuda("%d - memoryBusWidth %d bits\n", device, deviceProp.memoryBusWidth);
180  printfQuda("%d - l2CacheSize %d bytes\n", device, deviceProp.l2CacheSize);
181  printfQuda("%d - maxThreadsPerMultiProcessor %d\n\n", device, deviceProp.maxThreadsPerMultiProcessor);
182  }
183  }
184 
186  {
188 
189  int greatestPriority;
190  int leastPriority;
191  cudaDeviceGetStreamPriorityRange(&leastPriority, &greatestPriority);
192  for (int i = 0; i < Nstream - 1; i++) {
193  cudaStreamCreateWithPriority(&streams[i], cudaStreamDefault, greatestPriority);
194  }
195  cudaStreamCreateWithPriority(&streams[Nstream - 1], cudaStreamDefault, leastPriority);
196 
197  checkCudaError();
198  }
199 
200  void destroy()
201  {
202  if (streams) {
203  for (int i = 0; i < Nstream; i++) cudaStreamDestroy(streams[i]);
204  delete[] streams;
205  streams = nullptr;
206  }
207 
208  char *device_reset_env = getenv("QUDA_DEVICE_RESET");
209  if (device_reset_env && strcmp(device_reset_env, "1") == 0) {
210  // end this CUDA context
211  cudaDeviceReset();
212  }
213  }
214 
216  {
217  static int max_shared_bytes = 0;
218  if (!max_shared_bytes)
219  cudaDeviceGetAttribute(&max_shared_bytes, cudaDevAttrMaxSharedMemoryPerBlockOptin, comm_gpuid());
220  return max_shared_bytes;
221  }
222 
223  namespace profile
224  {
225 
226  void start() { cudaProfilerStart(); }
227 
228  void stop() { cudaProfilerStop(); }
229 
230  } // namespace profile
231 
232  } // namespace device
233 } // namespace quda
int comm_gpuid(void)
qudaStream_t * streams
Definition: device.cpp:15
cudaDeviceProp deviceProp
Definition: device.cpp:14
@ QUDA_SILENT
Definition: enum_quda.h:265
@ QUDA_SUMMARIZE
Definition: enum_quda.h:266
int length[]
void stop()
Stop profiling.
Definition: device.cpp:228
void start()
Start profiling.
Definition: device.cpp:226
void create_context()
Create the streams associated with parallel execution.
Definition: device.cpp:185
void print_device_properties()
Query and print to stdout device properties of all GPUs.
Definition: device.cpp:122
void init(int dev)
Create the device context. Called by initQuda when initializing the library.
Definition: device.cpp:25
size_t max_dynamic_shared_memory()
Returns the maximum dynamic shared memory per block.
Definition: device.cpp:215
void destroy()
Free any persistent context state. Called by endQuda when tearing down the library.
Definition: device.cpp:200
const int Nstream
int setNumaAffinityNVML(int deviceid)
cudaStream_t qudaStream_t
Definition: quda_api.h:9
#define printfQuda(...)
Definition: util_quda.h:114
#define checkCudaErrorNoSync()
Definition: util_quda.h:143
#define checkCudaError()
Definition: util_quda.h:158
QudaVerbosity getVerbosity()
Definition: util_quda.cpp:21
#define warningQuda(...)
Definition: util_quda.h:132
#define errorQuda(...)
Definition: util_quda.h:120