1 #include <cuda_runtime.h>
2 #include <cuda_profiler_api.h>
23 static bool initialized =
false;
27 if (initialized)
return;
31 cudaDriverGetVersion(&driver_version);
32 printfQuda(
"CUDA Driver version = %d\n", driver_version);
35 cudaRuntimeGetVersion(&runtime_version);
36 printfQuda(
"CUDA Runtime version = %d\n", runtime_version);
39 nvmlReturn_t result = nvmlInit();
40 if (NVML_SUCCESS != result)
errorQuda(
"NVML Init failed with error %d", result);
42 char graphics_version[
length];
43 result = nvmlSystemGetDriverVersion(graphics_version,
length);
44 if (NVML_SUCCESS != result)
errorQuda(
"nvmlSystemGetDriverVersion failed with error %d", result);
45 printfQuda(
"Graphic driver version = %s\n", graphics_version);
46 result = nvmlShutdown();
47 if (NVML_SUCCESS != result)
errorQuda(
"NVML Shutdown failed with error %d", result);
51 cudaGetDeviceCount(&deviceCount);
52 if (deviceCount == 0) {
errorQuda(
"No CUDA devices found"); }
54 for (
int i = 0; i < deviceCount; i++) {
71 const int my_major = __COMPUTE_CAPABILITY__ / 100;
72 const int my_minor = (__COMPUTE_CAPABILITY__ - my_major * 100) / 10;
75 errorQuda(
"** Running on a device with compute capability %i.%i but QUDA was compiled for %i.%i. ** \n --- "
76 "Please set the correct QUDA_GPU_ARCH when running cmake.\n",
81 char *allow_jit_env = getenv(
"QUDA_ALLOW_JIT");
82 if (allow_jit_env && strcmp(allow_jit_env,
"1") == 0) {
84 warningQuda(
"** Running on a device with compute capability %i.%i but QUDA was compiled for %i.%i. **\n -- "
85 "Jitting the PTX since QUDA_ALLOW_JIT=1 was set. Note that this will take some time.\n",
88 errorQuda(
"** Running on a device with compute capability %i.%i but QUDA was compiled for %i.%i. **\n --- "
89 "Please set the correct QUDA_GPU_ARCH when running cmake.\n If you want the PTX to be jitted for "
90 "your current GPU arch please set the enviroment variable QUDA_ALLOW_JIT=1.",
97 "** Running on a device with compute capability %i.%i but QUDA was compiled for %i.%i. **\n -- This might "
98 "result in a lower performance. Please consider adjusting QUDA_GPU_ARCH when running cmake.\n",
109 char *enable_numa_env = getenv(
"QUDA_ENABLE_NUMA");
110 if (enable_numa_env && strcmp(enable_numa_env,
"0") == 0) {
117 cudaDeviceSetCacheConfig(cudaFuncCachePreferL1);
126 cudaGetDeviceCount(&dev_count);
128 for (device = 0; device < dev_count; device++) {
133 printfQuda(
"%d - totalGlobalMem: %lu bytes ( %.2f Gbytes)\n", device,
deviceProp.totalGlobalMem,
134 deviceProp.totalGlobalMem / (
float)(1024 * 1024 * 1024));
135 printfQuda(
"%d - sharedMemPerBlock: %lu bytes ( %.2f Kbytes)\n", device,
deviceProp.sharedMemPerBlock,
152 printfQuda(
"%d - kernelExecTimeoutEnabled %s\n", device,
153 (
deviceProp.kernelExecTimeoutEnabled ?
"true" :
"false"));
155 printfQuda(
"%d - canMapHostMemory %s\n", device, (
deviceProp.canMapHostMemory ?
"true" :
"false"));
157 case 0:
printfQuda(
"%d - computeMode 0: cudaComputeModeDefault\n", device);
break;
158 case 1:
printfQuda(
"%d - computeMode 1: cudaComputeModeExclusive\n", device);
break;
159 case 2:
printfQuda(
"%d - computeMode 2: cudaComputeModeProhibited\n", device);
break;
160 case 3:
printfQuda(
"%d - computeMode 3: cudaComputeModeExclusiveProcess\n", device);
break;
161 default:
errorQuda(
"Unknown deviceProp.computeMode.");
165 printfQuda(
"%d - concurrentKernels %s\n", device, (
deviceProp.concurrentKernels ?
"true" :
"false"));
172 case 0:
printfQuda(
"%d - asyncEngineCount 1: host -> device only\n", device);
break;
173 case 1:
printfQuda(
"%d - asyncEngineCount 2: host <-> device\n", device);
break;
174 case 2:
printfQuda(
"%d - asyncEngineCount 0: not supported\n", device);
break;
175 default:
errorQuda(
"Unknown deviceProp.asyncEngineCount.");
177 printfQuda(
"%d - unifiedAddressing %s\n", device, (
deviceProp.unifiedAddressing ?
"true" :
"false"));
181 printfQuda(
"%d - maxThreadsPerMultiProcessor %d\n\n", device,
deviceProp.maxThreadsPerMultiProcessor);
189 int greatestPriority;
191 cudaDeviceGetStreamPriorityRange(&leastPriority, &greatestPriority);
192 for (
int i = 0; i <
Nstream - 1; i++) {
193 cudaStreamCreateWithPriority(&
streams[i], cudaStreamDefault, greatestPriority);
195 cudaStreamCreateWithPriority(&
streams[
Nstream - 1], cudaStreamDefault, leastPriority);
208 char *device_reset_env = getenv(
"QUDA_DEVICE_RESET");
209 if (device_reset_env && strcmp(device_reset_env,
"1") == 0) {
217 static int max_shared_bytes = 0;
218 if (!max_shared_bytes)
219 cudaDeviceGetAttribute(&max_shared_bytes, cudaDevAttrMaxSharedMemoryPerBlockOptin,
comm_gpuid());
220 return max_shared_bytes;
226 void start() { cudaProfilerStart(); }
228 void stop() { cudaProfilerStop(); }
cudaDeviceProp deviceProp
void stop()
Stop profiling.
void start()
Start profiling.
void create_context()
Create the streams associated with parallel execution.
void print_device_properties()
Query and print to stdout device properties of all GPUs.
void init(int dev)
Create the device context. Called by initQuda when initializing the library.
size_t max_dynamic_shared_memory()
Returns the maximum dynamic shared memory per block.
void destroy()
Free any persistent context state. Called by endQuda when tearing down the library.
int setNumaAffinityNVML(int deviceid)
cudaStream_t qudaStream_t
#define checkCudaErrorNoSync()
QudaVerbosity getVerbosity()