1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved. 2 3 Licensed under the Apache License, Version 2.0 (the "License"); 4 you may not use this file except in compliance with the License. 5 You may obtain a copy of the License at 6 7 http://www.apache.org/licenses/LICENSE-2.0 8 9 Unless required by applicable law or agreed to in writing, software 10 distributed under the License is distributed on an "AS IS" BASIS, 11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 See the License for the specific language governing permissions and 13 limitations under the License. 14 ==============================================================================*/ 15 16 #include "tensorflow/stream_executor/cuda/cuda_diagnostics.h" 17 18 #if !defined(PLATFORM_WINDOWS) 19 #include <dirent.h> 20 #endif 21 22 #include <limits.h> 23 #include <stddef.h> 24 #include <stdio.h> 25 #include <stdlib.h> 26 #include <string.h> 27 #ifdef __APPLE__ 28 #include <IOKit/kext/KextManager.h> 29 #include <mach-o/dyld.h> 30 #else 31 #if !defined(PLATFORM_WINDOWS) 32 #include <link.h> 33 #include <sys/sysmacros.h> 34 #include <unistd.h> 35 #endif 36 #include <sys/stat.h> 37 #endif 38 #include <algorithm> 39 #include <memory> 40 #include <vector> 41 42 #include "tensorflow/stream_executor/lib/process_state.h" 43 #include "tensorflow/stream_executor/lib/error.h" 44 #include "tensorflow/stream_executor/lib/status.h" 45 #include "tensorflow/stream_executor/lib/str_util.h" 46 #include "tensorflow/stream_executor/lib/strcat.h" 47 #include "tensorflow/stream_executor/lib/stringpiece.h" 48 #include "tensorflow/stream_executor/lib/stringprintf.h" 49 #include "tensorflow/stream_executor/platform/logging.h" 50 #include "tensorflow/stream_executor/lib/numbers.h" 51 #include "tensorflow/stream_executor/lib/str_util.h" 52 #include "tensorflow/stream_executor/lib/inlined_vector.h" 53 54 namespace perftools { 55 namespace gputools { 56 namespace cuda { 57 58 #ifdef __APPLE__ 59 static const CFStringRef kDriverKextIdentifier = CFSTR("com.nvidia.CUDA"); 60 #elif !defined(PLATFORM_WINDOWS) 61 static const char *kDriverVersionPath = "/proc/driver/nvidia/version"; 62 #endif 63 64 65 string DriverVersionToString(DriverVersion version) { 66 return port::Printf("%d.%d.%d", std::get<0>(version), std::get<1>(version), std::get<2>(version)); 67 } 68 69 string DriverVersionStatusToString(port::StatusOr<DriverVersion> version) { 70 if (!version.ok()) { 71 return version.status().ToString(); 72 } 73 74 return DriverVersionToString(version.ValueOrDie()); 75 } 76 77 port::StatusOr<DriverVersion> StringToDriverVersion(const string &value) { 78 std::vector<string> pieces = port::Split(value, '.'); 79 if (pieces.size() < 2 || pieces.size() > 4) { 80 return port::Status{ 81 port::error::INVALID_ARGUMENT, 82 port::Printf("expected %%d.%%d, %%d.%%d.%%d, or %%d.%%d.%%d.%%d form for driver version; got \"%s\"", 83 value.c_str())}; 84 } 85 86 int major; 87 int minor; 88 int patch = 0; 89 if (!port::safe_strto32(pieces[0], &major)) { 90 return port::Status{ 91 port::error::INVALID_ARGUMENT, 92 port::Printf("could not parse major version number \"%s\" as an " 93 "integer from string \"%s\"", 94 pieces[0].c_str(), value.c_str())}; 95 } 96 if (!port::safe_strto32(pieces[1], &minor)) { 97 return port::Status{ 98 port::error::INVALID_ARGUMENT, 99 port::Printf("could not parse minor version number \"%s\" as an " 100 "integer from string \"%s\"", 101 pieces[1].c_str(), value.c_str())}; 102 } 103 if (pieces.size() == 3 && !port::safe_strto32(pieces[2], &patch)) { 104 return port::Status{ 105 port::error::INVALID_ARGUMENT, 106 port::Printf("could not parse patch version number \"%s\" as an " 107 "integer from string \"%s\"", 108 pieces[2].c_str(), value.c_str())}; 109 } 110 111 DriverVersion result{major, minor, patch}; 112 VLOG(2) << "version string \"" << value << "\" made value " 113 << DriverVersionToString(result); 114 return result; 115 } 116 117 // -- class Diagnostician 118 119 string Diagnostician::GetDevNodePath(int dev_node_ordinal) { 120 return port::StrCat("/dev/nvidia", dev_node_ordinal); 121 } 122 123 void Diagnostician::LogDiagnosticInformation() { 124 #ifdef __APPLE__ 125 CFStringRef kext_ids[1]; 126 kext_ids[0] = kDriverKextIdentifier; 127 CFArrayRef kext_id_query = CFArrayCreate(nullptr, (const void**)kext_ids, 1, &kCFTypeArrayCallBacks); 128 CFDictionaryRef kext_infos = KextManagerCopyLoadedKextInfo(kext_id_query, nullptr); 129 CFRelease(kext_id_query); 130 131 CFDictionaryRef cuda_driver_info = nullptr; 132 if (CFDictionaryGetValueIfPresent(kext_infos, kDriverKextIdentifier, (const void**)&cuda_driver_info)) { 133 bool started = CFBooleanGetValue((CFBooleanRef)CFDictionaryGetValue(cuda_driver_info, CFSTR("OSBundleStarted"))); 134 if (!started) { 135 LOG(INFO) << "kernel driver is installed, but does not appear to be running on this host " 136 << "(" << port::Hostname() << ")"; 137 } 138 } else { 139 LOG(INFO) << "kernel driver does not appear to be installed on this host " 140 << "(" << port::Hostname() << ")"; 141 } 142 CFRelease(kext_infos); 143 #elif !defined(PLATFORM_WINDOWS) 144 if (access(kDriverVersionPath, F_OK) != 0) { 145 LOG(INFO) << "kernel driver does not appear to be running on this host " 146 << "(" << port::Hostname() << "): " 147 << "/proc/driver/nvidia/version does not exist"; 148 return; 149 } 150 auto dev0_path = GetDevNodePath(0); 151 if (access(dev0_path.c_str(), F_OK) != 0) { 152 LOG(INFO) << "no NVIDIA GPU device is present: " << dev0_path 153 << " does not exist"; 154 return; 155 } 156 #endif 157 158 LOG(INFO) << "retrieving CUDA diagnostic information for host: " 159 << port::Hostname(); 160 161 LogDriverVersionInformation(); 162 } 163 164 /* static */ void Diagnostician::LogDriverVersionInformation() { 165 LOG(INFO) << "hostname: " << port::Hostname(); 166 #ifndef PLATFORM_WINDOWS 167 if (VLOG_IS_ON(1)) { 168 const char *value = getenv("LD_LIBRARY_PATH"); 169 string library_path = value == nullptr ? "" : value; 170 VLOG(1) << "LD_LIBRARY_PATH is: \"" << library_path << "\""; 171 172 std::vector<string> pieces = port::Split(library_path, ':'); 173 for (const auto &piece : pieces) { 174 if (piece.empty()) { 175 continue; 176 } 177 DIR *dir = opendir(piece.c_str()); 178 if (dir == nullptr) { 179 VLOG(1) << "could not open \"" << piece << "\""; 180 continue; 181 } 182 while (dirent *entity = readdir(dir)) { 183 VLOG(1) << piece << " :: " << entity->d_name; 184 } 185 closedir(dir); 186 } 187 } 188 port::StatusOr<DriverVersion> dso_version = FindDsoVersion(); 189 LOG(INFO) << "libcuda reported version is: " 190 << DriverVersionStatusToString(dso_version); 191 192 port::StatusOr<DriverVersion> kernel_version = FindKernelDriverVersion(); 193 LOG(INFO) << "kernel reported version is: " 194 << DriverVersionStatusToString(kernel_version); 195 #endif 196 197 // OS X kernel driver does not report version accurately 198 #if !defined(__APPLE__) && !defined(PLATFORM_WINDOWS) 199 if (kernel_version.ok() && dso_version.ok()) { 200 WarnOnDsoKernelMismatch(dso_version, kernel_version); 201 } 202 #endif 203 } 204 205 // Iterates through loaded DSOs with DlIteratePhdrCallback to find the 206 // driver-interfacing DSO version number. Returns it as a string. 207 port::StatusOr<DriverVersion> Diagnostician::FindDsoVersion() { 208 port::StatusOr<DriverVersion> result{port::Status{ 209 port::error::NOT_FOUND, 210 "was unable to find libcuda.so DSO loaded into this program"}}; 211 212 #if defined(__APPLE__) 213 // OSX CUDA libraries have names like: libcuda_310.41.15_mercury.dylib 214 const string prefix("libcuda_"); 215 const string suffix("_mercury.dylib"); 216 for (uint32_t image_index = 0; image_index < _dyld_image_count(); ++image_index) { 217 const string path(_dyld_get_image_name(image_index)); 218 const size_t suffix_pos = path.rfind(suffix); 219 const size_t prefix_pos = path.rfind(prefix, suffix_pos); 220 if (prefix_pos == string::npos || 221 suffix_pos == string::npos) { 222 // no match 223 continue; 224 } 225 const size_t start = prefix_pos + prefix.size(); 226 if (start >= suffix_pos) { 227 // version not included 228 continue; 229 } 230 const size_t length = suffix_pos - start; 231 const string version = path.substr(start, length); 232 result = StringToDriverVersion(version); 233 } 234 #else 235 #if !defined(PLATFORM_WINDOWS) && !defined(ANDROID_TEGRA) 236 // Callback used when iterating through DSOs. Looks for the driver-interfacing 237 // DSO and yields its version number into the callback data, when found. 238 auto iterate_phdr = 239 [](struct dl_phdr_info *info, size_t size, void *data) -> int { 240 if (strstr(info->dlpi_name, "libcuda.so.1")) { 241 VLOG(1) << "found DLL info with name: " << info->dlpi_name; 242 char resolved_path[PATH_MAX] = {0}; 243 if (realpath(info->dlpi_name, resolved_path) == nullptr) { 244 return 0; 245 } 246 VLOG(1) << "found DLL info with resolved path: " << resolved_path; 247 const char *slash = rindex(resolved_path, '/'); 248 if (slash == nullptr) { 249 return 0; 250 } 251 const char *so_suffix = ".so."; 252 const char *dot = strstr(slash, so_suffix); 253 if (dot == nullptr) { 254 return 0; 255 } 256 string dso_version = dot + strlen(so_suffix); 257 // TODO(b/22689637): Eliminate the explicit namespace if possible. 258 auto stripped_dso_version = port::StripSuffixString(dso_version, ".ld64"); 259 auto result = static_cast<port::StatusOr<DriverVersion> *>(data); 260 *result = StringToDriverVersion(stripped_dso_version); 261 return 1; 262 } 263 return 0; 264 }; 265 266 dl_iterate_phdr(iterate_phdr, &result); 267 #endif 268 #endif 269 270 return result; 271 } 272 273 port::StatusOr<DriverVersion> Diagnostician::FindKernelModuleVersion( 274 const string &driver_version_file_contents) { 275 static const char *kDriverFilePrelude = "Kernel Module "; 276 size_t offset = driver_version_file_contents.find(kDriverFilePrelude); 277 if (offset == string::npos) { 278 return port::Status{ 279 port::error::NOT_FOUND, 280 port::StrCat("could not find kernel module information in " 281 "driver version file contents: \"", 282 driver_version_file_contents, "\"")}; 283 } 284 285 string version_and_rest = driver_version_file_contents.substr( 286 offset + strlen(kDriverFilePrelude), string::npos); 287 size_t space_index = version_and_rest.find(" "); 288 auto kernel_version = version_and_rest.substr(0, space_index); 289 // TODO(b/22689637): Eliminate the explicit namespace if possible. 290 auto stripped_kernel_version = 291 port::StripSuffixString(kernel_version, ".ld64"); 292 return StringToDriverVersion(stripped_kernel_version); 293 } 294 295 void Diagnostician::WarnOnDsoKernelMismatch( 296 port::StatusOr<DriverVersion> dso_version, 297 port::StatusOr<DriverVersion> kernel_version) { 298 if (kernel_version.ok() && dso_version.ok() && 299 dso_version.ValueOrDie() == kernel_version.ValueOrDie()) { 300 LOG(INFO) << "kernel version seems to match DSO: " 301 << DriverVersionToString(kernel_version.ValueOrDie()); 302 } else { 303 LOG(ERROR) << "kernel version " 304 << DriverVersionStatusToString(kernel_version) 305 << " does not match DSO version " 306 << DriverVersionStatusToString(dso_version) 307 << " -- cannot find working devices in this configuration"; 308 } 309 } 310 311 312 port::StatusOr<DriverVersion> Diagnostician::FindKernelDriverVersion() { 313 #if defined(__APPLE__) 314 CFStringRef kext_ids[1]; 315 kext_ids[0] = kDriverKextIdentifier; 316 CFArrayRef kext_id_query = CFArrayCreate(nullptr, (const void**)kext_ids, 1, &kCFTypeArrayCallBacks); 317 CFDictionaryRef kext_infos = KextManagerCopyLoadedKextInfo(kext_id_query, nullptr); 318 CFRelease(kext_id_query); 319 320 CFDictionaryRef cuda_driver_info = nullptr; 321 if (CFDictionaryGetValueIfPresent(kext_infos, kDriverKextIdentifier, (const void**)&cuda_driver_info)) { 322 // NOTE: OSX CUDA driver does not currently store the same driver version 323 // in kCFBundleVersionKey as is returned by cuDriverGetVersion 324 CFRelease(kext_infos); 325 const CFStringRef str = (CFStringRef)CFDictionaryGetValue( 326 cuda_driver_info, kCFBundleVersionKey); 327 const char *version = CFStringGetCStringPtr(str, kCFStringEncodingUTF8); 328 329 // version can be NULL in which case treat it as empty string 330 // see 331 // https://developer.apple.com/library/mac/documentation/CoreFoundation/Conceptual/CFStrings/Articles/AccessingContents.html#//apple_ref/doc/uid/20001184-100980-TPXREF112 332 if (version == NULL) { 333 return StringToDriverVersion(""); 334 } 335 return StringToDriverVersion(version); 336 } 337 CFRelease(kext_infos); 338 auto status = 339 port::Status{port::error::INTERNAL, 340 port::StrCat("failed to read driver bundle version: ", 341 CFStringGetCStringPtr(kDriverKextIdentifier, kCFStringEncodingUTF8)) 342 }; 343 return status; 344 #elif defined(PLATFORM_WINDOWS) 345 auto status = 346 port::Status{port::error::UNIMPLEMENTED, 347 "kernel reported driver version not implemented on Windows" 348 }; 349 return status; 350 #else 351 FILE *driver_version_file = fopen(kDriverVersionPath, "r"); 352 if (driver_version_file == nullptr) { 353 return port::Status{ 354 port::error::PERMISSION_DENIED, 355 port::StrCat("could not open driver version path for reading: ", 356 kDriverVersionPath)}; 357 } 358 359 static const int kContentsSize = 1024; 360 port::InlinedVector<char, 4> contents(kContentsSize); 361 size_t retcode = 362 fread(contents.begin(), 1, kContentsSize - 2, driver_version_file); 363 if (retcode < kContentsSize - 1) { 364 contents[retcode] = '\0'; 365 } 366 contents[kContentsSize - 1] = '\0'; 367 368 if (retcode != 0) { 369 VLOG(1) << "driver version file contents: \"\"\"" << contents.begin() 370 << "\"\"\""; 371 fclose(driver_version_file); 372 return FindKernelModuleVersion(contents.begin()); 373 } 374 375 auto status = 376 port::Status{port::error::INTERNAL, 377 port::StrCat("failed to read driver version file contents: ", 378 kDriverVersionPath, "; ferror: ", 379 ferror(driver_version_file))}; 380 fclose(driver_version_file); 381 return status; 382 #endif 383 } 384 385 386 } // namespace cuda 387 } // namespace gputools 388 } // namespace perftools 389