Home | History | Annotate | Download | only in cuda
      1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
      2 
      3 Licensed under the Apache License, Version 2.0 (the "License");
      4 you may not use this file except in compliance with the License.
      5 You may obtain a copy of the License at
      6 
      7     http://www.apache.org/licenses/LICENSE-2.0
      8 
      9 Unless required by applicable law or agreed to in writing, software
     10 distributed under the License is distributed on an "AS IS" BASIS,
     11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     12 See the License for the specific language governing permissions and
     13 limitations under the License.
     14 ==============================================================================*/
     15 
     16 #include "tensorflow/stream_executor/cuda/cuda_diagnostics.h"
     17 
     18 #if !defined(PLATFORM_WINDOWS)
     19 #include <dirent.h>
     20 #endif
     21 
     22 #include <limits.h>
     23 #include <stddef.h>
     24 #include <stdio.h>
     25 #include <stdlib.h>
     26 #include <string.h>
     27 #ifdef __APPLE__
     28 #include <IOKit/kext/KextManager.h>
     29 #include <mach-o/dyld.h>
     30 #else
     31 #if !defined(PLATFORM_WINDOWS)
     32 #include <link.h>
     33 #include <sys/sysmacros.h>
     34 #include <unistd.h>
     35 #endif
     36 #include <sys/stat.h>
     37 #endif
     38 #include <algorithm>
     39 #include <memory>
     40 #include <vector>
     41 
     42 #include "tensorflow/stream_executor/lib/process_state.h"
     43 #include "tensorflow/stream_executor/lib/error.h"
     44 #include "tensorflow/stream_executor/lib/status.h"
     45 #include "tensorflow/stream_executor/lib/str_util.h"
     46 #include "tensorflow/stream_executor/lib/strcat.h"
     47 #include "tensorflow/stream_executor/lib/stringpiece.h"
     48 #include "tensorflow/stream_executor/lib/stringprintf.h"
     49 #include "tensorflow/stream_executor/platform/logging.h"
     50 #include "tensorflow/stream_executor/lib/numbers.h"
     51 #include "tensorflow/stream_executor/lib/str_util.h"
     52 #include "tensorflow/stream_executor/lib/inlined_vector.h"
     53 
     54 namespace perftools {
     55 namespace gputools {
     56 namespace cuda {
     57 
     58 #ifdef __APPLE__
     59 static const CFStringRef kDriverKextIdentifier = CFSTR("com.nvidia.CUDA");
     60 #elif !defined(PLATFORM_WINDOWS)
     61 static const char *kDriverVersionPath = "/proc/driver/nvidia/version";
     62 #endif
     63 
     64 
     65 string DriverVersionToString(DriverVersion version) {
     66   return port::Printf("%d.%d.%d", std::get<0>(version), std::get<1>(version), std::get<2>(version));
     67 }
     68 
     69 string DriverVersionStatusToString(port::StatusOr<DriverVersion> version) {
     70   if (!version.ok()) {
     71     return version.status().ToString();
     72   }
     73 
     74   return DriverVersionToString(version.ValueOrDie());
     75 }
     76 
     77 port::StatusOr<DriverVersion> StringToDriverVersion(const string &value) {
     78   std::vector<string> pieces = port::Split(value, '.');
     79   if (pieces.size() < 2 || pieces.size() > 4) {
     80     return port::Status{
     81         port::error::INVALID_ARGUMENT,
     82         port::Printf("expected %%d.%%d, %%d.%%d.%%d, or %%d.%%d.%%d.%%d form for driver version; got \"%s\"",
     83                      value.c_str())};
     84   }
     85 
     86   int major;
     87   int minor;
     88   int patch = 0;
     89   if (!port::safe_strto32(pieces[0], &major)) {
     90     return port::Status{
     91         port::error::INVALID_ARGUMENT,
     92         port::Printf("could not parse major version number \"%s\" as an "
     93                      "integer from string \"%s\"",
     94                      pieces[0].c_str(), value.c_str())};
     95   }
     96   if (!port::safe_strto32(pieces[1], &minor)) {
     97     return port::Status{
     98         port::error::INVALID_ARGUMENT,
     99         port::Printf("could not parse minor version number \"%s\" as an "
    100                      "integer from string \"%s\"",
    101                      pieces[1].c_str(), value.c_str())};
    102   }
    103   if (pieces.size() == 3 && !port::safe_strto32(pieces[2], &patch)) {
    104     return port::Status{
    105       port::error::INVALID_ARGUMENT,
    106       port::Printf("could not parse patch version number \"%s\" as an "
    107                      "integer from string \"%s\"",
    108                    pieces[2].c_str(), value.c_str())};
    109   }
    110 
    111   DriverVersion result{major, minor, patch};
    112   VLOG(2) << "version string \"" << value << "\" made value "
    113           << DriverVersionToString(result);
    114   return result;
    115 }
    116 
    117 // -- class Diagnostician
    118 
    119 string Diagnostician::GetDevNodePath(int dev_node_ordinal) {
    120   return port::StrCat("/dev/nvidia", dev_node_ordinal);
    121 }
    122 
    123 void Diagnostician::LogDiagnosticInformation() {
    124 #ifdef __APPLE__
    125   CFStringRef kext_ids[1];
    126   kext_ids[0] = kDriverKextIdentifier;
    127   CFArrayRef kext_id_query = CFArrayCreate(nullptr, (const void**)kext_ids, 1, &kCFTypeArrayCallBacks);
    128   CFDictionaryRef kext_infos = KextManagerCopyLoadedKextInfo(kext_id_query, nullptr);
    129   CFRelease(kext_id_query);
    130 
    131   CFDictionaryRef cuda_driver_info = nullptr;
    132   if (CFDictionaryGetValueIfPresent(kext_infos, kDriverKextIdentifier, (const void**)&cuda_driver_info)) {
    133     bool started = CFBooleanGetValue((CFBooleanRef)CFDictionaryGetValue(cuda_driver_info, CFSTR("OSBundleStarted")));
    134     if (!started) {
    135       LOG(INFO) << "kernel driver is installed, but does not appear to be running on this host "
    136                 << "(" << port::Hostname() << ")";
    137     }
    138   } else {
    139     LOG(INFO) << "kernel driver does not appear to be installed on this host "
    140               << "(" << port::Hostname() << ")";
    141   }
    142   CFRelease(kext_infos);
    143 #elif !defined(PLATFORM_WINDOWS)
    144   if (access(kDriverVersionPath, F_OK) != 0) {
    145     LOG(INFO) << "kernel driver does not appear to be running on this host "
    146               << "(" << port::Hostname() << "): "
    147               << "/proc/driver/nvidia/version does not exist";
    148     return;
    149   }
    150   auto dev0_path = GetDevNodePath(0);
    151   if (access(dev0_path.c_str(), F_OK) != 0) {
    152     LOG(INFO) << "no NVIDIA GPU device is present: " << dev0_path
    153               << " does not exist";
    154     return;
    155   }
    156 #endif
    157 
    158   LOG(INFO) << "retrieving CUDA diagnostic information for host: "
    159             << port::Hostname();
    160 
    161   LogDriverVersionInformation();
    162 }
    163 
    164 /* static */ void Diagnostician::LogDriverVersionInformation() {
    165   LOG(INFO) << "hostname: " << port::Hostname();
    166 #ifndef PLATFORM_WINDOWS
    167   if (VLOG_IS_ON(1)) {
    168     const char *value = getenv("LD_LIBRARY_PATH");
    169     string library_path = value == nullptr ? "" : value;
    170     VLOG(1) << "LD_LIBRARY_PATH is: \"" << library_path << "\"";
    171 
    172     std::vector<string> pieces = port::Split(library_path, ':');
    173     for (const auto &piece : pieces) {
    174       if (piece.empty()) {
    175         continue;
    176       }
    177       DIR *dir = opendir(piece.c_str());
    178       if (dir == nullptr) {
    179         VLOG(1) << "could not open \"" << piece << "\"";
    180         continue;
    181       }
    182       while (dirent *entity = readdir(dir)) {
    183         VLOG(1) << piece << " :: " << entity->d_name;
    184       }
    185       closedir(dir);
    186     }
    187   }
    188   port::StatusOr<DriverVersion> dso_version = FindDsoVersion();
    189   LOG(INFO) << "libcuda reported version is: "
    190             << DriverVersionStatusToString(dso_version);
    191 
    192   port::StatusOr<DriverVersion> kernel_version = FindKernelDriverVersion();
    193   LOG(INFO) << "kernel reported version is: "
    194 	  << DriverVersionStatusToString(kernel_version);
    195 #endif
    196 
    197   // OS X kernel driver does not report version accurately
    198 #if !defined(__APPLE__) && !defined(PLATFORM_WINDOWS)
    199   if (kernel_version.ok() && dso_version.ok()) {
    200     WarnOnDsoKernelMismatch(dso_version, kernel_version);
    201   }
    202 #endif
    203 }
    204 
    205 // Iterates through loaded DSOs with DlIteratePhdrCallback to find the
    206 // driver-interfacing DSO version number. Returns it as a string.
    207 port::StatusOr<DriverVersion> Diagnostician::FindDsoVersion() {
    208   port::StatusOr<DriverVersion> result{port::Status{
    209       port::error::NOT_FOUND,
    210       "was unable to find libcuda.so DSO loaded into this program"}};
    211 
    212 #if defined(__APPLE__)
    213     // OSX CUDA libraries have names like: libcuda_310.41.15_mercury.dylib
    214     const string prefix("libcuda_");
    215     const string suffix("_mercury.dylib");
    216     for (uint32_t image_index = 0; image_index < _dyld_image_count(); ++image_index) {
    217       const string path(_dyld_get_image_name(image_index));
    218       const size_t suffix_pos = path.rfind(suffix);
    219       const size_t prefix_pos = path.rfind(prefix, suffix_pos);
    220       if (prefix_pos == string::npos ||
    221           suffix_pos == string::npos) {
    222         // no match
    223         continue;
    224       }
    225       const size_t start = prefix_pos + prefix.size();
    226       if (start >= suffix_pos) {
    227         // version not included
    228         continue;
    229       }
    230       const size_t length = suffix_pos - start;
    231       const string version = path.substr(start, length);
    232       result = StringToDriverVersion(version);
    233     }
    234 #else
    235 #if !defined(PLATFORM_WINDOWS) && !defined(ANDROID_TEGRA)
    236   // Callback used when iterating through DSOs. Looks for the driver-interfacing
    237   // DSO and yields its version number into the callback data, when found.
    238   auto iterate_phdr =
    239       [](struct dl_phdr_info *info, size_t size, void *data) -> int {
    240     if (strstr(info->dlpi_name, "libcuda.so.1")) {
    241       VLOG(1) << "found DLL info with name: " << info->dlpi_name;
    242       char resolved_path[PATH_MAX] = {0};
    243       if (realpath(info->dlpi_name, resolved_path) == nullptr) {
    244         return 0;
    245       }
    246       VLOG(1) << "found DLL info with resolved path: " << resolved_path;
    247       const char *slash = rindex(resolved_path, '/');
    248       if (slash == nullptr) {
    249         return 0;
    250       }
    251       const char *so_suffix = ".so.";
    252       const char *dot = strstr(slash, so_suffix);
    253       if (dot == nullptr) {
    254         return 0;
    255       }
    256       string dso_version = dot + strlen(so_suffix);
    257       // TODO(b/22689637): Eliminate the explicit namespace if possible.
    258       auto stripped_dso_version = port::StripSuffixString(dso_version, ".ld64");
    259       auto result = static_cast<port::StatusOr<DriverVersion> *>(data);
    260       *result = StringToDriverVersion(stripped_dso_version);
    261       return 1;
    262     }
    263     return 0;
    264   };
    265 
    266   dl_iterate_phdr(iterate_phdr, &result);
    267 #endif
    268 #endif
    269 
    270   return result;
    271 }
    272 
    273 port::StatusOr<DriverVersion> Diagnostician::FindKernelModuleVersion(
    274     const string &driver_version_file_contents) {
    275   static const char *kDriverFilePrelude = "Kernel Module  ";
    276   size_t offset = driver_version_file_contents.find(kDriverFilePrelude);
    277   if (offset == string::npos) {
    278     return port::Status{
    279         port::error::NOT_FOUND,
    280         port::StrCat("could not find kernel module information in "
    281                      "driver version file contents: \"",
    282                      driver_version_file_contents, "\"")};
    283   }
    284 
    285   string version_and_rest = driver_version_file_contents.substr(
    286       offset + strlen(kDriverFilePrelude), string::npos);
    287   size_t space_index = version_and_rest.find(" ");
    288   auto kernel_version = version_and_rest.substr(0, space_index);
    289   // TODO(b/22689637): Eliminate the explicit namespace if possible.
    290   auto stripped_kernel_version =
    291       port::StripSuffixString(kernel_version, ".ld64");
    292   return StringToDriverVersion(stripped_kernel_version);
    293 }
    294 
    295 void Diagnostician::WarnOnDsoKernelMismatch(
    296     port::StatusOr<DriverVersion> dso_version,
    297     port::StatusOr<DriverVersion> kernel_version) {
    298   if (kernel_version.ok() && dso_version.ok() &&
    299       dso_version.ValueOrDie() == kernel_version.ValueOrDie()) {
    300     LOG(INFO) << "kernel version seems to match DSO: "
    301               << DriverVersionToString(kernel_version.ValueOrDie());
    302   } else {
    303     LOG(ERROR) << "kernel version "
    304                << DriverVersionStatusToString(kernel_version)
    305                << " does not match DSO version "
    306                << DriverVersionStatusToString(dso_version)
    307                << " -- cannot find working devices in this configuration";
    308   }
    309 }
    310 
    311 
    312 port::StatusOr<DriverVersion> Diagnostician::FindKernelDriverVersion() {
    313 #if defined(__APPLE__)
    314   CFStringRef kext_ids[1];
    315   kext_ids[0] = kDriverKextIdentifier;
    316   CFArrayRef kext_id_query = CFArrayCreate(nullptr, (const void**)kext_ids, 1, &kCFTypeArrayCallBacks);
    317   CFDictionaryRef kext_infos = KextManagerCopyLoadedKextInfo(kext_id_query, nullptr);
    318   CFRelease(kext_id_query);
    319 
    320   CFDictionaryRef cuda_driver_info = nullptr;
    321   if (CFDictionaryGetValueIfPresent(kext_infos, kDriverKextIdentifier, (const void**)&cuda_driver_info)) {
    322     // NOTE: OSX CUDA driver does not currently store the same driver version
    323     // in kCFBundleVersionKey as is returned by cuDriverGetVersion
    324     CFRelease(kext_infos);
    325     const CFStringRef str = (CFStringRef)CFDictionaryGetValue(
    326         cuda_driver_info, kCFBundleVersionKey);
    327     const char *version = CFStringGetCStringPtr(str, kCFStringEncodingUTF8);
    328 
    329     // version can be NULL in which case treat it as empty string
    330     // see
    331     // https://developer.apple.com/library/mac/documentation/CoreFoundation/Conceptual/CFStrings/Articles/AccessingContents.html#//apple_ref/doc/uid/20001184-100980-TPXREF112
    332     if (version == NULL) {
    333       return StringToDriverVersion("");
    334     }
    335     return StringToDriverVersion(version);
    336   }
    337   CFRelease(kext_infos);
    338   auto status =
    339     port::Status{port::error::INTERNAL,
    340                  port::StrCat("failed to read driver bundle version: ",
    341                               CFStringGetCStringPtr(kDriverKextIdentifier, kCFStringEncodingUTF8))
    342     };
    343   return status;
    344 #elif defined(PLATFORM_WINDOWS)
    345   auto status =
    346     port::Status{port::error::UNIMPLEMENTED,
    347                  "kernel reported driver version not implemented on Windows"
    348     };
    349   return status;
    350 #else
    351   FILE *driver_version_file = fopen(kDriverVersionPath, "r");
    352   if (driver_version_file == nullptr) {
    353     return port::Status{
    354         port::error::PERMISSION_DENIED,
    355         port::StrCat("could not open driver version path for reading: ",
    356                      kDriverVersionPath)};
    357   }
    358 
    359   static const int kContentsSize = 1024;
    360   port::InlinedVector<char, 4> contents(kContentsSize);
    361   size_t retcode =
    362       fread(contents.begin(), 1, kContentsSize - 2, driver_version_file);
    363   if (retcode < kContentsSize - 1) {
    364     contents[retcode] = '\0';
    365   }
    366   contents[kContentsSize - 1] = '\0';
    367 
    368   if (retcode != 0) {
    369     VLOG(1) << "driver version file contents: \"\"\"" << contents.begin()
    370             << "\"\"\"";
    371     fclose(driver_version_file);
    372     return FindKernelModuleVersion(contents.begin());
    373   }
    374 
    375   auto status =
    376       port::Status{port::error::INTERNAL,
    377                    port::StrCat("failed to read driver version file contents: ",
    378                                 kDriverVersionPath, "; ferror: ",
    379                                 ferror(driver_version_file))};
    380   fclose(driver_version_file);
    381   return status;
    382 #endif
    383 }
    384 
    385 
    386 }  // namespace cuda
    387 }  // namespace gputools
    388 }  // namespace perftools
    389