Home | History | Annotate | Download | only in stream_executor
      1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
      2 
      3 Licensed under the Apache License, Version 2.0 (the "License");
      4 you may not use this file except in compliance with the License.
      5 You may obtain a copy of the License at
      6 
      7     http://www.apache.org/licenses/LICENSE-2.0
      8 
      9 Unless required by applicable law or agreed to in writing, software
     10 distributed under the License is distributed on an "AS IS" BASIS,
     11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     12 See the License for the specific language governing permissions and
     13 limitations under the License.
     14 ==============================================================================*/
     15 
     16 #include "tensorflow/stream_executor/device_description.h"
     17 
     18 #include <algorithm>
     19 
     20 #include "absl/strings/str_cat.h"
     21 #include "tensorflow/stream_executor/lib/human_readable.h"
     22 #include "tensorflow/stream_executor/lib/mathutil.h"
     23 
     24 namespace stream_executor {
     25 
     26 static const uint64 kUninitializedUint64 = -1ULL;
     27 /* static */ const char *DeviceDescription::kUndefinedString = "<undefined>";
     28 
     29 DeviceDescription::DeviceDescription()
     30     : device_vendor_(kUndefinedString),
     31       platform_version_(kUndefinedString),
     32       driver_version_(kUndefinedString),
     33       runtime_version_(kUndefinedString),
     34       pci_bus_id_(kUndefinedString),
     35       name_(kUndefinedString),
     36       thread_dim_limit_(kUninitializedUint64, kUninitializedUint64,
     37                         kUninitializedUint64),
     38       block_dim_limit_(kUninitializedUint64, kUninitializedUint64,
     39                        kUninitializedUint64),
     40       blocks_per_core_limit_(kUninitializedUint64),
     41       threads_per_core_limit_(kUninitializedUint64),
     42       threads_per_block_limit_(kUninitializedUint64),
     43       threads_per_warp_(kUninitializedUint64),
     44       registers_per_core_limit_(kUninitializedUint64),
     45       registers_per_block_limit_(kUninitializedUint64),
     46       device_address_bits_(kUninitializedUint64),
     47       device_memory_size_(kUninitializedUint64),
     48       memory_bandwidth_(kUninitializedUint64),
     49       shared_memory_per_core_(kUninitializedUint64),
     50       shared_memory_per_block_(kUninitializedUint64),
     51       clock_rate_ghz_(-1.0),
     52       cuda_compute_capability_major_(-1),
     53       cuda_compute_capability_minor_(-1),
     54       rocm_amdgpu_isa_version_(-1),
     55       numa_node_(-1),
     56       core_count_(-1),
     57       ecc_enabled_(false) {}
     58 
     59 std::unique_ptr<std::map<string, string>> DeviceDescription::ToMap() const {
     60   std::unique_ptr<std::map<string, string>> owned_result{
     61       new std::map<string, string>};
     62   std::map<string, string> &result = *owned_result;
     63   result["Device Vendor"] = device_vendor();
     64   result["Platform Version"] = platform_version();
     65   result["Driver Version"] = driver_version();
     66   result["Runtime Version"] = runtime_version();
     67   result["PCI bus ID"] = pci_bus_id_;
     68   result["Device Name"] = name_;
     69 
     70   const ThreadDim &thread_dim = thread_dim_limit();
     71   result["ThreadDim Limit"] =
     72       absl::StrCat(thread_dim.x, ",", thread_dim.y, ",", thread_dim.z);
     73   const BlockDim &block_dim = block_dim_limit();
     74   result["BlockDim Limit"] =
     75       absl::StrCat(block_dim.x, ",", block_dim.y, ",", block_dim.z);
     76 
     77   result["Threads Per Core Limit"] = absl::StrCat(threads_per_core_limit());
     78   result["Threads Per Block Limit"] = absl::StrCat(threads_per_block_limit());
     79   result["Registers Per Block Limit"] =
     80       absl::StrCat(registers_per_block_limit());
     81 
     82   result["Device Address Bits"] = absl::StrCat(device_address_bits());
     83   result["Device Memory Size"] =
     84       port::HumanReadableNumBytes::ToString(device_memory_size());
     85   result["Memory Bandwidth"] = absl::StrCat(
     86       port::HumanReadableNumBytes::ToString(memory_bandwidth_), "/s");
     87 
     88   result["Shared Memory Per Core"] =
     89       port::HumanReadableNumBytes::ToString(shared_memory_per_core_);
     90   result["Shared Memory Per Block"] =
     91       port::HumanReadableNumBytes::ToString(shared_memory_per_block_);
     92 
     93   result["Clock Rate GHz"] = absl::StrCat(clock_rate_ghz());
     94 
     95   result["CUDA Compute Capability"] = absl::StrCat(
     96       cuda_compute_capability_major_, ".", cuda_compute_capability_minor_);
     97 
     98   result["NUMA Node"] = absl::StrCat(numa_node());
     99   result["Core Count"] = absl::StrCat(core_count());
    100   result["ECC Enabled"] = absl::StrCat(ecc_enabled());
    101   return owned_result;
    102 }
    103 
    104 namespace internal {
    105 
    106 DeviceDescriptionBuilder::DeviceDescriptionBuilder()
    107     : device_description_(new DeviceDescription) {}
    108 
    109 }  // namespace internal
    110 
    111 bool DeviceDescription::cuda_compute_capability(int *major, int *minor) const {
    112   *major = cuda_compute_capability_major_;
    113   *minor = cuda_compute_capability_minor_;
    114   return cuda_compute_capability_major_ != 0;
    115 }
    116 
    117 bool DeviceDescription::rocm_amdgpu_isa_version(int *version) const {
    118   bool status = false;
    119   if (rocm_amdgpu_isa_version_ > 0) {
    120     *version = rocm_amdgpu_isa_version_;
    121     status = true;
    122   }
    123   return status;
    124 }
    125 
    126 bool ThreadDimOk(const DeviceDescription &device_description,
    127                  const ThreadDim &thread_dim) {
    128   auto total_threads = thread_dim.x * thread_dim.y * thread_dim.z;
    129   auto threads_per_block_limit = device_description.threads_per_block_limit();
    130   if (total_threads > threads_per_block_limit) {
    131     VLOG(2) << "exceeded total-thread-per-block limit: " << total_threads
    132             << " vs limit " << threads_per_block_limit;
    133     return false;
    134   }
    135 
    136   const auto &limit = device_description.thread_dim_limit();
    137   bool ok = thread_dim.x <= limit.x && thread_dim.y <= limit.y &&
    138             thread_dim.z <= limit.z;
    139   if (!ok) {
    140     VLOG(2) << "thread dim " << thread_dim.ToString()
    141             << " exceeds limit contraints of " << limit.ToString();
    142   }
    143   return ok;
    144 }
    145 
    146 uint64 DivideCeil(uint64 x, uint64 y) {
    147   return port::MathUtil::CeilOfRatio(x, y);
    148 }
    149 
    150 void CalculateDimensionality(const DeviceDescription &device_description,
    151                              int64 element_count, int64 *threads_per_block,
    152                              int64 *block_count) {
    153   *threads_per_block = device_description.threads_per_block_limit();
    154   *block_count = port::MathUtil::CeilOfRatio(element_count, *threads_per_block);
    155   if (*block_count == 1) {
    156     CHECK_LE(element_count, *threads_per_block);
    157     *threads_per_block = element_count;
    158   }
    159 }
    160 
    161 }  // namespace stream_executor
    162