Home | History | Annotate | Download | only in stream_executor
      1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
      2 
      3 Licensed under the Apache License, Version 2.0 (the "License");
      4 you may not use this file except in compliance with the License.
      5 You may obtain a copy of the License at
      6 
      7     http://www.apache.org/licenses/LICENSE-2.0
      8 
      9 Unless required by applicable law or agreed to in writing, software
     10 distributed under the License is distributed on an "AS IS" BASIS,
     11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     12 See the License for the specific language governing permissions and
     13 limitations under the License.
     14 ==============================================================================*/
     15 
     16 #include "tensorflow/stream_executor/device_description.h"
     17 
     18 #include <algorithm>
     19 
     20 #include "tensorflow/stream_executor/lib/human_readable.h"
     21 #include "tensorflow/stream_executor/lib/mathutil.h"
     22 #include "tensorflow/stream_executor/lib/strcat.h"
     23 
     24 namespace perftools {
     25 namespace gputools {
     26 
     27 static const uint64 kUninitializedUint64 = -1ULL;
     28 /* static */ const char *DeviceDescription::kUndefinedString = "<undefined>";
     29 
     30 DeviceDescription::DeviceDescription()
     31     : device_vendor_(kUndefinedString),
     32       platform_version_(kUndefinedString),
     33       driver_version_(kUndefinedString),
     34       runtime_version_(kUndefinedString),
     35       pci_bus_id_(kUndefinedString),
     36       name_(kUndefinedString),
     37       thread_dim_limit_(kUninitializedUint64, kUninitializedUint64,
     38                         kUninitializedUint64),
     39       block_dim_limit_(kUninitializedUint64, kUninitializedUint64,
     40                        kUninitializedUint64),
     41       blocks_per_core_limit_(kUninitializedUint64),
     42       threads_per_core_limit_(kUninitializedUint64),
     43       threads_per_block_limit_(kUninitializedUint64),
     44       threads_per_warp_(kUninitializedUint64),
     45       registers_per_core_limit_(kUninitializedUint64),
     46       registers_per_block_limit_(kUninitializedUint64),
     47       registers_per_thread_limit_(kUninitializedUint64),
     48       warp_alloc_granularity_(1),
     49       register_alloc_granularity_(1),
     50       shared_memory_alloc_granularity_(1),
     51       device_address_bits_(kUninitializedUint64),
     52       device_memory_size_(kUninitializedUint64),
     53       shared_memory_per_core_(kUninitializedUint64),
     54       shared_memory_per_block_(kUninitializedUint64),
     55       clock_rate_ghz_(-1.0),
     56       cuda_compute_capability_major_(-1),
     57       cuda_compute_capability_minor_(-1),
     58       numa_node_(-1),
     59       core_count_(-1),
     60       ecc_enabled_(false) {}
     61 
     62 std::unique_ptr<std::map<string, string>> DeviceDescription::ToMap() const {
     63   std::unique_ptr<std::map<string, string>> owned_result{
     64       new std::map<string, string>};
     65   std::map<string, string> &result = *owned_result;
     66   result["Device Vendor"] = device_vendor();
     67   result["Platform Version"] = platform_version();
     68   result["Driver Version"] = driver_version();
     69   result["Runtime Version"] = runtime_version();
     70   result["PCI bus ID"] = pci_bus_id_;
     71   result["Device Name"] = name_;
     72 
     73   const ThreadDim &thread_dim = thread_dim_limit();
     74   result["ThreadDim Limit"] =
     75       port::StrCat(thread_dim.x, ",", thread_dim.y, ",", thread_dim.z);
     76   const BlockDim &block_dim = block_dim_limit();
     77   result["BlockDim Limit"] =
     78       port::StrCat(block_dim.x, ",", block_dim.y, ",", block_dim.z);
     79 
     80   result["Threads Per Core Limit"] = port::StrCat(threads_per_core_limit());
     81   result["Threads Per Block Limit"] = port::StrCat(threads_per_block_limit());
     82   result["Registers Per Block Limit"] =
     83       port::StrCat(registers_per_block_limit());
     84 
     85   result["Device Address Bits"] = port::StrCat(device_address_bits());
     86   result["Device Memory Size"] =
     87       port::HumanReadableNumBytes::ToString(device_memory_size());
     88 
     89   result["Shared Memory Per Core"] =
     90       port::HumanReadableNumBytes::ToString(shared_memory_per_core_);
     91   result["Shared Memory Per Block"] =
     92       port::HumanReadableNumBytes::ToString(shared_memory_per_block_);
     93 
     94   result["Clock Rate GHz"] = port::StrCat(clock_rate_ghz());
     95 
     96   result["CUDA Compute Capability"] = port::StrCat(
     97       cuda_compute_capability_major_, ".", cuda_compute_capability_minor_);
     98 
     99   result["NUMA Node"] = port::StrCat(numa_node());
    100   result["Core Count"] = port::StrCat(core_count());
    101   result["ECC Enabled"] = port::StrCat(ecc_enabled());
    102   return owned_result;
    103 }
    104 
    105 namespace internal {
    106 
    107 DeviceDescriptionBuilder::DeviceDescriptionBuilder()
    108     : device_description_(new DeviceDescription) {}
    109 
    110 }  // namespace internal
    111 
    112 bool DeviceDescription::cuda_compute_capability(int *major, int *minor) const {
    113   *major = cuda_compute_capability_major_;
    114   *minor = cuda_compute_capability_minor_;
    115   return cuda_compute_capability_major_ != 0;
    116 }
    117 
    118 bool ThreadDimOk(const DeviceDescription &device_description,
    119                  const ThreadDim &thread_dim) {
    120   auto total_threads = thread_dim.x * thread_dim.y * thread_dim.z;
    121   auto threads_per_block_limit = device_description.threads_per_block_limit();
    122   if (total_threads > threads_per_block_limit) {
    123     VLOG(2) << "exceeded total-thread-per-block limit: " << total_threads
    124             << " vs limit " << threads_per_block_limit;
    125     return false;
    126   }
    127 
    128   const auto &limit = device_description.thread_dim_limit();
    129   bool ok = thread_dim.x <= limit.x && thread_dim.y <= limit.y &&
    130             thread_dim.z <= limit.z;
    131   if (!ok) {
    132     VLOG(2) << "thread dim " << thread_dim.ToString()
    133             << " exceeds limit contraints of " << limit.ToString();
    134   }
    135   return ok;
    136 }
    137 
    138 uint64 DivideCeil(uint64 x, uint64 y) {
    139   return port::MathUtil::CeilOfRatio(x, y);
    140 }
    141 
    142 void CalculateDimensionality(const DeviceDescription &device_description,
    143                              uint64 element_count, uint64 *threads_per_block,
    144                              uint64 *block_count) {
    145   *threads_per_block = device_description.threads_per_block_limit();
    146   *block_count = DivideCeil(element_count, *threads_per_block);
    147   if (*block_count == 1) {
    148     CHECK_LE(element_count, *threads_per_block);
    149     *threads_per_block = element_count;
    150   }
    151 }
    152 
    153 // Round value up to a multiple of n.
    154 static uint64 RoundUp(uint64 value, uint64 n) {
    155   return port::MathUtil::CeilOfRatio(value, n) * n;
    156 }
    157 
    158 // Round value down to a multiple of n.
    159 static uint64 RoundDown(uint64 value, uint64 n) {
    160   return port::MathUtil::FloorOfRatio(value, n) * n;
    161 }
    162 
    163 uint64 CalculateOccupancy(const DeviceDescription &device_description,
    164                           uint64 registers_per_thread,
    165                           uint64 shared_memory_per_block,
    166                           const ThreadDim &thread_dims) {
    167   // Don't try to compute occupancy if necessary values are not initialized.
    168   uint64 required_fields[] =  { device_description.registers_per_thread_limit(),
    169                                 device_description.threads_per_warp(),
    170                                 device_description.warp_alloc_granularity(),
    171                                 device_description.register_alloc_granularity(),
    172                                 device_description.registers_per_block_limit(),
    173                                 device_description.shared_memory_per_core(),
    174                                 device_description.blocks_per_core_limit() };
    175   for (auto value : required_fields) {
    176     if (value == kUninitializedUint64) {
    177       return 0;
    178     }
    179   }
    180 
    181   if (registers_per_thread > device_description.registers_per_thread_limit()) {
    182     return 0;
    183   }
    184 
    185   uint64 warps_per_block =
    186       port::MathUtil::CeilOfRatio(thread_dims.x * thread_dims.y * thread_dims.z,
    187                                   device_description.threads_per_warp());
    188 
    189   // Warp resources are allocated at a particular granularity.  This value is
    190   // the effective number of warps for resource allocation purposes.
    191   uint64 alloc_warps_per_block =
    192       RoundUp(warps_per_block, device_description.warp_alloc_granularity());
    193 
    194   uint64 alloc_regs_per_warp =
    195       RoundUp(device_description.threads_per_warp() * registers_per_thread,
    196               device_description.register_alloc_granularity());
    197   uint64 regs_per_block = alloc_warps_per_block * alloc_regs_per_warp;
    198   uint64 reg_limit =
    199       device_description.registers_per_block_limit() / regs_per_block;
    200 
    201   uint64 alloc_smem_per_block = RoundUp(
    202       shared_memory_per_block,
    203       device_description.shared_memory_alloc_granularity());
    204   uint64 smem_limit = alloc_smem_per_block > 0 ?
    205       device_description.shared_memory_per_core() / alloc_smem_per_block :
    206       device_description.blocks_per_core_limit();
    207 
    208   uint64 thread_limit = device_description.threads_per_core_limit()
    209       / (warps_per_block  * device_description.threads_per_warp());
    210 
    211   return std::min({ device_description.blocks_per_core_limit(),
    212           reg_limit, smem_limit, thread_limit });
    213 }
    214 
    215 uint64 CalculateRegisterLimitForTargetOccupancy(
    216     const DeviceDescription &device_description, uint64 shared_memory_per_block,
    217     const ThreadDim &thread_dims, uint64 target_blocks_per_core) {
    218   // Linear search from maximum number of registers down until the target
    219   // blocks per SM is found.
    220   // TODO(meheff): Compute this using a closed form solution.
    221   int reg_step = device_description.register_alloc_granularity() /
    222       device_description.threads_per_warp();
    223   for (int r = device_description.registers_per_thread_limit(); r > 0;
    224        r = RoundDown(r - 1, reg_step)) {
    225     uint64 occupancy = CalculateOccupancy(
    226         device_description, r, shared_memory_per_block, thread_dims);
    227     if (occupancy >= target_blocks_per_core) {
    228       return r;
    229     }
    230   }
    231   return 0;
    232 }
    233 
    234 
    235 }  // namespace gputools
    236 }  // namespace perftools
    237