1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved. 2 3 Licensed under the Apache License, Version 2.0 (the "License"); 4 you may not use this file except in compliance with the License. 5 You may obtain a copy of the License at 6 7 http://www.apache.org/licenses/LICENSE-2.0 8 9 Unless required by applicable law or agreed to in writing, software 10 distributed under the License is distributed on an "AS IS" BASIS, 11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 See the License for the specific language governing permissions and 13 limitations under the License. 14 ==============================================================================*/ 15 16 #include "tensorflow/stream_executor/device_description.h" 17 18 #include <algorithm> 19 20 #include "tensorflow/stream_executor/lib/human_readable.h" 21 #include "tensorflow/stream_executor/lib/mathutil.h" 22 #include "tensorflow/stream_executor/lib/strcat.h" 23 24 namespace perftools { 25 namespace gputools { 26 27 static const uint64 kUninitializedUint64 = -1ULL; 28 /* static */ const char *DeviceDescription::kUndefinedString = "<undefined>"; 29 30 DeviceDescription::DeviceDescription() 31 : device_vendor_(kUndefinedString), 32 platform_version_(kUndefinedString), 33 driver_version_(kUndefinedString), 34 runtime_version_(kUndefinedString), 35 pci_bus_id_(kUndefinedString), 36 name_(kUndefinedString), 37 thread_dim_limit_(kUninitializedUint64, kUninitializedUint64, 38 kUninitializedUint64), 39 block_dim_limit_(kUninitializedUint64, kUninitializedUint64, 40 kUninitializedUint64), 41 blocks_per_core_limit_(kUninitializedUint64), 42 threads_per_core_limit_(kUninitializedUint64), 43 threads_per_block_limit_(kUninitializedUint64), 44 threads_per_warp_(kUninitializedUint64), 45 registers_per_core_limit_(kUninitializedUint64), 46 registers_per_block_limit_(kUninitializedUint64), 47 registers_per_thread_limit_(kUninitializedUint64), 48 warp_alloc_granularity_(1), 49 register_alloc_granularity_(1), 50 shared_memory_alloc_granularity_(1), 51 device_address_bits_(kUninitializedUint64), 52 device_memory_size_(kUninitializedUint64), 53 shared_memory_per_core_(kUninitializedUint64), 54 shared_memory_per_block_(kUninitializedUint64), 55 clock_rate_ghz_(-1.0), 56 cuda_compute_capability_major_(-1), 57 cuda_compute_capability_minor_(-1), 58 numa_node_(-1), 59 core_count_(-1), 60 ecc_enabled_(false) {} 61 62 std::unique_ptr<std::map<string, string>> DeviceDescription::ToMap() const { 63 std::unique_ptr<std::map<string, string>> owned_result{ 64 new std::map<string, string>}; 65 std::map<string, string> &result = *owned_result; 66 result["Device Vendor"] = device_vendor(); 67 result["Platform Version"] = platform_version(); 68 result["Driver Version"] = driver_version(); 69 result["Runtime Version"] = runtime_version(); 70 result["PCI bus ID"] = pci_bus_id_; 71 result["Device Name"] = name_; 72 73 const ThreadDim &thread_dim = thread_dim_limit(); 74 result["ThreadDim Limit"] = 75 port::StrCat(thread_dim.x, ",", thread_dim.y, ",", thread_dim.z); 76 const BlockDim &block_dim = block_dim_limit(); 77 result["BlockDim Limit"] = 78 port::StrCat(block_dim.x, ",", block_dim.y, ",", block_dim.z); 79 80 result["Threads Per Core Limit"] = port::StrCat(threads_per_core_limit()); 81 result["Threads Per Block Limit"] = port::StrCat(threads_per_block_limit()); 82 result["Registers Per Block Limit"] = 83 port::StrCat(registers_per_block_limit()); 84 85 result["Device Address Bits"] = port::StrCat(device_address_bits()); 86 result["Device Memory Size"] = 87 port::HumanReadableNumBytes::ToString(device_memory_size()); 88 89 result["Shared Memory Per Core"] = 90 port::HumanReadableNumBytes::ToString(shared_memory_per_core_); 91 result["Shared Memory Per Block"] = 92 port::HumanReadableNumBytes::ToString(shared_memory_per_block_); 93 94 result["Clock Rate GHz"] = port::StrCat(clock_rate_ghz()); 95 96 result["CUDA Compute Capability"] = port::StrCat( 97 cuda_compute_capability_major_, ".", cuda_compute_capability_minor_); 98 99 result["NUMA Node"] = port::StrCat(numa_node()); 100 result["Core Count"] = port::StrCat(core_count()); 101 result["ECC Enabled"] = port::StrCat(ecc_enabled()); 102 return owned_result; 103 } 104 105 namespace internal { 106 107 DeviceDescriptionBuilder::DeviceDescriptionBuilder() 108 : device_description_(new DeviceDescription) {} 109 110 } // namespace internal 111 112 bool DeviceDescription::cuda_compute_capability(int *major, int *minor) const { 113 *major = cuda_compute_capability_major_; 114 *minor = cuda_compute_capability_minor_; 115 return cuda_compute_capability_major_ != 0; 116 } 117 118 bool ThreadDimOk(const DeviceDescription &device_description, 119 const ThreadDim &thread_dim) { 120 auto total_threads = thread_dim.x * thread_dim.y * thread_dim.z; 121 auto threads_per_block_limit = device_description.threads_per_block_limit(); 122 if (total_threads > threads_per_block_limit) { 123 VLOG(2) << "exceeded total-thread-per-block limit: " << total_threads 124 << " vs limit " << threads_per_block_limit; 125 return false; 126 } 127 128 const auto &limit = device_description.thread_dim_limit(); 129 bool ok = thread_dim.x <= limit.x && thread_dim.y <= limit.y && 130 thread_dim.z <= limit.z; 131 if (!ok) { 132 VLOG(2) << "thread dim " << thread_dim.ToString() 133 << " exceeds limit contraints of " << limit.ToString(); 134 } 135 return ok; 136 } 137 138 uint64 DivideCeil(uint64 x, uint64 y) { 139 return port::MathUtil::CeilOfRatio(x, y); 140 } 141 142 void CalculateDimensionality(const DeviceDescription &device_description, 143 uint64 element_count, uint64 *threads_per_block, 144 uint64 *block_count) { 145 *threads_per_block = device_description.threads_per_block_limit(); 146 *block_count = DivideCeil(element_count, *threads_per_block); 147 if (*block_count == 1) { 148 CHECK_LE(element_count, *threads_per_block); 149 *threads_per_block = element_count; 150 } 151 } 152 153 // Round value up to a multiple of n. 154 static uint64 RoundUp(uint64 value, uint64 n) { 155 return port::MathUtil::CeilOfRatio(value, n) * n; 156 } 157 158 // Round value down to a multiple of n. 159 static uint64 RoundDown(uint64 value, uint64 n) { 160 return port::MathUtil::FloorOfRatio(value, n) * n; 161 } 162 163 uint64 CalculateOccupancy(const DeviceDescription &device_description, 164 uint64 registers_per_thread, 165 uint64 shared_memory_per_block, 166 const ThreadDim &thread_dims) { 167 // Don't try to compute occupancy if necessary values are not initialized. 168 uint64 required_fields[] = { device_description.registers_per_thread_limit(), 169 device_description.threads_per_warp(), 170 device_description.warp_alloc_granularity(), 171 device_description.register_alloc_granularity(), 172 device_description.registers_per_block_limit(), 173 device_description.shared_memory_per_core(), 174 device_description.blocks_per_core_limit() }; 175 for (auto value : required_fields) { 176 if (value == kUninitializedUint64) { 177 return 0; 178 } 179 } 180 181 if (registers_per_thread > device_description.registers_per_thread_limit()) { 182 return 0; 183 } 184 185 uint64 warps_per_block = 186 port::MathUtil::CeilOfRatio(thread_dims.x * thread_dims.y * thread_dims.z, 187 device_description.threads_per_warp()); 188 189 // Warp resources are allocated at a particular granularity. This value is 190 // the effective number of warps for resource allocation purposes. 191 uint64 alloc_warps_per_block = 192 RoundUp(warps_per_block, device_description.warp_alloc_granularity()); 193 194 uint64 alloc_regs_per_warp = 195 RoundUp(device_description.threads_per_warp() * registers_per_thread, 196 device_description.register_alloc_granularity()); 197 uint64 regs_per_block = alloc_warps_per_block * alloc_regs_per_warp; 198 uint64 reg_limit = 199 device_description.registers_per_block_limit() / regs_per_block; 200 201 uint64 alloc_smem_per_block = RoundUp( 202 shared_memory_per_block, 203 device_description.shared_memory_alloc_granularity()); 204 uint64 smem_limit = alloc_smem_per_block > 0 ? 205 device_description.shared_memory_per_core() / alloc_smem_per_block : 206 device_description.blocks_per_core_limit(); 207 208 uint64 thread_limit = device_description.threads_per_core_limit() 209 / (warps_per_block * device_description.threads_per_warp()); 210 211 return std::min({ device_description.blocks_per_core_limit(), 212 reg_limit, smem_limit, thread_limit }); 213 } 214 215 uint64 CalculateRegisterLimitForTargetOccupancy( 216 const DeviceDescription &device_description, uint64 shared_memory_per_block, 217 const ThreadDim &thread_dims, uint64 target_blocks_per_core) { 218 // Linear search from maximum number of registers down until the target 219 // blocks per SM is found. 220 // TODO(meheff): Compute this using a closed form solution. 221 int reg_step = device_description.register_alloc_granularity() / 222 device_description.threads_per_warp(); 223 for (int r = device_description.registers_per_thread_limit(); r > 0; 224 r = RoundDown(r - 1, reg_step)) { 225 uint64 occupancy = CalculateOccupancy( 226 device_description, r, shared_memory_per_block, thread_dims); 227 if (occupancy >= target_blocks_per_core) { 228 return r; 229 } 230 } 231 return 0; 232 } 233 234 235 } // namespace gputools 236 } // namespace perftools 237