1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved. 2 3 Licensed under the Apache License, Version 2.0 (the "License"); 4 you may not use this file except in compliance with the License. 5 You may obtain a copy of the License at 6 7 http://www.apache.org/licenses/LICENSE-2.0 8 9 Unless required by applicable law or agreed to in writing, software 10 distributed under the License is distributed on an "AS IS" BASIS, 11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 See the License for the specific language governing permissions and 13 limitations under the License. 14 ==============================================================================*/ 15 16 // Describes the underlying platform for a StreamExecutor; e.g. OpenCL or CUDA 17 // device and platform properties. Also contains convenience functions for 18 // checking/calculating launch dimensionality based on device properties. 19 20 #ifndef TENSORFLOW_STREAM_EXECUTOR_DEVICE_DESCRIPTION_H_ 21 #define TENSORFLOW_STREAM_EXECUTOR_DEVICE_DESCRIPTION_H_ 22 23 #include <map> 24 #include <memory> 25 #include "tensorflow/stream_executor/platform/port.h" 26 27 #include "tensorflow/stream_executor/launch_dim.h" 28 #include "tensorflow/stream_executor/platform/port.h" 29 30 namespace perftools { 31 namespace gputools { 32 namespace internal { 33 class DeviceDescriptionBuilder; 34 } // namespace internal 35 36 // Data that describes the execution target of the StreamExecutor, in terms of 37 // important logical parameters. These include dimensionality limits and 38 // physical parameters of interest, such as number of cores present on the 39 // device. 40 // 41 // Thread-safe: immutable post-initialization. 42 class DeviceDescription { 43 public: 44 // Returns the platform being run on; this value is primarily intended for 45 // printing, and comes out something like "OpenCL 1.2" or "Compute Capability 46 // 3.5". 47 const string &platform_version() const { return platform_version_; } 48 49 // Returns the driver version interfacing with the underlying platform. Vendor 50 // dependent format. 51 const string &driver_version() const { return driver_version_; } 52 53 // Return the runtime version, if one is provided by the underlying platform. 54 // Vendor dependent format / usefulness. 55 const string &runtime_version() const { return runtime_version_; } 56 57 // Returns the name that the device reports. Vendor dependent. 58 const string &name() const { return name_; } 59 60 // Returns the PCI bus identifier for this device, of the form 61 // [domain]:[bus]:[device].[function] 62 const string &pci_bus_id() const { return pci_bus_id_; } 63 64 // Returns the NUMA node associated with this device, for use in 65 // determining socket locality. If the NUMA node could not be determined, -1 66 // is returned. 67 int numa_node() const { return numa_node_; } 68 69 // Number of cores (traditional notion of core; i.e. an SM on an NVIDIA device 70 // or an AMD Compute Unit. 71 int core_count() const { return core_count_; } 72 73 // Returns the limit on the thread dimensionality values in each of the 74 // respective dimensions. These limits affect what constitutes a legitimate 75 // kernel launch request. 76 const ThreadDim &thread_dim_limit() const { return thread_dim_limit_; } 77 78 // Returns the limit on the block dimensionality values in each of the 79 // respective dimensions. These limits may affect what constitutes a 80 // legitimate kernel launch request. 81 const BlockDim &block_dim_limit() const { return block_dim_limit_; } 82 83 // Returns the limit on the number of simultaneously resident blocks 84 // on a multiprocessor. 85 uint64 blocks_per_core_limit() const { return blocks_per_core_limit_; } 86 87 // Returns the limit on the total number of threads that can be launched in a 88 // single block; i.e. the limit on x * y * z dimensions of a ThreadDim. 89 // This limit affects what constitutes a legitimate kernel launch request. 90 const uint64 &threads_per_block_limit() const { 91 return threads_per_block_limit_; 92 } 93 94 // Returns the limit on the total number of threads that can be simultaneously 95 // launched on a given multiprocessor. 96 const uint64 &threads_per_core_limit() const { 97 return threads_per_core_limit_; 98 } 99 100 // Returns the number of threads per warp/wavefront. 101 const uint64 &threads_per_warp() const { return threads_per_warp_; } 102 103 // Returns the limit on the total number of registers per core. 104 const uint64 ®isters_per_core_limit() const { 105 return registers_per_core_limit_; 106 } 107 108 // Returns the limit on the total number of registers that can be 109 // simultaneously used by a block. 110 const uint64 ®isters_per_block_limit() const { 111 return registers_per_block_limit_; 112 } 113 114 // Returns the limit on the total number of registers that can be 115 // allocated to a thread. 116 const uint64 ®isters_per_thread_limit() const { 117 return registers_per_thread_limit_; 118 } 119 120 // Returns the granularity at which warps are allocated resources. 121 const uint64 &warp_alloc_granularity() const { 122 return warp_alloc_granularity_; 123 } 124 125 // Returns the granularity at which registers are allocated to warps. 126 const uint64 ®ister_alloc_granularity() const { 127 return register_alloc_granularity_; 128 } 129 130 // Returns the granularity at which shared memory is allocated to warps. 131 const uint64 &shared_memory_alloc_granularity() const { 132 return shared_memory_alloc_granularity_; 133 } 134 135 // Returns the number of address bits available to kernel code running on the 136 // platform. This affects things like the maximum allocation size and perhaps 137 // types used in kernel code such as size_t. 138 const uint64 &device_address_bits() const { return device_address_bits_; } 139 140 // Returns the device memory size in bytes. 141 uint64 device_memory_size() const { return device_memory_size_; } 142 143 // Returns the device's core clock rate in GHz. 144 float clock_rate_ghz() const { return clock_rate_ghz_; } 145 146 // Returns whether ECC is enabled. 147 bool ecc_enabled() const { return ecc_enabled_; } 148 149 // Returns the device vendor string, e.g., "NVIDIA Corporation", "Advanced 150 // Micro Devices, Inc.", or "GenuineIntel". 151 const string &device_vendor() const { return device_vendor_; } 152 153 // Returns the CUDA compute capability if we're running on the CUDA platform. 154 // If a CUDA compute capability is not available, the major version will be 155 // zero, and the return value will be false. 156 bool cuda_compute_capability(int *major, int *minor) const; 157 158 // Returns the maximum amount of shared memory present on a single core 159 // (i.e. Streaming Multiprocessor on NVIDIA GPUs; Compute Unit for OpenCL 160 // devices). Note that some devices, such as NVIDIA's have a configurable 161 // partitioning between shared memory and L1 cache. 162 uint64 shared_memory_per_core() const { return shared_memory_per_core_; } 163 164 // Returns the maximum amount of shared memory available for a single block. 165 uint64 shared_memory_per_block() const { return shared_memory_per_block_; } 166 167 // TODO(leary): resident blocks per core will be useful. 168 169 // Convenience typedef for the string-based DeviceDescription mapping. 170 typedef std::map<string, string> Map; 171 172 // Returns a mapping from readable names to readable values that describe the 173 // device. This is useful for things like printing. 174 std::unique_ptr<Map> ToMap() const; 175 176 // For string values that are not available via the underlying platform, this 177 // value will be provided. 178 static const char *kUndefinedString; 179 180 private: 181 friend class internal::DeviceDescriptionBuilder; 182 183 DeviceDescription(); 184 185 // For description of the following members, see the corresponding accessor 186 // above. 187 // 188 // N.B. If another field is added, update ToMap() above. 189 string device_vendor_; 190 string platform_version_; 191 string driver_version_; 192 string runtime_version_; 193 string pci_bus_id_; 194 string name_; 195 196 ThreadDim thread_dim_limit_; 197 BlockDim block_dim_limit_; 198 199 uint64 blocks_per_core_limit_; 200 201 uint64 threads_per_core_limit_; 202 uint64 threads_per_block_limit_; 203 uint64 threads_per_warp_; 204 205 uint64 registers_per_core_limit_; 206 uint64 registers_per_block_limit_; 207 uint64 registers_per_thread_limit_; 208 209 uint64 warp_alloc_granularity_; 210 uint64 register_alloc_granularity_; 211 uint64 shared_memory_alloc_granularity_; 212 213 uint64 device_address_bits_; 214 uint64 device_memory_size_; 215 216 // Shared memory limits on a given device. 217 uint64 shared_memory_per_core_; 218 uint64 shared_memory_per_block_; 219 220 float clock_rate_ghz_; 221 222 // CUDA "CC" major value, -1 if not available. 223 int cuda_compute_capability_major_; 224 int cuda_compute_capability_minor_; 225 226 int numa_node_; 227 int core_count_; 228 bool ecc_enabled_; 229 230 SE_DISALLOW_COPY_AND_ASSIGN(DeviceDescription); 231 }; 232 233 namespace internal { 234 235 // Helper class the builds a device description, given that it has a large 236 // number of fields that would be easily confused in constructor form. 237 class DeviceDescriptionBuilder { 238 public: 239 DeviceDescriptionBuilder(); 240 241 // For descriptions of the following fields, see comments on the corresponding 242 // DeviceDescription::* accessors above. 243 244 void set_device_vendor(const string &value) { 245 device_description_->device_vendor_ = value; 246 } 247 void set_platform_version(const string &value) { 248 device_description_->platform_version_ = value; 249 } 250 void set_driver_version(const string &value) { 251 device_description_->driver_version_ = value; 252 } 253 void set_runtime_version(const string &value) { 254 device_description_->runtime_version_ = value; 255 } 256 void set_pci_bus_id(const string &value) { 257 device_description_->pci_bus_id_ = value; 258 } 259 void set_name(const string &value) { device_description_->name_ = value; } 260 261 void set_thread_dim_limit(const ThreadDim &value) { 262 device_description_->thread_dim_limit_ = value; 263 } 264 void set_block_dim_limit(const BlockDim &value) { 265 device_description_->block_dim_limit_ = value; 266 } 267 268 void set_blocks_per_core_limit(uint64 value) { 269 device_description_->blocks_per_core_limit_ = value; 270 } 271 272 void set_threads_per_core_limit(uint64 value) { 273 device_description_->threads_per_core_limit_ = value; 274 } 275 void set_threads_per_block_limit(uint64 value) { 276 device_description_->threads_per_block_limit_ = value; 277 } 278 void set_threads_per_warp(uint64 value) { 279 device_description_->threads_per_warp_ = value; 280 } 281 282 void set_registers_per_core_limit(uint64 value) { 283 device_description_->registers_per_core_limit_ = value; 284 } 285 void set_registers_per_block_limit(uint64 value) { 286 device_description_->registers_per_block_limit_ = value; 287 } 288 void set_registers_per_thread_limit(uint64 value) { 289 device_description_->registers_per_thread_limit_ = value; 290 } 291 292 void set_warp_alloc_granularity(uint64 value) { 293 device_description_->warp_alloc_granularity_ = value; 294 } 295 void set_register_alloc_granularity(uint64 value) { 296 device_description_->register_alloc_granularity_ = value; 297 } 298 void set_shared_memory_alloc_granularity(uint64 value) { 299 device_description_->shared_memory_alloc_granularity_ = value; 300 } 301 302 void set_device_address_bits(uint64 value) { 303 device_description_->device_address_bits_ = value; 304 } 305 void set_device_memory_size(uint64 value) { 306 device_description_->device_memory_size_ = value; 307 } 308 309 void set_shared_memory_per_core(int64 value) { 310 device_description_->shared_memory_per_core_ = value; 311 } 312 void set_shared_memory_per_block(int64 value) { 313 device_description_->shared_memory_per_block_ = value; 314 } 315 316 void set_clock_rate_ghz(float value) { 317 device_description_->clock_rate_ghz_ = value; 318 } 319 320 void set_cuda_compute_capability(int major, int minor) { 321 device_description_->cuda_compute_capability_major_ = major; 322 device_description_->cuda_compute_capability_minor_ = minor; 323 } 324 325 void set_numa_node(int value) { device_description_->numa_node_ = value; } 326 void set_core_count(int value) { device_description_->core_count_ = value; } 327 void set_ecc_enabled(bool value) { 328 device_description_->ecc_enabled_ = value; 329 } 330 331 // Returns a built DeviceDescription with ownership transferred to the 332 // caller. There are currently no restrictions on which fields must be set in 333 // order to build the descriptor. 334 // 335 // Once the description is built, this builder object should be discarded. 336 std::unique_ptr<DeviceDescription> Build() { 337 return std::move(device_description_); 338 } 339 340 private: 341 std::unique_ptr<DeviceDescription> device_description_; 342 343 SE_DISALLOW_COPY_AND_ASSIGN(DeviceDescriptionBuilder); 344 }; 345 346 } // namespace internal 347 348 // Returns whether the given thread_dim is acceptable given the limits described 349 // in device_description. For detailed reasons for failing the predicate, enable 350 // VLOG(2) for this module. 351 bool ThreadDimOk(const DeviceDescription &device_description, 352 const ThreadDim &thread_dim); 353 354 // [deprecated] Use MathUtil::CeilOfRatio directly instead. 355 // 356 // Equivalent to ceil(double(element_count) / threads_per_block). 357 uint64 DivideCeil(uint64 x, uint64 y); 358 359 // Calculate the number of threads/blocks required to process element_count 360 // elements. Note that you can still end up with more threads than 361 // element_count due to rounding, so kernels often start with an "is this 362 // thread id in the element_count range?" test. 363 void CalculateDimensionality(const DeviceDescription &device_description, 364 uint64 element_count, uint64 *threads_per_block, 365 uint64 *block_count); 366 367 // Compute and return maximum blocks per core (occupancy) based on the 368 // device description, some kernel characteristics and the number of threads per 369 // block. If unable to compute occupancy, zero is returned. 370 uint64 CalculateOccupancy(const DeviceDescription &device_description, 371 uint64 registers_per_thread, 372 uint64 shared_memory_per_block, 373 const ThreadDim &thread_dims); 374 375 // Compute and return the maximum number of registers per thread which 376 // achieves the target occupancy. If the target is not possible then 377 // zero is returned. 378 uint64 CalculateRegisterLimitForTargetOccupancy( 379 const DeviceDescription &device_description, uint64 shared_memory_per_block, 380 const ThreadDim &thread_dims, uint64 target_blocks_per_core); 381 382 } // namespace gputools 383 } // namespace perftools 384 385 #endif // TENSORFLOW_STREAM_EXECUTOR_DEVICE_DESCRIPTION_H_ 386