Home | History | Annotate | Download | only in stream_executor
      1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
      2 
      3 Licensed under the Apache License, Version 2.0 (the "License");
      4 you may not use this file except in compliance with the License.
      5 You may obtain a copy of the License at
      6 
      7     http://www.apache.org/licenses/LICENSE-2.0
      8 
      9 Unless required by applicable law or agreed to in writing, software
     10 distributed under the License is distributed on an "AS IS" BASIS,
     11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     12 See the License for the specific language governing permissions and
     13 limitations under the License.
     14 ==============================================================================*/
     15 
     16 // Describes the underlying platform for a StreamExecutor; e.g. OpenCL or CUDA
     17 // device and platform properties. Also contains convenience functions for
     18 // checking/calculating launch dimensionality based on device properties.
     19 
     20 #ifndef TENSORFLOW_STREAM_EXECUTOR_DEVICE_DESCRIPTION_H_
     21 #define TENSORFLOW_STREAM_EXECUTOR_DEVICE_DESCRIPTION_H_
     22 
     23 #include <map>
     24 #include <memory>
     25 #include "tensorflow/stream_executor/platform/port.h"
     26 
     27 #include "tensorflow/stream_executor/launch_dim.h"
     28 #include "tensorflow/stream_executor/platform/port.h"
     29 
     30 namespace perftools {
     31 namespace gputools {
     32 namespace internal {
     33 class DeviceDescriptionBuilder;
     34 }  // namespace internal
     35 
     36 // Data that describes the execution target of the StreamExecutor, in terms of
     37 // important logical parameters. These include dimensionality limits and
     38 // physical parameters of interest, such as number of cores present on the
     39 // device.
     40 //
     41 // Thread-safe: immutable post-initialization.
     42 class DeviceDescription {
     43  public:
     44   // Returns the platform being run on; this value is primarily intended for
     45   // printing, and comes out something like "OpenCL 1.2" or "Compute Capability
     46   // 3.5".
     47   const string &platform_version() const { return platform_version_; }
     48 
     49   // Returns the driver version interfacing with the underlying platform. Vendor
     50   // dependent format.
     51   const string &driver_version() const { return driver_version_; }
     52 
     53   // Return the runtime version, if one is provided by the underlying platform.
     54   // Vendor dependent format / usefulness.
     55   const string &runtime_version() const { return runtime_version_; }
     56 
     57   // Returns the name that the device reports. Vendor dependent.
     58   const string &name() const { return name_; }
     59 
     60   // Returns the PCI bus identifier for this device, of the form
     61   // [domain]:[bus]:[device].[function]
     62   const string &pci_bus_id() const { return pci_bus_id_; }
     63 
     64   // Returns the NUMA node associated with this device, for use in
     65   // determining socket locality. If the NUMA node could not be determined, -1
     66   // is returned.
     67   int numa_node() const { return numa_node_; }
     68 
     69   // Number of cores (traditional notion of core; i.e. an SM on an NVIDIA device
     70   // or an AMD Compute Unit.
     71   int core_count() const { return core_count_; }
     72 
     73   // Returns the limit on the thread dimensionality values in each of the
     74   // respective dimensions. These limits affect what constitutes a legitimate
     75   // kernel launch request.
     76   const ThreadDim &thread_dim_limit() const { return thread_dim_limit_; }
     77 
     78   // Returns the limit on the block dimensionality values in each of the
     79   // respective dimensions. These limits may affect what constitutes a
     80   // legitimate kernel launch request.
     81   const BlockDim &block_dim_limit() const { return block_dim_limit_; }
     82 
     83   // Returns the limit on the number of simultaneously resident blocks
     84   // on a multiprocessor.
     85   uint64 blocks_per_core_limit() const { return blocks_per_core_limit_; }
     86 
     87   // Returns the limit on the total number of threads that can be launched in a
     88   // single block; i.e. the limit on x * y * z dimensions of a ThreadDim.
     89   // This limit affects what constitutes a legitimate kernel launch request.
     90   const uint64 &threads_per_block_limit() const {
     91     return threads_per_block_limit_;
     92   }
     93 
     94   // Returns the limit on the total number of threads that can be simultaneously
     95   // launched on a given multiprocessor.
     96   const uint64 &threads_per_core_limit() const {
     97     return threads_per_core_limit_;
     98   }
     99 
    100   // Returns the number of threads per warp/wavefront.
    101   const uint64 &threads_per_warp() const { return threads_per_warp_; }
    102 
    103   // Returns the limit on the total number of registers per core.
    104   const uint64 &registers_per_core_limit() const {
    105     return registers_per_core_limit_;
    106   }
    107 
    108   // Returns the limit on the total number of registers that can be
    109   // simultaneously used by a block.
    110   const uint64 &registers_per_block_limit() const {
    111     return registers_per_block_limit_;
    112   }
    113 
    114   // Returns the limit on the total number of registers that can be
    115   // allocated to a thread.
    116   const uint64 &registers_per_thread_limit() const {
    117     return registers_per_thread_limit_;
    118   }
    119 
    120   // Returns the granularity at which warps are allocated resources.
    121   const uint64 &warp_alloc_granularity() const {
    122     return warp_alloc_granularity_;
    123   }
    124 
    125   // Returns the granularity at which registers are allocated to warps.
    126   const uint64 &register_alloc_granularity() const {
    127     return register_alloc_granularity_;
    128   }
    129 
    130   // Returns the granularity at which shared memory is allocated to warps.
    131   const uint64 &shared_memory_alloc_granularity() const {
    132     return shared_memory_alloc_granularity_;
    133   }
    134 
    135   // Returns the number of address bits available to kernel code running on the
    136   // platform. This affects things like the maximum allocation size and perhaps
    137   // types used in kernel code such as size_t.
    138   const uint64 &device_address_bits() const { return device_address_bits_; }
    139 
    140   // Returns the device memory size in bytes.
    141   uint64 device_memory_size() const { return device_memory_size_; }
    142 
    143   // Returns the device's core clock rate in GHz.
    144   float clock_rate_ghz() const { return clock_rate_ghz_; }
    145 
    146   // Returns whether ECC is enabled.
    147   bool ecc_enabled() const { return ecc_enabled_; }
    148 
    149   // Returns the device vendor string, e.g., "NVIDIA Corporation", "Advanced
    150   // Micro Devices, Inc.", or "GenuineIntel".
    151   const string &device_vendor() const { return device_vendor_; }
    152 
    153   // Returns the CUDA compute capability if we're running on the CUDA platform.
    154   // If a CUDA compute capability is not available, the major version will be
    155   // zero, and the return value will be false.
    156   bool cuda_compute_capability(int *major, int *minor) const;
    157 
    158   // Returns the maximum amount of shared memory present on a single core
    159   // (i.e. Streaming Multiprocessor on NVIDIA GPUs; Compute Unit for OpenCL
    160   // devices). Note that some devices, such as NVIDIA's have a configurable
    161   // partitioning between shared memory and L1 cache.
    162   uint64 shared_memory_per_core() const { return shared_memory_per_core_; }
    163 
    164   // Returns the maximum amount of shared memory available for a single block.
    165   uint64 shared_memory_per_block() const { return shared_memory_per_block_; }
    166 
    167   // TODO(leary): resident blocks per core will be useful.
    168 
    169   // Convenience typedef for the string-based DeviceDescription mapping.
    170   typedef std::map<string, string> Map;
    171 
    172   // Returns a mapping from readable names to readable values that describe the
    173   // device. This is useful for things like printing.
    174   std::unique_ptr<Map> ToMap() const;
    175 
    176   // For string values that are not available via the underlying platform, this
    177   // value will be provided.
    178   static const char *kUndefinedString;
    179 
    180  private:
    181   friend class internal::DeviceDescriptionBuilder;
    182 
    183   DeviceDescription();
    184 
    185   // For description of the following members, see the corresponding accessor
    186   // above.
    187   //
    188   // N.B. If another field is added, update ToMap() above.
    189   string device_vendor_;
    190   string platform_version_;
    191   string driver_version_;
    192   string runtime_version_;
    193   string pci_bus_id_;
    194   string name_;
    195 
    196   ThreadDim thread_dim_limit_;
    197   BlockDim block_dim_limit_;
    198 
    199   uint64 blocks_per_core_limit_;
    200 
    201   uint64 threads_per_core_limit_;
    202   uint64 threads_per_block_limit_;
    203   uint64 threads_per_warp_;
    204 
    205   uint64 registers_per_core_limit_;
    206   uint64 registers_per_block_limit_;
    207   uint64 registers_per_thread_limit_;
    208 
    209   uint64 warp_alloc_granularity_;
    210   uint64 register_alloc_granularity_;
    211   uint64 shared_memory_alloc_granularity_;
    212 
    213   uint64 device_address_bits_;
    214   uint64 device_memory_size_;
    215 
    216   // Shared memory limits on a given device.
    217   uint64 shared_memory_per_core_;
    218   uint64 shared_memory_per_block_;
    219 
    220   float clock_rate_ghz_;
    221 
    222   // CUDA "CC" major value, -1 if not available.
    223   int cuda_compute_capability_major_;
    224   int cuda_compute_capability_minor_;
    225 
    226   int numa_node_;
    227   int core_count_;
    228   bool ecc_enabled_;
    229 
    230   SE_DISALLOW_COPY_AND_ASSIGN(DeviceDescription);
    231 };
    232 
    233 namespace internal {
    234 
    235 // Helper class the builds a device description, given that it has a large
    236 // number of fields that would be easily confused in constructor form.
    237 class DeviceDescriptionBuilder {
    238  public:
    239   DeviceDescriptionBuilder();
    240 
    241   // For descriptions of the following fields, see comments on the corresponding
    242   // DeviceDescription::* accessors above.
    243 
    244   void set_device_vendor(const string &value) {
    245     device_description_->device_vendor_ = value;
    246   }
    247   void set_platform_version(const string &value) {
    248     device_description_->platform_version_ = value;
    249   }
    250   void set_driver_version(const string &value) {
    251     device_description_->driver_version_ = value;
    252   }
    253   void set_runtime_version(const string &value) {
    254     device_description_->runtime_version_ = value;
    255   }
    256   void set_pci_bus_id(const string &value) {
    257     device_description_->pci_bus_id_ = value;
    258   }
    259   void set_name(const string &value) { device_description_->name_ = value; }
    260 
    261   void set_thread_dim_limit(const ThreadDim &value) {
    262     device_description_->thread_dim_limit_ = value;
    263   }
    264   void set_block_dim_limit(const BlockDim &value) {
    265     device_description_->block_dim_limit_ = value;
    266   }
    267 
    268   void set_blocks_per_core_limit(uint64 value) {
    269     device_description_->blocks_per_core_limit_ = value;
    270   }
    271 
    272   void set_threads_per_core_limit(uint64 value) {
    273     device_description_->threads_per_core_limit_ = value;
    274   }
    275   void set_threads_per_block_limit(uint64 value) {
    276     device_description_->threads_per_block_limit_ = value;
    277   }
    278   void set_threads_per_warp(uint64 value) {
    279     device_description_->threads_per_warp_ = value;
    280   }
    281 
    282   void set_registers_per_core_limit(uint64 value) {
    283     device_description_->registers_per_core_limit_ = value;
    284   }
    285   void set_registers_per_block_limit(uint64 value) {
    286     device_description_->registers_per_block_limit_ = value;
    287   }
    288   void set_registers_per_thread_limit(uint64 value) {
    289     device_description_->registers_per_thread_limit_ = value;
    290   }
    291 
    292   void set_warp_alloc_granularity(uint64 value) {
    293     device_description_->warp_alloc_granularity_ = value;
    294   }
    295   void set_register_alloc_granularity(uint64 value) {
    296     device_description_->register_alloc_granularity_ = value;
    297   }
    298   void set_shared_memory_alloc_granularity(uint64 value) {
    299     device_description_->shared_memory_alloc_granularity_ = value;
    300   }
    301 
    302   void set_device_address_bits(uint64 value) {
    303     device_description_->device_address_bits_ = value;
    304   }
    305   void set_device_memory_size(uint64 value) {
    306     device_description_->device_memory_size_ = value;
    307   }
    308 
    309   void set_shared_memory_per_core(int64 value) {
    310     device_description_->shared_memory_per_core_ = value;
    311   }
    312   void set_shared_memory_per_block(int64 value) {
    313     device_description_->shared_memory_per_block_ = value;
    314   }
    315 
    316   void set_clock_rate_ghz(float value) {
    317     device_description_->clock_rate_ghz_ = value;
    318   }
    319 
    320   void set_cuda_compute_capability(int major, int minor) {
    321     device_description_->cuda_compute_capability_major_ = major;
    322     device_description_->cuda_compute_capability_minor_ = minor;
    323   }
    324 
    325   void set_numa_node(int value) { device_description_->numa_node_ = value; }
    326   void set_core_count(int value) { device_description_->core_count_ = value; }
    327   void set_ecc_enabled(bool value) {
    328     device_description_->ecc_enabled_ = value;
    329   }
    330 
    331   // Returns a built DeviceDescription with ownership transferred to the
    332   // caller. There are currently no restrictions on which fields must be set in
    333   // order to build the descriptor.
    334   //
    335   // Once the description is built, this builder object should be discarded.
    336   std::unique_ptr<DeviceDescription> Build() {
    337     return std::move(device_description_);
    338   }
    339 
    340  private:
    341   std::unique_ptr<DeviceDescription> device_description_;
    342 
    343   SE_DISALLOW_COPY_AND_ASSIGN(DeviceDescriptionBuilder);
    344 };
    345 
    346 }  // namespace internal
    347 
    348 // Returns whether the given thread_dim is acceptable given the limits described
    349 // in device_description. For detailed reasons for failing the predicate, enable
    350 // VLOG(2) for this module.
    351 bool ThreadDimOk(const DeviceDescription &device_description,
    352                  const ThreadDim &thread_dim);
    353 
    354 // [deprecated] Use MathUtil::CeilOfRatio directly instead.
    355 //
    356 // Equivalent to ceil(double(element_count) / threads_per_block).
    357 uint64 DivideCeil(uint64 x, uint64 y);
    358 
    359 // Calculate the number of threads/blocks required to process element_count
    360 // elements. Note that you can still end up with more threads than
    361 // element_count due to rounding, so kernels often start with an "is this
    362 // thread id in the element_count range?" test.
    363 void CalculateDimensionality(const DeviceDescription &device_description,
    364                              uint64 element_count, uint64 *threads_per_block,
    365                              uint64 *block_count);
    366 
    367 // Compute and return maximum blocks per core (occupancy) based on the
    368 // device description, some kernel characteristics and the number of threads per
    369 // block.  If unable to compute occupancy, zero is returned.
    370 uint64 CalculateOccupancy(const DeviceDescription &device_description,
    371                           uint64 registers_per_thread,
    372                           uint64 shared_memory_per_block,
    373                           const ThreadDim &thread_dims);
    374 
    375 // Compute and return the maximum number of registers per thread which
    376 // achieves the target occupancy.  If the target is not possible then
    377 // zero is returned.
    378 uint64 CalculateRegisterLimitForTargetOccupancy(
    379     const DeviceDescription &device_description, uint64 shared_memory_per_block,
    380     const ThreadDim &thread_dims, uint64 target_blocks_per_core);
    381 
    382 }  // namespace gputools
    383 }  // namespace perftools
    384 
    385 #endif  // TENSORFLOW_STREAM_EXECUTOR_DEVICE_DESCRIPTION_H_
    386