Home | History | Annotate | Download | only in stream_executor
      1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
      2 
      3 Licensed under the Apache License, Version 2.0 (the "License");
      4 you may not use this file except in compliance with the License.
      5 You may obtain a copy of the License at
      6 
      7     http://www.apache.org/licenses/LICENSE-2.0
      8 
      9 Unless required by applicable law or agreed to in writing, software
     10 distributed under the License is distributed on an "AS IS" BASIS,
     11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     12 See the License for the specific language governing permissions and
     13 limitations under the License.
     14 ==============================================================================*/
     15 
     16 // Describes the underlying platform for a StreamExecutor; e.g. OpenCL or CUDA
     17 // device and platform properties. Also contains convenience functions for
     18 // checking/calculating launch dimensionality based on device properties.
     19 
     20 #ifndef TENSORFLOW_STREAM_EXECUTOR_DEVICE_DESCRIPTION_H_
     21 #define TENSORFLOW_STREAM_EXECUTOR_DEVICE_DESCRIPTION_H_
     22 
     23 #include <map>
     24 #include <memory>
     25 #include "absl/base/macros.h"
     26 #include "tensorflow/stream_executor/launch_dim.h"
     27 #include "tensorflow/stream_executor/platform/port.h"
     28 
     29 namespace stream_executor {
     30 namespace internal {
     31 class DeviceDescriptionBuilder;
     32 }  // namespace internal
     33 
     34 // Data that describes the execution target of the StreamExecutor, in terms of
     35 // important logical parameters. These include dimensionality limits and
     36 // physical parameters of interest, such as number of cores present on the
     37 // device.
     38 //
     39 // Thread-safe: immutable post-initialization.
     40 class DeviceDescription {
     41  public:
     42   // Returns the platform being run on; this value is primarily intended for
     43   // printing, and comes out something like "OpenCL 1.2" or "Compute Capability
     44   // 3.5".
     45   const string &platform_version() const { return platform_version_; }
     46 
     47   // Returns the driver version interfacing with the underlying platform. Vendor
     48   // dependent format.
     49   const string &driver_version() const { return driver_version_; }
     50 
     51   // Return the runtime version, if one is provided by the underlying platform.
     52   // Vendor dependent format / usefulness.
     53   const string &runtime_version() const { return runtime_version_; }
     54 
     55   // Returns the name that the device reports. Vendor dependent.
     56   const string &name() const { return name_; }
     57 
     58   // Returns the PCI bus identifier for this device, of the form
     59   // [domain]:[bus]:[device].[function]
     60   const string &pci_bus_id() const { return pci_bus_id_; }
     61 
     62   // Returns the NUMA node associated with this device, for use in
     63   // determining socket locality. If the NUMA node could not be determined, -1
     64   // is returned.
     65   int numa_node() const { return numa_node_; }
     66 
     67   // Number of cores (traditional notion of core; i.e. an SM on an NVIDIA device
     68   // or an AMD Compute Unit.
     69   int core_count() const { return core_count_; }
     70 
     71   // Returns the limit on the thread dimensionality values in each of the
     72   // respective dimensions. These limits affect what constitutes a legitimate
     73   // kernel launch request.
     74   const ThreadDim &thread_dim_limit() const { return thread_dim_limit_; }
     75 
     76   // Returns the limit on the block dimensionality values in each of the
     77   // respective dimensions. These limits may affect what constitutes a
     78   // legitimate kernel launch request.
     79   const BlockDim &block_dim_limit() const { return block_dim_limit_; }
     80 
     81   // Returns the maximum number of simultaneously resident blocks
     82   // on a multiprocessor.
     83   int64 blocks_per_core_limit() const { return blocks_per_core_limit_; }
     84 
     85   // Returns the limit on the total number of threads that can be launched in a
     86   // single block; i.e. the limit on x * y * z dimensions of a ThreadDim.
     87   // This limit affects what constitutes a legitimate kernel launch request.
     88   const int64 &threads_per_block_limit() const {
     89     return threads_per_block_limit_;
     90   }
     91 
     92   // Returns the limit on the total number of threads that can be simultaneously
     93   // launched on a given multiprocessor.
     94   const int64 &threads_per_core_limit() const {
     95     return threads_per_core_limit_;
     96   }
     97 
     98   // Returns the number of threads per warp/wavefront.
     99   const int64 &threads_per_warp() const { return threads_per_warp_; }
    100 
    101   // Returns the limit on the total number of registers per core.
    102   const int64 &registers_per_core_limit() const {
    103     return registers_per_core_limit_;
    104   }
    105 
    106   // Returns the limit on the total number of registers that can be
    107   // simultaneously used by a block.
    108   const int64 &registers_per_block_limit() const {
    109     return registers_per_block_limit_;
    110   }
    111 
    112   // Returns the number of address bits available to kernel code running on the
    113   // platform. This affects things like the maximum allocation size and perhaps
    114   // types used in kernel code such as size_t.
    115   const int64 &device_address_bits() const { return device_address_bits_; }
    116 
    117   // Returns the device memory size in bytes.
    118   int64 device_memory_size() const { return device_memory_size_; }
    119 
    120   // Returns the device's memory bandwidth in bytes/sec.  (This is for
    121   // reads/writes to/from the device's own memory, not for transfers between the
    122   // host and device.)
    123   int64 memory_bandwidth() const { return memory_bandwidth_; }
    124 
    125   // Returns the device's core clock rate in GHz.
    126   float clock_rate_ghz() const { return clock_rate_ghz_; }
    127 
    128   // Returns whether ECC is enabled.
    129   bool ecc_enabled() const { return ecc_enabled_; }
    130 
    131   // Returns the device vendor string, e.g., "NVIDIA Corporation", "Advanced
    132   // Micro Devices, Inc.", or "GenuineIntel".
    133   const string &device_vendor() const { return device_vendor_; }
    134 
    135   // Returns the CUDA compute capability if we're running on the CUDA platform.
    136   // If a CUDA compute capability is not available, the major version will be
    137   // zero, and the return value will be false.
    138   bool cuda_compute_capability(int *major, int *minor) const;
    139 
    140   // Returns the AMDGPU ISA version if we're running on the ROCm platform.
    141   // If the information is not available, the version is not modified,
    142   // and the return value will be false.
    143   bool rocm_amdgpu_isa_version(int *version) const;
    144 
    145   // Returns the maximum amount of shared memory present on a single core
    146   // (i.e. Streaming Multiprocessor on NVIDIA GPUs; Compute Unit for OpenCL
    147   // devices). Note that some devices, such as NVIDIA's have a configurable
    148   // partitioning between shared memory and L1 cache.
    149   int64 shared_memory_per_core() const { return shared_memory_per_core_; }
    150 
    151   // Returns the maximum amount of shared memory available for a single block.
    152   int64 shared_memory_per_block() const { return shared_memory_per_block_; }
    153 
    154   // TODO(leary): resident blocks per core will be useful.
    155 
    156   // Convenience typedef for the string-based DeviceDescription mapping.
    157   typedef std::map<string, string> Map;
    158 
    159   // Returns a mapping from readable names to readable values that describe the
    160   // device. This is useful for things like printing.
    161   std::unique_ptr<Map> ToMap() const;
    162 
    163   // For string values that are not available via the underlying platform, this
    164   // value will be provided.
    165   static const char *kUndefinedString;
    166 
    167  private:
    168   friend class internal::DeviceDescriptionBuilder;
    169 
    170   DeviceDescription();
    171 
    172   // For description of the following members, see the corresponding accessor
    173   // above.
    174   //
    175   // N.B. If another field is added, update ToMap() above.
    176   string device_vendor_;
    177   string platform_version_;
    178   string driver_version_;
    179   string runtime_version_;
    180   string pci_bus_id_;
    181   string name_;
    182 
    183   ThreadDim thread_dim_limit_;
    184   BlockDim block_dim_limit_;
    185 
    186   int64 blocks_per_core_limit_;
    187 
    188   int64 threads_per_core_limit_;
    189   int64 threads_per_block_limit_;
    190   int64 threads_per_warp_;
    191 
    192   int64 registers_per_core_limit_;
    193   int64 registers_per_block_limit_;
    194 
    195   int64 device_address_bits_;
    196   int64 device_memory_size_;
    197   int64 memory_bandwidth_;
    198 
    199   // Shared memory limits on a given device.
    200   int64 shared_memory_per_core_;
    201   int64 shared_memory_per_block_;
    202 
    203   float clock_rate_ghz_;
    204 
    205   // CUDA "CC" major value, -1 if not available.
    206   int cuda_compute_capability_major_;
    207   int cuda_compute_capability_minor_;
    208 
    209   // ROCM AMDGPU ISA version, 0 if not available.
    210   int rocm_amdgpu_isa_version_;
    211 
    212   int numa_node_;
    213   int core_count_;
    214   bool ecc_enabled_;
    215 
    216   SE_DISALLOW_COPY_AND_ASSIGN(DeviceDescription);
    217 };
    218 
    219 namespace internal {
    220 
    221 // Helper class the builds a device description, given that it has a large
    222 // number of fields that would be easily confused in constructor form.
    223 class DeviceDescriptionBuilder {
    224  public:
    225   DeviceDescriptionBuilder();
    226 
    227   // For descriptions of the following fields, see comments on the corresponding
    228   // DeviceDescription::* accessors above.
    229 
    230   void set_device_vendor(const string &value) {
    231     device_description_->device_vendor_ = value;
    232   }
    233   void set_platform_version(const string &value) {
    234     device_description_->platform_version_ = value;
    235   }
    236   void set_driver_version(const string &value) {
    237     device_description_->driver_version_ = value;
    238   }
    239   void set_runtime_version(const string &value) {
    240     device_description_->runtime_version_ = value;
    241   }
    242   void set_pci_bus_id(const string &value) {
    243     device_description_->pci_bus_id_ = value;
    244   }
    245   void set_name(const string &value) { device_description_->name_ = value; }
    246 
    247   void set_thread_dim_limit(const ThreadDim &value) {
    248     device_description_->thread_dim_limit_ = value;
    249   }
    250   void set_block_dim_limit(const BlockDim &value) {
    251     device_description_->block_dim_limit_ = value;
    252   }
    253 
    254   void set_blocks_per_core_limit(int64 value) {
    255     device_description_->blocks_per_core_limit_ = value;
    256   }
    257 
    258   void set_threads_per_core_limit(int64 value) {
    259     device_description_->threads_per_core_limit_ = value;
    260   }
    261   void set_threads_per_block_limit(int64 value) {
    262     device_description_->threads_per_block_limit_ = value;
    263   }
    264   void set_threads_per_warp(int64 value) {
    265     device_description_->threads_per_warp_ = value;
    266   }
    267 
    268   void set_registers_per_core_limit(int64 value) {
    269     device_description_->registers_per_core_limit_ = value;
    270   }
    271   void set_registers_per_block_limit(int64 value) {
    272     device_description_->registers_per_block_limit_ = value;
    273   }
    274 
    275   void set_device_address_bits(int64 value) {
    276     device_description_->device_address_bits_ = value;
    277   }
    278   void set_device_memory_size(int64 value) {
    279     device_description_->device_memory_size_ = value;
    280   }
    281   void set_memory_bandwidth(int64 value) {
    282     device_description_->memory_bandwidth_ = value;
    283   }
    284 
    285   void set_shared_memory_per_core(int64 value) {
    286     device_description_->shared_memory_per_core_ = value;
    287   }
    288   void set_shared_memory_per_block(int64 value) {
    289     device_description_->shared_memory_per_block_ = value;
    290   }
    291 
    292   void set_clock_rate_ghz(float value) {
    293     device_description_->clock_rate_ghz_ = value;
    294   }
    295 
    296   void set_cuda_compute_capability(int major, int minor) {
    297     device_description_->cuda_compute_capability_major_ = major;
    298     device_description_->cuda_compute_capability_minor_ = minor;
    299   }
    300 
    301   void set_rocm_amdgpu_isa_version(int version) {
    302     device_description_->rocm_amdgpu_isa_version_ = version;
    303   }
    304 
    305   void set_numa_node(int value) { device_description_->numa_node_ = value; }
    306   void set_core_count(int value) { device_description_->core_count_ = value; }
    307   void set_ecc_enabled(bool value) {
    308     device_description_->ecc_enabled_ = value;
    309   }
    310 
    311   // Returns a built DeviceDescription with ownership transferred to the
    312   // caller. There are currently no restrictions on which fields must be set in
    313   // order to build the descriptor.
    314   //
    315   // Once the description is built, this builder object should be discarded.
    316   std::unique_ptr<DeviceDescription> Build() {
    317     return std::move(device_description_);
    318   }
    319 
    320  private:
    321   std::unique_ptr<DeviceDescription> device_description_;
    322 
    323   SE_DISALLOW_COPY_AND_ASSIGN(DeviceDescriptionBuilder);
    324 };
    325 
    326 }  // namespace internal
    327 
    328 // Returns whether the given thread_dim is acceptable given the limits described
    329 // in device_description. For detailed reasons for failing the predicate, enable
    330 // VLOG(2) for this module.
    331 bool ThreadDimOk(const DeviceDescription &device_description,
    332                  const ThreadDim &thread_dim);
    333 
    334 // Equivalent to ceil(double(element_count) / threads_per_block).
    335 ABSL_DEPRECATED("Use MathUtil::CeilOfRatio directly instead.")
    336 int64 DivideCeil(int64 x, int64 y);
    337 
    338 // Calculate the number of threads/blocks required to process element_count
    339 // elements. Note that you can still end up with more threads than
    340 // element_count due to rounding, so kernels often start with an "is this
    341 // thread id in the element_count range?" test.
    342 void CalculateDimensionality(const DeviceDescription &device_description,
    343                              int64 element_count, int64 *threads_per_block,
    344                              int64 *block_count);
    345 
    346 }  // namespace stream_executor
    347 
    348 #endif  // TENSORFLOW_STREAM_EXECUTOR_DEVICE_DESCRIPTION_H_
    349