Home | History | Annotate | Download | only in stream_executor
      1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
      2 
      3 Licensed under the Apache License, Version 2.0 (the "License");
      4 you may not use this file except in compliance with the License.
      5 You may obtain a copy of the License at
      6 
      7     http://www.apache.org/licenses/LICENSE-2.0
      8 
      9 Unless required by applicable law or agreed to in writing, software
     10 distributed under the License is distributed on an "AS IS" BASIS,
     11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     12 See the License for the specific language governing permissions and
     13 limitations under the License.
     14 ==============================================================================*/
     15 
     16 // Suite of types that represent device memory allocations. These are
     17 // allocated by the StreamExecutor interface, which produces values appropriate
     18 // for the underlying platform (whether it be CUDA or OpenCL).
     19 //
     20 // The untyped base class (like a device void*) is DeviceMemoryBase, which can
     21 // be specialized for a given allocation type (like a device T*) using
     22 // DeviceMemory<T>.
     23 
     24 #ifndef TENSORFLOW_STREAM_EXECUTOR_DEVICE_MEMORY_H_
     25 #define TENSORFLOW_STREAM_EXECUTOR_DEVICE_MEMORY_H_
     26 
     27 #include <stddef.h>
     28 
     29 #include "tensorflow/stream_executor/platform/port.h"
     30 
     31 namespace perftools {
     32 namespace gputools {
     33 
     34 // Temporarily pull stream_executor into perftools::gputools while we migrate
     35 // code to the new namespace.  TODO(b/77980417): Remove this once we've
     36 // completed the migration.
     37 using namespace stream_executor;  // NOLINT[build/namespaces]
     38 
     39 }  // namespace gputools
     40 }  // namespace perftools
     41 
     42 namespace stream_executor {
     43 
     44 class StreamExecutor;
     45 
     46 // void*-analogous device memory allocation. For the typed variation, see
     47 // DeviceMemory<T>.
     48 //
     49 // This is effectively a two-tuple of a pointer and size; however, note that the
     50 // pointer may not be to the virtual address itself -- in OpenCL the pointer is
     51 // to a cl_mem handle that describes the device allocation. Therefore,
     52 // DeviceMemoryBase::opaque does not necessarily produce a pointer that can be
     53 // referenced directly, so use it with caution.
     54 //
     55 // Thread-compatible.
     56 class DeviceMemoryBase {
     57  public:
     58   // Default constructor instantiates a null-pointed, zero-sized device memory
     59   // region. An opaque pointer may be provided -- see header for details on the
     60   // opacity of that pointer.
     61   explicit DeviceMemoryBase(void *opaque = nullptr, uint64 size = 0,
     62                             bool is_sub_buffer = false)
     63       : opaque_(opaque), size_(size), is_sub_buffer_(is_sub_buffer) {}
     64 
     65   // Returns whether the backing memory is the null pointer.
     66   // A `== nullptr` convenience method is also provided.
     67   bool is_null() const { return opaque_ == nullptr; }
     68   bool operator==(std::nullptr_t other) const { return is_null(); }
     69   bool operator!=(std::nullptr_t other) const { return !is_null(); }
     70 
     71   // Provides a partial order between device memory values.
     72   //
     73   // This operator is provided so that this object can be used as a key in an
     74   // ordered map.
     75   bool operator<(const DeviceMemoryBase &other) const {
     76     return opaque() < other.opaque();
     77   }
     78 
     79   // Returns the size, in bytes, for the backing memory.
     80   uint64 size() const { return size_; }
     81 
     82   // Warning: note that the pointer returned is not necessarily directly to
     83   // device virtual address space, but is platform-dependent.
     84   void *opaque() { return opaque_; }
     85   const void *opaque() const { return opaque_; }
     86 
     87   // Returns true if this is an offset into another primary allocation.
     88   bool is_sub_buffer() const { return is_sub_buffer_; }
     89 
     90   // Returns whether the two DeviceMemoryBase segments are identical (both in
     91   // their opaque pointer and size).
     92   bool IsSameAs(const DeviceMemoryBase &other) const {
     93     return opaque() == other.opaque() && size() == other.size();
     94   }
     95 
     96  protected:
     97   friend class StreamExecutor;
     98 
     99   // Resets the internal values of the opaque pointer and number of bytes in the
    100   // memory region, just as in the constructor.
    101   void Reset(void *opaque, uint64 bytes) {
    102     opaque_ = opaque;
    103     size_ = bytes;
    104   }
    105 
    106  private:
    107   void *opaque_;  // Platform-dependent value representing allocated memory.
    108   uint64 size_;   // Size in bytes of this allocation.
    109   bool is_sub_buffer_;  // Is this a primary allocation or a sub-buffer?
    110 };
    111 
    112 // Typed wrapper around "void *"-like DeviceMemoryBase.
    113 //
    114 // For example, DeviceMemory<int> is a simple wrapper around DeviceMemoryBase
    115 // that represents one or more integers in Device memory.
    116 //
    117 // Thread-compatible.
    118 template <typename ElemT>
    119 class DeviceMemory final : public DeviceMemoryBase {
    120  public:
    121   // Default constructor instantiates a null-pointed, zero-sized memory region.
    122   DeviceMemory() : DeviceMemoryBase(nullptr, 0) {}
    123   DeviceMemory(std::nullptr_t) : DeviceMemory() {}
    124 
    125   // Typed device memory regions may be constructed from untyped device memory
    126   // regions, this effectively amounts to a cast from a void*.
    127   explicit DeviceMemory(const DeviceMemoryBase &other)
    128       : DeviceMemoryBase(const_cast<DeviceMemoryBase &>(other).opaque(),
    129                          other.size(), other.is_sub_buffer()) {}
    130 
    131   // Returns the number of elements of type ElemT that constitute this
    132   // allocation.
    133   uint64 ElementCount() const { return size() / sizeof(ElemT); }
    134 
    135   // Returns whether this is a single-element allocation.
    136   bool IsScalar() const { return ElementCount() == 1; }
    137 
    138   // Create a typed area of DeviceMemory with a given opaque pointer and the
    139   // quantity of bytes in the allocation. This function is broken out to
    140   // distinguish bytes from an element count.
    141   static DeviceMemory<ElemT> MakeFromByteSize(void *opaque, uint64 bytes) {
    142     return DeviceMemory<ElemT>(opaque, bytes);
    143   }
    144 
    145   // Resets the DeviceMemory data, in MakeFromByteSize fashion.
    146   // This simply clobbers the prior values.
    147   void ResetFromByteSize(void *opaque, uint64 bytes) {
    148     // TODO(leary) when NVCC is eliminated we can add this check (and the
    149     // logging include it requires).
    150     // CHECK_EQ(0, bytes % sizeof(ElemT));
    151     DeviceMemoryBase::Reset(opaque, bytes);
    152   }
    153 
    154   // ------------------------------------------------------------
    155 
    156  protected:
    157   // This constructor is solely used from derived classes; it is made protected
    158   // because it accepts a byte-size instead of an element count, which could
    159   // potentially be misused given the ElementCount() nature of this interface.
    160   //
    161   // In order to specify the desire to use byte size instead of element count
    162   // explicitly, use MakeFromByteSize.
    163   DeviceMemory(void *opaque, uint64 size) : DeviceMemoryBase(opaque, size) {}
    164 };
    165 
    166 // A class to encapsulate the type and size of a dynamic shared memory
    167 // buffer. Because the buffer exists solely on the device and is not copyable
    168 // to the host, memory objects of this type do not maintain buffer pointers
    169 // on the host.
    170 template <typename ElemT>
    171 class SharedDeviceMemory final : public DeviceMemoryBase {
    172  public:
    173   explicit SharedDeviceMemory(uint64 elem_count)
    174       : DeviceMemoryBase(nullptr, elem_count * kElemSize) {}
    175 
    176   static constexpr size_t kElemSize = sizeof(ElemT);
    177 
    178   // Returns the number of elements of type ElemT that constitute this
    179   // allocation.
    180   uint64 ElementCount() const { return size() / kElemSize; }
    181 
    182   // Returns whether this is a single-element allocation.
    183   bool IsScalar() const { return ElementCount() == 1; }
    184 };
    185 
    186 // Similar to the typed DeviceMemory, but is the unique owner of its
    187 // memory, if any. ScopedDeviceMemory is thread-compatible. It is also
    188 // movable and uncopyable to represent unique ownership.
    189 template <typename ElemT>
    190 class ScopedDeviceMemory {
    191  public:
    192   // Default construction initializes the internal state to nullptr.  This
    193   // mirrors the std::unique_ptr<> functionality, where default construction
    194   // produces a nullptr unique_ptr, which can be assigned later.
    195   ScopedDeviceMemory();
    196 
    197   // Parameters:
    198   //  parent: Executor used to deallocate memory when this instance goes
    199   //          out of scope.
    200   //  value: Already-allocated device memory value for this scoped mechanism to
    201   //         deallocate. This memory must have been allocated by parent.
    202   ScopedDeviceMemory(StreamExecutor *parent, DeviceMemoryBase value);
    203 
    204   // Constructor overload that places a literal array into device memory
    205   ScopedDeviceMemory(StreamExecutor *parent,
    206                      std::initializer_list<ElemT> values);
    207 
    208   // Moves ownership of the memory from other to the constructed
    209   // object.
    210   //
    211   // Postcondition: other == nullptr.
    212   ScopedDeviceMemory(ScopedDeviceMemory &&other) noexcept:
    213       ScopedDeviceMemory(other.parent_, other.Release()) {}
    214 
    215   // Releases the memory that was provided in the constructor, through the
    216   // "parent" StreamExecutor.
    217   ~ScopedDeviceMemory();
    218 
    219   // Moves ownership of the memory from other to this object.
    220   //
    221   // Postcondition: other == nullptr.
    222   ScopedDeviceMemory& operator=(ScopedDeviceMemory &&other) {
    223     Reset(other.Release());
    224     parent_ = other.parent_;
    225     return *this;
    226   }
    227 
    228   // Returns the memory that backs this scoped allocation converted to
    229   // DeviceMemory<T> apparent type. This is useful for cases where the
    230   // DeviceMemory must be passed by const-ref, as the ScopedDeviceMemory doesn't
    231   // allow copying, for scoped-object-lifetime reasons.
    232   const DeviceMemory<ElemT> &cref() const { return wrapped_; }
    233 
    234   // Returns a pointer to the DeviceMemory<T> apparent type for use in mutable
    235   // operations. The value returned should not be used outside the scope of this
    236   // ScopedDeviceMemory object's lifetime.
    237   DeviceMemory<ElemT> *ptr() { return &wrapped_; }
    238   const DeviceMemory<ElemT> *ptr() const { return &wrapped_; }
    239 
    240   // Smart-pointer-like operators for the wrapped DeviceMemory.
    241   // This reference must not be used outside the lifetime of this
    242   // ScopedDeviceMemory.
    243   const DeviceMemory<ElemT> &operator*() const { return cref(); }
    244   DeviceMemory<ElemT> *operator->() { return ptr(); }
    245   const DeviceMemory<ElemT> *operator->() const { return ptr(); }
    246   bool operator==(std::nullptr_t other) const { return wrapped_.is_null(); }
    247   bool operator!=(std::nullptr_t other) const { return !wrapped_.is_null(); }
    248 
    249   // Analogous to std::unique_ptr::reset, frees the existing memory held in
    250   // this scoped memory container and replaces it with updated. Ownership
    251   // of updated is transferred to this object.
    252   void Reset(DeviceMemory<ElemT> updated);
    253   void Reset(std::nullptr_t);
    254 
    255   // Analogous to std::unique_ptr::release, releases ownership of the held
    256   // memory and transfers it to the caller.
    257   //
    258   // Postcondition: *this == nullptr
    259   DeviceMemory<ElemT> Release() {
    260     auto tmp = wrapped_;
    261     wrapped_.ResetFromByteSize(nullptr, 0);
    262     return tmp;
    263   }
    264 
    265  private:
    266   DeviceMemory<ElemT> wrapped_;  // Value we wrap with scoped-release.
    267   StreamExecutor *parent_;       // See constructor.
    268 
    269   SE_DISALLOW_COPY_AND_ASSIGN(ScopedDeviceMemory);
    270 };
    271 
    272 // Host-side representation of packed-and-aligned vector datatypes on the device
    273 // side. Since these can appear in device kernel signatures, we support
    274 // launching them with these datatypes in launch signatures.
    275 
    276 struct Float2 {
    277   float x, y;
    278 };
    279 
    280 struct Float4 {
    281   Float2 xz, yw;
    282 };
    283 
    284 struct Double2 {
    285   double x, y;
    286 };
    287 
    288 static_assert(sizeof(Float2) == 2 * sizeof(float), "Float2 must be packed");
    289 static_assert(sizeof(Float4) == 4 * sizeof(float), "Float4 must be packed");
    290 static_assert(sizeof(Double2) == 2 * sizeof(double), "Double2 must be packed");
    291 
    292 }  // namespace stream_executor
    293 
    294 #endif  // TENSORFLOW_STREAM_EXECUTOR_DEVICE_MEMORY_H_
    295