Home | History | Annotate | Download | only in stream_executor
      1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
      2 
      3 Licensed under the Apache License, Version 2.0 (the "License");
      4 you may not use this file except in compliance with the License.
      5 You may obtain a copy of the License at
      6 
      7     http://www.apache.org/licenses/LICENSE-2.0
      8 
      9 Unless required by applicable law or agreed to in writing, software
     10 distributed under the License is distributed on an "AS IS" BASIS,
     11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     12 See the License for the specific language governing permissions and
     13 limitations under the License.
     14 ==============================================================================*/
     15 
     16 // Suite of types that represent device memory allocations. These are
     17 // allocated by the StreamExecutor interface, which produces values appropriate
     18 // for the underlying platform (whether it be CUDA or OpenCL).
     19 //
     20 // The untyped base class (like a device void*) is DeviceMemoryBase, which can
     21 // be specialized for a given allocation type (like a device T*) using
     22 // DeviceMemory<T>.
     23 
     24 #ifndef TENSORFLOW_STREAM_EXECUTOR_DEVICE_MEMORY_H_
     25 #define TENSORFLOW_STREAM_EXECUTOR_DEVICE_MEMORY_H_
     26 
     27 #include <stddef.h>
     28 
     29 #include "tensorflow/stream_executor/lib/casts.h"
     30 #include "tensorflow/stream_executor/platform/port.h"
     31 
     32 namespace perftools {
     33 namespace gputools {
     34 
     35 class StreamExecutor;
     36 
     37 // void*-analogous device memory allocation. For the typed variation, see
     38 // DeviceMemory<T>.
     39 //
     40 // This is effectively a two-tuple of a pointer and size; however, note that the
     41 // pointer may not be to the virtual address itself -- in OpenCL the pointer is
     42 // to a cl_mem handle that describes the device allocation. Therefore,
     43 // DeviceMemoryBase::opaque does not necessarily produce a pointer that can be
     44 // referenced directly, so use it with caution.
     45 //
     46 // Thread-compatible.
     47 class DeviceMemoryBase {
     48  public:
     49   // Default constructor instantiates a null-pointed, zero-sized device memory
     50   // region. An opaque pointer may be provided -- see header for details on the
     51   // opacity of that pointer.
     52   explicit DeviceMemoryBase(void *opaque = nullptr, uint64 size = 0,
     53                             bool is_sub_buffer = false)
     54       : opaque_(opaque), size_(size), is_sub_buffer_(is_sub_buffer) {}
     55 
     56   // Returns whether the backing memory is the null pointer.
     57   // A `== nullptr` convenience method is also provided.
     58   bool is_null() const { return opaque_ == nullptr; }
     59   bool operator==(std::nullptr_t other) const { return is_null(); }
     60   bool operator!=(std::nullptr_t other) const { return !is_null(); }
     61 
     62   // Provides a partial order between device memory values.
     63   //
     64   // This operator is provided so that this object can be used as a key in an
     65   // ordered map.
     66   bool operator<(const DeviceMemoryBase &other) const {
     67     return opaque() < other.opaque();
     68   }
     69 
     70   // Returns the size, in bytes, for the backing memory.
     71   uint64 size() const { return size_; }
     72 
     73   // Warning: note that the pointer returned is not necessarily directly to
     74   // device virtual address space, but is platform-dependent.
     75   void *opaque() { return opaque_; }
     76   const void *opaque() const { return opaque_; }
     77 
     78   // Returns true if this is an offset into another primary allocation.
     79   bool is_sub_buffer() const { return is_sub_buffer_; }
     80 
     81   // Returns whether the two DeviceMemoryBase segments are identical (both in
     82   // their opaque pointer and size).
     83   bool IsSameAs(const DeviceMemoryBase &other) const {
     84     return opaque() == other.opaque() && size() == other.size();
     85   }
     86 
     87  protected:
     88   friend class StreamExecutor;
     89 
     90   // Resets the internal values of the opaque pointer and number of bytes in the
     91   // memory region, just as in the constructor.
     92   void Reset(void *opaque, uint64 bytes) {
     93     opaque_ = opaque;
     94     size_ = bytes;
     95   }
     96 
     97  private:
     98   void *opaque_;  // Platform-dependent value representing allocated memory.
     99   uint64 size_;   // Size in bytes of this allocation.
    100   bool is_sub_buffer_;  // Is this a primary allocation or a sub-buffer?
    101 };
    102 
    103 // Typed wrapper around "void *"-like DeviceMemoryBase.
    104 //
    105 // For example, DeviceMemory<int> is a simple wrapper around DeviceMemoryBase
    106 // that represents one or more integers in Device memory.
    107 //
    108 // Thread-compatible.
    109 template <typename ElemT>
    110 class DeviceMemory final : public DeviceMemoryBase {
    111  public:
    112   // Default constructor instantiates a null-pointed, zero-sized memory region.
    113   DeviceMemory() : DeviceMemoryBase(nullptr, 0) {}
    114   DeviceMemory(std::nullptr_t) : DeviceMemory() {}
    115 
    116   // Typed device memory regions may be constructed from untyped device memory
    117   // regions, this effectively amounts to a cast from a void*.
    118   explicit DeviceMemory(const DeviceMemoryBase &other)
    119       : DeviceMemoryBase(const_cast<DeviceMemoryBase &>(other).opaque(),
    120                          other.size(), other.is_sub_buffer()) {}
    121 
    122   // Returns the number of elements of type ElemT that constitute this
    123   // allocation.
    124   uint64 ElementCount() const { return size() / sizeof(ElemT); }
    125 
    126   // Returns whether this is a single-element allocation.
    127   bool IsScalar() const { return ElementCount() == 1; }
    128 
    129   // Create a typed area of DeviceMemory with a given opaque pointer and the
    130   // quantity of bytes in the allocation. This function is broken out to
    131   // distinguish bytes from an element count.
    132   static DeviceMemory<ElemT> MakeFromByteSize(void *opaque, uint64 bytes) {
    133     return DeviceMemory<ElemT>(opaque, bytes);
    134   }
    135 
    136   // Resets the DeviceMemory data, in MakeFromByteSize fashion.
    137   // This simply clobbers the prior values.
    138   void ResetFromByteSize(void *opaque, uint64 bytes) {
    139     // TODO(leary) when NVCC is eliminated we can add this check (and the
    140     // logging include it requires).
    141     // CHECK_EQ(0, bytes % sizeof(ElemT));
    142     DeviceMemoryBase::Reset(opaque, bytes);
    143   }
    144 
    145   // ------------------------------------------------------------
    146 
    147  protected:
    148   // This constructor is solely used from derived classes; it is made protected
    149   // because it accepts a byte-size instead of an element count, which could
    150   // potentially be misused given the ElementCount() nature of this interface.
    151   //
    152   // In order to specify the desire to use byte size instead of element count
    153   // explicitly, use MakeFromByteSize.
    154   DeviceMemory(void *opaque, uint64 size) : DeviceMemoryBase(opaque, size) {}
    155 };
    156 
    157 // A class to encapsulate the type and size of a dynamic shared memory
    158 // buffer. Because the buffer exists solely on the device and is not copyable
    159 // to the host, memory objects of this type do not maintain buffer pointers
    160 // on the host.
    161 template <typename ElemT>
    162 class SharedDeviceMemory final : public DeviceMemoryBase {
    163  public:
    164   explicit SharedDeviceMemory(uint64 elem_count)
    165       : DeviceMemoryBase(nullptr, elem_count * kElemSize) {}
    166 
    167   static constexpr size_t kElemSize = sizeof(ElemT);
    168 
    169   // Returns the number of elements of type ElemT that constitute this
    170   // allocation.
    171   uint64 ElementCount() const { return size() / kElemSize; }
    172 
    173   // Returns whether this is a single-element allocation.
    174   bool IsScalar() const { return ElementCount() == 1; }
    175 };
    176 
    177 // Similar to the typed DeviceMemory, but is the unique owner of its
    178 // memory, if any. ScopedDeviceMemory is thread-compatible. It is also
    179 // movable and uncopyable to represent unique ownership.
    180 template <typename ElemT>
    181 class ScopedDeviceMemory {
    182  public:
    183   // Default construction initializes the internal state to nullptr.  This
    184   // mirrors the std::unique_ptr<> functionality, where default construction
    185   // produces a nullptr unique_ptr, which can be assigned later.
    186   ScopedDeviceMemory();
    187 
    188   // Parameters:
    189   //  parent: Executor used to deallocate memory when this instance goes
    190   //          out of scope.
    191   //  value: Already-allocated device memory value for this scoped mechanism to
    192   //         deallocate. This memory must have been allocated by parent.
    193   ScopedDeviceMemory(StreamExecutor *parent, DeviceMemoryBase value);
    194 
    195   // Constructor overload that places a literal array into device memory
    196   ScopedDeviceMemory(StreamExecutor *parent,
    197                      std::initializer_list<ElemT> values);
    198 
    199   // Moves ownership of the memory from other to the constructed
    200   // object.
    201   //
    202   // Postcondition: other == nullptr.
    203   ScopedDeviceMemory(ScopedDeviceMemory &&other) noexcept:
    204       ScopedDeviceMemory(other.parent_, other.Release()) {}
    205 
    206   // Releases the memory that was provided in the constructor, through the
    207   // "parent" StreamExecutor.
    208   ~ScopedDeviceMemory();
    209 
    210   // Moves ownership of the memory from other to this object.
    211   //
    212   // Postcondition: other == nullptr.
    213   ScopedDeviceMemory& operator=(ScopedDeviceMemory &&other) {
    214     Reset(other.Release());
    215     parent_ = other.parent_;
    216     return *this;
    217   }
    218 
    219   // Returns the memory that backs this scoped allocation converted to
    220   // DeviceMemory<T> apparent type. This is useful for cases where the
    221   // DeviceMemory must be passed by const-ref, as the ScopedDeviceMemory doesn't
    222   // allow copying, for scoped-object-lifetime reasons.
    223   const DeviceMemory<ElemT> &cref() const { return wrapped_; }
    224 
    225   // Returns a pointer to the DeviceMemory<T> apparent type for use in mutable
    226   // operations. The value returned should not be used outside the scope of this
    227   // ScopedDeviceMemory object's lifetime.
    228   DeviceMemory<ElemT> *ptr() { return &wrapped_; }
    229   const DeviceMemory<ElemT> *ptr() const { return &wrapped_; }
    230 
    231   // Smart-pointer-like operators for the wrapped DeviceMemory.
    232   // This reference must not be used outside the lifetime of this
    233   // ScopedDeviceMemory.
    234   const DeviceMemory<ElemT> &operator*() const { return cref(); }
    235   DeviceMemory<ElemT> *operator->() { return ptr(); }
    236   const DeviceMemory<ElemT> *operator->() const { return ptr(); }
    237   bool operator==(std::nullptr_t other) const { return wrapped_.is_null(); }
    238   bool operator!=(std::nullptr_t other) const { return !wrapped_.is_null(); }
    239 
    240   // Analogous to std::unique_ptr::reset, frees the existing memory held in
    241   // this scoped memory container and replaces it with updated. Ownership
    242   // of updated is transferred to this object.
    243   void Reset(DeviceMemory<ElemT> updated);
    244   void Reset(std::nullptr_t);
    245 
    246   // Analogous to std::unique_ptr::release, releases ownership of the held
    247   // memory and transfers it to the caller.
    248   //
    249   // Postcondition: *this == nullptr
    250   DeviceMemory<ElemT> Release() {
    251     auto tmp = wrapped_;
    252     wrapped_.ResetFromByteSize(nullptr, 0);
    253     return tmp;
    254   }
    255 
    256  private:
    257   DeviceMemory<ElemT> wrapped_;  // Value we wrap with scoped-release.
    258   StreamExecutor *parent_;       // See constructor.
    259 
    260   SE_DISALLOW_COPY_AND_ASSIGN(ScopedDeviceMemory);
    261 };
    262 
    263 // Host-side representation of packed-and-aligned vector datatypes on the device
    264 // side. Since these can appear in device kernel signatures, we support
    265 // launching them with these datatypes in launch signatures.
    266 
    267 struct Float2 {
    268   float x, y;
    269 };
    270 
    271 struct Float4 {
    272   Float2 xz, yw;
    273 };
    274 
    275 struct Double2 {
    276   double x, y;
    277 };
    278 
    279 static_assert(sizeof(Float2) == 2 * sizeof(float), "Float2 must be packed");
    280 static_assert(sizeof(Float4) == 4 * sizeof(float), "Float4 must be packed");
    281 static_assert(sizeof(Double2) == 2 * sizeof(double), "Double2 must be packed");
    282 
    283 }  // namespace gputools
    284 }  // namespace perftools
    285 
    286 #endif  // TENSORFLOW_STREAM_EXECUTOR_DEVICE_MEMORY_H_
    287